diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index a51e5b7d4e6..80ef402a6cd 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -53,7 +53,7 @@ jobs: - "-DFLB_SANITIZE_THREAD=On" - "-DFLB_SIMD=On" - "-DFLB_SIMD=Off" - - "-DFLB_ARROW=On" + - "-DFLB_PARQUET_ENCODER=On" - "-DFLB_COMPILER_STRICT_POINTER_TYPES=On" cmake_version: - "3.31.6" @@ -69,7 +69,7 @@ jobs: compiler: cc: clang cxx: clang++ - - flb_option: "-DFLB_ARROW=On" + - flb_option: "-DFLB_PARQUET_ENCODER=On" compiler: cc: clang cxx: clang++ @@ -114,15 +114,15 @@ jobs: with: repository: calyptia/fluent-bit-ci path: ci - - name: Setup Apache Arrow libraries for parquet (-DFLB_ARROW=On Only) - if: matrix.flb_option == '-DFLB_ARROW=On' + - name: Setup Apache Arrow libraries for Parquet encoder (-DFLB_PARQUET_ENCODER=On Only) + if: matrix.flb_option == '-DFLB_PARQUET_ENCODER=On' run: | sudo apt-get update sudo apt-get install -y -V ca-certificates lsb-release wget - wget https://packages.apache.org/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + wget https://packages.apache.org/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb sudo apt-get install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb sudo apt-get update - sudo apt-get install -y -V libarrow-glib-dev libparquet-glib-dev + sudo apt-get install -y -V libarrow-dev libparquet-dev - name: ${{ matrix.compiler.cc }} & ${{ matrix.compiler.cxx }} - ${{ matrix.flb_option }} run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index f1d7e775168..794eee7cc1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -199,7 +199,7 @@ option(FLB_SIMD "Enable SIMD support" No) option(FLB_CORO_STACK_SIZE "Set coroutine stack size") option(FLB_AVRO_ENCODER "Build with Avro encoding support" No) option(FLB_AWS_ERROR_REPORTER "Build with aws error reporting support" No) -option(FLB_ARROW "Build with Apache Arrow support" No) +option(FLB_PARQUET_ENCODER "Build with Parquet encoding support" No) option(FLB_WINDOWS_DEFAULTS "Build with predefined Windows settings" Yes) option(FLB_WASM "Build with WASM runtime support" Yes) option(FLB_WAMRC "Build with WASM AOT compiler executable" No) @@ -281,6 +281,11 @@ if(FLB_ALL) set(FLB_DEBUG 1) set(FLB_TLS 1) + # Encoders + set(FLB_AVRO_ENCODER 1) + # Note: FLB_PARQUET_ENCODER is not enabled by FLB_ALL due to external dependencies + # Use -DFLB_PARQUET_ENCODER=On explicitly to enable it + # Input plugins set(FLB_IN_CPU 1) set(FLB_IN_MEM 1) @@ -1069,11 +1074,12 @@ if(FLB_CONFIG_YAML) # For non-standard libyaml installation paths such as homebrew bottled libyaml. include_directories(${LIBYAML_INCLUDEDIR}) link_directories(${LIBYAML_LIBRARY_DIRS}) + message(STATUS "libyaml found via pkg-config: ${LIBYAML_VERSION}") else() if (FLB_LIBYAML_DIR) set(LIBYAML_LIBRARY_DIRS "${FLB_LIBYAML_DIR}/lib") set(LIBYAML_INCLUDEDIR "${FLB_LIBYAML_DIR}/include") - message(STATUS "specified libyaml dir: ${FLB_LIBYAML_DIR}") + message(STATUS "Using specified libyaml dir: ${FLB_LIBYAML_DIR}") if (MSVC) FLB_DEFINITION(YAML_DECLARE_STATIC) endif () @@ -1082,21 +1088,45 @@ if(FLB_CONFIG_YAML) include_directories(${LIBYAML_INCLUDEDIR}) link_directories(${LIBYAML_LIBRARY_DIRS}) else () - # Requires libyaml support - check_c_source_compiles(" - #include - int main() { - yaml_parser_t parser; - return 0; - }" FLB_HAVE_LIBYAML) - - if(NOT FLB_HAVE_LIBYAML) - message(FATAL_ERROR - "YAML development dependencies required for YAML configuration format handling.\n" - "This is a build time dependency, you can either install the " - "dependencies or disable the feature setting the CMake option " - "-DFLB_CONFIG_YAML=Off ." - ) + # Try to auto-detect libyaml on macOS via Homebrew + if (FLB_SYSTEM_MACOS) + find_program(BREW_EXECUTABLE brew) + if (BREW_EXECUTABLE) + execute_process( + COMMAND ${BREW_EXECUTABLE} --prefix libyaml + OUTPUT_VARIABLE BREW_LIBYAML_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + ) + if (BREW_LIBYAML_PREFIX AND EXISTS "${BREW_LIBYAML_PREFIX}") + set(LIBYAML_LIBRARY_DIRS "${BREW_LIBYAML_PREFIX}/lib") + set(LIBYAML_INCLUDEDIR "${BREW_LIBYAML_PREFIX}/include") + set(FLB_HAVE_LIBYAML 1) + FLB_DEFINITION(FLB_HAVE_LIBYAML) + include_directories(${LIBYAML_INCLUDEDIR}) + link_directories(${LIBYAML_LIBRARY_DIRS}) + message(STATUS "libyaml found via Homebrew: ${BREW_LIBYAML_PREFIX}") + endif() + endif() + endif() + + # If still not found, try compile test + if (NOT FLB_HAVE_LIBYAML) + check_c_source_compiles(" + #include + int main() { + yaml_parser_t parser; + return 0; + }" FLB_HAVE_LIBYAML) + + if(NOT FLB_HAVE_LIBYAML) + message(FATAL_ERROR + "YAML development dependencies required for YAML configuration format handling.\n" + "This is a build time dependency, you can either install the " + "dependencies or disable the feature setting the CMake option " + "-DFLB_CONFIG_YAML=Off ." + ) + endif() endif() endif () @@ -1304,22 +1334,24 @@ if(FLB_OUT_PGSQL AND (NOT PostgreSQL_FOUND)) FLB_OPTION(FLB_OUT_PGSQL OFF) endif() -# Arrow GLib -# ========== -find_package(PkgConfig) -pkg_check_modules(ARROW_GLIB QUIET arrow-glib) -if(FLB_ARROW AND ARROW_GLIB_FOUND) - FLB_DEFINITION(FLB_HAVE_ARROW) -else() - set(FLB_ARROW OFF) -endif() +# Parquet +if(FLB_PARQUET_ENCODER) + # Enable C++ for Parquet support + enable_language(CXX) + set(CMAKE_CXX_STANDARD 17) -# Additional prerequisites for Apache Parquet -pkg_check_modules(ARROW_GLIB_PARQUET QUIET parquet-glib) -if(FLB_ARROW AND ARROW_GLIB_PARQUET_FOUND) - FLB_DEFINITION(FLB_HAVE_ARROW_PARQUET) -else() - message(STATUS "Arrow GLib Parquet not found. Disabling parquet compression") + # Detect Arrow and Parquet libraries + include(cmake/parquet.cmake) + + if(ARROW_FOUND AND PARQUET_FOUND) + FLB_DEFINITION(FLB_HAVE_PARQUET_ENCODER) + include_directories(${ARROW_INCLUDE_DIRS}) + include_directories(${PARQUET_INCLUDE_DIRS}) + else() + message(WARNING "FLB_PARQUET_ENCODER is enabled but Arrow/Parquet libraries not found.") + message(WARNING "Disabling Parquet encoder support.") + set(FLB_PARQUET_ENCODER OFF) + endif() endif() # EBPF Support diff --git a/cmake/kafka.cmake b/cmake/kafka.cmake index 09e98dd9228..c0bd713926f 100644 --- a/cmake/kafka.cmake +++ b/cmake/kafka.cmake @@ -1,5 +1,4 @@ # Kafka CMake Configuration -# kafka.cmake - Clean version without internal AWS check FLB_OPTION(RDKAFKA_BUILD_STATIC On) FLB_OPTION(RDKAFKA_BUILD_EXAMPLES Off) FLB_OPTION(RDKAFKA_BUILD_TESTS Off) @@ -7,38 +6,48 @@ FLB_OPTION(ENABLE_LZ4_EXT Off) include(FindPkgConfig) -# Check for libsasl2 (required for SASL authentication) -set(FLB_SASL_ENABLED OFF) +# librdkafka has built-in support for: +# - SASL/PLAIN (built-in, no external deps) +# - SASL/SCRAM (built-in, no external deps) +# - SASL/OAUTHBEARER (built-in, no external deps) +# Only SASL/GSSAPI (Kerberos) requires cyrus-sasl library + +# Check for cyrus-sasl (optional, only needed for GSSAPI/Kerberos) +set(FLB_SASL_CYRUS_ENABLED OFF) if(PkgConfig_FOUND) pkg_check_modules(SASL libsasl2) if(SASL_FOUND) - message(STATUS "Found libsasl2: ${SASL_VERSION}") - set(FLB_SASL_ENABLED ON) + message(STATUS "Found cyrus-sasl: ${SASL_VERSION}") + set(FLB_SASL_CYRUS_ENABLED ON) else() - message(WARNING "libsasl2 not found - SASL authentication will be disabled") + message(STATUS "cyrus-sasl not found - SASL/GSSAPI (Kerberos) will be disabled") endif() else() - message(WARNING "pkg-config not available - trying fallback SASL detection") - # Fallback detection + message(STATUS "pkg-config not available - trying fallback cyrus-sasl detection") find_library(SASL2_LIB NAMES sasl2) find_path(SASL2_INCLUDE NAMES sasl/sasl.h) if(SASL2_LIB AND SASL2_INCLUDE) - set(FLB_SASL_ENABLED ON) - message(STATUS "Found libsasl2 via fallback: ${SASL2_LIB}") + set(FLB_SASL_CYRUS_ENABLED ON) + set(SASL_LIBRARIES ${SASL2_LIB}) + set(SASL_INCLUDE_DIRS ${SASL2_INCLUDE}) + message(STATUS "Found cyrus-sasl via fallback: ${SASL2_LIB}") endif() endif() -# OAuth Bearer is built into librdkafka when SASL is available -set(FLB_SASL_OAUTHBEARER_ENABLED ${FLB_SASL_ENABLED}) +# SASL is always enabled (built-in PLAIN/SCRAM/OAUTHBEARER support) +set(FLB_SASL_ENABLED ON) + +# OAuth Bearer is built into librdkafka (no external deps needed) +set(FLB_SASL_OAUTHBEARER_ENABLED ON) -# MSK IAM requires OAuth Bearer support -set(FLB_KAFKA_MSK_IAM_ENABLED ${FLB_SASL_OAUTHBEARER_ENABLED}) +# MSK IAM requires OAuth Bearer support (which is always available now) +set(FLB_KAFKA_MSK_IAM_ENABLED ON) # Configure librdkafka options -FLB_OPTION(WITH_SASL ${FLB_SASL_ENABLED}) -FLB_OPTION(WITH_SSL On) -FLB_OPTION(WITH_SASL_OAUTHBEARER ${FLB_SASL_OAUTHBEARER_ENABLED}) -FLB_OPTION(WITH_SASL_CYRUS ${FLB_SASL_ENABLED}) +FLB_OPTION(WITH_SASL ON) # Always ON (built-in PLAIN/SCRAM) +FLB_OPTION(WITH_SSL On) # SSL support +FLB_OPTION(WITH_SASL_OAUTHBEARER ON) # Always ON (built-in) +FLB_OPTION(WITH_SASL_CYRUS ${FLB_SASL_CYRUS_ENABLED}) # Only if cyrus-sasl found # Export compile-time definitions using FLB_DEFINITION macro if(FLB_SASL_ENABLED) @@ -66,6 +75,12 @@ add_subdirectory(${FLB_PATH_LIB_RDKAFKA} EXCLUDE_FROM_ALL) set(KAFKA_LIBRARIES "rdkafka") +# Add SASL libraries if cyrus-sasl is enabled +if(FLB_SASL_CYRUS_ENABLED AND SASL_LIBRARIES) + list(APPEND KAFKA_LIBRARIES ${SASL_LIBRARIES}) + message(STATUS "Added SASL libraries to Kafka: ${SASL_LIBRARIES}") +endif() + # Summary of what's enabled message(STATUS "=== Kafka Feature Summary ===") message(STATUS "SASL Auth: ${FLB_SASL_ENABLED}") diff --git a/cmake/parquet.cmake b/cmake/parquet.cmake new file mode 100644 index 00000000000..3f40c3c687a --- /dev/null +++ b/cmake/parquet.cmake @@ -0,0 +1,311 @@ +# Parquet detection for Fluent Bit (Enhanced Cross-Platform Version) +# ===================================================================== +# This module detects Apache Arrow and Parquet C++ libraries across multiple platforms. +# +# The following variables are set: +# ARROW_FOUND - System has Arrow library +# ARROW_INCLUDE_DIRS - Arrow include directories +# ARROW_LIBRARIES - Arrow libraries to link +# PARQUET_FOUND - System has Parquet library +# PARQUET_INCLUDE_DIRS - Parquet include directories +# PARQUET_LIBRARIES - Parquet libraries to link + +# Platform detection +if(WIN32) + set(PLATFORM_NAME "Windows") +elseif(APPLE) + set(PLATFORM_NAME "macOS") +elseif(UNIX) + set(PLATFORM_NAME "Linux") +else() + set(PLATFORM_NAME "Unknown") +endif() + +message(STATUS "Detecting Arrow/Parquet libraries on ${PLATFORM_NAME}...") + +# ============================================================================= +# Method 1: Try pkg-config (Linux/macOS) +# ============================================================================= +if(NOT WIN32) + find_package(PkgConfig QUIET) + if(PKG_CONFIG_FOUND) + pkg_check_modules(ARROW QUIET arrow) + pkg_check_modules(PARQUET QUIET parquet) + if(ARROW_FOUND AND PARQUET_FOUND) + message(STATUS "Found via pkg-config") + endif() + endif() +endif() + +# ============================================================================= +# Method 2: Try CMake Config files (All platforms) +# ============================================================================= +if(NOT ARROW_FOUND) + find_package(Arrow QUIET CONFIG) + if(Arrow_FOUND) + set(ARROW_FOUND TRUE) + + # Handle both arrow_shared and arrow_static, and get include directories + if(TARGET arrow_shared) + set(ARROW_LIBRARIES arrow_shared) + get_target_property(ARROW_INCLUDE_DIRS arrow_shared INTERFACE_INCLUDE_DIRECTORIES) + elseif(TARGET arrow_static) + set(ARROW_LIBRARIES arrow_static) + get_target_property(ARROW_INCLUDE_DIRS arrow_static INTERFACE_INCLUDE_DIRECTORIES) + elseif(TARGET Arrow::arrow_shared) + set(ARROW_LIBRARIES Arrow::arrow_shared) + get_target_property(ARROW_INCLUDE_DIRS Arrow::arrow_shared INTERFACE_INCLUDE_DIRECTORIES) + elseif(TARGET Arrow::arrow_static) + set(ARROW_LIBRARIES Arrow::arrow_static) + get_target_property(ARROW_INCLUDE_DIRS Arrow::arrow_static INTERFACE_INCLUDE_DIRECTORIES) + else() + set(ARROW_LIBRARIES arrow) + endif() + + # Fallback: if target property retrieval failed, check exported variables + if(NOT DEFINED ARROW_INCLUDE_DIRS OR ARROW_INCLUDE_DIRS STREQUAL "ARROW_INCLUDE_DIRS-NOTFOUND") + if(DEFINED ARROW_INCLUDE_DIRS) + # Already defined from pkg-config or elsewhere, keep it + elseif(DEFINED ARROW_INCLUDE_DIR) + # Use singular form if available + set(ARROW_INCLUDE_DIRS ${ARROW_INCLUDE_DIR}) + endif() + endif() + + message(STATUS "Found via CMake Config (Arrow)") + endif() +endif() + +if(NOT PARQUET_FOUND) + find_package(Parquet QUIET CONFIG) + if(Parquet_FOUND) + set(PARQUET_FOUND TRUE) + + # Handle both parquet_shared and parquet_static, and get include directories + if(TARGET parquet_shared) + set(PARQUET_LIBRARIES parquet_shared) + get_target_property(PARQUET_INCLUDE_DIRS parquet_shared INTERFACE_INCLUDE_DIRECTORIES) + elseif(TARGET parquet_static) + set(PARQUET_LIBRARIES parquet_static) + get_target_property(PARQUET_INCLUDE_DIRS parquet_static INTERFACE_INCLUDE_DIRECTORIES) + elseif(TARGET Parquet::parquet_shared) + set(PARQUET_LIBRARIES Parquet::parquet_shared) + get_target_property(PARQUET_INCLUDE_DIRS Parquet::parquet_shared INTERFACE_INCLUDE_DIRECTORIES) + elseif(TARGET Parquet::parquet_static) + set(PARQUET_LIBRARIES Parquet::parquet_static) + get_target_property(PARQUET_INCLUDE_DIRS Parquet::parquet_static INTERFACE_INCLUDE_DIRECTORIES) + else() + set(PARQUET_LIBRARIES parquet) + endif() + + # Fallback: if target property retrieval failed, check exported variables + if(NOT DEFINED PARQUET_INCLUDE_DIRS OR PARQUET_INCLUDE_DIRS STREQUAL "PARQUET_INCLUDE_DIRS-NOTFOUND") + if(DEFINED PARQUET_INCLUDE_DIRS) + # Already defined from pkg-config or elsewhere, keep it + elseif(DEFINED PARQUET_INCLUDE_DIR) + # Use singular form if available + set(PARQUET_INCLUDE_DIRS ${PARQUET_INCLUDE_DIR}) + endif() + endif() + + message(STATUS "Found via CMake Config (Parquet)") + endif() +endif() + +# ============================================================================= +# Method 3: Manual search with platform-specific paths +# ============================================================================= +if(NOT ARROW_FOUND OR NOT PARQUET_FOUND) + # Define search paths based on platform + if(WIN32) + # Windows paths (vcpkg, manual install, choco) + set(SEARCH_PATHS + "$ENV{VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}" + "C:/vcpkg/installed/${VCPKG_TARGET_TRIPLET}" + "C:/Program Files/Arrow" + "C:/Program Files (x86)/Arrow" + "$ENV{ProgramFiles}/Arrow" + "$ENV{ProgramFiles(x86)}/Arrow" + "$ENV{LOCALAPPDATA}/Arrow" + ) + set(LIB_SUFFIXES lib) + set(INCLUDE_SUFFIXES include) + set(ARROW_LIB_NAMES arrow arrow_static) + set(PARQUET_LIB_NAMES parquet parquet_static) + elseif(APPLE) + # macOS paths (Homebrew, MacPorts, manual) + # Detect Apple Silicon vs Intel + execute_process( + COMMAND uname -m + OUTPUT_VARIABLE ARCH + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(ARCH STREQUAL "arm64") + set(HOMEBREW_PREFIX "/opt/homebrew") + else() + set(HOMEBREW_PREFIX "/usr/local") + endif() + + set(SEARCH_PATHS + ${HOMEBREW_PREFIX} + /opt/local # MacPorts + /usr/local + $ENV{HOME}/.local + ) + set(LIB_SUFFIXES lib) + set(INCLUDE_SUFFIXES include) + set(ARROW_LIB_NAMES arrow libarrow) + set(PARQUET_LIB_NAMES parquet libparquet) + else() + # Linux paths (apt, yum, manual) + set(SEARCH_PATHS + /usr + /usr/local + /opt/arrow + /opt/local + $ENV{HOME}/.local + ) + # Check for 64-bit vs 32-bit + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(LIB_SUFFIXES lib64 lib lib/x86_64-linux-gnu) + else() + set(LIB_SUFFIXES lib lib/i386-linux-gnu) + endif() + set(INCLUDE_SUFFIXES include) + set(ARROW_LIB_NAMES arrow libarrow) + set(PARQUET_LIB_NAMES parquet libparquet) + endif() + + # Search for Arrow + if(NOT ARROW_FOUND) + find_path(ARROW_INCLUDE_DIR + NAMES arrow/api.h + PATHS ${SEARCH_PATHS} + PATH_SUFFIXES ${INCLUDE_SUFFIXES} + NO_DEFAULT_PATH + ) + + find_library(ARROW_LIBRARY + NAMES ${ARROW_LIB_NAMES} + PATHS ${SEARCH_PATHS} + PATH_SUFFIXES ${LIB_SUFFIXES} + NO_DEFAULT_PATH + ) + + if(ARROW_INCLUDE_DIR AND ARROW_LIBRARY) + set(ARROW_FOUND TRUE) + set(ARROW_INCLUDE_DIRS ${ARROW_INCLUDE_DIR}) + set(ARROW_LIBRARIES ${ARROW_LIBRARY}) + message(STATUS "Found via manual search (Arrow)") + endif() + endif() + + # Search for Parquet + if(NOT PARQUET_FOUND) + find_path(PARQUET_INCLUDE_DIR + NAMES parquet/api/reader.h + PATHS ${SEARCH_PATHS} + PATH_SUFFIXES ${INCLUDE_SUFFIXES} + NO_DEFAULT_PATH + ) + + find_library(PARQUET_LIBRARY + NAMES ${PARQUET_LIB_NAMES} + PATHS ${SEARCH_PATHS} + PATH_SUFFIXES ${LIB_SUFFIXES} + NO_DEFAULT_PATH + ) + + if(PARQUET_INCLUDE_DIR AND PARQUET_LIBRARY) + set(PARQUET_FOUND TRUE) + set(PARQUET_INCLUDE_DIRS ${PARQUET_INCLUDE_DIR}) + set(PARQUET_LIBRARIES ${PARQUET_LIBRARY}) + message(STATUS "Found via manual search (Parquet)") + endif() + endif() +endif() + +# ============================================================================= +# Validation and Version Check +# ============================================================================= +if(ARROW_FOUND) + # Try to detect Arrow version + if(EXISTS "${ARROW_INCLUDE_DIRS}/arrow/util/config.h") + file(STRINGS "${ARROW_INCLUDE_DIRS}/arrow/util/config.h" + ARROW_VERSION_LINE REGEX "^#define ARROW_VERSION_STRING") + if(ARROW_VERSION_LINE) + string(REGEX REPLACE "^#define ARROW_VERSION_STRING \"([0-9.]+)\".*" "\\1" + ARROW_VERSION ${ARROW_VERSION_LINE}) + message(STATUS "Arrow version: ${ARROW_VERSION}") + + # Check minimum version requirement: Arrow >= 11.0.0 + if(ARROW_VERSION VERSION_LESS "11.0.0") + message(FATAL_ERROR "Arrow version ${ARROW_VERSION} is too old. " + "The Parquet encoder requires Arrow >= 11.0.0 for the FileWriter::Open() API. " + "Please upgrade Arrow or disable Parquet support with -DFLB_PARQUET_ENCODER=Off") + endif() + endif() + endif() +endif() + +# ============================================================================= +# Report results with installation hints +# ============================================================================= +if(ARROW_FOUND AND PARQUET_FOUND) + message(STATUS "✓ Arrow found: ${ARROW_LIBRARIES}") + message(STATUS " Include dirs: ${ARROW_INCLUDE_DIRS}") + message(STATUS "✓ Parquet found: ${PARQUET_LIBRARIES}") + message(STATUS " Include dirs: ${PARQUET_INCLUDE_DIRS}") +else() + message(STATUS "✗ Arrow/Parquet not found") + + if(FLB_PARQUET_ENCODER) + message(WARNING "FLB_PARQUET_ENCODER is enabled but Arrow/Parquet libraries not found.") + message(WARNING "") + message(WARNING "Installation instructions:") + + if(WIN32) + message(WARNING " Windows (vcpkg):") + message(WARNING " vcpkg install arrow:x64-windows parquet:x64-windows") + message(WARNING " cmake -DCMAKE_TOOLCHAIN_FILE=[vcpkg root]/scripts/buildsystems/vcpkg.cmake ..") + message(WARNING "") + message(WARNING " Windows (pre-built):") + message(WARNING " Download from: https://arrow.apache.org/install/") + elseif(APPLE) + message(WARNING " macOS (Homebrew):") + message(WARNING " brew install apache-arrow") + message(WARNING "") + message(WARNING " macOS (MacPorts):") + message(WARNING " sudo port install apache-arrow") + else() + message(WARNING " Ubuntu/Debian:") + message(WARNING " sudo apt-get install -y -V ca-certificates lsb-release wget") + message(WARNING " wget https://packages.apache.org/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb") + message(WARNING " sudo apt-get install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb") + message(WARNING " sudo apt-get update") + message(WARNING " sudo apt-get install -y -V libarrow-dev libparquet-dev") + message(WARNING "") + message(WARNING " RHEL/CentOS:") + message(WARNING " sudo yum install -y https://packages.apache.org/artifactory/arrow/centos/$(cut -d: -f5 /etc/system-release-cpe | cut -d. -f1)/apache-arrow-release-latest.rpm") + message(WARNING " sudo yum install -y arrow-devel parquet-devel") + endif() + + message(WARNING "") + message(WARNING "Or disable with: -DFLB_PARQUET_ENCODER=Off") + endif() +endif() + +# ============================================================================= +# Export variables to parent scope +# ============================================================================= +set(ARROW_FOUND ${ARROW_FOUND} PARENT_SCOPE) +set(ARROW_INCLUDE_DIRS ${ARROW_INCLUDE_DIRS} PARENT_SCOPE) +set(ARROW_LIBRARIES ${ARROW_LIBRARIES} PARENT_SCOPE) +set(PARQUET_FOUND ${PARQUET_FOUND} PARENT_SCOPE) +set(PARQUET_INCLUDE_DIRS ${PARQUET_INCLUDE_DIRS} PARENT_SCOPE) +set(PARQUET_LIBRARIES ${PARQUET_LIBRARIES} PARENT_SCOPE) + +# Export version if found +if(DEFINED ARROW_VERSION) + set(ARROW_VERSION ${ARROW_VERSION} PARENT_SCOPE) +endif() diff --git a/include/fluent-bit/aws/flb_aws_compress.h b/include/fluent-bit/aws/flb_aws_compress.h index 6525e96d867..b3eb2e64f7e 100644 --- a/include/fluent-bit/aws/flb_aws_compress.h +++ b/include/fluent-bit/aws/flb_aws_compress.h @@ -21,11 +21,29 @@ #define FLB_AWS_COMPRESS #include +/* + * Compression algorithms (true compression) + * Valid values: none, gzip, snappy, zstd + */ #define FLB_AWS_COMPRESS_NONE 0 #define FLB_AWS_COMPRESS_GZIP 1 -#define FLB_AWS_COMPRESS_ARROW 2 -#define FLB_AWS_COMPRESS_PARQUET 3 -#define FLB_AWS_COMPRESS_ZSTD 4 +#define FLB_AWS_COMPRESS_SNAPPY 2 +#define FLB_AWS_COMPRESS_ZSTD 3 + +/* + * File format conversion (NOT compression algorithms) + * + * DEPRECATED: FLB_AWS_COMPRESS_ARROW (4) + * - Arrow is not a proper file format for S3 + * - This value is kept only for backward compatibility to avoid compilation errors + * - DO NOT USE in new code + * + * Valid file format: PARQUET (5) + * - Use format=parquet instead of compression=parquet (deprecated usage) + * - Supported S3 output formats: json (FLB_S3_FORMAT_JSON), parquet (FLB_S3_FORMAT_PARQUET) + */ +#define FLB_AWS_COMPRESS_ARROW 4 /* DEPRECATED - Do not use */ +#define FLB_AWS_COMPRESS_PARQUET 5 /* Use format=parquet instead */ /* * Get compression type from compression keyword. The return value is used to identify diff --git a/include/fluent-bit/flb_aws_util.h b/include/fluent-bit/flb_aws_util.h index 0bed165fc1e..5700b613ed7 100644 --- a/include/fluent-bit/flb_aws_util.h +++ b/include/fluent-bit/flb_aws_util.h @@ -197,13 +197,14 @@ int flb_aws_is_auth_error(char *payload, size_t payload_size); int flb_read_file(const char *path, char **out_buf, size_t *out_size); -/* Constructs S3 object key as per the format. */ +/* + * Constructs S3 object key as per the format. + * Supports variables: $TAG, $TAG[0-9], $UUID, $INDEX, time formatters, $FILE_PATH, $FILE_NAME + * file_path: optional file path (can be NULL) - used only when format contains $FILE_PATH or $FILE_NAME + */ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, - char *tag_delimiter, uint64_t seq_index); - -/* Constructs S3 object key as per the blob format. */ -flb_sds_t flb_get_s3_blob_key(const char *format, const char *tag, - char *tag_delimiter, const char *blob_path); + char *tag_delimiter, uint64_t seq_index, + const char *file_path); /* * This function is an extension to strftime which can support milliseconds with %3N, diff --git a/include/fluent-bit/flb_blob_db.h b/include/fluent-bit/flb_blob_db.h index d201c3689ed..f195e5127b4 100644 --- a/include/fluent-bit/flb_blob_db.h +++ b/include/fluent-bit/flb_blob_db.h @@ -123,6 +123,9 @@ #define SQL_UPDATE_FILE_PART_IN_PROGRESS \ "UPDATE blob_parts SET in_progress=@status WHERE id=@id;" +#define SQL_UPDATE_FILE_PARTS_IN_PROGRESS \ + "UPDATE blob_parts SET in_progress=@status WHERE file_id=@file_id;" + #define SQL_UPDATE_FILE_PART_DELIVERY_ATTEMPT_COUNT \ "UPDATE blob_parts " \ " SET delivery_attempts=@delivery_attempts " \ @@ -137,8 +140,7 @@ #define SQL_RESET_FILE_PART_UPLOAD_STATES \ "UPDATE blob_parts " \ " SET delivery_attempts=0, " \ - " uploaded=0, " \ - " in_progress=0 " \ + " uploaded=0 " \ " WHERE file_id=@id;" #define SQL_GET_NEXT_FILE_PART \ @@ -192,6 +194,28 @@ "ORDER BY f.created ASC " \ "LIMIT 1;" +#define SQL_GET_ALL_PARTS_FOR_FILE \ + "SELECT id, part_id, offset_start, offset_end " \ + " FROM blob_parts " \ + " WHERE file_id = @file_id " \ + "ORDER BY part_id ASC;" + +#define SQL_GET_NEXT_PENDING_FILE \ + "SELECT DISTINCT f.id, f.path, f.destination, f.remote_id, f.tag, " \ + " (SELECT COUNT(*) FROM blob_parts p2 WHERE p2.file_id = f.id) as part_count " \ + "FROM blob_files f " \ + "JOIN blob_parts p ON f.id = p.file_id " \ + "WHERE f.aborted = 0 " \ + " AND p.uploaded = 0 " \ + " AND p.in_progress = 0 " \ + "GROUP BY f.id " \ + "ORDER BY f.created ASC " \ + "LIMIT 1;" + +#define SQL_GET_PART_UPLOAD_STATUS \ + "SELECT uploaded FROM blob_parts WHERE id = @id;" + + #define FLB_BLOB_DB_SUCCESS 0 #define FLB_BLOB_DB_ERROR_NO_BACKEND_AVAILABLE -1 @@ -287,9 +311,17 @@ FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_BASE - 19 #define FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_GET_FILE_PART_COUNT \ FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_BASE - 20 +#define FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_GET_ALL_PARTS_FOR_FILE \ + FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_BASE - 21 +#define FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_GET_NEXT_PENDING_FILE \ + FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_BASE - 22 +#define FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_GET_PART_UPLOAD_STATUS \ + FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_BASE - 23 +#define FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_UPDATE_FILE_PARTS_IN_PROGRESS \ + FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_BASE - 24 #define FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_TOP \ - FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_GET_OLDEST_FILE_WITH_PARTS + FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_UPDATE_FILE_PARTS_IN_PROGRESS #ifdef FLB_HAVE_SQLDB #include @@ -332,6 +364,10 @@ struct flb_blob_db { __internal_sqlite3_stmt *stmt_update_file_part_in_progress; __internal_sqlite3_stmt *stmt_get_oldest_file_with_parts; + __internal_sqlite3_stmt *stmt_get_all_parts_for_file; + __internal_sqlite3_stmt *stmt_get_next_pending_file; + __internal_sqlite3_stmt *stmt_get_part_upload_status; + __internal_sqlite3_stmt *stmt_update_file_parts_in_progress; }; int flb_blob_db_open(struct flb_blob_db *context, @@ -438,14 +474,43 @@ int flb_blob_db_file_fetch_oldest_ready(struct flb_blob_db *context, cfl_sds_t *source, cfl_sds_t *file_remote_id, cfl_sds_t *file_tag, - int *part_count); + int *part_count, + time_t *file_created); int flb_blob_db_file_fetch_part_ids(struct flb_blob_db *context, uint64_t file_id, - cfl_sds_t *remote_id_list, + flb_sds_t *remote_id_list, size_t remote_id_list_size, int *remote_id_count); int flb_blob_db_file_fetch_part_count(struct flb_blob_db *context, uint64_t file_id); -#endif \ No newline at end of file + +int flb_blob_db_file_fetch_all_parts(struct flb_blob_db *context, + uint64_t file_id, + uint64_t **part_db_ids, + uint64_t **part_nums, + off_t **offset_starts, + off_t **offset_ends, + int *count); + +int flb_blob_db_file_get_next_pending(struct flb_blob_db *context, + uint64_t *file_id, + cfl_sds_t *path, + cfl_sds_t *destination, + cfl_sds_t *remote_id, + cfl_sds_t *tag, + int *part_count); + +int flb_blob_db_file_part_check_uploaded(struct flb_blob_db *context, + uint64_t part_id, + int *uploaded); + +int flb_blob_db_file_parts_in_progress(struct flb_blob_db *context, + uint64_t file_id, + int status); + +/* Recovery helpers */ +int flb_blob_db_reset_zombie_parts(struct flb_blob_db *context); + +#endif diff --git a/include/fluent-bit/flb_parquet.h b/include/fluent-bit/flb_parquet.h new file mode 100644 index 00000000000..b8a8faf4d38 --- /dev/null +++ b/include/fluent-bit/flb_parquet.h @@ -0,0 +1,98 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FLB_PARQUET_H +#define FLB_PARQUET_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Validate Parquet schema at startup (Fail Fast) + * + * @param schema_str JSON schema string to validate + * @param error_msg Output buffer for error message (optional) + * @param error_msg_size Size of error buffer + * @return 0 on success, -1 on failure + */ +int flb_parquet_validate_schema(const char *schema_str, + char *error_msg, + size_t error_msg_size); + +/* + * Opaque handle for Parquet schema + * This provides type safety while keeping implementation details hidden + */ +typedef struct flb_parquet_schema flb_parquet_schema; + +/* + * Create Parquet schema from JSON (avoids stack overflow in coroutine) + * + * This function parses the JSON schema in the main thread context where + * stack space is not limited, avoiding stack overflow when yyjson recursively + * parses deeply nested schemas in Fluent Bit's small coroutine stacks (37KB). + * + * @param schema_str JSON schema string + * @param error_msg Output buffer for error message (optional) + * @param error_msg_size Size of error buffer + * @return Schema handle, or NULL on failure + */ +flb_parquet_schema *flb_parquet_schema_create(const char *schema_str, + char *error_msg, + size_t error_msg_size); + +/* + * Free Parquet schema + * + * @param schema Schema handle returned by flb_parquet_schema_create + */ +void flb_parquet_schema_destroy(flb_parquet_schema *schema); + +/* + * Convert msgpack to Parquet using streaming approach + * + * This function accepts a pre-parsed schema to avoid stack overflow + * in coroutines. Use flb_parquet_schema_create() during plugin + * initialization to create the schema. + * + * @param msgpack_file_path Path to the msgpack file to read + * @param schema Pre-parsed schema from flb_parquet_schema_create() + * @param compression Compression type (FLB_AWS_COMPRESS_*) + * @param output_file Path where the Parquet file will be written + * @param out_file_size Output: size of the generated Parquet file + * @param total_file_size Configured total_file_size for optimization + * @return 0 on success, -1 on failure + */ +int flb_msgpack_to_parquet_streaming(const char *msgpack_file_path, + flb_parquet_schema *schema, + int compression, + const char *output_file, + size_t *out_file_size, + size_t total_file_size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/avro/CMakeLists.txt b/lib/avro/CMakeLists.txt index 36b0a028323..242048b5ad3 100644 --- a/lib/avro/CMakeLists.txt +++ b/lib/avro/CMakeLists.txt @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. # -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.12) project(AvroC C) enable_testing() diff --git a/lib/jansson-e23f558/CMakeLists.txt b/lib/jansson-e23f558/CMakeLists.txt index ed33e3c47f7..e22e6e7b549 100644 --- a/lib/jansson-e23f558/CMakeLists.txt +++ b/lib/jansson-e23f558/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.1) +cmake_minimum_required (VERSION 3.12) project(jansson C) # Options @@ -564,8 +564,9 @@ if (NOT JANSSON_WITHOUT_TESTS) # Enable using "make check" just like the autotools project. # By default cmake creates a target "make test" - add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} - DEPENDS json_process ${api_tests}) + # Disabled to avoid conflicts with other libraries (e.g., nghttp2) + # add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} + # DEPENDS json_process ${api_tests}) endif () # diff --git a/plugins/in_blob/blob_file.c b/plugins/in_blob/blob_file.c index 0a51bd9de81..067afce234b 100644 --- a/plugins/in_blob/blob_file.c +++ b/plugins/in_blob/blob_file.c @@ -78,11 +78,13 @@ int blob_file_append(struct blob_ctx *ctx, char *path, struct stat *st) #ifdef FLB_HAVE_SQLDB /* insert the entry into the database */ - bfile->db_id = blob_db_file_insert(ctx, path, st->st_size); - if (bfile->db_id < 0) { - cfl_sds_destroy(bfile->path); - flb_free(bfile); - return -1; + if (ctx->database_file) { + bfile->db_id = blob_db_file_insert(ctx, path, st->st_size); + if (bfile->db_id < 0) { + cfl_sds_destroy(bfile->path); + flb_free(bfile); + return -1; + } } #endif @@ -120,4 +122,4 @@ void blob_file_list_remove_all(struct blob_ctx *ctx) cfl_list_del(&bfile->_head); blob_file_list_remove(bfile); } -} \ No newline at end of file +} diff --git a/plugins/out_kinesis_firehose/firehose.c b/plugins/out_kinesis_firehose/firehose.c index fb4cb1dc6cb..3dea0ddae2b 100644 --- a/plugins/out_kinesis_firehose/firehose.c +++ b/plugins/out_kinesis_firehose/firehose.c @@ -496,8 +496,7 @@ static struct flb_config_map config_map[] = { FLB_CONFIG_MAP_STR, "compression", NULL, 0, FLB_FALSE, 0, "Compression type for Firehose records. Each log record is individually compressed " - "and sent to Firehose. 'gzip' and 'arrow' are the supported values. " - "'arrow' is only an available if Apache Arrow was enabled at compile time. " + "and sent to Firehose. Supported values: 'gzip', 'snappy', 'zstd'. " "Defaults to no compression." }, diff --git a/plugins/out_s3/CMakeLists.txt b/plugins/out_s3/CMakeLists.txt index 94e04861707..010148d1492 100644 --- a/plugins/out_s3/CMakeLists.txt +++ b/plugins/out_s3/CMakeLists.txt @@ -1,6 +1,10 @@ set(src s3.c s3_store.c - s3_multipart.c) + s3_stream.c + s3_multipart.c + s3_blob.c + s3_auth.c + s3_queue.c) FLB_PLUGIN(out_s3 "${src}" "") diff --git a/plugins/out_s3/s3.c b/plugins/out_s3/s3.c index 3016b28d69a..7052d1b8d28 100644 --- a/plugins/out_s3/s3.c +++ b/plugins/out_s3/s3.c @@ -39,8 +39,17 @@ #include +#ifdef FLB_HAVE_PARQUET_ENCODER +#include +#endif + #include "s3.h" +#include "s3_multipart.h" #include "s3_store.h" +#include "s3_stream.h" +#include "s3_blob.h" +#include "s3_auth.h" +#include "s3_queue.h" #define DEFAULT_S3_PORT 443 #define DEFAULT_S3_INSECURE_PORT 80 @@ -60,84 +69,16 @@ static int setenv(const char *name, const char *value, int overwrite) } #endif -static int s3_timer_create(struct flb_s3 *ctx); - -static int construct_request_buffer(struct flb_s3 *ctx, flb_sds_t new_data, - struct s3_file *chunk, - char **out_buf, size_t *out_size); - -static int s3_put_object(struct flb_s3 *ctx, const char *tag, time_t file_first_log_time, - char *body, size_t body_size); - -static int put_all_chunks(struct flb_s3 *ctx); - -static void cb_s3_upload(struct flb_config *ctx, void *data); - -static struct multipart_upload *get_upload(struct flb_s3 *ctx, - const char *tag, int tag_len); - -static struct multipart_upload *create_upload(struct flb_s3 *ctx, - const char *tag, int tag_len, - time_t file_first_log_time); - -static void remove_from_queue(struct upload_queue *entry); - -static int blob_initialize_authorization_endpoint_upstream(struct flb_s3 *context); - -static struct flb_aws_header *get_content_encoding_header(int compression_type) +/* Check if plugin is running under test mode */ +int s3_plugin_under_test() { - static struct flb_aws_header gzip_header = { - .key = "Content-Encoding", - .key_len = 16, - .val = "gzip", - .val_len = 4, - }; - - static struct flb_aws_header zstd_header = { - .key = "Content-Encoding", - .key_len = 16, - .val = "zstd", - .val_len = 4, - }; - - switch (compression_type) { - case FLB_AWS_COMPRESS_GZIP: - return &gzip_header; - case FLB_AWS_COMPRESS_ZSTD: - return &zstd_header; - default: - return NULL; + if (getenv("FLB_S3_PLUGIN_UNDER_TEST") != NULL) { + return FLB_TRUE; } + return FLB_FALSE; } -static struct flb_aws_header content_type_header = { - .key = "Content-Type", - .key_len = 12, - .val = "", - .val_len = 0, -}; - -static struct flb_aws_header canned_acl_header = { - .key = "x-amz-acl", - .key_len = 9, - .val = "", - .val_len = 0, -}; - -static struct flb_aws_header content_md5_header = { - .key = "Content-MD5", - .key_len = 11, - .val = "", - .val_len = 0, -}; - -static struct flb_aws_header storage_class_header = { - .key = "x-amz-storage-class", - .key_len = 19, - .val = "", - .val_len = 0, -}; - +/* Mock error response generator for testing */ static char *mock_error_response(char *error_env_var) { char *err_val = NULL; @@ -151,7 +92,6 @@ static char *mock_error_response(char *error_env_var) flb_errno(); return NULL; } - len = strlen(err_val); memcpy(error, err_val, len); error[len] = '\0'; @@ -161,94 +101,10 @@ static char *mock_error_response(char *error_env_var) return NULL; } -int s3_plugin_under_test() -{ - if (getenv("FLB_S3_PLUGIN_UNDER_TEST") != NULL) { - return FLB_TRUE; - } - - return FLB_FALSE; -} - -int create_headers(struct flb_s3 *ctx, char *body_md5, - struct flb_aws_header **headers, int *num_headers, - int multipart_upload) -{ - int n = 0; - int headers_len = 0; - struct flb_aws_header *s3_headers = NULL; - struct flb_aws_header *encoding_header = NULL; - - if (ctx->content_type != NULL) { - headers_len++; - } - if (ctx->compression == FLB_AWS_COMPRESS_GZIP || ctx->compression == FLB_AWS_COMPRESS_ZSTD) { - headers_len++; - } - if (ctx->canned_acl != NULL) { - headers_len++; - } - if (body_md5 != NULL && strlen(body_md5) && multipart_upload == FLB_FALSE) { - headers_len++; - } - if (ctx->storage_class != NULL) { - headers_len++; - } - if (headers_len == 0) { - *num_headers = headers_len; - *headers = s3_headers; - return 0; - } - - s3_headers = flb_calloc(headers_len, sizeof(struct flb_aws_header)); - if (s3_headers == NULL) { - flb_errno(); - return -1; - } - - if (ctx->content_type != NULL) { - s3_headers[n] = content_type_header; - s3_headers[n].val = ctx->content_type; - s3_headers[n].val_len = strlen(ctx->content_type); - n++; - } - if (ctx->compression == FLB_AWS_COMPRESS_GZIP || ctx->compression == FLB_AWS_COMPRESS_ZSTD) { - encoding_header = get_content_encoding_header(ctx->compression); - - if (encoding_header == NULL) { - flb_errno(); - flb_free(s3_headers); - return -1; - } - s3_headers[n] = *encoding_header; - n++; - } - if (ctx->canned_acl != NULL) { - s3_headers[n] = canned_acl_header; - s3_headers[n].val = ctx->canned_acl; - s3_headers[n].val_len = strlen(ctx->canned_acl); - n++; - } - if (body_md5 != NULL && strlen(body_md5) && multipart_upload == FLB_FALSE) { - s3_headers[n] = content_md5_header; - s3_headers[n].val = body_md5; - s3_headers[n].val_len = strlen(body_md5); - n++; - } - if (ctx->storage_class != NULL) { - s3_headers[n] = storage_class_header; - s3_headers[n].val = ctx->storage_class; - s3_headers[n].val_len = strlen(ctx->storage_class); - } - - *num_headers = headers_len; - *headers = s3_headers; - return 0; -}; - -struct flb_http_client *mock_s3_call(char *error_env_var, char *api) +/* Mock S3 API calls for testing */ +struct flb_http_client *mock_s3_call(char *error_env_var, char *api, + const char *body, size_t body_size) { - /* create an http client so that we can set the response */ struct flb_http_client *c = NULL; char *error = mock_error_response(error_env_var); char *resp; @@ -264,7 +120,6 @@ struct flb_http_client *mock_s3_call(char *error_env_var, char *api) if (error != NULL) { c->resp.status = 400; - /* resp.data is freed on destroy, payload is supposed to reference it */ c->resp.data = error; c->resp.payload = c->resp.data; c->resp.payload_size = strlen(error); @@ -273,8 +128,8 @@ struct flb_http_client *mock_s3_call(char *error_env_var, char *api) c->resp.status = 200; c->resp.payload = ""; c->resp.payload_size = 0; + if (strcmp(api, "CreateMultipartUpload") == 0) { - /* mocked success response */ c->resp.payload = "\n" "\n" "example-bucket\n" @@ -283,15 +138,13 @@ struct flb_http_client *mock_s3_call(char *error_env_var, char *api) ""; c->resp.payload_size = strlen(c->resp.payload); } - if (strcmp(api, "AbortMultipartUpload") == 0) { - /* mocked success response */ + else if (strcmp(api, "AbortMultipartUpload") == 0) { c->resp.status = 204; - resp = "Date: Mon, 1 Nov 2010 20:34:56 GMT\n" - "ETag: \"b54357faf0632cce46e942fa68356b38\"\n" - "Content-Length: 0\n" - "Connection: keep-alive\n" - "Server: AmazonS3"; - /* since etag is in the headers, this code uses resp.data */ + resp = "Date: Mon, 1 Nov 2010 20:34:56 GMT\n" + "ETag: \"b54357faf0632cce46e942fa68356b38\"\n" + "Content-Length: 0\n" + "Connection: keep-alive\n" + "Server: AmazonS3"; len = strlen(resp); c->resp.data = flb_calloc(len + 1, sizeof(char)); if (!c->resp.data) { @@ -304,13 +157,11 @@ struct flb_http_client *mock_s3_call(char *error_env_var, char *api) c->resp.data_size = len; } else if (strcmp(api, "UploadPart") == 0) { - /* mocked success response */ - resp = "Date: Mon, 1 Nov 2010 20:34:56 GMT\n" - "ETag: \"b54357faf0632cce46e942fa68356b38\"\n" - "Content-Length: 0\n" - "Connection: keep-alive\n" - "Server: AmazonS3"; - /* since etag is in the headers, this code uses resp.data */ + resp = "Date: Mon, 1 Nov 2010 20:34:56 GMT\n" + "ETag: \"b54357faf0632cce46e942fa68356b38\"\n" + "Content-Length: 0\n" + "Connection: keep-alive\n" + "Server: AmazonS3"; len = strlen(resp); c->resp.data = flb_calloc(len + 1, sizeof(char)); if (!c->resp.data) { @@ -322,15 +173,15 @@ struct flb_http_client *mock_s3_call(char *error_env_var, char *api) c->resp.data[len] = '\0'; c->resp.data_size = len; } - else { - c->resp.payload = ""; - c->resp.payload_size = 0; - } } return c; } +/* Timer callback - processes upload queue and retries pending uploads */ +void cb_s3_upload(struct flb_config *ctx, void *data); + +/* Concatenate two path segments with '/' separator */ static flb_sds_t concat_path(char *p1, char *p2) { flb_sds_t dir; @@ -373,7 +224,7 @@ static int read_seq_index(char *seq_index_file, uint64_t *seq_index) } /* Writes index value to metadata file */ -static int write_seq_index(char *seq_index_file, uint64_t seq_index) +int write_seq_index(char *seq_index_file, uint64_t seq_index) { FILE *fp; int ret; @@ -440,7 +291,12 @@ static int init_seq_index(void *context) { return -1; } - sprintf(tmp_buf, "%d", ctx->ins->id); + ret = snprintf(tmp_buf, sizeof(tmp_buf), "%d", ctx->ins->id); + if (ret < 0 || ret >= sizeof(tmp_buf)) { + flb_plg_error(ctx->ins, "Failed to format sequential index file path"); + flb_errno(); + return -1; + } ret = flb_sds_cat_safe(&ctx->seq_index_file, tmp_buf, strlen(tmp_buf)); if (ret < 0) { flb_plg_error(ctx->ins, "Failed to create sequential index file path"); @@ -481,46 +337,23 @@ static int init_seq_index(void *context) { return 0; } -void multipart_upload_destroy(struct multipart_upload *m_upload) -{ - int i; - flb_sds_t etag; - - if (!m_upload) { - return; - } - - if (m_upload->s3_key) { - flb_sds_destroy(m_upload->s3_key); - } - if (m_upload->tag) { - flb_sds_destroy(m_upload->tag); - } - if (m_upload->upload_id) { - flb_sds_destroy(m_upload->upload_id); - } - - for (i = 0; i < m_upload->part_number; i++) { - etag = m_upload->etags[i]; - if (etag) { - flb_sds_destroy(etag); - } - } - - flb_free(m_upload); -} - static void s3_context_destroy(struct flb_s3 *ctx) { struct mk_list *head; struct mk_list *tmp; - struct multipart_upload *m_upload; struct upload_queue *upload_contents; if (!ctx) { return; } +#ifdef FLB_HAVE_PARQUET_ENCODER + if (ctx->cached_arrow_schema) { + flb_parquet_schema_destroy(ctx->cached_arrow_schema); + ctx->cached_arrow_schema = NULL; + } +#endif + if (ctx->base_provider) { flb_aws_provider_destroy(ctx->base_provider); } @@ -569,18 +402,9 @@ static void s3_context_destroy(struct flb_s3 *ctx) flb_tls_destroy(ctx->authorization_endpoint_tls_context); } - /* Remove uploads */ - mk_list_foreach_safe(head, tmp, &ctx->uploads) { - m_upload = mk_list_entry(head, struct multipart_upload, _head); - mk_list_del(&m_upload->_head); - multipart_upload_destroy(m_upload); - } - mk_list_foreach_safe(head, tmp, &ctx->upload_queue) { upload_contents = mk_list_entry(head, struct upload_queue, _head); - s3_store_file_delete(ctx, upload_contents->upload_file); - multipart_upload_destroy(upload_contents->m_upload_file); - remove_from_queue(upload_contents); + s3_queue_remove(ctx, upload_contents); } flb_free(ctx); @@ -611,45 +435,40 @@ static int cb_s3_init(struct flb_output_instance *ins, return -1; } ctx->ins = ins; - mk_list_init(&ctx->uploads); mk_list_init(&ctx->upload_queue); + ctx->initial_upload_done = FLB_FALSE; + ctx->retry_time = 0; ctx->upload_queue_success = FLB_FALSE; + ctx->is_exiting = FLB_FALSE; + ctx->needs_recovery = FLB_FALSE; if(ctx->ins->retry_limit < 0) { ctx->ins->retry_limit = MAX_UPLOAD_ERRORS; } - /* Export context */ flb_output_set_context(ins, ctx); - /* initialize config map */ ret = flb_output_config_map_set(ins, (void *) ctx); if (ret == -1) { return -1; } - /* the check against -1 is works here because size_t is unsigned - * and (int) -1 == unsigned max value - * Fluent Bit uses -1 (which becomes max value) to indicate undefined - */ + /* Fluent Bit uses -1 to indicate undefined for size_t fields */ if (ctx->ins->total_limit_size != -1) { flb_plg_warn(ctx->ins, "Please use 'store_dir_limit_size' with s3 output instead of 'storage.total_limit_size'. " "S3 has its own buffer files located in the store_dir."); } - /* Date key */ ctx->date_key = ctx->json_date_key; tmp = flb_output_get_property("json_date_key", ins); if (tmp) { - /* Just check if we have to disable it */ if (flb_utils_bool(tmp) == FLB_FALSE) { ctx->date_key = NULL; } } - /* Date format for JSON output */ ctx->json_date_format = FLB_PACK_JSON_DATE_ISO8601; tmp = flb_output_get_property("json_date_format", ins); if (tmp) { @@ -669,11 +488,7 @@ static int cb_s3_init(struct flb_output_instance *ins, return -1; } - /* - * store_dir is the user input, buffer_dir is what the code uses - * We append the bucket name to the dir, to support multiple instances - * of this plugin using the same buffer dir - */ + /* Append bucket name to store_dir to support multiple plugin instances */ tmp_sds = concat_path(ctx->store_dir, ctx->bucket); if (!tmp_sds) { flb_plg_error(ctx->ins, "Could not construct buffer path"); @@ -681,7 +496,6 @@ static int cb_s3_init(struct flb_output_instance *ins, } ctx->buffer_dir = tmp_sds; - /* Initialize local storage */ ret = s3_store_init(ctx); if (ret == -1) { flb_plg_error(ctx->ins, "Failed to initialize S3 storage: %s", @@ -706,92 +520,144 @@ static int cb_s3_init(struct flb_output_instance *ins, } } - /* validate 'total_file_size' */ if (ctx->file_size <= 0) { flb_plg_error(ctx->ins, "Failed to parse total_file_size %s", tmp); return -1; } - if (ctx->file_size < 1000000) { - flb_plg_error(ctx->ins, "total_file_size must be at least 1MB"); - return -1; - } if (ctx->file_size > MAX_FILE_SIZE) { flb_plg_error(ctx->ins, "Max total_file_size is %s bytes", MAX_FILE_SIZE_STR); return -1; } - flb_plg_info(ctx->ins, "Using upload size %lu bytes", ctx->file_size); + flb_plg_info(ctx->ins, "total_file_size: %llu MiB", + (unsigned long long)(ctx->file_size / S3_MiB)); - if (ctx->use_put_object == FLB_FALSE && ctx->file_size < 2 * MIN_CHUNKED_UPLOAD_SIZE) { - flb_plg_info(ctx->ins, - "total_file_size is less than 10 MB, will use PutObject API"); - ctx->use_put_object = FLB_TRUE; - } + ctx->compression = FLB_AWS_COMPRESS_NONE; tmp = flb_output_get_property("compression", ins); if (tmp) { ret = flb_aws_compression_get_type(tmp); - if (ret == -1) { - flb_plg_error(ctx->ins, "unknown compression: %s", tmp); + + if (ret == FLB_AWS_COMPRESS_ARROW || ret == FLB_AWS_COMPRESS_PARQUET) { +#ifndef FLB_HAVE_PARQUET_ENCODER + flb_plg_error(ctx->ins, + "Parquet format is not supported in this build. " + "Rebuild with -DFLB_PARQUET_ENCODER=On."); return -1; +#else + flb_plg_warn(ctx->ins, + "DEPRECATED: compression=%s is deprecated. Use format=parquet instead. " + "Defaulting to GZIP compression for Parquet.", tmp); + ctx->format = FLB_S3_FORMAT_PARQUET; + ctx->compression = FLB_AWS_COMPRESS_GZIP; +#endif } - if (ctx->use_put_object == FLB_FALSE && - (ret == FLB_AWS_COMPRESS_ARROW || - ret == FLB_AWS_COMPRESS_PARQUET)) { - flb_plg_error(ctx->ins, - "use_put_object must be enabled when Apache Arrow or Parquet is enabled"); + else if (ret == -1) { + flb_plg_error(ctx->ins, "Unknown compression type: %s", tmp); return -1; } - ctx->compression = ret; + else { + ctx->compression = ret; + } } - tmp = flb_output_get_property("content_type", ins); + tmp = flb_output_get_property("format", ins); if (tmp) { - ctx->content_type = (char *) tmp; - } - if (ctx->use_put_object == FLB_FALSE) { - /* upload_chunk_size */ - if (ctx->upload_chunk_size <= 0) { - flb_plg_error(ctx->ins, "Failed to parse upload_chunk_size %s", tmp); + if (strcasecmp(tmp, "json") == 0) { + ctx->format = FLB_S3_FORMAT_JSON; + } +#ifdef FLB_HAVE_PARQUET_ENCODER + else if (strcasecmp(tmp, "parquet") == 0) { + ctx->format = FLB_S3_FORMAT_PARQUET; + } +#endif + else { +#ifdef FLB_HAVE_PARQUET_ENCODER + flb_plg_error(ctx->ins, "Invalid format '%s'. Supported formats: (json, parquet)", tmp); +#else + flb_plg_error(ctx->ins, "Invalid format '%s'. Supported formats: (json), parquet requires build with -DFLB_PARQUET_ENCODER=On", tmp); +#endif return -1; } - if (ctx->upload_chunk_size > ctx->file_size) { - flb_plg_error(ctx->ins, - "upload_chunk_size can not be larger than total_file_size"); + } + else if (ctx->format != FLB_S3_FORMAT_PARQUET) { + ctx->format = FLB_S3_FORMAT_JSON; + } + +#ifdef FLB_HAVE_PARQUET_ENCODER + if (ctx->format == FLB_S3_FORMAT_PARQUET) { + if (ctx->schema_str == NULL) { + flb_plg_error(ctx->ins, "schema_str is required when format=parquet"); return -1; } - if (ctx->upload_chunk_size < MIN_CHUNKED_UPLOAD_SIZE) { - flb_plg_error(ctx->ins, "upload_chunk_size must be at least 5,242,880 bytes"); + + /* + * CRITICAL: Pre-parse and cache Arrow schema in main thread context. + * This avoids stack overflow when yyjson recursively parses the schema + * in Fluent Bit's small coroutine stacks (37KB) during chunk processing. + */ + char parse_error[512]; + ctx->cached_arrow_schema = flb_parquet_schema_create( + ctx->schema_str, + parse_error, + sizeof(parse_error) + ); + + if (ctx->cached_arrow_schema == NULL) { + flb_plg_error(ctx->ins, "Failed to parse schema_str: %s", parse_error); return -1; } - if (ctx->compression != FLB_AWS_COMPRESS_NONE) { - if(ctx->upload_chunk_size > MAX_CHUNKED_UPLOAD_COMPRESS_SIZE) { - flb_plg_error(ctx->ins, "upload_chunk_size in compressed multipart upload cannot exceed 5GB"); - return -1; - } - } else { - if (ctx->upload_chunk_size > MAX_CHUNKED_UPLOAD_SIZE) { - flb_plg_error(ctx->ins, "Max upload_chunk_size is 50MB"); - return -1; - } + + flb_plg_info(ctx->ins, "schema_str parsed and cached successfully"); + + if (ctx->compression == FLB_AWS_COMPRESS_NONE) { + flb_plg_warn(ctx->ins, + "format=parquet with compression=none: Parquet files will be uncompressed. " + "For better storage efficiency and query performance, consider enabling compression."); } + + flb_plg_info(ctx->ins, + "format=parquet: using %s compression", + ctx->compression == FLB_AWS_COMPRESS_GZIP ? "GZIP" : + ctx->compression == FLB_AWS_COMPRESS_ZSTD ? "ZSTD" : + ctx->compression == FLB_AWS_COMPRESS_SNAPPY ? "SNAPPY" : "NONE"); } +#endif - if (ctx->upload_chunk_size != MIN_CHUNKED_UPLOAD_SIZE && - (ctx->upload_chunk_size * 2) > ctx->file_size) { - flb_plg_error(ctx->ins, "total_file_size is less than 2x upload_chunk_size"); - return -1; + tmp = flb_output_get_property("content_type", ins); + if (tmp) { + ctx->content_type = (char *) tmp; } - if (ctx->use_put_object == FLB_TRUE) { + /* Initialize upload_chunk_size (unified part size for all upload types) */ + { + size_t user_configured = 0; + /* - * code internally uses 'upload_chunk_size' as the unit for each Put, - * regardless of which API is used to send data + * Support both upload_chunk_size and part_size (deprecated) parameters. + * If both are set, upload_chunk_size takes precedence. + * This ensures backward compatibility while unifying the configuration. */ - ctx->upload_chunk_size = ctx->file_size; - if (ctx->file_size > MAX_FILE_SIZE_PUT_OBJECT) { - flb_plg_error(ctx->ins, "Max total_file_size is 50M when use_put_object is enabled"); - return -1; + tmp = flb_output_get_property("upload_chunk_size", ins); + if (tmp) { + user_configured = ctx->upload_chunk_size; + if (user_configured <= 0) { + flb_plg_error(ctx->ins, "Failed to parse upload_chunk_size"); + return -1; + } } + else if (ctx->part_size > 0) { + /* part_size is set but upload_chunk_size is not - use part_size value */ + flb_plg_warn(ctx->ins, "'part_size' is deprecated, please use 'upload_chunk_size' instead"); + user_configured = ctx->part_size; + } + + ctx->upload_chunk_size = flb_s3_calculate_optimal_part_size( + user_configured, + ctx->file_size + ); + + flb_plg_info(ctx->ins, "upload_chunk_size=%lluM", + (unsigned long long)(ctx->upload_chunk_size / S3_MiB)); } tmp = flb_output_get_property("endpoint", ins); @@ -834,7 +700,6 @@ static int cb_s3_init(struct flb_output_instance *ins, flb_utils_split_free(split); } else { - /* default endpoint for the given region */ ctx->endpoint = flb_aws_endpoint("s3", ctx->region); ctx->insecure = FLB_FALSE; ctx->port = DEFAULT_S3_PORT; @@ -876,7 +741,6 @@ static int cb_s3_init(struct flb_output_instance *ins, } } - /* AWS provider needs a separate TLS instance */ ctx->provider_tls = flb_tls_create(FLB_TLS_CLIENT_MODE, FLB_TRUE, ins->tls_debug, @@ -906,11 +770,9 @@ static int cb_s3_init(struct flb_output_instance *ins, tmp = flb_output_get_property("role_arn", ins); if (tmp) { - /* Use the STS Provider */ ctx->base_provider = ctx->provider; role_arn = (char *) tmp; - /* STS provider needs yet another separate TLS instance */ ctx->sts_provider_tls = flb_tls_create(FLB_TLS_CLIENT_MODE, FLB_TRUE, ins->tls_debug, @@ -952,19 +814,8 @@ static int cb_s3_init(struct flb_output_instance *ins, } } - /* read any remaining buffers from previous (failed) executions */ ctx->has_old_buffers = s3_store_has_data(ctx); - ctx->has_old_uploads = s3_store_has_uploads(ctx); - - /* Multipart */ - multipart_read_uploads_from_fs(ctx); - - if (mk_list_size(&ctx->uploads) > 0) { - /* note that these should be sent */ - ctx->has_old_uploads = FLB_TRUE; - } - /* create S3 client */ generator = flb_aws_client_generator(); ctx->s3_client = generator->create(); if (!ctx->s3_client) { @@ -976,7 +827,7 @@ static int cb_s3_init(struct flb_output_instance *ins, ctx->s3_client->region = ctx->region; ctx->s3_client->service = "s3"; ctx->s3_client->port = ctx->port; - ctx->s3_client->flags = 0; + ctx->s3_client->flags = FLB_HTTP_11; ctx->s3_client->proxy = NULL; ctx->s3_client->s3_mode = S3_MODE_SIGNED_PAYLOAD; ctx->s3_client->retry_requests = ctx->retry_requests; @@ -995,88 +846,58 @@ static int cb_s3_init(struct flb_output_instance *ins, flb_output_upstream_set(ctx->s3_client->upstream, ctx->ins); + /* + * Disable keepalive by default - multipart uploads have long intervals + * between requests which can cause connection reuse issues. + */ + flb_stream_disable_keepalive(&ctx->s3_client->upstream->base); + ctx->s3_client->host = ctx->endpoint; - /* set to sync mode and initialize credentials */ ctx->provider->provider_vtable->sync(ctx->provider); ctx->provider->provider_vtable->init(ctx->provider); ctx->timer_created = FLB_FALSE; - ctx->timer_ms = (int) (ctx->upload_timeout / 6) * 1000; - if (ctx->timer_ms > UPLOAD_TIMER_MAX_WAIT) { - ctx->timer_ms = UPLOAD_TIMER_MAX_WAIT; - } - else if (ctx->timer_ms < UPLOAD_TIMER_MIN_WAIT) { - ctx->timer_ms = UPLOAD_TIMER_MIN_WAIT; - } + /* Fixed 5-second timer for responsive queue processing */ + ctx->timer_ms = 5000; - /* - * S3 must ALWAYS use sync mode - * In the timer thread we do a mk_list_foreach_safe on the queue of uplaods and chunks - * Iterating over those lists is not concurrent safe. If a flush call ran at the same time - * And deleted an item from the list, this could cause a crash/corruption. - */ + /* Sync mode: avoid race condition in queue/list access. Single worker for same reason. */ flb_stream_disable_async_mode(&ctx->s3_client->upstream->base); if (ctx->authorization_endpoint_url != NULL) { - ret = blob_initialize_authorization_endpoint_upstream(ctx); + ret = s3_auth_init_endpoint(ctx); if (ret != 0) { - flb_plg_error(ctx->ins, - "Failed to initialize authorization endpoint upstream"); - + flb_plg_error(ctx->ins, "Failed to initialize authorization endpoint upstream"); return -1; } ctx->s3_client->has_auth = FLB_FALSE; } - /* clean up any old buffers found on startup */ - if (ctx->has_old_buffers == FLB_TRUE) { - flb_plg_info(ctx->ins, - "Sending locally buffered data from previous " - "executions to S3; buffer=%s", - ctx->fs->root_path); - ctx->has_old_buffers = FLB_FALSE; - ret = put_all_chunks(ctx); - if (ret < 0) { - ctx->has_old_buffers = FLB_TRUE; - flb_plg_error(ctx->ins, - "Failed to send locally buffered data left over " - "from previous executions; will retry. Buffer=%s", - ctx->fs->root_path); - } - } - - /* clean up any old uploads found on start up */ - if (ctx->has_old_uploads == FLB_TRUE) { - flb_plg_info(ctx->ins, - "Completing multipart uploads from previous " - "executions to S3; buffer=%s", - ctx->stream_upload->path); - ctx->has_old_uploads = FLB_FALSE; - - /* - * we don't need to worry if this fails; it will retry each - * time the upload callback is called - */ - cb_s3_upload(config, ctx); - } - - /* this is done last since in the previous block we make calls to AWS */ ctx->provider->provider_vtable->upstream_set(ctx->provider, ctx->ins); - /* database file for blob signal handling */ + /* Initialize blob database if configured */ if (ctx->blob_database_file != NULL) { - ret = flb_blob_db_open(&ctx->blob_db, - config, - ctx->blob_database_file); + /* Blob uploads now use the unified upload_chunk_size parameter */ + flb_plg_info(ctx->ins, "Blob upload_chunk_size: %llu MiB (unified with log uploads)", + (unsigned long long)(ctx->upload_chunk_size / S3_MiB)); + ret = flb_blob_db_open(&ctx->blob_db, config, ctx->blob_database_file); if (ret != FLB_BLOB_DB_SUCCESS) { return -1; } } + /* Initialize upload queue mutex for thread-safe access */ + ret = pthread_mutex_init(&ctx->upload_queue_lock, NULL); + if (ret != 0) { + flb_plg_error(ctx->ins, "Failed to initialize upload queue mutex"); + if (ctx->blob_database_file != NULL) { + flb_blob_db_close(&ctx->blob_db); + } + return -1; + } return 0; } @@ -1086,8 +907,7 @@ static int cb_s3_worker_init(void *data, struct flb_config *config) int ret; struct worker_info *info; struct flb_s3 *ctx = data; - - flb_plg_info(ctx->ins, "initializing worker"); + struct flb_sched *sched; info = FLB_TLS_GET(s3_worker_info); if (!info) { @@ -1101,10 +921,30 @@ static int cb_s3_worker_init(void *data, struct flb_config *config) FLB_TLS_SET(s3_worker_info, info); } - ret = s3_timer_create(ctx); - if (ret == -1) { - flb_plg_error(ctx->ins, "failed to create upload timer"); - return -1; + /* + * Create timer for upload processing. + * Timer is created immediately to ensure responsive queue processing. + */ + if (ctx->timer_created == FLB_FALSE) { + sched = flb_sched_ctx_get(); + ret = flb_sched_timer_cb_create(sched, FLB_SCHED_TIMER_CB_PERM, + ctx->timer_ms, cb_s3_upload, ctx, NULL); + if (ret == -1) { + flb_plg_error(ctx->ins, "Failed to create upload timer"); + return -1; + } + ctx->timer_created = FLB_TRUE; + flb_plg_debug(ctx->ins, "Upload timer created (interval: %dms)", ctx->timer_ms); + } + + /* + * Deferred recovery: Defer recovery to first timer callback. + * This avoids blocking worker initialization if there are many pending files. + * The timer will execute recovery on its first invocation (within timer_ms). + */ + if (ctx->blob_database_file != NULL || ctx->has_old_buffers == FLB_TRUE) { + ctx->needs_recovery = FLB_TRUE; + flb_plg_debug(ctx->ins, "Recovery scheduled for first timer callback"); } return 0; @@ -1120,8 +960,6 @@ static int cb_s3_worker_exit(void *data, struct flb_config *config) return 0; } - flb_plg_info(ctx->ins, "terminating worker"); - info = FLB_TLS_GET(s3_worker_info); if (info != NULL) { flb_free(info); @@ -1132,2603 +970,560 @@ static int cb_s3_worker_exit(void *data, struct flb_config *config) } /* - * return value is one of FLB_OK, FLB_RETRY, FLB_ERROR - * - * Chunk is allowed to be NULL + * Create multipart upload and enqueue all parts (Orchestration function) + * This function coordinates between multipart, blob, and queue modules: + * 1. Creates a multipart upload + * 2. Saves upload_id to database + * 3. Queries all parts from database for this specific file + * 4. Enqueues each part individually */ -static int upload_data(struct flb_s3 *ctx, struct s3_file *chunk, - struct multipart_upload *m_upload, - char *body, size_t body_size, - const char *tag, int tag_len) +int s3_initiate_multipart_upload(struct flb_s3 *ctx, + uint64_t file_id, + const char *file_path, + const char *tag, + int tag_len) { - int init_upload = FLB_FALSE; - int complete_upload = FLB_FALSE; - int size_check = FLB_FALSE; - int part_num_check = FLB_FALSE; - int timeout_check = FLB_FALSE; + struct multipart_upload *m_upload = NULL; + flb_sds_t pre_signed_url = NULL; + uint64_t *part_db_ids = NULL; + uint64_t *part_nums = NULL; + off_t *offset_starts = NULL; + off_t *offset_ends = NULL; + int part_count = 0; + int total_enqueued = 0; int ret; - void *payload_buf = NULL; - size_t payload_size = 0; - size_t preCompress_size = 0; - time_t file_first_log_time = time(NULL); - - /* - * When chunk does not exist, file_first_log_time will be the current time. - * This is only for unit tests and prevents unit tests from segfaulting when chunk is - * NULL because if so chunk->first_log_time will be NULl either and will cause - * segfault during the process of put_object upload or mutipart upload. - */ - if (chunk != NULL) { - file_first_log_time = chunk->first_log_time; - } - - if (ctx->compression != FLB_AWS_COMPRESS_NONE) { - /* Map payload */ - ret = flb_aws_compression_compress(ctx->compression, body, body_size, &payload_buf, &payload_size); - if (ret == -1) { - flb_plg_error(ctx->ins, "Failed to compress data"); - if (chunk != NULL) { - s3_store_file_unlock(chunk); - chunk->failures += 1; - } - return FLB_RETRY; - } - else { - preCompress_size = body_size; - body = (void *) payload_buf; - body_size = payload_size; - } - } + int i; - if (ctx->use_put_object == FLB_TRUE) { - goto put_object; + /* Create multipart upload structure */ + m_upload = s3_multipart_upload_create(ctx, tag, tag_len, file_path); + if (!m_upload) { + flb_plg_error(ctx->ins, "Failed to create multipart upload structure"); + return -1; } - if (s3_plugin_under_test() == FLB_TRUE) { - init_upload = FLB_TRUE; - complete_upload = FLB_TRUE; - if (ctx->use_put_object == FLB_TRUE) { - goto put_object; - } - else { - goto multipart; - } + /* Fetch presigned URL for CreateMultipartUpload */ + ret = s3_auth_fetch_presigned_url(ctx, &pre_signed_url, + S3_PRESIGNED_URL_CREATE_MULTIPART, + m_upload->s3_key, NULL, 0); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to fetch presigned URL for CreateMultipartUpload"); + s3_multipart_upload_destroy(m_upload); + return -1; } - if (m_upload == NULL) { - if (chunk != NULL && time(NULL) > - (chunk->create_time + ctx->upload_timeout + ctx->retry_time)) { - /* timeout already reached, just PutObject */ - goto put_object; - } - else if (body_size >= ctx->file_size) { - /* already big enough, just use PutObject API */ - goto put_object; - } - else if(body_size > MIN_CHUNKED_UPLOAD_SIZE) { - init_upload = FLB_TRUE; - goto multipart; - } - else { - if ((ctx->use_put_object == FLB_FALSE && (ctx->compression == FLB_AWS_COMPRESS_GZIP || ctx->compression == FLB_AWS_COMPRESS_ZSTD))) { - flb_plg_info(ctx->ins, "Pre-compression upload_chunk_size= %zu, After compression, chunk is only %zu bytes, " - "the chunk was too small, using PutObject to upload", preCompress_size, body_size); - } - goto put_object; - } - } - else { - /* existing upload */ - if (body_size < MIN_CHUNKED_UPLOAD_SIZE) { - complete_upload = FLB_TRUE; - } + /* Call AWS CreateMultipartUpload API */ + ret = s3_multipart_create(ctx, m_upload, pre_signed_url); + flb_sds_destroy(pre_signed_url); - goto multipart; + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to create multipart upload for file_id=%" PRIu64, file_id); + s3_multipart_upload_destroy(m_upload); + return -1; } -put_object: - - /* - * remove chunk from buffer list - */ - ret = s3_put_object(ctx, tag, file_first_log_time, body, body_size); - if (ctx->compression != FLB_AWS_COMPRESS_NONE) { - flb_free(payload_buf); - } + /* Save upload_id to database */ + ret = flb_blob_file_update_remote_id(&ctx->blob_db, file_id, m_upload->upload_id); if (ret < 0) { - /* re-add chunk to list */ - if (chunk) { - s3_store_file_unlock(chunk); - chunk->failures += 1; - } - return FLB_RETRY; + flb_plg_error(ctx->ins, "Failed to save upload_id to database"); + s3_multipart_abort(ctx, m_upload, NULL); + s3_multipart_upload_destroy(m_upload); + return -1; } - /* data was sent successfully- delete the local buffer */ - if (chunk) { - s3_store_file_delete(ctx, chunk); + /* Get all parts for this specific file */ + ret = flb_blob_db_file_fetch_all_parts(&ctx->blob_db, file_id, + &part_db_ids, &part_nums, + &offset_starts, &offset_ends, + &part_count); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to get parts for file_id=%" PRIu64, file_id); + s3_multipart_upload_destroy(m_upload); + return -1; } - return FLB_OK; - -multipart: - if (init_upload == FLB_TRUE) { - m_upload = create_upload(ctx, tag, tag_len, file_first_log_time); - if (!m_upload) { - flb_plg_error(ctx->ins, "Could not find or create upload for tag %s", tag); - if (chunk) { - s3_store_file_unlock(chunk); - } - if (ctx->compression != FLB_AWS_COMPRESS_NONE) { - flb_free(payload_buf); - } - return FLB_RETRY; - } + if (part_count == 0) { + flb_plg_error(ctx->ins, "No parts found for file_id=%" PRIu64, file_id); + s3_multipart_upload_destroy(m_upload); + return -1; } - if (m_upload->upload_state == MULTIPART_UPLOAD_STATE_NOT_CREATED) { - ret = create_multipart_upload(ctx, m_upload, NULL); - if (ret < 0) { - flb_plg_error(ctx->ins, "Could not initiate multipart upload"); - if (chunk) { - s3_store_file_unlock(chunk); - } - if (ctx->compression != FLB_AWS_COMPRESS_NONE) { - flb_free(payload_buf); - } - return FLB_RETRY; + /* Enqueue all parts */ + for (i = 0; i < part_count; i++) { + ret = s3_queue_add_part(ctx, file_id, part_db_ids[i], part_nums[i], + file_path, offset_starts[i], offset_ends[i], + m_upload->s3_key, m_upload->upload_id, + tag, tag_len); + if (ret == 0) { + total_enqueued++; } - m_upload->upload_state = MULTIPART_UPLOAD_STATE_CREATED; } - ret = upload_part(ctx, m_upload, body, body_size, NULL); - if (ret < 0) { - if (ctx->compression != FLB_AWS_COMPRESS_NONE) { - flb_free(payload_buf); - } - m_upload->upload_errors += 1; - /* re-add chunk to list */ - if (chunk) { - s3_store_file_unlock(chunk); - chunk->failures += 1; - } - return FLB_RETRY; + /* Free allocated arrays */ + if (part_db_ids) { + flb_free(part_db_ids); } - m_upload->part_number += 1; - /* data was sent successfully- delete the local buffer */ - if (chunk) { - s3_store_file_delete(ctx, chunk); - chunk = NULL; + if (part_nums) { + flb_free(part_nums); } - if (ctx->compression != FLB_AWS_COMPRESS_NONE) { - flb_free(payload_buf); + if (offset_starts) { + flb_free(offset_starts); } - if (m_upload->bytes >= ctx->file_size) { - size_check = FLB_TRUE; - flb_plg_info(ctx->ins, "Will complete upload for %s because uploaded data is greater" - " than size set by total_file_size", m_upload->s3_key); - } - if (m_upload->part_number >= 10000) { - part_num_check = FLB_TRUE; - flb_plg_info(ctx->ins, "Will complete upload for %s because 10,000 chunks " - "(the API limit) have been uploaded", m_upload->s3_key); - } - if (time(NULL) > - (m_upload->init_time + ctx->upload_timeout + ctx->retry_time)) { - timeout_check = FLB_TRUE; - flb_plg_info(ctx->ins, "Will complete upload for %s because upload_timeout" - " has elapsed", m_upload->s3_key); - } - if (size_check || part_num_check || timeout_check) { - complete_upload = FLB_TRUE; + if (offset_ends) { + flb_free(offset_ends); } - if (complete_upload == FLB_TRUE) { - /* mark for completion- the upload timer will handle actual completion */ - m_upload->upload_state = MULTIPART_UPLOAD_STATE_COMPLETE_IN_PROGRESS; + s3_multipart_upload_destroy(m_upload); + + if (total_enqueued == 0) { + flb_plg_error(ctx->ins, "No parts enqueued for file_id=%" PRIu64, file_id); + return -1; } - return FLB_OK; + flb_plg_info(ctx->ins, "Multipart upload created: file_id=%" PRIu64 " (%d parts)", + file_id, total_enqueued); + return 0; } - -/* - * Attempts to send all chunks to S3 using PutObject - * Used on shut down to try to send all buffered data - * Used on start up to try to send any leftover buffers from previous executions - */ -static int put_all_chunks(struct flb_s3 *ctx) -{ - struct s3_file *chunk; - struct mk_list *tmp; - struct mk_list *head; - struct mk_list *f_head; - struct flb_fstore_file *fsf; - struct flb_fstore_stream *fs_stream; - void *payload_buf = NULL; - size_t payload_size = 0; - char *buffer = NULL; - size_t buffer_size; - int ret; - - mk_list_foreach(head, &ctx->fs->streams) { - /* skip multi upload stream */ - fs_stream = mk_list_entry(head, struct flb_fstore_stream, _head); - if (fs_stream == ctx->stream_upload) { - continue; - } - /* skip metadata stream */ - if (fs_stream == ctx->stream_metadata) { - continue; - } - - mk_list_foreach_safe(f_head, tmp, &fs_stream->files) { - fsf = mk_list_entry(f_head, struct flb_fstore_file, _head); - chunk = fsf->data; - - /* Locked chunks are being processed, skip */ - if (chunk->locked == FLB_TRUE) { - continue; - } - - if (chunk->failures >= ctx->ins->retry_limit) { - flb_plg_warn(ctx->ins, - "Chunk for tag %s failed to send %d/%d times, will not retry", - (char *) fsf->meta_buf, chunk->failures, ctx->ins->retry_limit); - flb_fstore_file_inactive(ctx->fs, fsf); - continue; - } - - ret = construct_request_buffer(ctx, NULL, chunk, - &buffer, &buffer_size); - if (ret < 0) { - flb_plg_error(ctx->ins, - "Could not construct request buffer for %s", - chunk->file_path); - return -1; - } - - if (ctx->compression != FLB_AWS_COMPRESS_NONE) { - /* Map payload */ - ret = flb_aws_compression_compress(ctx->compression, buffer, buffer_size, &payload_buf, &payload_size); - if (ret == -1) { - flb_plg_error(ctx->ins, "Failed to compress data, uploading uncompressed data instead to prevent data loss"); - } else { - flb_plg_info(ctx->ins, "Pre-compression chunk size is %zu, After compression, chunk is %zu bytes", buffer_size, payload_size); - flb_free(buffer); - - buffer = (void *) payload_buf; - buffer_size = payload_size; - } - } - - ret = s3_put_object(ctx, (const char *) - fsf->meta_buf, - chunk->create_time, buffer, buffer_size); - flb_free(buffer); - if (ret < 0) { - s3_store_file_unlock(chunk); - chunk->failures += 1; - return -1; - } - - /* data was sent successfully- delete the local buffer */ - s3_store_file_delete(ctx, chunk); - } - } - - return 0; -} - -/* - * Either new_data or chunk can be NULL, but not both - */ -static int construct_request_buffer(struct flb_s3 *ctx, flb_sds_t new_data, - struct s3_file *chunk, - char **out_buf, size_t *out_size) -{ - char *body; - char *tmp; - size_t body_size = 0; - char *buffered_data = NULL; - size_t buffer_size = 0; - int ret; - - if (new_data == NULL && chunk == NULL) { - flb_plg_error(ctx->ins, "[construct_request_buffer] Something went wrong" - " both chunk and new_data are NULL"); - return -1; - } - - if (chunk) { - ret = s3_store_file_read(ctx, chunk, &buffered_data, &buffer_size); - if (ret < 0) { - flb_plg_error(ctx->ins, "Could not read locally buffered data %s", - chunk->file_path); - return -1; - } - - /* - * lock the chunk from buffer list - */ - s3_store_file_lock(chunk); - body = buffered_data; - body_size = buffer_size; - } - - /* - * If new data is arriving, increase the original 'buffered_data' size - * to append the new one. - */ - if (new_data) { - body_size += flb_sds_len(new_data); - - tmp = flb_realloc(buffered_data, body_size + 1); - if (!tmp) { - flb_errno(); - flb_free(buffered_data); - if (chunk) { - s3_store_file_unlock(chunk); - } - return -1; - } - body = buffered_data = tmp; - memcpy(body + buffer_size, new_data, flb_sds_len(new_data)); - body[body_size] = '\0'; - } - - *out_buf = body; - *out_size = body_size; - - return 0; -} - -static int s3_put_object(struct flb_s3 *ctx, const char *tag, time_t file_first_log_time, - char *body, size_t body_size) -{ - flb_sds_t s3_key = NULL; - struct flb_http_client *c = NULL; - struct flb_aws_client *s3_client; - struct flb_aws_header *headers = NULL; - char *random_alphanumeric; - int append_random = FLB_FALSE; - int len; - int ret; - int num_headers = 0; - char *final_key; - flb_sds_t uri; - flb_sds_t tmp; - char final_body_md5[25]; - - s3_key = flb_get_s3_key(ctx->s3_key_format, file_first_log_time, tag, - ctx->tag_delimiters, ctx->seq_index); - if (!s3_key) { - flb_plg_error(ctx->ins, "Failed to construct S3 Object Key for %s", tag); - return -1; - } - - len = strlen(s3_key); - if ((len + 16) <= 1024 && !ctx->key_fmt_has_uuid && !ctx->static_file_path && - !ctx->key_fmt_has_seq_index) { - append_random = FLB_TRUE; - len += 16; - } - len += strlen(ctx->bucket + 1); - - uri = flb_sds_create_size(len); - - if (append_random == FLB_TRUE) { - random_alphanumeric = flb_sts_session_name(); - if (!random_alphanumeric) { - flb_sds_destroy(s3_key); - flb_sds_destroy(uri); - flb_plg_error(ctx->ins, "Failed to create randomness for S3 key %s", tag); - return -1; - } - /* only use 8 chars of the random string */ - random_alphanumeric[8] = '\0'; - - tmp = flb_sds_printf(&uri, "/%s%s-object%s", ctx->bucket, s3_key, - random_alphanumeric); - flb_free(random_alphanumeric); - } - else { - tmp = flb_sds_printf(&uri, "/%s%s", ctx->bucket, s3_key); - } - - if (!tmp) { - flb_sds_destroy(s3_key); - flb_plg_error(ctx->ins, "Failed to create PutObject URI"); - return -1; - } - flb_sds_destroy(s3_key); - uri = tmp; - - memset(final_body_md5, 0, sizeof(final_body_md5)); - if (ctx->send_content_md5 == FLB_TRUE) { - ret = get_md5_base64(body, body_size, - final_body_md5, sizeof(final_body_md5)); - if (ret != 0) { - flb_plg_error(ctx->ins, "Failed to create Content-MD5 header"); - flb_sds_destroy(uri); - return -1; - } - } - - /* Update file and increment index value right before request */ - if (ctx->key_fmt_has_seq_index) { - ctx->seq_index++; - - ret = write_seq_index(ctx->seq_index_file, ctx->seq_index); - if (ret < 0 && access(ctx->seq_index_file, F_OK) == 0) { - ctx->seq_index--; - flb_plg_error(ctx->ins, "Failed to update sequential index metadata file"); - return -1; - } - } - - s3_client = ctx->s3_client; - if (s3_plugin_under_test() == FLB_TRUE) { - c = mock_s3_call("TEST_PUT_OBJECT_ERROR", "PutObject"); - } - else { - ret = create_headers(ctx, final_body_md5, &headers, &num_headers, FLB_FALSE); - if (ret == -1) { - flb_plg_error(ctx->ins, "Failed to create headers"); - flb_sds_destroy(uri); - goto decrement_index; - } - c = s3_client->client_vtable->request(s3_client, FLB_HTTP_PUT, - uri, body, body_size, - headers, num_headers); - flb_free(headers); - } - if (c) { - flb_plg_debug(ctx->ins, "PutObject http status=%d", c->resp.status); - if (c->resp.status == 200) { - /* - * URI contains bucket name, so we must advance over it - * to print the object key - */ - final_key = uri + strlen(ctx->bucket) + 1; - flb_plg_info(ctx->ins, "Successfully uploaded object %s", final_key); - flb_sds_destroy(uri); - flb_http_client_destroy(c); - - return 0; - } - flb_aws_print_xml_error(c->resp.payload, c->resp.payload_size, - "PutObject", ctx->ins); - if (c->resp.data != NULL) { - flb_plg_error(ctx->ins, "Raw PutObject response: %s", c->resp.data); - } - flb_http_client_destroy(c); - } - - flb_plg_error(ctx->ins, "PutObject request failed"); - flb_sds_destroy(uri); - goto decrement_index; - -decrement_index: - if (ctx->key_fmt_has_seq_index) { - ctx->seq_index--; - - ret = write_seq_index(ctx->seq_index_file, ctx->seq_index); - if (ret < 0) { - flb_plg_error(ctx->ins, "Failed to decrement index after request error"); - return -1; - } - } - return -1; -} - -int get_md5_base64(char *buf, size_t buf_size, char *md5_str, size_t md5_str_size) -{ - unsigned char md5_bin[16]; - size_t olen; - int ret; - - ret = flb_hash_simple(FLB_HASH_MD5, - (unsigned char *) buf, buf_size, - md5_bin, sizeof(md5_bin)); - - if (ret != FLB_CRYPTO_SUCCESS) { - return -1; - } - - ret = flb_base64_encode((unsigned char*) md5_str, md5_str_size, - &olen, md5_bin, sizeof(md5_bin)); - if (ret != 0) { - return ret; - } - - return 0; -} - -static struct multipart_upload *get_upload(struct flb_s3 *ctx, - const char *tag, int tag_len) -{ - struct multipart_upload *m_upload = NULL; - struct multipart_upload *tmp_upload = NULL; - struct mk_list *tmp; - struct mk_list *head; - - mk_list_foreach_safe(head, tmp, &ctx->uploads) { - tmp_upload = mk_list_entry(head, struct multipart_upload, _head); - - if (tmp_upload->upload_state == MULTIPART_UPLOAD_STATE_COMPLETE_IN_PROGRESS) { - continue; - } - if (tmp_upload->upload_errors >= ctx->ins->retry_limit) { - tmp_upload->upload_state = MULTIPART_UPLOAD_STATE_COMPLETE_IN_PROGRESS; - flb_plg_error(ctx->ins, "Upload for %s has reached max upload errors", - tmp_upload->s3_key); - continue; - } - if (strcmp(tmp_upload->tag, tag) == 0) { - m_upload = tmp_upload; - break; - } - } - - return m_upload; -} - -static struct multipart_upload *create_upload(struct flb_s3 *ctx, const char *tag, - int tag_len, time_t file_first_log_time) +int s3_upload_file(struct flb_s3 *ctx, + flb_sds_t body, size_t body_size, + const char *tag, int tag_len, + time_t file_first_log_time) { int ret; - struct multipart_upload *m_upload = NULL; flb_sds_t s3_key = NULL; - flb_sds_t tmp_sds = NULL; - - /* create new upload for this key */ - m_upload = flb_calloc(1, sizeof(struct multipart_upload)); - if (!m_upload) { - flb_errno(); - return NULL; - } - s3_key = flb_get_s3_key(ctx->s3_key_format, file_first_log_time, tag, - ctx->tag_delimiters, ctx->seq_index); - if (!s3_key) { - flb_plg_error(ctx->ins, "Failed to construct S3 Object Key for %s", tag); - flb_free(m_upload); - return NULL; - } - m_upload->s3_key = s3_key; - tmp_sds = flb_sds_create_len(tag, tag_len); - if (!tmp_sds) { - flb_errno(); - flb_sds_destroy(s3_key); - flb_free(m_upload); - return NULL; - } - m_upload->tag = tmp_sds; - m_upload->upload_state = MULTIPART_UPLOAD_STATE_NOT_CREATED; - m_upload->part_number = 1; - m_upload->init_time = time(NULL); - mk_list_add(&m_upload->_head, &ctx->uploads); - - /* Update file and increment index value right before request */ - if (ctx->key_fmt_has_seq_index) { - ctx->seq_index++; - - ret = write_seq_index(ctx->seq_index_file, ctx->seq_index); - if (ret < 0) { - ctx->seq_index--; - - mk_list_del(&m_upload->_head); - - flb_sds_destroy(tmp_sds); - flb_sds_destroy(s3_key); - - flb_free(m_upload); - - flb_plg_error(ctx->ins, "Failed to write to sequential index metadata file"); - - return NULL; - } - } - - return m_upload; -} - -/* Adds an entry to upload queue */ -static int add_to_queue(struct flb_s3 *ctx, struct s3_file *upload_file, - struct multipart_upload *m_upload_file, const char *tag, int tag_len) -{ - struct upload_queue *upload_contents; - flb_sds_t tag_cpy; - - /* Create upload contents object and add to upload queue */ - upload_contents = flb_calloc(1, sizeof(struct upload_queue)); - if (upload_contents == NULL) { - flb_plg_error(ctx->ins, "Error allocating memory for upload_queue entry"); - flb_errno(); - return -1; - } - upload_contents->upload_file = upload_file; - upload_contents->m_upload_file = m_upload_file; - upload_contents->tag_len = tag_len; - upload_contents->retry_counter = 0; - upload_contents->upload_time = -1; - - /* Necessary to create separate string for tag to prevent corruption */ - tag_cpy = flb_sds_create_len(tag, tag_len); - if (!tag_cpy) { - flb_errno(); - flb_free(upload_contents); - return -1; - } - upload_contents->tag = tag_cpy; - - - /* Add entry to upload queue */ - mk_list_add(&upload_contents->_head, &ctx->upload_queue); - return 0; -} - -/* Removes an entry from upload_queue */ -void remove_from_queue(struct upload_queue *entry) -{ - mk_list_del(&entry->_head); - flb_sds_destroy(entry->tag); - flb_free(entry); - return; -} - -/* Validity check for upload queue object */ -static int upload_queue_valid(struct upload_queue *upload_contents, time_t now, - void *out_context) -{ - struct flb_s3 *ctx = out_context; - - if (upload_contents == NULL) { - flb_plg_error(ctx->ins, "Error getting entry from upload_queue"); - return -1; - } - if (upload_contents->_head.next == NULL || upload_contents->_head.prev == NULL) { - flb_plg_debug(ctx->ins, "Encountered previously deleted entry in " - "upload_queue. Deleting invalid entry"); - mk_list_del(&upload_contents->_head); - return -1; - } - if (upload_contents->upload_file->locked == FLB_FALSE) { - flb_plg_debug(ctx->ins, "Encountered unlocked file in upload_queue. " - "Exiting"); - return -1; - } - if (upload_contents->upload_file->size <= 0) { - flb_plg_debug(ctx->ins, "Encountered empty chunk file in upload_queue. " - "Deleting empty chunk file"); - remove_from_queue(upload_contents); - return -1; - } - if (now < upload_contents->upload_time) { - flb_plg_debug(ctx->ins, "Found valid chunk file but not ready to upload"); - return -1; - } - return 0; -} - -static int send_upload_request(void *out_context, flb_sds_t chunk, - struct s3_file *upload_file, - struct multipart_upload *m_upload_file, - const char *tag, int tag_len) -{ - int ret; - char *buffer; - size_t buffer_size; - struct flb_s3 *ctx = out_context; - - /* Create buffer to upload to S3 */ - ret = construct_request_buffer(ctx, chunk, upload_file, &buffer, &buffer_size); - flb_sds_destroy(chunk); - if (ret < 0) { - flb_plg_error(ctx->ins, "Could not construct request buffer for %s", - upload_file->file_path); - return -1; - } - - /* Upload to S3 */ - ret = upload_data(ctx, upload_file, m_upload_file, buffer, buffer_size, tag, tag_len); - flb_free(buffer); - - return ret; -} - -static int buffer_chunk(void *out_context, struct s3_file *upload_file, - flb_sds_t chunk, int chunk_size, - const char *tag, int tag_len, - time_t file_first_log_time) -{ - int ret; - struct flb_s3 *ctx = out_context; - - ret = s3_store_buffer_put(ctx, upload_file, tag, - tag_len, chunk, (size_t) chunk_size, file_first_log_time); - flb_sds_destroy(chunk); - if (ret < 0) { - flb_plg_warn(ctx->ins, "Could not buffer chunk. Data order preservation " - "will be compromised"); - return -1; - } - return 0; -} - -/* Uploads all chunk files in queue synchronously */ -static void s3_upload_queue(struct flb_config *config, void *out_context) -{ - int ret; - time_t now; - struct upload_queue *upload_contents; - struct flb_s3 *ctx = out_context; - struct mk_list *tmp; - struct mk_list *head; - - flb_plg_debug(ctx->ins, "Running upload timer callback (upload_queue).."); - /* No chunks in upload queue. Scan for timed out chunks. */ - if (mk_list_size(&ctx->upload_queue) == 0) { - flb_plg_debug(ctx->ins, "No files found in upload_queue. Scanning for timed " - "out chunks"); - cb_s3_upload(config, out_context); - } - - /* Iterate through each file in upload queue */ - mk_list_foreach_safe(head, tmp, &ctx->upload_queue) { - upload_contents = mk_list_entry(head, struct upload_queue, _head); + /* Check if body is a file path marker (for file-based streaming upload) */ + if (strncmp(body, "FILE:", 5) == 0) { + const char *file_path = body + 5; + flb_sds_t compressed_path = NULL; + const char *final_file_path = file_path; - now = time(NULL); + /* Apply compression to temp file if needed (not for Parquet - already compressed) */ + if (ctx->compression != FLB_AWS_COMPRESS_NONE && ctx->format != FLB_S3_FORMAT_PARQUET) { + const char *compression_suffix; - /* Checks if upload_contents is valid */ - ret = upload_queue_valid(upload_contents, now, ctx); - if (ret < 0) { - goto exit; - } - - /* Try to upload file. Return value can be -1, FLB_OK, FLB_ERROR, FLB_RETRY. */ - ret = send_upload_request(ctx, NULL, upload_contents->upload_file, - upload_contents->m_upload_file, - upload_contents->tag, upload_contents->tag_len); - if (ret < 0) { - goto exit; - } - else if (ret == FLB_OK) { - remove_from_queue(upload_contents); - ctx->retry_time = 0; - ctx->upload_queue_success = FLB_TRUE; - } - else { - s3_store_file_lock(upload_contents->upload_file); - ctx->upload_queue_success = FLB_FALSE; - - /* If retry limit was reached, discard file and remove file from queue */ - upload_contents->retry_counter++; - if (upload_contents->retry_counter >= ctx->ins->retry_limit) { - flb_plg_warn(ctx->ins, "Chunk file failed to send %d times, will not " - "retry", upload_contents->retry_counter); - s3_store_file_inactive(ctx, upload_contents->upload_file); - multipart_upload_destroy(upload_contents->m_upload_file); - remove_from_queue(upload_contents); - continue; + /* Determine file suffix based on compression algorithm */ + switch (ctx->compression) { + case FLB_AWS_COMPRESS_GZIP: + compression_suffix = ".gz"; + break; + case FLB_AWS_COMPRESS_ZSTD: + compression_suffix = ".zstd"; + break; + case FLB_AWS_COMPRESS_SNAPPY: + compression_suffix = ".snappy"; + break; + default: + compression_suffix = ".compressed"; + flb_plg_warn(ctx->ins, "Unknown compression type %d, using generic suffix", + ctx->compression); + break; } - /* Retry in N seconds */ - upload_contents->upload_time = now + 2 * upload_contents->retry_counter; - ctx->retry_time += 2 * upload_contents->retry_counter; - flb_plg_debug(ctx->ins, "Failed to upload file in upload_queue. Will not " - "retry for %d seconds", 2 * upload_contents->retry_counter); - break; - } - } - -exit: - return; -} - -static int blob_initialize_authorization_endpoint_upstream(struct flb_s3 *context) -{ - int ret; - struct flb_upstream *upstream; - struct flb_tls *tls_context; - char *scheme = NULL; - char *host = NULL; - char *port = NULL; - char *uri = NULL; - int upstream_flags; - - context->authorization_endpoint_upstream = NULL; - context->authorization_endpoint_tls_context = NULL; - - /* Parse and split URL */ - ret = flb_utils_url_split(context->authorization_endpoint_url, - &scheme, &host, &port, &uri); - - if (ret == -1) { - flb_plg_error(context->ins, - "Invalid URL: %s", - context->authorization_endpoint_url); - - return -1; - } - - if (scheme != NULL) { - flb_free(scheme); - - scheme = NULL; - } - - if (port != NULL) { - flb_free(port); - - port = NULL; - } - - if (host == NULL || uri == NULL) { - flb_plg_error(context->ins, - "Invalid URL: %s", - context->authorization_endpoint_url); - - if (host != NULL) { - flb_free(host); - } - - if (uri != NULL) { - flb_free(uri); - } - - return -2; - } - - tls_context = flb_tls_create(FLB_TLS_CLIENT_MODE, - FLB_FALSE, - FLB_FALSE, - host, - NULL, - NULL, - NULL, - NULL, - NULL); - - flb_free(host); - flb_free(uri); - - if (tls_context == NULL) { - flb_plg_error(context->ins, - "TLS context creation errror"); - - return -2; - } - - upstream = flb_upstream_create_url(context->ins->config, - context->authorization_endpoint_url, - FLB_IO_TCP, - tls_context); - - if (upstream == NULL) { - flb_tls_destroy(tls_context); - - flb_plg_error(context->ins, - "Upstream creation errror"); - - return -3; - } - - upstream_flags = flb_stream_get_flags(&upstream->base); - - flb_output_upstream_set(upstream, context->ins); - - flb_stream_set_flags(&upstream->base, upstream_flags); - - context->authorization_endpoint_upstream = upstream; - context->authorization_endpoint_tls_context = tls_context; - - return 0; -} - -static int blob_request_pre_signed_url(struct flb_s3 *context, - flb_sds_t *result_url, - char *url) -{ - int ret; - size_t b_sent; - struct flb_http_client *http_client; - struct flb_connection *connection; - char *scheme = NULL; - char *host = NULL; - char *port = NULL; - char *uri = NULL; - uint16_t port_as_short; - flb_sds_t tmp; - - /* Parse and split URL */ - ret = flb_utils_url_split(url, - &scheme, &host, &port, &uri); - if (ret == -1) { - flb_plg_error(context->ins, - "Invalid URL: %s", - url); - - return -1; - } - - if (port != NULL) { - port_as_short = (uint16_t) strtoul(port, NULL, 10); - } - else { - if (scheme != NULL) { - if (strcasecmp(scheme, "https") == 0) { - port_as_short = 443; - } - else { - port_as_short = 80; - } - } - } - - if (scheme != NULL) { - flb_free(scheme); - scheme = NULL; - } - - if (port != NULL) { - flb_free(port); - port = NULL; - } - - if (host == NULL || uri == NULL) { - flb_plg_error(context->ins, - "Invalid URL: %s", - context->authorization_endpoint_url); - - if (host != NULL) { - flb_free(host); - } - - if (uri != NULL) { - flb_free(uri); - } - - return -2; - } - - /* Get upstream connection */ - connection = flb_upstream_conn_get(context->authorization_endpoint_upstream); - if (connection == NULL) { - flb_free(host); - flb_free(uri); - - flb_plg_error(context->ins, - "cannot create connection"); - - return -3; - } - - /* Create HTTP client context */ - http_client = flb_http_client(connection, - FLB_HTTP_GET, - uri, - NULL, 0, - host, - (int) port_as_short, - NULL, 0); - if (http_client == NULL) { - flb_upstream_conn_release(connection); - flb_free(host); - flb_free(uri); - - flb_plg_error(context->ins, - "cannot create HTTP client"); - - return -4; - } - - flb_http_add_header(http_client, - "Accept", - strlen("Accept"), - "text/plain", - 10); - - /* User Agent */ - flb_http_add_header(http_client, - "User-Agent", 10, - "Fluent-Bit", 10); - - if (context->authorization_endpoint_username != NULL && - context->authorization_endpoint_password != NULL) { - flb_http_basic_auth(http_client, - context->authorization_endpoint_username, - context->authorization_endpoint_password); - } - else if (context->authorization_endpoint_bearer_token != NULL) { - flb_http_bearer_auth(http_client, - context->authorization_endpoint_bearer_token); - } - - /* Send HTTP request */ - ret = flb_http_do(http_client, &b_sent); - - if (ret == -1) { - flb_plg_error(context->ins, - "Error sending configuration request"); - - ret = -5; - } - else { - if (http_client->resp.status == 200) { - flb_plg_info(context->ins, - "Pre signed url retrieved successfully"); - - if (*result_url != NULL) { - tmp = flb_sds_copy(*result_url, - http_client->resp.payload, - http_client->resp.payload_size); - } - else { - tmp = flb_sds_create_len(http_client->resp.payload, - http_client->resp.payload_size); - } - - if (tmp == NULL) { - flb_plg_error(context->ins, - "Pre signed url duplication error"); - - ret = -7; - } - else { - *result_url = tmp; - } - } - else { - if (http_client->resp.payload_size > 0) { - flb_plg_error(context->ins, - "Pre signed url retrieval failed with status %i\n%s", - http_client->resp.status, - http_client->resp.payload); - } - else { - flb_plg_error(context->ins, - "Pre signed url retrieval failed with status %i", - http_client->resp.status); - } - - ret = -6; - } - } - - flb_http_client_destroy(http_client); - flb_upstream_conn_release(connection); - flb_free(host); - flb_free(uri); - - return ret; -} - -static int blob_fetch_pre_signed_url(struct flb_s3 *context, - flb_sds_t *result_url, - char *format, - ...) -{ - va_list arguments[2]; - int url_length; - int ret; - flb_sds_t url; - flb_sds_t tmp; - - va_start(arguments[0], format); - va_copy(arguments[1], arguments[0]); - - url_length = vsnprintf(NULL, 0, format, arguments[0]); - - va_end(arguments[0]); - - if (url_length <= 0) { - va_end(arguments[1]); - - return -1; - } - - url = flb_sds_create_size( - flb_sds_len(context->authorization_endpoint_url) + url_length + 2); - - if (url == NULL) { - va_end(arguments[1]); - - return -2; - } - - tmp = flb_sds_cat(url, - context->authorization_endpoint_url, - flb_sds_len(context->authorization_endpoint_url)); - - url_length = vsnprintf( - &tmp[flb_sds_len(tmp)], - flb_sds_avail(tmp), - format, - arguments[1]); - - va_end(arguments[1]); - - if (url_length <= 0) { - flb_sds_destroy(tmp); - - return -3; - } - - url = tmp; - - flb_sds_len_set(url, flb_sds_len(url) + url_length); - - ret = blob_request_pre_signed_url(context, result_url, (char *) url); - - flb_sds_destroy(url); - - return ret; -} - -static int blob_fetch_put_object_pre_signed_url(struct flb_s3 *context, - flb_sds_t *result_url, - char *tag, - char *bucket, - char *path) -{ - char *valid_path; - - valid_path = (char *) path; - - while (*valid_path == '.' || - *valid_path == '/') { - valid_path++; - } - - return blob_fetch_pre_signed_url(context, - result_url, - "/put_object_presigned_url/%s/%s/%s", - bucket, - tag, - valid_path); -} - -static int blob_fetch_create_multipart_upload_pre_signed_url(struct flb_s3 *context, - flb_sds_t *result_url, - char *tag, - char *bucket, - char *path) -{ - char *valid_path; - - valid_path = (char *) path; - - while (*valid_path == '.' || - *valid_path == '/') { - valid_path++; - } - - return blob_fetch_pre_signed_url(context, - result_url, - "/multipart_creation_presigned_url/%s/%s/%s", - bucket, - tag, - valid_path); -} - -static int blob_fetch_multipart_upload_pre_signed_url(struct flb_s3 *context, - flb_sds_t *result_url, - char *tag, - char *bucket, - char *path, - char *upload_id, - int part_number) -{ - char *valid_path; - - valid_path = (char *) path; - - while (*valid_path == '.' || - *valid_path == '/') { - valid_path++; - } - - return blob_fetch_pre_signed_url(context, - result_url, - "/multipart_upload_presigned_url/%s/%s/%s/%s/%d", - bucket, - tag, - valid_path, - upload_id, - part_number); -} - -static int blob_fetch_multipart_complete_pre_signed_url(struct flb_s3 *context, - flb_sds_t *result_url, - char *tag, - char *bucket, - char *path, - char *upload_id) -{ - char *valid_path; - - valid_path = (char *) path; - - while (*valid_path == '.' || - *valid_path == '/') { - valid_path++; - } - - return blob_fetch_pre_signed_url(context, - result_url, - "/multipart_complete_presigned_url/%s/%s/%s/%s", - bucket, - tag, - valid_path, - upload_id); -} - -static int blob_fetch_multipart_abort_pre_signed_url(struct flb_s3 *context, - flb_sds_t *result_url, - char *tag, - char *bucket, - char *path, - char *upload_id) -{ - char *valid_path; - - valid_path = (char *) path; - - while (*valid_path == '.' || - *valid_path == '/') { - valid_path++; - } - - return blob_fetch_pre_signed_url(context, - result_url, - "/multipart_upload_presigned_url/%s/%s/%s/%s", - bucket, - tag, - valid_path, - upload_id); -} - -static struct multipart_upload *create_blob_upload(struct flb_s3 *ctx, const char *tag, - int tag_len, - const char *path) -{ - int ret; - struct multipart_upload *m_upload = NULL; - flb_sds_t s3_key = NULL; - flb_sds_t tmp_sds = NULL; - - /* create new upload for this key */ - m_upload = flb_calloc(1, sizeof(struct multipart_upload)); - if (!m_upload) { - flb_errno(); - return NULL; - } - - s3_key = flb_get_s3_blob_key("/$TAG/", - tag, - ctx->tag_delimiters, - path); - - if (!s3_key) { - flb_plg_error(ctx->ins, "Failed to construct S3 Object Key for %s", tag); - flb_free(m_upload); - return NULL; - } - m_upload->s3_key = s3_key; - tmp_sds = flb_sds_create_len(tag, tag_len); - if (!tmp_sds) { - flb_errno(); - flb_sds_destroy(s3_key); - flb_free(m_upload); - return NULL; - } - m_upload->tag = tmp_sds; - m_upload->upload_state = MULTIPART_UPLOAD_STATE_NOT_CREATED; - m_upload->part_number = 1; - m_upload->init_time = time(NULL); - mk_list_add(&m_upload->_head, &ctx->uploads); - - /* Update file and increment index value right before request */ - if (ctx->key_fmt_has_seq_index) { - ctx->seq_index++; - - ret = write_seq_index(ctx->seq_index_file, ctx->seq_index); - if (ret < 0) { - ctx->seq_index--; - - mk_list_del(&m_upload->_head); - - flb_sds_destroy(tmp_sds); - flb_sds_destroy(s3_key); - - flb_free(m_upload); - - flb_plg_error(ctx->ins, "Failed to write to sequential index metadata file"); - - return NULL; - } - } - - return m_upload; -} - -static int put_blob_object(struct flb_s3 *ctx, - const char *tag, - const char *path, - char *body, size_t body_size) -{ - flb_sds_t s3_key = NULL; - struct flb_http_client *c = NULL; - struct flb_aws_client *s3_client; - struct flb_aws_header *headers = NULL; - int len; - int ret; - int num_headers = 0; - char *final_key; - flb_sds_t uri; - flb_sds_t tmp; - char final_body_md5[25]; - - if (ctx->authorization_endpoint_url == NULL) { - s3_key = flb_get_s3_blob_key("/$TAG/", - tag, - ctx->tag_delimiters, - path); - - if (!s3_key) { - flb_plg_error(ctx->ins, "Failed to construct S3 Object Key for %s", tag); - return -1; - } - - len = strlen(s3_key); - len += strlen(ctx->bucket + 1); - - uri = flb_sds_create_size(len); - - tmp = flb_sds_printf(&uri, "/%s%s", ctx->bucket, s3_key); - - if (!tmp) { - flb_sds_destroy(s3_key); - flb_plg_error(ctx->ins, "Failed to create PutObject URI"); - return -1; - } - - flb_sds_destroy(s3_key); - uri = tmp; - } - else { - uri = NULL; - - ret = blob_fetch_put_object_pre_signed_url(ctx, &uri, (char *) tag, ctx->bucket, (char *) path); - - if (ret != 0) { - return -1; - } - } - - memset(final_body_md5, 0, sizeof(final_body_md5)); - if (ctx->send_content_md5 == FLB_TRUE) { - ret = get_md5_base64(body, body_size, - final_body_md5, sizeof(final_body_md5)); - if (ret != 0) { - flb_plg_error(ctx->ins, "Failed to create Content-MD5 header"); - flb_sds_destroy(uri); - return -1; - } - } - - s3_client = ctx->s3_client; - if (s3_plugin_under_test() == FLB_TRUE) { - c = mock_s3_call("TEST_PUT_OBJECT_ERROR", "PutObject"); - } - else { - ret = create_headers(ctx, final_body_md5, &headers, &num_headers, FLB_FALSE); - if (ret == -1) { - flb_plg_error(ctx->ins, "Failed to create headers"); - flb_sds_destroy(uri); - return -1; - } - - c = s3_client->client_vtable->request(s3_client, FLB_HTTP_PUT, - uri, body, body_size, - headers, num_headers); - flb_free(headers); - } - if (c) { - flb_plg_debug(ctx->ins, "PutObject http status=%d", c->resp.status); - if (c->resp.status == 200) { - /* - * URI contains bucket name, so we must advance over it - * to print the object key - */ - final_key = uri + strlen(ctx->bucket) + 1; - flb_plg_info(ctx->ins, "Successfully uploaded object %s", final_key); - flb_sds_destroy(uri); - flb_http_client_destroy(c); - - return 0; - } - flb_aws_print_xml_error(c->resp.payload, c->resp.payload_size, - "PutObject", ctx->ins); - if (c->resp.data != NULL) { - flb_plg_error(ctx->ins, "Raw PutObject response: %s", c->resp.data); - } - flb_http_client_destroy(c); - } - - flb_plg_error(ctx->ins, "PutObject request failed"); - flb_sds_destroy(uri); - - return -1; -} - -static int abort_blob_upload(struct flb_s3 *ctx, - cfl_sds_t file_tag, - cfl_sds_t file_path, - cfl_sds_t file_remote_id) -{ - struct multipart_upload *m_upload; - flb_sds_t pre_signed_url; - int ret; - - pre_signed_url = NULL; - - m_upload = create_blob_upload(ctx, file_tag, cfl_sds_len(file_tag), file_path); - - if (m_upload == NULL) { - return -1; - } - - mk_list_del(&m_upload->_head); - - m_upload->upload_id = flb_sds_create(file_remote_id); - - if (m_upload->upload_id == NULL) { - m_upload->part_number = 0; - - multipart_upload_destroy(m_upload); - - flb_plg_error(ctx->ins, "Could not allocate upload id copy"); - - return -2; - } - - if (ctx->authorization_endpoint_url != NULL) { - ret = blob_fetch_multipart_abort_pre_signed_url(ctx, - &pre_signed_url, - file_tag, - ctx->bucket, - file_path, - m_upload->upload_id); - - if (ret != 0) { - m_upload->part_number = 0; - - multipart_upload_destroy(m_upload); - - return -3; - } - } - else { - pre_signed_url = NULL; - } - - ret = abort_multipart_upload(ctx, m_upload, pre_signed_url); - - if (pre_signed_url != NULL) { - flb_sds_destroy(pre_signed_url); - - pre_signed_url = NULL; - } - - m_upload->part_number = 0; - - multipart_upload_destroy(m_upload); - - return 0; -} - -static int cb_s3_upload_blob(struct flb_config *config, void *data) -{ - int ret; - char *out_buf = NULL; - size_t out_size; - uint64_t id; - uint64_t file_id; - uint64_t part_id; - uint64_t part_delivery_attempts; - uint64_t file_delivery_attempts; - off_t offset_start; - off_t offset_end; - cfl_sds_t file_remote_id = NULL; - cfl_sds_t file_destination = NULL; - cfl_sds_t file_path = NULL; - cfl_sds_t file_tag = NULL; - cfl_sds_t part_ids = NULL; - cfl_sds_t source = NULL; - struct flb_s3 *ctx = data; - struct worker_info *info; - struct flb_blob_delivery_notification *notification; - struct multipart_upload *m_upload; - int part_count; - int put_object_required; - flb_sds_t pre_signed_url; - - info = FLB_TLS_GET(s3_worker_info); - - if (info->active_upload) { - flb_plg_trace(ctx->ins, "[worker: file upload] upload already in progress..."); - - return 0; - } - - if (ctx->blob_db.db == NULL) { - return 0; - } - - info->active_upload = FLB_TRUE; - pre_signed_url = NULL; - - /* - * Check if is there any file which has been fully uploaded and we need to commit it with - * the Put Block List operation - */ - - flb_blob_db_lock(&ctx->blob_db); - - while (1) { - ret = flb_blob_db_file_get_next_stale(&ctx->blob_db, - &file_id, - &file_path, - ctx->upload_parts_freshness_threshold, - &file_remote_id, - &file_tag, - &part_count); - - if (ret == 1) { - if (part_count > 1) { - ret = abort_blob_upload(ctx, file_tag, file_path, file_remote_id); - - if (ret != 0) { - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - flb_blob_db_unlock(&ctx->blob_db); - - return -1; - } - } - - flb_blob_file_update_remote_id(&ctx->blob_db, file_id, ""); - flb_blob_db_file_reset_upload_states(&ctx->blob_db, file_id); - flb_blob_db_file_set_aborted_state(&ctx->blob_db, file_id, 0); - - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_path); - cfl_sds_destroy(source); - - file_remote_id = NULL; - file_path = NULL; - source = NULL; - } - else { - break; - } - } - - while (1) { - ret = flb_blob_db_file_get_next_aborted(&ctx->blob_db, - &file_id, - &file_delivery_attempts, - &file_path, - &source, - &file_remote_id, - &file_tag, - &part_count); - - if (ret == 1) { - if (part_count > 1) { - ret = abort_blob_upload(ctx, file_tag, file_path, file_remote_id); - - if (ret != 0) { - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - flb_blob_db_unlock(&ctx->blob_db); - - return -1; - } - } - - if (ctx->file_delivery_attempt_limit != FLB_OUT_RETRY_UNLIMITED && - file_delivery_attempts < ctx->file_delivery_attempt_limit) { - - flb_blob_file_update_remote_id(&ctx->blob_db, file_id, ""); - flb_blob_db_file_reset_upload_states(&ctx->blob_db, file_id); - flb_blob_db_file_set_aborted_state(&ctx->blob_db, file_id, 0); - } - else { - ret = flb_blob_db_file_delete(&ctx->blob_db, file_id); - - notification = flb_calloc(1, - sizeof( - struct flb_blob_delivery_notification)); - - if (notification != NULL) { - notification->base.dynamically_allocated = FLB_TRUE; - notification->base.notification_type = FLB_NOTIFICATION_TYPE_BLOB_DELIVERY; - notification->base.destructor = flb_input_blob_delivery_notification_destroy; - notification->success = FLB_FALSE; - notification->path = cfl_sds_create(file_path); - - ret = flb_notification_enqueue(FLB_PLUGIN_INPUT, - source, - ¬ification->base, - config); - - if (ret != 0) { - flb_plg_error(ctx->ins, - "blob file '%s' (id=%" PRIu64 ") notification " \ - "delivery error %d", file_path, file_id, ret); - - flb_notification_cleanup(¬ification->base); - } - } - } - - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_path); - cfl_sds_destroy(source); - - file_remote_id = NULL; - file_path = NULL; - source = NULL; - } - else { - break; - } - } - - ret = flb_blob_db_file_fetch_oldest_ready(&ctx->blob_db, - &file_id, - &file_path, - &part_ids, - &source, - &file_remote_id, - &file_tag, - &part_count); - - if (ret == 0) { - flb_plg_trace(ctx->ins, "no blob files ready to commit"); - } - else if (ret == -1) { - flb_plg_error(ctx->ins, "cannot get oldest blob file ready to upload"); - } - else if (ret == 1) { - /* one file is ready to be committed */ - flb_plg_debug(ctx->ins, "blob file '%s' (id=%" PRIu64 ") ready to upload", file_path, file_id); - - if (part_count > 1) { - m_upload = create_blob_upload(ctx, file_tag, cfl_sds_len(file_tag), file_path); - - if (m_upload == NULL) { - flb_blob_db_unlock(&ctx->blob_db); - - return -1; - } - - mk_list_del(&m_upload->_head); - - m_upload->upload_id = flb_sds_create(file_remote_id); - - if (m_upload->upload_id == NULL) { - m_upload->part_number = 0; - multipart_upload_destroy(m_upload); - - flb_plg_error(ctx->ins, "Could not allocate upload id copy"); - - flb_blob_db_unlock(&ctx->blob_db); - - return -4; - } - - ret = flb_blob_db_file_fetch_part_ids(&ctx->blob_db, - file_id, - m_upload->etags, - 1000, - &part_count); - - if (ret == -1) { - m_upload->part_number = 0; - multipart_upload_destroy(m_upload); - - flb_plg_error(ctx->ins, "Could not retrieve part ids"); - - flb_blob_db_unlock(&ctx->blob_db); - - return -5; - } - - m_upload->part_number = part_count; - - if (ctx->authorization_endpoint_url != NULL) { - ret = blob_fetch_multipart_complete_pre_signed_url(ctx, - &pre_signed_url, - file_tag, - ctx->bucket, - file_path, - m_upload->upload_id); - - if (ret != 0) { - multipart_upload_destroy(m_upload); - - flb_blob_db_unlock(&ctx->blob_db); - - return -5; - } - } - else { - pre_signed_url = NULL; - } - - ret = complete_multipart_upload(ctx, m_upload, pre_signed_url); - - if (pre_signed_url != NULL) { - flb_sds_destroy(pre_signed_url); - - pre_signed_url = NULL; - } - - if (ret < 0) { - multipart_upload_destroy(m_upload); - - flb_plg_error(ctx->ins, "Could not initiate multipart upload"); - - flb_blob_db_unlock(&ctx->blob_db); - - return -6; - } - - multipart_upload_destroy(m_upload); - } - else { - ret = 0; - } - - if (ret == -1) { - flb_plg_error(ctx->ins, "cannot commit blob file parts for file id=%" PRIu64 " path=%s", - file_id, file_path); - } - else { - flb_plg_info(ctx->ins, "blob file '%s' (id=%" PRIu64 ") committed successfully", file_path, file_id); - /* notify the engine the blob file has been processed */ - - notification = flb_calloc(1, - sizeof( - struct flb_blob_delivery_notification)); - - if (notification != NULL) { - notification->base.dynamically_allocated = FLB_TRUE; - notification->base.notification_type = FLB_NOTIFICATION_TYPE_BLOB_DELIVERY; - notification->base.destructor = flb_input_blob_delivery_notification_destroy; - notification->success = FLB_TRUE; - notification->path = cfl_sds_create(file_path); - - ret = flb_notification_enqueue(FLB_PLUGIN_INPUT, - source, - ¬ification->base, - config); - - if (ret != 0) { - flb_plg_error(ctx->ins, - "blob file '%s' (id=%" PRIu64 ") notification " \ - "delivery error %d", file_path, file_id, ret); - - flb_notification_cleanup(¬ification->base); - } - } - - /* remove the file entry from the database */ - ret = flb_blob_db_file_delete(&ctx->blob_db, file_id); - if (ret == -1) { - flb_plg_error(ctx->ins, "cannot delete blob file '%s' (id=%" PRIu64 ") from the database", - file_path, file_id); - } - } - } - - flb_blob_db_unlock(&ctx->blob_db); - - if (file_tag) { - cfl_sds_destroy(file_tag); - file_tag = NULL; - } - - if (file_path) { - cfl_sds_destroy(file_path); - file_path = NULL; - } - - if (part_ids) { - cfl_sds_destroy(part_ids); - part_ids = NULL; - } - - if (source) { - cfl_sds_destroy(source); - source = NULL; - } - - if (file_remote_id) { - cfl_sds_destroy(file_remote_id); - file_remote_id = NULL; - } - - /* check for a next part file and lock it */ - ret = flb_blob_db_file_part_get_next(&ctx->blob_db, &id, &file_id, &part_id, - &offset_start, &offset_end, - &part_delivery_attempts, - &file_delivery_attempts, - &file_path, - &file_destination, - &file_remote_id, - &file_tag, - &part_count); - - if (ret == -1) { - flb_plg_error(ctx->ins, "cannot get next blob file part"); - info->active_upload = FLB_FALSE; - - return -1; - } - else if (ret == 0) { - flb_plg_trace(ctx->ins, "no more blob file parts to process"); - info->active_upload = FLB_FALSE; - - return -1; - } - else if (ret == 1) { - /* just continue, the row info was retrieved */ - } - - if (strcmp(file_destination, ctx->endpoint) != 0) { - flb_plg_info(ctx->ins, - "endpoint change detected, restarting file : %s\n%s\n%s", - file_path, - file_destination, - ctx->endpoint); - - info->active_upload = FLB_FALSE; - - /* we need to set the aborted state flag to wait for existing uploads - * to finish and then wipe the slate and start again but we don't want - * to increment the failure count in this case. - */ - flb_blob_db_file_set_aborted_state(&ctx->blob_db, file_id, 1); - - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - return 0; - } - - /* since this is the first part we want to increment the files - * delivery attempt counter. - */ - if (part_id == 0) { - flb_blob_db_file_delivery_attempts(&ctx->blob_db, file_id, ++file_delivery_attempts); - } - - /* read the file content */ - ret = flb_utils_read_file_offset(file_path, offset_start, offset_end, &out_buf, &out_size); - if (ret == -1) { - flb_plg_error(ctx->ins, "cannot read file part %s", file_path); - - info->active_upload = FLB_FALSE; - - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - return 0; - } - - flb_blob_db_file_part_update_delivery_attempt_counter(&ctx->blob_db, file_id, part_id, ++part_delivery_attempts); - - flb_plg_debug(ctx->ins, "sending part file %s (id=%" PRIu64 " part_id=%" PRIu64 ")", file_path, id, part_id); - - put_object_required = FLB_FALSE; - - - if (part_id == 0) { - if (part_count == 1) { - if (out_size <= MIN_CHUNKED_UPLOAD_SIZE) { - put_object_required = FLB_TRUE; - } - } - } - - if (put_object_required == FLB_TRUE) { - ret = put_blob_object(ctx, - file_tag, - file_path, - out_buf, - out_size); - - if (ret != 0) { - flb_free(out_buf); - - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - return -1; - } - } - else { - m_upload = create_blob_upload(ctx, file_tag, cfl_sds_len(file_tag), file_path); - - if (m_upload == NULL) { - flb_free(out_buf); - - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - return -1; - } - - mk_list_del(&m_upload->_head); - - if (part_id == 0) { - if (ctx->authorization_endpoint_url != NULL) { - ret = blob_fetch_create_multipart_upload_pre_signed_url(ctx, - &pre_signed_url, - file_tag, - ctx->bucket, - file_path); - - if (ret != 0) { - flb_free(out_buf); - - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - m_upload->part_number = 0; - multipart_upload_destroy(m_upload); - - return -1; - } - } - else { - pre_signed_url = NULL; - } - - ret = create_multipart_upload(ctx, m_upload, pre_signed_url); - - if (pre_signed_url != NULL) { - flb_sds_destroy(pre_signed_url); - - pre_signed_url = NULL; - } - - if (ret < 0) { - flb_free(out_buf); - - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - - m_upload->part_number = 0; - multipart_upload_destroy(m_upload); - - flb_plg_error(ctx->ins, "Could not initiate multipart upload"); - - return -2; - } - - ret = flb_blob_file_update_remote_id(&ctx->blob_db, file_id, m_upload->upload_id); - - if (ret != FLB_BLOB_DB_SUCCESS) { - flb_free(out_buf); - - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - m_upload->part_number = 0; - multipart_upload_destroy(m_upload); - - flb_plg_error(ctx->ins, "Could not save upload id"); - - return -3; - } - } - else { - m_upload->upload_id = flb_sds_create(file_remote_id); - - if (m_upload->upload_id == NULL) { - flb_free(out_buf); - - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - m_upload->part_number = 0; - multipart_upload_destroy(m_upload); - - flb_plg_error(ctx->ins, "Could not allocate upload id copy"); - - return -4; - } - } - - m_upload->part_number = part_id + 1; - - if (ctx->authorization_endpoint_url != NULL) { - ret = blob_fetch_multipart_upload_pre_signed_url(ctx, - &pre_signed_url, - file_tag, - ctx->bucket, - file_path, - m_upload->upload_id, - m_upload->part_number); - - if (ret != 0) { - flb_free(out_buf); - - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - m_upload->part_number = 0; - multipart_upload_destroy(m_upload); - - return -1; - } - } - else { - pre_signed_url = NULL; - } - - ret = upload_part(ctx, m_upload, out_buf, out_size, pre_signed_url); - - if (pre_signed_url != NULL) { - flb_sds_destroy(pre_signed_url); - - pre_signed_url = NULL; - } - - if (ret == 0) { - ret = flb_blob_db_file_part_update_remote_id(&ctx->blob_db, - id, - m_upload->etags[m_upload->part_number - 1]); - - flb_sds_destroy(m_upload->etags[m_upload->part_number - 1]); - } - - - m_upload->part_number = 0; - multipart_upload_destroy(m_upload); - } - - if (ret == 0) { - ret = flb_blob_db_file_part_uploaded(&ctx->blob_db, id); - } - else { - ret = flb_blob_db_file_part_in_progress(&ctx->blob_db, 0, id); - - if (ctx->part_delivery_attempt_limit != FLB_OUT_RETRY_UNLIMITED && - part_delivery_attempts >= ctx->part_delivery_attempt_limit) { - flb_blob_db_file_set_aborted_state(&ctx->blob_db, file_id, 1); - } - } - - info->active_upload = FLB_FALSE; - - flb_free(out_buf); - - cfl_sds_destroy(file_tag); - cfl_sds_destroy(file_path); - cfl_sds_destroy(file_remote_id); - cfl_sds_destroy(file_destination); - - return 0; -} - - - - - -static void cb_s3_upload(struct flb_config *config, void *data) -{ - struct flb_s3 *ctx = data; - struct s3_file *chunk = NULL; - struct multipart_upload *m_upload = NULL; - struct flb_fstore_file *fsf; - char *buffer = NULL; - size_t buffer_size = 0; - struct mk_list *tmp; - struct mk_list *head; - int complete; - int ret; - time_t now; - - flb_plg_info(ctx->ins, "Running upload timer callback (cb_s3_upload).."); - - now = time(NULL); - - /* Check all chunks and see if any have timed out */ - mk_list_foreach_safe(head, tmp, &ctx->stream_active->files) { - fsf = mk_list_entry(head, struct flb_fstore_file, _head); - chunk = fsf->data; - - if (now < (chunk->create_time + ctx->upload_timeout + ctx->retry_time)) { - continue; /* Only send chunks which have timed out */ - } - - /* Locked chunks are being processed, skip */ - if (chunk->locked == FLB_TRUE) { - continue; - } - - m_upload = get_upload(ctx, (const char *) fsf->meta_buf, fsf->meta_size); - - ret = construct_request_buffer(ctx, NULL, chunk, &buffer, &buffer_size); - if (ret < 0) { - flb_plg_error(ctx->ins, "Could not construct request buffer for %s", - chunk->file_path); - continue; - } - - /* FYI: if construct_request_buffer() succeedeed, the s3_file is locked */ - ret = upload_data(ctx, chunk, m_upload, buffer, buffer_size, - (const char *) fsf->meta_buf, fsf->meta_size); - flb_free(buffer); - if (ret != FLB_OK) { - flb_plg_error(ctx->ins, "Could not send chunk with tag %s", - (char *) fsf->meta_buf); - if(chunk->failures >= ctx->ins->retry_limit){ - flb_plg_warn(ctx->ins, - "Chunk for tag %s failed to send %d/%d times, will not retry", - (char *) fsf->meta_buf, chunk->failures, ctx->ins->retry_limit); - flb_fstore_file_inactive(ctx->fs, fsf); - continue; - } - } - } - - /* Check all uploads and see if any need completion */ - mk_list_foreach_safe(head, tmp, &ctx->uploads) { - m_upload = mk_list_entry(head, struct multipart_upload, _head); - complete = FLB_FALSE; - - if (m_upload->complete_errors >= ctx->ins->retry_limit) { - flb_plg_error(ctx->ins, - "Upload for %s has reached max completion errors, " - "plugin will give up", m_upload->s3_key); - mk_list_del(&m_upload->_head); - continue; - } - - if (m_upload->upload_state == MULTIPART_UPLOAD_STATE_NOT_CREATED) { - continue; - } - - if (m_upload->upload_state == MULTIPART_UPLOAD_STATE_COMPLETE_IN_PROGRESS) { - complete = FLB_TRUE; - } - if (time(NULL) > (m_upload->init_time + ctx->upload_timeout + ctx->retry_time)) { - flb_plg_info(ctx->ins, "Completing upload for %s because upload_timeout" - " has passed", m_upload->s3_key); - complete = FLB_TRUE; - } - if (complete == FLB_TRUE) { - m_upload->upload_state = MULTIPART_UPLOAD_STATE_COMPLETE_IN_PROGRESS; - mk_list_del(&m_upload->_head); - ret = complete_multipart_upload(ctx, m_upload, NULL); - if (ret == 0) { - multipart_upload_destroy(m_upload); - } - else { - mk_list_add(&m_upload->_head, &ctx->uploads); - /* data was persisted, this can be retried */ - m_upload->complete_errors += 1; - flb_plg_error(ctx->ins, "Could not complete upload %s, will retry..", - m_upload->s3_key); - } - } - } - -} - -static flb_sds_t flb_pack_msgpack_extract_log_key(void *out_context, const char *data, - uint64_t bytes, struct flb_config *config) -{ - int i; - int records = 0; - int map_size; - int check = FLB_FALSE; - int found = FLB_FALSE; - int log_key_missing = 0; - int ret; - int alloc_error = 0; - struct flb_s3 *ctx = out_context; - char *val_buf; - char *key_str = NULL; - size_t key_str_size = 0; - size_t msgpack_size = bytes + bytes / 4; - size_t val_offset = 0; - flb_sds_t out_buf; - msgpack_object map; - msgpack_object key; - msgpack_object val; - struct flb_log_event_decoder log_decoder; - struct flb_log_event log_event; - - /* Iterate the original buffer and perform adjustments */ - records = flb_mp_count(data, bytes); - if (records <= 0) { - return NULL; - } - - /* Allocate buffer to store log_key contents */ - val_buf = flb_calloc(1, msgpack_size); - if (val_buf == NULL) { - flb_plg_error(ctx->ins, "Could not allocate enough " - "memory to read record"); - flb_errno(); - return NULL; - } - - ret = flb_log_event_decoder_init(&log_decoder, (char *) data, bytes); - - if (ret != FLB_EVENT_DECODER_SUCCESS) { - flb_plg_error(ctx->ins, - "Log event decoder initialization error : %d", ret); + compressed_path = flb_sds_create_size(strlen(file_path) + 12); + if (!compressed_path) { + flb_errno(); + unlink(file_path); + return FLB_RETRY; + } - flb_free(val_buf); + compressed_path = flb_sds_printf(&compressed_path, "%s%s", file_path, compression_suffix); + if (!compressed_path) { + flb_errno(); + unlink(file_path); + return FLB_RETRY; + } - return NULL; - } + ret = stream_compress_file(ctx, file_path, compressed_path, 0, -1); + if (ret < 0) { + unlink(file_path); + flb_sds_destroy(compressed_path); + return FLB_RETRY; + } + if (unlink(file_path) != 0) { + flb_plg_warn(ctx->ins, "Failed to delete uncompressed temp file: %s", + file_path); + } - while (!alloc_error && - (ret = flb_log_event_decoder_next( - &log_decoder, - &log_event)) == FLB_EVENT_DECODER_SUCCESS) { + final_file_path = compressed_path; + } - /* Get the record/map */ - map = *log_event.body; + /* Extract filename from path for log data */ + const char *filename = strrchr(final_file_path, '/'); + filename = filename ? filename + 1 : final_file_path; - if (map.type != MSGPACK_OBJECT_MAP) { - continue; + /* Generate s3_key - same logic for all files (with or without DB) */ + s3_key = flb_get_s3_key(ctx->s3_key_format, file_first_log_time, tag, + ctx->tag_delimiters, ctx->seq_index, filename); + if (!s3_key) { + flb_plg_error(ctx->ins, "Failed to generate S3 key for log data"); + if (compressed_path) { + flb_sds_destroy(compressed_path); + } + unlink(final_file_path); + return FLB_RETRY; } - map_size = map.via.map.size; + flb_plg_debug(ctx->ins, "Uploading log file %s to S3 key: %s", filename, s3_key); - /* Reset variables for found log_key and correct type */ - found = FLB_FALSE; - check = FLB_FALSE; + /* Use streaming multipart upload from disk - MEMORY OPTIMIZED */ + ret = s3_multipart_upload_file(ctx, final_file_path, s3_key, tag, tag_len, file_first_log_time); - /* Extract log_key from record and append to output buffer */ - for (i = 0; i < map_size; i++) { - key = map.via.map.ptr[i].key; - val = map.via.map.ptr[i].val; - - if (key.type == MSGPACK_OBJECT_BIN) { - key_str = (char *) key.via.bin.ptr; - key_str_size = key.via.bin.size; - check = FLB_TRUE; - } - if (key.type == MSGPACK_OBJECT_STR) { - key_str = (char *) key.via.str.ptr; - key_str_size = key.via.str.size; - check = FLB_TRUE; - } + flb_sds_destroy(s3_key); - if (check == FLB_TRUE) { - if (strncmp(ctx->log_key, key_str, key_str_size) == 0) { - found = FLB_TRUE; - - /* - * Copy contents of value into buffer. Necessary to copy - * strings because flb_msgpack_to_json does not handle nested - * JSON gracefully and double escapes them. - */ - if (val.type == MSGPACK_OBJECT_BIN) { - memcpy(val_buf + val_offset, val.via.bin.ptr, val.via.bin.size); - val_offset += val.via.bin.size; - val_buf[val_offset] = '\n'; - val_offset++; - } - else if (val.type == MSGPACK_OBJECT_STR) { - memcpy(val_buf + val_offset, val.via.str.ptr, val.via.str.size); - val_offset += val.via.str.size; - val_buf[val_offset] = '\n'; - val_offset++; - } - else { - ret = flb_msgpack_to_json(val_buf + val_offset, - msgpack_size - val_offset, &val, - config->json_escape_unicode); - if (ret < 0) { - break; - } - val_offset += ret; - val_buf[val_offset] = '\n'; - val_offset++; - } - /* Exit early once log_key has been found for current record */ - break; - } + flb_plg_debug(ctx->ins, "Cleaning up temporary file: %s", final_file_path); + if (unlink(final_file_path) != 0) { + if (errno != ENOENT) { + flb_plg_warn(ctx->ins, "Failed to delete temporary file %s: %s", + final_file_path, strerror(errno)); } } - /* If log_key was not found in the current record, mark log key as missing */ - if (found == FLB_FALSE) { - log_key_missing++; + if (compressed_path) { + flb_sds_destroy(compressed_path); } - } - /* Throw error once per chunk if at least one log key was not found */ - if (log_key_missing > 0) { - flb_plg_error(ctx->ins, "Could not find log_key '%s' in %d records", - ctx->log_key, log_key_missing); + return ret; } - flb_log_event_decoder_destroy(&log_decoder); + flb_plg_error(ctx->ins, "Invalid upload path: expected FILE: marker but got direct data"); + return FLB_ERROR; +} + - /* If nothing was read, destroy buffer */ - if (val_offset == 0) { - flb_free(val_buf); - return NULL; +static int s3_stream_to_json(struct flb_s3 *ctx, + struct s3_file *chunk, + flb_sds_t *out_buf, + size_t *out_size) +{ + char chunk_path[PATH_MAX]; + int ret; + + if (!chunk || !chunk->stream_path || !chunk->fsf || + !chunk->fsf->chunk || !chunk->fsf->chunk->name) { + flb_plg_error(ctx->ins, "Invalid chunk data"); + return -1; } - val_buf[val_offset] = '\0'; - /* Create output buffer to store contents */ - out_buf = flb_sds_create(val_buf); - if (out_buf == NULL) { - flb_plg_error(ctx->ins, "Error creating buffer to store log_key contents."); - flb_errno(); + /* Construct path on stack - chunk is locked, safe to access */ + ret = snprintf(chunk_path, sizeof(chunk_path), "%s/%s", + chunk->stream_path, chunk->fsf->chunk->name); + if (ret < 0 || ret >= sizeof(chunk_path)) { + flb_plg_error(ctx->ins, "Chunk path too long"); + return -1; } - flb_free(val_buf); - return out_buf; + return stream_process_msgpack_file( + ctx, + chunk_path, + chunk->size, + ".json", + stream_json_processor, + NULL, + out_buf, + out_size + ); } -static void unit_test_flush(void *out_context, struct s3_file *upload_file, - const char *tag, int tag_len, flb_sds_t chunk, - int chunk_size, struct multipart_upload *m_upload_file, - time_t file_first_log_time) +static int s3_stream_extract_log_key(struct flb_s3 *ctx, + struct s3_file *chunk, + flb_sds_t *out_buf, + size_t *out_size) { + char chunk_path[PATH_MAX]; int ret; - char *buffer; - size_t buffer_size; - struct flb_s3 *ctx = out_context; - s3_store_buffer_put(ctx, upload_file, tag, tag_len, - chunk, (size_t) chunk_size, file_first_log_time); - ret = construct_request_buffer(ctx, chunk, upload_file, &buffer, &buffer_size); - if (ret < 0) { - flb_plg_error(ctx->ins, "Could not construct request buffer for %s", - upload_file->file_path); - FLB_OUTPUT_RETURN(FLB_RETRY); + if (!chunk || !chunk->stream_path || !chunk->fsf || + !chunk->fsf->chunk || !chunk->fsf->chunk->name) { + flb_plg_error(ctx->ins, "Invalid chunk data"); + return -1; } - ret = upload_data(ctx, upload_file, m_upload_file, buffer, buffer_size, tag, tag_len); - flb_free(buffer); + /* Construct path on stack - chunk is locked, safe to access */ + ret = snprintf(chunk_path, sizeof(chunk_path), "%s/%s", + chunk->stream_path, chunk->fsf->chunk->name); + if (ret < 0 || ret >= sizeof(chunk_path)) { + flb_plg_error(ctx->ins, "Chunk path too long"); + return -1; + } - FLB_OUTPUT_RETURN(ret); + return stream_process_msgpack_file( + ctx, + chunk_path, + chunk->size, + ".txt", + stream_log_key_processor, + NULL, + out_buf, + out_size + ); } -static void flush_init(void *out_context) +/* + * Convert chunk to upload format (JSON/Parquet). + * Returns FILE: marker pointing to temp file for streaming upload. + * Caller must unlock chunk on success (kept locked for retry on failure). + */ +int s3_format_chunk(struct flb_s3 *ctx, + struct s3_file *chunk, + flb_sds_t *out_buf, size_t *out_size) { int ret; - struct flb_s3 *ctx = out_context; - struct flb_sched *sched; - /* clean up any old buffers found on startup */ - if (ctx->has_old_buffers == FLB_TRUE) { - flb_plg_info(ctx->ins, - "Sending locally buffered data from previous " - "executions to S3; buffer=%s", - ctx->fs->root_path); - ctx->has_old_buffers = FLB_FALSE; - ret = put_all_chunks(ctx); + if (chunk == NULL) { + flb_plg_error(ctx->ins, "[construct_request_buffer] chunk is NULL"); + return -1; + } + + s3_store_file_lock(chunk); + + /* For JSON format with chunk file: use streaming conversion to minimize memory */ + if (ctx->format == FLB_S3_FORMAT_JSON && !ctx->log_key) { + ret = s3_stream_to_json(ctx, chunk, out_buf, out_size); if (ret < 0) { - ctx->has_old_buffers = FLB_TRUE; - flb_plg_error(ctx->ins, - "Failed to send locally buffered data left over " - "from previous executions; will retry. Buffer=%s", - ctx->fs->root_path); - FLB_OUTPUT_RETURN(FLB_RETRY); + s3_store_file_unlock(chunk); } + return ret; } - /* - * create a timer that will run periodically and check if uploads - * are ready for completion - * this is created once on the first flush - */ - if (ctx->timer_created == FLB_FALSE) { - flb_plg_debug(ctx->ins, - "Creating upload timer with frequency %ds", - ctx->timer_ms / 1000); + /* For log_key extraction with chunk file: use streaming extraction to minimize memory */ + if (ctx->log_key) { + ret = s3_stream_extract_log_key(ctx, chunk, out_buf, out_size); + if (ret < 0) { + s3_store_file_unlock(chunk); + } + return ret; + } - sched = flb_sched_ctx_get(); + /* For Parquet format with chunk file: use streaming conversion */ + if (ctx->format == FLB_S3_FORMAT_PARQUET) { +#ifdef FLB_HAVE_PARQUET_ENCODER + char chunk_path[PATH_MAX]; + char temp_path[PATH_MAX]; + size_t parquet_file_size = 0; + flb_sds_t formatted_data; + size_t formatted_size; - if (ctx->preserve_data_ordering) { - ret = flb_sched_timer_cb_create(sched, FLB_SCHED_TIMER_CB_PERM, - ctx->timer_ms, s3_upload_queue, ctx, NULL); + if (!ctx->schema_str) { + flb_plg_error(ctx->ins, "schema_str is required when format=parquet"); + s3_store_file_unlock(chunk); + return -1; } - else { - ret = flb_sched_timer_cb_create(sched, FLB_SCHED_TIMER_CB_PERM, - ctx->timer_ms, cb_s3_upload, ctx, NULL); + + /* Validate chunk data before accessing */ + if (!chunk->stream_path || !chunk->fsf || !chunk->fsf->chunk || !chunk->fsf->chunk->name) { + flb_plg_error(ctx->ins, "Invalid chunk data for Parquet conversion"); + s3_store_file_unlock(chunk); + return -1; } - if (ret == -1) { - flb_plg_error(ctx->ins, "Failed to create upload timer"); - FLB_OUTPUT_RETURN(FLB_RETRY); + + /* Construct paths on stack - chunk is locked, safe to access */ + ret = snprintf(chunk_path, sizeof(chunk_path), "%s/%s", + chunk->stream_path, chunk->fsf->chunk->name); + if (ret < 0 || ret >= sizeof(chunk_path)) { + flb_plg_error(ctx->ins, "Chunk path too long"); + s3_store_file_unlock(chunk); + return -1; } - ctx->timer_created = FLB_TRUE; - } -} -static int blob_chunk_register_parts(struct flb_s3 *ctx, uint64_t file_id, size_t total_size) -{ - int ret; - int64_t parts = 0; - int64_t id; - size_t offset_start = 0; - size_t offset_end = 0; - - /* generate file parts */ - while (offset_start < total_size) { - offset_end = offset_start + ctx->part_size; - - /* do not exceed maximum size */ - if (offset_end > total_size) { - offset_end = total_size; + ret = snprintf(temp_path, sizeof(temp_path), "%s/parquet_stream_%p.parquet", + ctx->buffer_dir, (void*)chunk); + if (ret < 0 || ret >= sizeof(temp_path)) { + flb_plg_error(ctx->ins, "Temp path too long"); + s3_store_file_unlock(chunk); + return -1; } - /* insert part */ - ret = flb_blob_db_file_part_insert(&ctx->blob_db, file_id, parts, offset_start, offset_end, &id); - if (ret == -1) { - flb_plg_error(ctx->ins, "cannot insert blob file part into database"); + /* + * CRITICAL: Use streaming version with cached schema. + * This avoids yyjson stack overflow in coroutines by using + * the pre-parsed schema from initialization. + */ + ret = flb_msgpack_to_parquet_streaming( + chunk_path, + ctx->cached_arrow_schema, + ctx->compression, + temp_path, + &parquet_file_size, + ctx->file_size + ); + + if (ret < 0) { + flb_plg_error(ctx->ins, "Streaming Parquet conversion failed"); + unlink(temp_path); + s3_store_file_unlock(chunk); return -1; } - offset_start = offset_end; - parts++; - } + /* Handle empty Parquet output - mark as max failures to skip retry */ + if (parquet_file_size == 0) { + flb_plg_warn(ctx->ins, + "Parquet conversion produced 0 records from %zu bytes of input. " + "Possible causes: empty msgpack data, schema mismatch with incoming data structure, " + "or filtered out by schema. Check your schema_str configuration. " + "File will be marked as failed and cleaned up on next restart.", + chunk->size); - return parts; -} + /* Mark as max failures so it won't be retried */ + chunk->failures = ctx->ins->retry_limit; -static int process_blob_chunk(struct flb_s3 *ctx, struct flb_event_chunk *event_chunk) -{ - int64_t ret; - int64_t file_id; - cfl_sds_t file_path = NULL; - cfl_sds_t source = NULL; - size_t file_size; - msgpack_object map; + unlink(temp_path); + s3_store_file_unlock(chunk); + return -1; + } - struct flb_log_event_decoder log_decoder; - struct flb_log_event log_event; + /* Return file path marker */ + formatted_data = flb_sds_create_size(strlen(temp_path) + 6); + if (!formatted_data) { + flb_plg_error(ctx->ins, "Failed to create path marker"); + unlink(temp_path); + s3_store_file_unlock(chunk); + return -1; + } + + formatted_data = flb_sds_printf(&formatted_data, "FILE:%s", temp_path); + if (!formatted_data) { + flb_plg_error(ctx->ins, "Failed to format path marker"); + unlink(temp_path); + s3_store_file_unlock(chunk); + return -1; + } - if (ctx->blob_db.db == NULL) { - flb_plg_error(ctx->ins, "Cannot process blob because this operation requires a database."); + formatted_size = parquet_file_size; + *out_buf = formatted_data; + *out_size = formatted_size; + return 0; +#else + flb_plg_error(ctx->ins, "Parquet format not supported in this build"); + s3_store_file_unlock(chunk); return -1; +#endif } - ret = flb_log_event_decoder_init(&log_decoder, - (char *) event_chunk->data, - event_chunk->size); + /* Should not reach here - all formats should be handled above */ + flb_plg_error(ctx->ins, "Unknown format in construct_request_buffer"); + s3_store_file_unlock(chunk); + return -1; +} - if (ret != FLB_EVENT_DECODER_SUCCESS) { - flb_plg_error(ctx->ins, - "Log event decoder initialization error : %i", (int) ret); - return -1; +/* Timer callback - scans for timed-out chunks and processes upload queue */ +void cb_s3_upload(struct flb_config *config, void *data) +{ + struct flb_s3 *ctx = data; + struct s3_file *chunk; + struct flb_fstore_file *fsf; + struct upload_queue *entry; + struct mk_list *tmp; + struct mk_list *head; + time_t now; + int ret; + int uploaded = 0; + int enqueued = 0; + /* CRITICAL: Check exit flag at the very beginning */ + if (ctx->is_exiting == FLB_TRUE) { + flb_plg_debug(ctx->ins, "Timer callback: exit in progress, skipping"); + return; } - while (flb_log_event_decoder_next(&log_decoder, &log_event) == FLB_EVENT_DECODER_SUCCESS) { - map = *log_event.body; - ret = flb_input_blob_file_get_info(map, &source, &file_path, &file_size); - if (ret == -1) { - flb_plg_error(ctx->ins, "cannot get file info from blob record, skipping"); - continue; + /* + * Deferred recovery: Execute on first timer invocation. + * Worker initialization returns immediately, with recovery deferred to + * the first timer invocation to avoid blocking startup. + */ + if (ctx->needs_recovery == FLB_TRUE) { + ret = s3_queue_recover_all(ctx, config); + if (ret < 0) { + flb_plg_error(ctx->ins, "Recovery failed"); } + ctx->needs_recovery = FLB_FALSE; + } - ret = flb_blob_db_file_insert(&ctx->blob_db, - event_chunk->tag, - source, - ctx->endpoint, - file_path, - file_size); - - if (ret == -1) { - flb_plg_error(ctx->ins, "cannot insert blob file into database: %s (size=%lu)", - file_path, file_size); + now = time(NULL); - cfl_sds_destroy(file_path); - cfl_sds_destroy(source); + /* Scan active stream for timed-out chunks and add them to queue */ + mk_list_foreach_safe(head, tmp, &ctx->stream_active->files) { + fsf = mk_list_entry(head, struct flb_fstore_file, _head); + chunk = fsf->data; + /* Skip if not ready yet */ + if (now < (chunk->create_time + ctx->upload_timeout + ctx->retry_time)) { continue; } - /* generate the parts by using the newest id created (ret) */ - file_id = ret; - ret = blob_chunk_register_parts(ctx, file_id, file_size); - if (ret == -1) { - flb_plg_error(ctx->ins, "cannot register blob file '%s 'parts into database", - file_path); - - cfl_sds_destroy(file_path); - cfl_sds_destroy(source); - - return -1; + /* Skip locked files */ + if (chunk->locked == FLB_TRUE) { + continue; } - flb_plg_debug(ctx->ins, "blob file '%s' (id=%zu) registered with %zu parts", - file_path, file_id, ret); - + /* Check failure limit */ + if (chunk->failures >= ctx->ins->retry_limit) { + flb_plg_warn(ctx->ins, + "Chunk failed %d times, marking inactive (tag=%s)", + chunk->failures, (char*)fsf->meta_buf); + flb_fstore_file_inactive(ctx->fs, fsf); + continue; + } - cfl_sds_destroy(file_path); - cfl_sds_destroy(source); + /* Add to upload queue */ + s3_store_file_lock(chunk); + ret = s3_queue_add_file(ctx, 0, chunk, NULL, + (const char*)fsf->meta_buf, + fsf->meta_size); + if (ret == 0) { + enqueued++; + } + else { + s3_store_file_unlock(chunk); + } } - flb_log_event_decoder_destroy(&log_decoder); + /* Now process all entries in upload queue */ + pthread_mutex_lock(&ctx->upload_queue_lock); - return 0; -} + mk_list_foreach_safe(head, tmp, &ctx->upload_queue) { + /* CRITICAL: Check exit flag in each iteration to avoid blocking exit */ + if (ctx->is_exiting == FLB_TRUE) { + flb_plg_info(ctx->ins, "Timer callback: exit requested during queue processing"); + break; + } -static void cb_s3_blob_file_upload(struct flb_config *config, void *out_context) -{ - cb_s3_upload_blob(config, out_context); + entry = mk_list_entry(head, struct upload_queue, _head); - flb_sched_timer_cb_coro_return(); -} + /* Check if ready to upload */ + if (now < entry->upload_time) { + continue; + } -static int s3_timer_create(struct flb_s3 *ctx) -{ - int ret; - int64_t ms; - struct flb_sched *sched; + /* Remove from queue and unlock for upload */ + mk_list_del(&entry->_head); + pthread_mutex_unlock(&ctx->upload_queue_lock); - sched = flb_sched_ctx_get(); + /* Process upload (this calls the helper functions from s3_queue.c) */ + ret = s3_queue_process_entry(ctx, entry, now); - /* convert from seconds to milliseconds (scheduler needs ms) */ - ms = ctx->upload_parts_timeout * 1000; + /* Re-lock for next iteration */ + pthread_mutex_lock(&ctx->upload_queue_lock); - ret = flb_sched_timer_coro_cb_create(sched, FLB_SCHED_TIMER_CB_PERM, ms, - cb_s3_blob_file_upload, ctx, NULL); - if (ret == -1) { - flb_plg_error(ctx->ins, "failed to create upload timer"); - return -1; + if (ret == 1) { + /* Success - entry already freed */ + uploaded++; + } + else if (ret == -1) { + /* Failure - re-add to queue for retry */ + mk_list_add(&entry->_head, &ctx->upload_queue); + } + /* ret == 0 means entry was freed (invalid or retry limit reached) - do nothing */ } - return 0; + pthread_mutex_unlock(&ctx->upload_queue_lock); + + if (enqueued > 0 || uploaded > 0) { + flb_plg_info(ctx->ins, "Timer: enqueued %d, uploaded %d file(s)", + enqueued, uploaded); + } } static void cb_s3_flush(struct flb_event_chunk *event_chunk, @@ -3744,17 +1539,16 @@ static void cb_s3_flush(struct flb_event_chunk *event_chunk, flb_sds_t chunk = NULL; struct s3_file *upload_file = NULL; struct flb_s3 *ctx = out_context; - struct multipart_upload *m_upload_file = NULL; time_t file_first_log_time = 0; struct flb_log_event_decoder log_decoder; struct flb_log_event log_event; if (event_chunk->type == FLB_EVENT_TYPE_BLOBS) { /* - * For Blob types, we use the flush callback to enqueue the file, then cb_azb_blob_file_upload() - * takes care of the rest like reading the file and uploading it to S3. + * For Blob types, we use the flush callback to enqueue the file, then the upload timer + * takes care of processing the queue and uploading parts to S3. */ - ret = process_blob_chunk(ctx, event_chunk); + ret = s3_blob_process_events(ctx, event_chunk); if (ret == -1) { FLB_OUTPUT_RETURN(FLB_RETRY); } @@ -3762,31 +1556,17 @@ static void cb_s3_flush(struct flb_event_chunk *event_chunk, FLB_OUTPUT_RETURN(FLB_OK); } - /* Cleanup old buffers and initialize upload timer */ - flush_init(ctx); - - /* Process chunk */ - if (ctx->log_key) { - chunk = flb_pack_msgpack_extract_log_key(ctx, - event_chunk->data, - event_chunk->size, - config); - } - else { - chunk = flb_pack_msgpack_to_json_format(event_chunk->data, - event_chunk->size, - FLB_PACK_JSON_FORMAT_LINES, - ctx->json_date_format, - ctx->date_key, - config->json_escape_unicode); - } + /* + * Store raw msgpack data directly - format conversion happens during upload. + * This avoids repeated conversions and enables better batching for columnar formats. + */ + chunk = flb_sds_create_len(event_chunk->data, event_chunk->size); if (chunk == NULL) { - flb_plg_error(ctx->ins, "Could not marshal msgpack to output string"); + flb_plg_error(ctx->ins, "Failed to create buffer for msgpack data"); FLB_OUTPUT_RETURN(FLB_ERROR); } - chunk_size = flb_sds_len(chunk); + chunk_size = event_chunk->size; - /* Get a file candidate matching the given 'tag' */ upload_file = s3_store_file_get(ctx, event_chunk->tag, flb_sds_len(event_chunk->tag)); @@ -3817,7 +1597,6 @@ static void cb_s3_flush(struct flb_event_chunk *event_chunk, flb_log_event_decoder_destroy(&log_decoder); } else { - /* Get file_first_log_time from upload_file */ file_first_log_time = upload_file->first_log_time; } @@ -3825,15 +1604,11 @@ static void cb_s3_flush(struct flb_event_chunk *event_chunk, file_first_log_time = time(NULL); } - /* Specific to unit tests, will not get called normally */ - if (s3_plugin_under_test() == FLB_TRUE) { - unit_test_flush(ctx, upload_file, - event_chunk->tag, flb_sds_len(event_chunk->tag), - chunk, chunk_size, - m_upload_file, file_first_log_time); - } + /* + * In test mode, data should be uploaded immediately for validation. + * Set upload_timeout=1s in test config to trigger immediate uploads. + */ - /* Discard upload_file if it has failed to upload retry_limit times */ if (upload_file != NULL && upload_file->failures >= ctx->ins->retry_limit) { flb_plg_warn(ctx->ins, "File with tag %s failed to send %d/%d times, will not retry", event_chunk->tag, upload_file->failures, ctx->ins->retry_limit); @@ -3841,7 +1616,6 @@ static void cb_s3_flush(struct flb_event_chunk *event_chunk, upload_file = NULL; } - /* If upload_timeout has elapsed, upload file */ if (upload_file != NULL && time(NULL) > (upload_file->create_time + ctx->upload_timeout)) { upload_timeout_check = FLB_TRUE; @@ -3849,63 +1623,133 @@ static void cb_s3_flush(struct flb_event_chunk *event_chunk, event_chunk->tag); } - m_upload_file = get_upload(ctx, - event_chunk->tag, flb_sds_len(event_chunk->tag)); - - if (m_upload_file != NULL && time(NULL) > - (m_upload_file->init_time + ctx->upload_timeout)) { - upload_timeout_check = FLB_TRUE; - flb_plg_info(ctx->ins, "upload_timeout reached for %s", event_chunk->tag); - } - - /* If total_file_size has been reached, upload file */ - if ((upload_file && upload_file->size + chunk_size > ctx->upload_chunk_size) || - (m_upload_file && m_upload_file->bytes + chunk_size > ctx->file_size)) { + if (upload_file && upload_file->size + chunk_size > ctx->file_size) { total_file_size_check = FLB_TRUE; + flb_plg_info(ctx->ins, "total_file_size reached for %s", + event_chunk->tag); } - /* File is ready for upload, upload_file != NULL prevents from segfaulting. */ if ((upload_file != NULL) && (upload_timeout_check == FLB_TRUE || total_file_size_check == FLB_TRUE)) { - if (ctx->preserve_data_ordering == FLB_TRUE) { - /* Buffer last chunk in file and lock file to prevent further changes */ - ret = buffer_chunk(ctx, upload_file, chunk, chunk_size, + /* + * Unified Worker Queue Architecture: + * - With DB: Use part-level tracking (same as blob files) + * - Without DB: Use legacy fstore-based upload + */ + + if (ctx->blob_db.db != NULL) { + /* + * CRITICAL ARCHITECTURE FIX (普通日志): + * DO NOT create multipart upload here in flush callback (coroutine context)! + * + * Instead, just persist metadata to database: + * 1. Buffer the chunk data to temp file + * 2. Insert file record into database (PENDING state) + * 3. Register parts in database + * 4. Return immediately + * + * The timer callback will pick up these PENDING files and create + * multipart uploads in proper thread context. + */ + flb_sds_t msgpack_data = NULL; + size_t msgpack_size; + + /* First buffer the chunk to get the complete msgpack data */ + ret = s3_queue_buffer_chunk(ctx, upload_file, chunk, chunk_size, event_chunk->tag, flb_sds_len(event_chunk->tag), file_first_log_time); - if (ret < 0) { FLB_OUTPUT_RETURN(FLB_RETRY); } - s3_store_file_lock(upload_file); - /* Add chunk file to upload queue */ - ret = add_to_queue(ctx, upload_file, m_upload_file, - event_chunk->tag, flb_sds_len(event_chunk->tag)); + /* Read the complete buffered msgpack data */ + ret = s3_format_chunk(ctx, upload_file, &msgpack_data, &msgpack_size); if (ret < 0) { - FLB_OUTPUT_RETURN(FLB_ERROR); + flb_plg_error(ctx->ins, "Failed to read buffered chunk data"); + FLB_OUTPUT_RETURN(FLB_RETRY); } - /* Go through upload queue and return error if something went wrong */ - s3_upload_queue(config, ctx); - if (ctx->upload_queue_success == FLB_FALSE) { - ctx->upload_queue_success = FLB_TRUE; - FLB_OUTPUT_RETURN(FLB_ERROR); + /* Check if it's a FILE: marker (streaming format) */ + if (strncmp(msgpack_data, "FILE:", 5) == 0) { + /* It's already a temp file, extract the path and use it directly */ + const char *temp_file_path = msgpack_data + 5; + struct stat st; + + if (stat(temp_file_path, &st) == 0) { + /* File exists, register in database only (lightweight) */ + int64_t file_id; + int part_count; + + /* Insert file into database */ + ret = flb_blob_db_file_insert(&ctx->blob_db, event_chunk->tag, "chunk", + ctx->endpoint, (char *)temp_file_path, st.st_size); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to insert chunk file into database"); + flb_sds_destroy(msgpack_data); + unlink(temp_file_path); + FLB_OUTPUT_RETURN(FLB_RETRY); + } + + file_id = ret; + + /* Register parts */ + part_count = s3_blob_register_parts(ctx, file_id, st.st_size); + if (part_count < 0) { + flb_plg_error(ctx->ins, "Failed to register chunk file parts"); + flb_blob_db_file_delete(&ctx->blob_db, file_id); + flb_sds_destroy(msgpack_data); + unlink(temp_file_path); + FLB_OUTPUT_RETURN(FLB_RETRY); + } + + flb_sds_destroy(msgpack_data); + + ret = s3_queue_add_pending_file(ctx, file_id, temp_file_path, + event_chunk->tag, flb_sds_len(event_chunk->tag)); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to enqueue pending file"); + flb_blob_db_file_delete(&ctx->blob_db, file_id); + unlink(temp_file_path); + FLB_OUTPUT_RETURN(FLB_RETRY); + } + + s3_store_file_delete(ctx, upload_file); + + flb_plg_info(ctx->ins, "Chunk file registered (file_id=%" PRId64 ", parts=%d)", + file_id, part_count); + FLB_OUTPUT_RETURN(FLB_OK); + } } - FLB_OUTPUT_RETURN(FLB_OK); + + /* Fallback: shouldn't reach here in normal operation */ + flb_plg_warn(ctx->ins, "Unexpected data format in DB-tracked chunk upload"); + flb_sds_destroy(msgpack_data); + FLB_OUTPUT_RETURN(FLB_RETRY); } else { - /* Send upload directly without upload queue */ - ret = send_upload_request(ctx, chunk, upload_file, m_upload_file, - event_chunk->tag, - flb_sds_len(event_chunk->tag)); + /* Legacy fstore-based upload (no DB tracking) */ + ret = s3_queue_buffer_chunk(ctx, upload_file, chunk, chunk_size, + event_chunk->tag, flb_sds_len(event_chunk->tag), + file_first_log_time); + + if (ret < 0) { + FLB_OUTPUT_RETURN(FLB_RETRY); + } + + s3_store_file_lock(upload_file); + + /* Add to unified worker queue using unified interface (simple mode: file_id=0) */ + ret = s3_queue_add_file(ctx, 0, upload_file, NULL, + event_chunk->tag, flb_sds_len(event_chunk->tag)); if (ret < 0) { + s3_store_file_unlock(upload_file); FLB_OUTPUT_RETURN(FLB_ERROR); } - FLB_OUTPUT_RETURN(ret); + + FLB_OUTPUT_RETURN(FLB_OK); } } - /* Buffer current chunk in filesystem and wait for next chunk from engine */ - ret = buffer_chunk(ctx, upload_file, chunk, chunk_size, + ret = s3_queue_buffer_chunk(ctx, upload_file, chunk, chunk_size, event_chunk->tag, flb_sds_len(event_chunk->tag), file_first_log_time); @@ -3917,55 +1761,27 @@ static void cb_s3_flush(struct flb_event_chunk *event_chunk, static int cb_s3_exit(void *data, struct flb_config *config) { - int ret; struct flb_s3 *ctx = data; - struct multipart_upload *m_upload = NULL; - struct mk_list *tmp; - struct mk_list *head; if (!ctx) { return 0; } - if (s3_store_has_data(ctx) == FLB_TRUE) { - flb_plg_info(ctx->ins, "Sending all locally buffered data to S3"); - ret = put_all_chunks(ctx); - if (ret < 0) { - flb_plg_error(ctx->ins, "Could not send all chunks on exit"); - } - } - - if (s3_store_has_uploads(ctx) == FLB_TRUE) { - mk_list_foreach_safe(head, tmp, &ctx->uploads) { - m_upload = mk_list_entry(head, struct multipart_upload, _head); - - if (m_upload->upload_state == MULTIPART_UPLOAD_STATE_NOT_CREATED) { - continue; - } - - if (m_upload->bytes > 0) { - m_upload->upload_state = MULTIPART_UPLOAD_STATE_COMPLETE_IN_PROGRESS; - mk_list_del(&m_upload->_head); - ret = complete_multipart_upload(ctx, m_upload, NULL); - if (ret == 0) { - multipart_upload_destroy(m_upload); - } - else { - mk_list_add(&m_upload->_head, &ctx->uploads); - flb_plg_error(ctx->ins, "Could not complete upload %s", - m_upload->s3_key); - } - } - } - } - - if (ctx->blob_database_file != NULL && - ctx->blob_db.db != NULL) { + /* Signal shutdown */ + ctx->is_exiting = FLB_TRUE; + /* Cleanup blob database if it was initialized */ + if (ctx->blob_database_file != NULL && ctx->blob_db.db != NULL) { flb_blob_db_close(&ctx->blob_db); } + /* Cleanup upload queue mutex */ + pthread_mutex_destroy(&ctx->upload_queue_lock); + + /* Cleanup storage */ s3_store_exit(ctx); + + /* Destroy context - MUST be last as it frees ctx */ s3_context_destroy(ctx); return 0; @@ -3984,30 +1800,17 @@ static struct flb_config_map config_map[] = { "Specifies the name of the date field in output." }, { - FLB_CONFIG_MAP_SIZE, "total_file_size", "100000000", + FLB_CONFIG_MAP_SIZE, "total_file_size", "500M", 0, FLB_TRUE, offsetof(struct flb_s3, file_size), - "Specifies the size of files in S3. Maximum size is 50GB, minimum is 1MB" + "Buffer size threshold that triggers upload. When buffered data reaches this size, it is uploaded to S3. " + "Works together with upload_timeout (either condition triggers upload). Maximum: 5TB, Default: 500M." }, { - FLB_CONFIG_MAP_SIZE, "upload_chunk_size", "5242880", + FLB_CONFIG_MAP_SIZE, "upload_chunk_size", NULL, 0, FLB_TRUE, offsetof(struct flb_s3, upload_chunk_size), - "This plugin uses the S3 Multipart Upload API to stream data to S3, " - "ensuring your data gets-off-the-box as quickly as possible. " - "This parameter configures the size of each “part” in the upload. " - "The total_file_size option configures the size of the file you will see " - "in S3; this option determines the size of chunks uploaded until that " - "size is reached. These chunks are temporarily stored in chunk_buffer_path " - "until their size reaches upload_chunk_size, which point the chunk is " - "uploaded to S3. Default: 5M, Max: 50M, Min: 5M." - }, - - { - FLB_CONFIG_MAP_TIME, "upload_timeout", "10m", - 0, FLB_TRUE, offsetof(struct flb_s3, upload_timeout), - "Optionally specify a timeout for uploads. " - "Whenever this amount of time has elapsed, Fluent Bit will complete an " - "upload and create a new file in S3. For example, set this value to 60m " - "and you will get a new file in S3 every hour. Default is 10m." + "Part size for log data multipart uploads. Controls chunk size when uploading buffered data to S3. " + "Default: 100MiB. Automatically adjusted based on total_file_size to stay within AWS 10,000 parts limit. " + "Range: 5MiB - 5GiB. Allocates a buffer of this size per upload, larger values improve throughput but increase memory usage." }, { FLB_CONFIG_MAP_STR, "bucket", NULL, @@ -4042,11 +1845,19 @@ static struct flb_config_map config_map[] = { { FLB_CONFIG_MAP_STR, "compression", NULL, 0, FLB_FALSE, 0, - "Compression type for S3 objects. 'gzip', 'arrow', 'parquet' and 'zstd' are the supported values. " - "'arrow' and 'parquet' are only available if Apache Arrow was enabled at compile time. " - "Defaults to no compression. " - "If 'gzip' is selected, the Content-Encoding HTTP Header will be set to 'gzip'." - "If 'zstd' is selected, the Content-Encoding HTTP Header will be set to 'zstd'." + "Compression type for S3 objects. Supported values: `none`, `gzip`, `snappy`, `zstd`. Default: `none`. " + "`arrow` and `parquet` are deprecated legacy values that will set format=parquet." + }, + { + FLB_CONFIG_MAP_STR, "format", "json", + 0, FLB_FALSE, 0, + "Output format for S3 objects. Supported: json, parquet." + }, + { + FLB_CONFIG_MAP_STR, "schema_str", NULL, + 0, FLB_TRUE, offsetof(struct flb_s3, schema_str), + "JSON schema for output format. Required when `format=parquet`. " + "Example: `{\"fields\":[{\"name\":\"message\",\"type\":{\"name\":\"utf8\"}}]}`" }, { FLB_CONFIG_MAP_STR, "content_type", NULL, @@ -4058,9 +1869,9 @@ static struct flb_config_map config_map[] = { { FLB_CONFIG_MAP_STR, "store_dir", "/tmp/fluent-bit/s3", 0, FLB_TRUE, offsetof(struct flb_s3, store_dir), - "Directory to locally buffer data before sending. Plugin uses the S3 Multipart " - "upload API to send data in chunks of 5 MB at a time- only a small amount of" - " data will be locally buffered at any given point in time." + "Directory to locally buffer data before sending. The plugin buffers data locally until " + "total_file_size or upload_timeout is reached, then uploads using streaming multipart upload " + "for memory efficiency. Upload chunk size is controlled by upload_chunk_size parameter." }, { @@ -4089,7 +1900,7 @@ static struct flb_config_map config_map[] = { { FLB_CONFIG_MAP_STR, "s3_key_format_tag_delimiters", ".", 0, FLB_TRUE, offsetof(struct flb_s3, tag_delimiters), - "A series of characters which will be used to split the tag into “parts” for " + "A series of characters which will be used to split the tag into parts for " "use with the s3_key_format option. See the in depth examples and tutorial in " "the documentation." }, @@ -4106,8 +1917,8 @@ static struct flb_config_map config_map[] = { { FLB_CONFIG_MAP_BOOL, "use_put_object", "false", - 0, FLB_TRUE, offsetof(struct flb_s3, use_put_object), - "Use the S3 PutObject API, instead of the multipart upload API" + 0, FLB_FALSE, 0, + "Deprecated: This option has no effect. The plugin automatically handles uploads efficiently for all file sizes." }, { @@ -4118,10 +1929,10 @@ static struct flb_config_map config_map[] = { { FLB_CONFIG_MAP_BOOL, "preserve_data_ordering", "true", - 0, FLB_TRUE, offsetof(struct flb_s3, preserve_data_ordering), - "Normally, when an upload request fails, there is a high chance for the last " - "received chunk to be swapped with a later chunk, resulting in data shuffling. " - "This feature prevents this shuffling by using a queue logic for uploads." + 0, FLB_FALSE, 0, + "DEPRECATED: This parameter has no effect and will be removed in a future version. " + "The plugin now always uses an efficient background worker thread architecture that " + "maintains upload order automatically. Setting this parameter has no impact on behavior." }, { @@ -4168,27 +1979,34 @@ static struct flb_config_map config_map[] = { }, { - FLB_CONFIG_MAP_SIZE, "part_size", "25M", + FLB_CONFIG_MAP_SIZE, "part_size", NULL, 0, FLB_TRUE, offsetof(struct flb_s3, part_size), - "Size of each part when uploading blob files" + "DEPRECATED: Use 'upload_chunk_size' instead. This parameter is kept for backward compatibility. " }, { - FLB_CONFIG_MAP_INT, "file_delivery_attempt_limit", "1", + FLB_CONFIG_MAP_INT, "file_delivery_attempt_limit", "3", 0, FLB_TRUE, offsetof(struct flb_s3, file_delivery_attempt_limit), - "File delivery attempt limit" + "Maximum delivery attempts for entire file upload (including CreateMultipartUpload). " + "Handles file-level failures such as credential expiration, bucket configuration issues, " + "and complete part upload failures. Works with auto_retry_requests for comprehensive " + "error handling. Aligns with AWS SDK standard defaults. Default: 3. " + "Set to 1 for fail-fast behavior if needed." }, { - FLB_CONFIG_MAP_INT, "part_delivery_attempt_limit", "1", + FLB_CONFIG_MAP_INT, "part_delivery_attempt_limit", "5", 0, FLB_TRUE, offsetof(struct flb_s3, part_delivery_attempt_limit), - "File part delivery attempt limit" + "Maximum delivery attempts for individual parts in multipart uploads. " + "Handles transient failures for specific parts such as network timeouts and S3 throttling. " + "Should be >= file_delivery_attempt_limit for optimal reliability. Default: 5." }, { - FLB_CONFIG_MAP_TIME, "upload_parts_timeout", "10M", - 0, FLB_TRUE, offsetof(struct flb_s3, upload_parts_timeout), - "Timeout to upload parts of a blob file" + FLB_CONFIG_MAP_TIME, "upload_parts_timeout", "10m", + 0, FLB_FALSE, 0, + "DEPRECATED: This parameter has no effect and will be removed in a future version. " + "Use 'upload_timeout' instead. Setting this parameter has no impact on behavior." }, { @@ -4197,6 +2015,13 @@ static struct flb_config_map config_map[] = { "Maximum lifespan of an uncommitted file part" }, + { + FLB_CONFIG_MAP_TIME, "upload_timeout", "10m", + 0, FLB_TRUE, offsetof(struct flb_s3, upload_timeout), + "Timeout to trigger upload of buffered data. When buffered data has been waiting for this duration, " + "it will be uploaded to S3 even if total_file_size has not been reached. Default is 10m minutes." + }, + { FLB_CONFIG_MAP_STR, "authorization_endpoint_url", NULL, 0, FLB_TRUE, offsetof(struct flb_s3, authorization_endpoint_url), @@ -4215,14 +2040,14 @@ static struct flb_config_map config_map[] = { "Authorization endpoint basic authentication password" }, - { - FLB_CONFIG_MAP_STR, "authorization_endpoint_bearer_token", NULL, - 0, FLB_TRUE, offsetof(struct flb_s3, authorization_endpoint_bearer_token), - "Authorization endpoint bearer token" - }, + { + FLB_CONFIG_MAP_STR, "authorization_endpoint_bearer_token", NULL, + 0, FLB_TRUE, offsetof(struct flb_s3, authorization_endpoint_bearer_token), + "Authorization endpoint bearer token" + }, - /* EOF */ - {0} + /* EOF */ + {0} }; /* Plugin registration */ diff --git a/plugins/out_s3/s3.h b/plugins/out_s3/s3.h index fc30ff81ff7..763d9d4bcb1 100644 --- a/plugins/out_s3/s3.h +++ b/plugins/out_s3/s3.h @@ -26,51 +26,46 @@ #include #include #include +#include -/* Upload data to S3 in 5MB chunks */ -#define MIN_CHUNKED_UPLOAD_SIZE 5242880 -#define MAX_CHUNKED_UPLOAD_SIZE 50000000 -#define MAX_CHUNKED_UPLOAD_COMPRESS_SIZE 5000000000 +#define MAX_FILE_SIZE 5497558138880ULL /* 5TB (AWS S3 max object size) */ +#define MAX_FILE_SIZE_STR "5TB" -#define UPLOAD_TIMER_MAX_WAIT 60000 -#define UPLOAD_TIMER_MIN_WAIT 6000 +#define MAX_UPLOAD_ERRORS 5 -#define MULTIPART_UPLOAD_STATE_NOT_CREATED 0 -#define MULTIPART_UPLOAD_STATE_CREATED 1 -#define MULTIPART_UPLOAD_STATE_COMPLETE_IN_PROGRESS 2 +/* AWS S3 multipart upload constraints */ +#define S3_MiB 1048576ULL +#define S3_GiB (1024 * S3_MiB) +#define S3_AWS_MIN_PART_SIZE (5 * S3_MiB) +#define S3_AWS_MAX_PART_SIZE (5 * S3_GiB) +#define S3_AWS_MAX_PARTS 10000 +#define S3_DEFAULT_PART_SIZE (100 * S3_MiB) -#define DEFAULT_FILE_SIZE 100000000 -#define MAX_FILE_SIZE 50000000000 -#define MAX_FILE_SIZE_STR "50,000,000,000" +/* Multipart upload error codes */ +#define S3_MULTIPART_ERROR_GENERAL -1 +#define S3_MULTIPART_ERROR_NO_SUCH_UPLOAD -2 -/* Allowed max file size 1 GB for publishing to S3 */ -#define MAX_FILE_SIZE_PUT_OBJECT 1000000000 +struct upload_queue { + uint64_t file_id; + uint64_t part_db_id; + uint64_t part_id; -#define DEFAULT_UPLOAD_TIMEOUT 3600 + struct s3_file *upload_file; -#define MAX_UPLOAD_ERRORS 5 + flb_sds_t file_path; + off_t offset_start; + off_t offset_end; -/* - * If we see repeated errors on an upload/chunk, we will discard it - * This saves us from scenarios where something goes wrong and an upload can - * not proceed (may be some other process completed it or deleted the upload) - * instead of erroring out forever, we eventually discard the upload. - * - * The same is done for chunks, just to be safe, even though realistically - * I can't think of a reason why a chunk could become unsendable. - * - * The retry limit is now configurable via the retry_limit parameter. - */ + flb_sds_t s3_key; + flb_sds_t upload_id; -struct upload_queue { - struct s3_file *upload_file; - struct multipart_upload *m_upload_file; flb_sds_t tag; int tag_len; - int retry_counter; time_t upload_time; + int needs_upload_creation; + struct mk_list _head; }; @@ -78,28 +73,16 @@ struct multipart_upload { flb_sds_t s3_key; flb_sds_t tag; flb_sds_t upload_id; - int upload_state; time_t init_time; - /* - * maximum of 10,000 parts in an upload, for each we need to store mapping - * of Part Number to ETag - */ flb_sds_t etags[10000]; int part_number; - /* - * we use async http, so we need to check that all part requests have - * completed before we complete the upload - */ int parts_uploaded; - - /* ongoing tracker of how much data has been sent for this upload */ size_t bytes; struct mk_list _head; - /* see note for retry_limit configuration */ int upload_errors; int complete_errors; }; @@ -119,10 +102,9 @@ struct flb_s3 { char *profile; int free_endpoint; int retry_requests; - int use_put_object; int send_content_md5; int static_file_path; - int compression; + int compression; /* Compression type (for Parquet internal or outer layer) */ int port; int insecure; size_t store_dir_limit_size; @@ -130,7 +112,6 @@ struct flb_s3 { struct flb_blob_db blob_db; flb_sds_t blob_database_file; size_t part_size; - time_t upload_parts_timeout; time_t upload_parts_freshness_threshold; int file_delivery_attempt_limit; int part_delivery_attempt_limit; @@ -141,14 +122,11 @@ struct flb_s3 { struct flb_upstream *authorization_endpoint_upstream; struct flb_tls *authorization_endpoint_tls_context; - /* track the total amount of buffered data */ size_t current_buffer_size; struct flb_aws_provider *provider; struct flb_aws_provider *base_provider; - /* tls instances can't be re-used; aws provider requires a separate one */ struct flb_tls *provider_tls; - /* one for the standard chain provider, one for sts assume role */ struct flb_tls *sts_provider_tls; struct flb_tls *client_tls; @@ -162,22 +140,17 @@ struct flb_s3 { char *store_dir; struct flb_fstore *fs; struct flb_fstore_stream *stream_active; /* default active stream */ - struct flb_fstore_stream *stream_upload; /* multipart upload stream */ struct flb_fstore_stream *stream_metadata; /* s3 metadata stream */ - /* - * used to track that unset buffers were found on startup that have not - * been sent - */ int has_old_buffers; - /* old multipart uploads read on start up */ - int has_old_uploads; - - struct mk_list uploads; + int initial_upload_done; + int is_exiting; + int needs_recovery; int preserve_data_ordering; int upload_queue_success; struct mk_list upload_queue; + pthread_mutex_t upload_queue_lock; /* Protects upload_queue access */ size_t file_size; size_t upload_chunk_size; @@ -194,34 +167,34 @@ struct flb_s3 { flb_sds_t seq_index_file; struct flb_output_instance *ins; -}; - -int upload_part(struct flb_s3 *ctx, struct multipart_upload *m_upload, - char *body, size_t body_size, char *pre_signed_url); - -int create_multipart_upload(struct flb_s3 *ctx, - struct multipart_upload *m_upload, - char *pre_signed_url); -int complete_multipart_upload(struct flb_s3 *ctx, - struct multipart_upload *m_upload, - char *pre_signed_url); - -int abort_multipart_upload(struct flb_s3 *ctx, - struct multipart_upload *m_upload, - char *pre_signed_url); - -void multipart_read_uploads_from_fs(struct flb_s3 *ctx); - -void multipart_upload_destroy(struct multipart_upload *m_upload); + int format; + char *schema_str; + struct flb_parquet_schema *cached_arrow_schema; +}; -struct flb_http_client *mock_s3_call(char *error_env_var, char *api); +#define FLB_S3_FORMAT_JSON 0 +#define FLB_S3_FORMAT_PARQUET 1 + +void cb_s3_upload(struct flb_config *config, void *data); +int s3_format_chunk(struct flb_s3 *ctx, + struct s3_file *chunk, + flb_sds_t *out_buf, size_t *out_size); +int s3_upload_file(struct flb_s3 *ctx, + flb_sds_t body, size_t body_size, + const char *tag, int tag_len, + time_t file_first_log_time); + +/* Orchestration: initiate multipart upload and enqueue parts */ +int s3_initiate_multipart_upload(struct flb_s3 *ctx, + uint64_t file_id, + const char *file_path, + const char *tag, + int tag_len); + +/* Mock functions for testing */ +struct flb_http_client *mock_s3_call(char *error_env_var, char *api, + const char *body, size_t body_size); int s3_plugin_under_test(); -int get_md5_base64(char *buf, size_t buf_size, char *md5_str, size_t md5_str_size); - -int create_headers(struct flb_s3 *ctx, char *body_md5, - struct flb_aws_header **headers, int *num_headers, - int multipart_upload); - #endif diff --git a/plugins/out_s3/s3_auth.c b/plugins/out_s3/s3_auth.c new file mode 100644 index 00000000000..06bb78c0fd6 --- /dev/null +++ b/plugins/out_s3/s3_auth.c @@ -0,0 +1,400 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "s3.h" +#include "s3_auth.h" + +struct url_parts { + char *scheme; + char *host; + char *port; + char *uri; +}; + +static void url_parts_destroy(struct url_parts *parts) +{ + if (!parts) { + return; + } + + if (parts->scheme) { + flb_free(parts->scheme); + } + if (parts->host) { + flb_free(parts->host); + } + if (parts->port) { + flb_free(parts->port); + } + if (parts->uri) { + flb_free(parts->uri); + } +} + +static int parse_url(struct flb_s3 *ctx, const char *url, struct url_parts *parts) +{ + int ret; + + memset(parts, 0, sizeof(struct url_parts)); + + ret = flb_utils_url_split(url, &parts->scheme, &parts->host, + &parts->port, &parts->uri); + if (ret == -1) { + url_parts_destroy(parts); + flb_plg_error(ctx->ins, "Invalid URL: %s", url); + return -1; + } + + if (!parts->host || !parts->uri) { + url_parts_destroy(parts); + flb_plg_error(ctx->ins, "Invalid URL (missing host or path): %s", url); + return -1; + } + + return 0; +} + +int s3_auth_init_endpoint(struct flb_s3 *ctx) +{ + struct url_parts parts; + struct flb_upstream *upstream = NULL; + struct flb_tls *tls_context = NULL; + int ret; + + ctx->authorization_endpoint_upstream = NULL; + ctx->authorization_endpoint_tls_context = NULL; + + ret = parse_url(ctx, ctx->authorization_endpoint_url, &parts); + if (ret == -1) { + return -1; + } + + /* Determine if HTTPS is used */ + int use_https = (parts.scheme && strcasecmp(parts.scheme, "https") == 0); + int io_flags = use_https ? FLB_IO_TLS : FLB_IO_TCP; + + /* Create TLS context only for HTTPS endpoints */ + if (use_https) { + tls_context = flb_tls_create(FLB_TLS_CLIENT_MODE, FLB_TRUE, FLB_FALSE, + parts.host, NULL, NULL, NULL, NULL, NULL); + if (!tls_context) { + flb_plg_error(ctx->ins, "TLS context creation error"); + url_parts_destroy(&parts); + return -1; + } + } + + upstream = flb_upstream_create_url(ctx->ins->config, + ctx->authorization_endpoint_url, + io_flags, tls_context); + if (!upstream) { + flb_plg_error(ctx->ins, "Upstream creation error"); + if (tls_context) { + flb_tls_destroy(tls_context); + } + url_parts_destroy(&parts); + return -1; + } + + flb_output_upstream_set(upstream, ctx->ins); + + ctx->authorization_endpoint_upstream = upstream; + ctx->authorization_endpoint_tls_context = tls_context; + + url_parts_destroy(&parts); + return 0; +} + +static uint16_t get_port_from_url(const char *scheme, const char *port_str) +{ + if (port_str) { + char *endptr; + unsigned long port_val; + + errno = 0; + port_val = strtoul(port_str, &endptr, 10); + + /* Validate conversion: must convert entire string, no overflow, valid port range */ + if (endptr != port_str && *endptr == '\0' && + errno != ERANGE && port_val >= 1 && port_val <= 65535) { + return (uint16_t) port_val; + } + + /* Conversion failed, fall back to scheme-based default */ + } + + if (scheme && strcasecmp(scheme, "https") == 0) { + return 443; + } + + return 80; +} + +static int setup_http_client_headers(struct flb_s3 *ctx, + struct flb_http_client *client) +{ + flb_http_add_header(client, "Accept", 6, "text/plain", 10); + flb_http_add_header(client, "User-Agent", 10, "Fluent-Bit", 10); + + if (ctx->authorization_endpoint_username && + ctx->authorization_endpoint_password) { + flb_http_basic_auth(client, ctx->authorization_endpoint_username, + ctx->authorization_endpoint_password); + } + else if (ctx->authorization_endpoint_bearer_token) { + flb_http_bearer_auth(client, ctx->authorization_endpoint_bearer_token); + } + + return 0; +} + +int s3_auth_request_presigned_url(struct flb_s3 *ctx, + flb_sds_t *result_url, + char *url) +{ + struct url_parts parts; + struct flb_connection *connection = NULL; + struct flb_http_client *http_client = NULL; + uint16_t port; + size_t b_sent; + flb_sds_t tmp; + int ret; + + ret = parse_url(ctx, url, &parts); + if (ret == -1) { + return -1; + } + + port = get_port_from_url(parts.scheme, parts.port); + + connection = flb_upstream_conn_get(ctx->authorization_endpoint_upstream); + if (!connection) { + flb_plg_error(ctx->ins, "Cannot create connection"); + ret = -1; + goto cleanup; + } + + http_client = flb_http_client(connection, FLB_HTTP_GET, parts.uri, + NULL, 0, parts.host, (int) port, NULL, 0); + if (!http_client) { + flb_plg_error(ctx->ins, "Cannot create HTTP client"); + ret = -1; + goto cleanup; + } + + setup_http_client_headers(ctx, http_client); + + ret = flb_http_do(http_client, &b_sent); + if (ret == -1) { + flb_plg_error(ctx->ins, "Error sending configuration request"); + goto cleanup; + } + + if (http_client->resp.status != 200) { + if (http_client->resp.payload_size > 0) { + flb_plg_error(ctx->ins, + "Pre-signed URL retrieval failed with status %i\n%s", + http_client->resp.status, http_client->resp.payload); + } + else { + flb_plg_error(ctx->ins, + "Pre-signed URL retrieval failed with status %i", + http_client->resp.status); + } + ret = -1; + goto cleanup; + } + + flb_plg_info(ctx->ins, "Pre-signed URL retrieved successfully"); + + if (*result_url) { + tmp = flb_sds_copy(*result_url, http_client->resp.payload, + http_client->resp.payload_size); + } + else { + tmp = flb_sds_create_len(http_client->resp.payload, + http_client->resp.payload_size); + } + + if (!tmp) { + flb_plg_error(ctx->ins, "Pre-signed URL duplication error"); + ret = -1; + goto cleanup; + } + + *result_url = tmp; + ret = 0; + +cleanup: + if (http_client) { + flb_http_client_destroy(http_client); + } + if (connection) { + flb_upstream_conn_release(connection); + } + url_parts_destroy(&parts); + + return ret; +} + +static flb_sds_t build_presigned_url_path(struct flb_s3 *ctx, + int url_type, + const char *s3_key, + const char *upload_id, + int part_number) +{ + flb_sds_t encoded_key = NULL; + flb_sds_t encoded_id = NULL; + flb_sds_t path = NULL; + flb_sds_t tmp; + + encoded_key = flb_uri_encode(s3_key, strlen(s3_key)); + if (!encoded_key) { + flb_plg_error(ctx->ins, "Failed to URL encode S3 key: %s", s3_key); + return NULL; + } + + if (upload_id) { + encoded_id = flb_uri_encode(upload_id, strlen(upload_id)); + if (!encoded_id) { + flb_plg_error(ctx->ins, "Failed to URL encode upload_id"); + flb_sds_destroy(encoded_key); + return NULL; + } + } + + path = flb_sds_create_size(512); + if (!path) { + flb_errno(); + goto error; + } + + /* Strip leading '/' from encoded_key if present */ + const char *key_to_use = encoded_key; + if (encoded_key[0] == '/') { + key_to_use = encoded_key + 1; + } + + switch (url_type) { + case S3_PRESIGNED_URL_CREATE_MULTIPART: + tmp = flb_sds_printf(&path, "/multipart_creation_presigned_url/%s/%s", + ctx->bucket, key_to_use); + break; + + case S3_PRESIGNED_URL_UPLOAD_PART: + if (!encoded_id) { + goto error; + } + tmp = flb_sds_printf(&path, "/multipart_upload_presigned_url/%s/%s/%s/%d", + ctx->bucket, key_to_use, encoded_id, part_number); + break; + + case S3_PRESIGNED_URL_COMPLETE_MULTIPART: + if (!encoded_id) { + goto error; + } + tmp = flb_sds_printf(&path, "/multipart_complete_presigned_url/%s/%s/%s", + ctx->bucket, key_to_use, encoded_id); + break; + + case S3_PRESIGNED_URL_ABORT_MULTIPART: + if (!encoded_id) { + goto error; + } + tmp = flb_sds_printf(&path, "/multipart_abort_presigned_url/%s/%s/%s", + ctx->bucket, key_to_use, encoded_id); + break; + + default: + flb_plg_error(ctx->ins, "Unknown URL type: %d", url_type); + goto error; + } + + if (!tmp) { + goto error; + } + + flb_sds_destroy(encoded_key); + if (encoded_id) { + flb_sds_destroy(encoded_id); + } + + return tmp; + +error: + if (path) { + flb_sds_destroy(path); + } + if (encoded_key) { + flb_sds_destroy(encoded_key); + } + if (encoded_id) { + flb_sds_destroy(encoded_id); + } + return NULL; +} + +int s3_auth_fetch_presigned_url(struct flb_s3 *ctx, + flb_sds_t *result_url, + int url_type, + const char *s3_key, + const char *upload_id, + int part_number) +{ + flb_sds_t url_path = NULL; + flb_sds_t full_url = NULL; + int ret; + + if (!ctx->authorization_endpoint_url) { + *result_url = NULL; + return 0; + } + + url_path = build_presigned_url_path(ctx, url_type, s3_key, upload_id, part_number); + if (!url_path) { + return -1; + } + + full_url = flb_sds_create_size( + flb_sds_len(ctx->authorization_endpoint_url) + flb_sds_len(url_path) + 1); + if (!full_url) { + flb_sds_destroy(url_path); + return -1; + } + + full_url = flb_sds_printf(&full_url, "%s%s", + ctx->authorization_endpoint_url, url_path); + if (!full_url) { + flb_sds_destroy(url_path); + return -1; + } + + ret = s3_auth_request_presigned_url(ctx, result_url, full_url); + + flb_sds_destroy(url_path); + flb_sds_destroy(full_url); + + return ret; +} diff --git a/plugins/out_s3/s3_auth.h b/plugins/out_s3/s3_auth.h new file mode 100644 index 00000000000..81d97ae15ee --- /dev/null +++ b/plugins/out_s3/s3_auth.h @@ -0,0 +1,49 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FLB_OUT_S3_AUTH_H +#define FLB_OUT_S3_AUTH_H + +#include "s3.h" + +/* Unified presigned URL types */ +typedef enum { + S3_PRESIGNED_URL_CREATE_MULTIPART, + S3_PRESIGNED_URL_UPLOAD_PART, + S3_PRESIGNED_URL_COMPLETE_MULTIPART, + S3_PRESIGNED_URL_ABORT_MULTIPART +} s3_presigned_url_type_t; + +/* Initialize authorization endpoint upstream connection */ +int s3_auth_init_endpoint(struct flb_s3 *ctx); + +/* Request presigned URL from authorization endpoint */ +int s3_auth_request_presigned_url(struct flb_s3 *ctx, + flb_sds_t *result_url, + char *url); + +/* Unified presigned URL fetcher - works for both standard and blob uploads */ +int s3_auth_fetch_presigned_url(struct flb_s3 *ctx, + flb_sds_t *result_url, + int url_type, + const char *s3_key, + const char *upload_id, + int part_number); + +#endif diff --git a/plugins/out_s3/s3_blob.c b/plugins/out_s3/s3_blob.c new file mode 100644 index 00000000000..59d309f5e9d --- /dev/null +++ b/plugins/out_s3/s3_blob.c @@ -0,0 +1,448 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "s3.h" +#include "s3_multipart.h" +#include "s3_blob.h" +#include "s3_store.h" +#include "s3_stream.h" +#include "s3_auth.h" +#include "s3_queue.h" + +/* Forward declarations */ +static int recover_stale_files(struct flb_s3 *ctx); +static int handle_aborted_files(struct flb_s3 *ctx, struct flb_config *config); + + +static int abort_multipart_upload(struct flb_s3 *ctx, + cfl_sds_t file_tag, + cfl_sds_t file_path, + cfl_sds_t file_remote_id) +{ + struct multipart_upload *m_upload; + flb_sds_t pre_signed_url = NULL; + int ret; + + m_upload = s3_multipart_upload_create(ctx, file_tag, cfl_sds_len(file_tag), file_path); + if (!m_upload) { + return -1; + } + + m_upload->upload_id = flb_sds_create(file_remote_id); + if (!m_upload->upload_id) { + flb_plg_error(ctx->ins, "Could not allocate upload id copy"); + s3_multipart_upload_destroy(m_upload); + return -1; + } + + ret = s3_auth_fetch_presigned_url(ctx, &pre_signed_url, + S3_PRESIGNED_URL_ABORT_MULTIPART, + m_upload->s3_key, m_upload->upload_id, 0); + if (ret < 0) { + s3_multipart_upload_destroy(m_upload); + return -1; + } + + ret = s3_multipart_abort(ctx, m_upload, pre_signed_url); + flb_sds_destroy(pre_signed_url); + s3_multipart_upload_destroy(m_upload); + + return ret; +} + +int s3_blob_notify_delivery(struct flb_s3 *ctx, + struct flb_config *config, + cfl_sds_t source, + cfl_sds_t file_path, + uint64_t file_id, + int success) +{ + struct flb_blob_delivery_notification *notification; + int ret; + + notification = flb_calloc(1, sizeof(struct flb_blob_delivery_notification)); + if (!notification) { + flb_plg_error(ctx->ins, "failed to allocate delivery notification"); + return -1; + } + + notification->base.dynamically_allocated = FLB_TRUE; + notification->base.notification_type = FLB_NOTIFICATION_TYPE_BLOB_DELIVERY; + notification->base.destructor = flb_input_blob_delivery_notification_destroy; + notification->success = success; + notification->path = cfl_sds_create(file_path); + if (!notification->path) { + flb_plg_error(ctx->ins, "failed to allocate path for delivery notification"); + flb_free(notification); + return -1; + } + + ret = flb_notification_enqueue(FLB_PLUGIN_INPUT, source, + ¬ification->base, config); + if (ret != 0) { + flb_plg_error(ctx->ins, "notification delivery failed for '%s' (id=%" PRIu64 ")", + file_path, file_id); + flb_notification_cleanup(¬ification->base); + return -1; + } + + return 0; +} + +/* + * Phase 2: State transitions for special states (STALE, ABORTED) + * This function handles states that need special operations before being re-queued: + * - STALE: Files with old last_delivery_attempt, may need multipart abort + * - ABORTED: Files that failed upload, need retry decision + * + * Note: This does NOT enqueue files. Phase 3 (rebuild_queue_from_storage) handles that. + */ +int s3_blob_recover_state(struct flb_s3 *ctx, struct flb_config *config) +{ + if (!ctx->blob_db.db) { + return 0; + } + + flb_plg_debug(ctx->ins, "recovery: phase 2 - processing special states (stale/aborted)"); + + flb_blob_db_lock(&ctx->blob_db); + + /* Handle STALE → PENDING transitions */ + recover_stale_files(ctx); + + /* Handle ABORTED → PENDING or DELETE transitions */ + handle_aborted_files(ctx, config); + + flb_blob_db_unlock(&ctx->blob_db); + + return 0; +} + +static int recover_stale_files(struct flb_s3 *ctx) +{ + uint64_t file_id; + cfl_sds_t file_path = NULL; + cfl_sds_t file_remote_id = NULL; + cfl_sds_t file_tag = NULL; + int part_count; + int ret; + int stale_count = 0; + + while (1) { + ret = flb_blob_db_file_get_next_stale(&ctx->blob_db, &file_id, &file_path, + ctx->upload_parts_freshness_threshold, + &file_remote_id, &file_tag, &part_count); + + if (ret != 1) { + break; + } + + flb_plg_info(ctx->ins, "Stale file detected, resetting upload state " + "(file_id=%" PRIu64 ", parts=%d)", file_id, part_count); + + if (part_count > 1) { + abort_multipart_upload(ctx, file_tag, file_path, file_remote_id); + } + + flb_blob_file_update_remote_id(&ctx->blob_db, file_id, ""); + flb_blob_db_file_reset_upload_states(&ctx->blob_db, file_id); + flb_blob_db_file_set_aborted_state(&ctx->blob_db, file_id, 0); + + cfl_sds_destroy(file_remote_id); + cfl_sds_destroy(file_path); + cfl_sds_destroy(file_tag); + + file_remote_id = NULL; + file_path = NULL; + file_tag = NULL; + stale_count++; + } + + if (stale_count > 0) { + flb_plg_info(ctx->ins, "Recovered %d stale file(s)", stale_count); + } + + return 0; +} + +static int handle_aborted_files(struct flb_s3 *ctx, struct flb_config *config) +{ + uint64_t file_id; + uint64_t file_delivery_attempts; + cfl_sds_t file_path = NULL; + cfl_sds_t source = NULL; + cfl_sds_t file_remote_id = NULL; + cfl_sds_t file_tag = NULL; + int part_count; + int ret; + int upload_valid; + int aborted_count = 0; + int retry_resume_count = 0; + int retry_fresh_count = 0; + int discarded_count = 0; + + while (1) { + ret = flb_blob_db_file_get_next_aborted(&ctx->blob_db, &file_id, + &file_delivery_attempts, + &file_path, &source, + &file_remote_id, &file_tag, + &part_count); + + if (ret != 1) { + break; + } + + aborted_count++; + + if (ctx->file_delivery_attempt_limit != FLB_OUT_RETRY_UNLIMITED && + file_delivery_attempts < ctx->file_delivery_attempt_limit) { + /* Distinguish between two retry scenarios */ + if (file_remote_id && strlen(file_remote_id) > 0) { + /* Scenario A: Has upload_id - validate before deciding */ + /* + * KNOWN LIMITATION: Generate S3 key for validation + * If s3_key_format contains dynamic time variables (e.g., %Y/%m/%d) and recovery + * happens after a date boundary, the regenerated key may differ from the original, + * causing false "NoSuchUpload" and unnecessary file re-upload. + * + * To fully resolve this would require storing the actual S3 key string in the database, + * which is not currently supported. Users should use static key formats for reliable + * recovery, or accept potential re-uploads when using dynamic time-based keys. + */ + flb_sds_t s3_key = flb_get_s3_key(ctx->s3_key_format, time(NULL), file_tag, + ctx->tag_delimiters, ctx->seq_index, file_path); + + if (!s3_key) { + flb_plg_error(ctx->ins, "Failed to generate S3 key for validation"); + /* Treat as invalid upload_id to be safe */ + upload_valid = 0; + } else { + flb_plg_debug(ctx->ins, "Validating upload_id for file_id=%" PRIu64, file_id); + upload_valid = s3_multipart_check_upload_exists(ctx, s3_key, file_remote_id); + flb_sds_destroy(s3_key); + } + + if (upload_valid == 1) { + /* Upload ID is valid - keep for resume */ + flb_plg_info(ctx->ins, + "Upload ID validated (still exists), will resume upload " + "(file_id=%" PRIu64 ")", file_id); + retry_resume_count++; + } + else if (upload_valid == 0) { + /* Upload ID is invalid - reset to fresh start */ + flb_plg_info(ctx->ins, + "Upload ID no longer valid (expired or aborted), " + "will create fresh upload (file_id=%" PRIu64 ")", file_id); + + flb_blob_file_update_remote_id(&ctx->blob_db, file_id, ""); + flb_blob_db_file_reset_upload_states(&ctx->blob_db, file_id); + retry_fresh_count++; + } + else { + /* Network error or validation failed - treat conservatively */ + flb_plg_warn(ctx->ins, + "Cannot validate upload_id (network error), " + "assuming invalid for safety (file_id=%" PRIu64 ")", file_id); + + flb_blob_file_update_remote_id(&ctx->blob_db, file_id, ""); + flb_blob_db_file_reset_upload_states(&ctx->blob_db, file_id); + retry_fresh_count++; + } + } + else { + /* Scenario B: No upload_id - fresh start needed */ + /* Reset all parts to start fresh */ + flb_blob_db_file_reset_upload_states(&ctx->blob_db, file_id); + retry_fresh_count++; + } + + /* Clear aborted flag to allow retry */ + flb_blob_db_file_set_aborted_state(&ctx->blob_db, file_id, 0); + } + else { + discarded_count++; + + /* Abort the multipart upload before deleting */ + if (part_count > 1 && file_remote_id && strlen(file_remote_id) > 0) { + abort_multipart_upload(ctx, file_tag, file_path, file_remote_id); + } + + flb_blob_db_file_delete(&ctx->blob_db, file_id); + s3_blob_notify_delivery(ctx, config, source, file_path, file_id, FLB_FALSE); + } + + cfl_sds_destroy(file_remote_id); + cfl_sds_destroy(file_path); + cfl_sds_destroy(source); + cfl_sds_destroy(file_tag); + + file_remote_id = NULL; + file_path = NULL; + source = NULL; + file_tag = NULL; + } + + if (aborted_count > 0) { + flb_plg_info(ctx->ins, + "Processed %d aborted file(s): %d resume (valid upload_id), " + "%d fresh start (invalid/no upload_id), %d discarded", + aborted_count, retry_resume_count, retry_fresh_count, discarded_count); + } + + return 0; +} + +int s3_blob_register_parts(struct flb_s3 *ctx, uint64_t file_id, size_t total_size) +{ + size_t offset_start = 0; + size_t offset_end; + size_t actual_part_size; + int64_t parts = 0; + int64_t id; + int ret; + + /* Use unified upload_chunk_size parameter for all upload types */ + actual_part_size = flb_s3_calculate_optimal_part_size( + ctx->upload_chunk_size, + total_size + ); + + while (offset_start < total_size) { + offset_end = offset_start + actual_part_size; + if (offset_end > total_size) { + offset_end = total_size; + } + + ret = flb_blob_db_file_part_insert(&ctx->blob_db, file_id, parts, + offset_start, offset_end, &id); + if (ret == -1) { + flb_plg_error(ctx->ins, "cannot insert blob file part into database"); + return -1; + } + + offset_start = offset_end; + parts++; + } + + return parts; +} + +/* + * Process blob events in flush callback + * + * ARCHITECTURE FIX: + * The flush callback runs in a coroutine context with limited stack (37KB). + * We should ONLY do lightweight operations here: + * 1. Parse event and extract metadata + * 2. Persist metadata to database + * 3. Return immediately + * + * Heavy operations (CreateMultipartUpload, API calls) should be deferred + * to the timer callback which runs in a proper thread context. + */ +int s3_blob_process_events(struct flb_s3 *ctx, struct flb_event_chunk *event_chunk) +{ + struct flb_log_event_decoder log_decoder; + struct flb_log_event log_event; + cfl_sds_t file_path = NULL; + cfl_sds_t source = NULL; + size_t file_size; + int64_t file_id; + msgpack_object map; + int ret; + int processed = 0; + + if (!ctx->blob_db.db) { + flb_plg_error(ctx->ins, "Cannot process blob without database"); + return -1; + } + + ret = flb_log_event_decoder_init(&log_decoder, (char *)event_chunk->data, + event_chunk->size); + if (ret != FLB_EVENT_DECODER_SUCCESS) { + flb_plg_error(ctx->ins, "Log event decoder initialization error: %i", ret); + return -1; + } + + while (flb_log_event_decoder_next(&log_decoder, &log_event) == + FLB_EVENT_DECODER_SUCCESS) { + map = *log_event.body; + ret = flb_input_blob_file_get_info(map, &source, &file_path, &file_size); + if (ret == -1) { + flb_plg_error(ctx->ins, "cannot get file info from blob record"); + continue; + } + + /* 1. Insert file metadata into database */ + ret = flb_blob_db_file_insert(&ctx->blob_db, event_chunk->tag, source, + ctx->endpoint, file_path, file_size); + if (ret == -1) { + flb_plg_error(ctx->ins, "cannot insert blob file: %s (size=%lu)", + file_path, file_size); + cfl_sds_destroy(file_path); + cfl_sds_destroy(source); + continue; + } + + file_id = ret; + + /* 2. Register parts for this file */ + ret = s3_blob_register_parts(ctx, file_id, file_size); + if (ret == -1) { + flb_plg_error(ctx->ins, "cannot register blob file parts: %s", file_path); + flb_blob_db_file_delete(&ctx->blob_db, file_id); + cfl_sds_destroy(file_path); + cfl_sds_destroy(source); + continue; + } + + ret = s3_queue_add_pending_file(ctx, file_id, file_path, + event_chunk->tag, strlen(event_chunk->tag)); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to enqueue pending file"); + flb_blob_db_file_delete(&ctx->blob_db, file_id); + cfl_sds_destroy(file_path); + cfl_sds_destroy(source); + continue; + } + + cfl_sds_destroy(file_path); + cfl_sds_destroy(source); + file_path = NULL; + source = NULL; + processed++; + } + + flb_log_event_decoder_destroy(&log_decoder); + + if (processed > 0) { + flb_plg_debug(ctx->ins, "Registered %d blob file(s), will upload via timer", processed); + } + + return 0; +} diff --git a/plugins/out_s3/s3_blob.h b/plugins/out_s3/s3_blob.h new file mode 100644 index 00000000000..5a681720d6f --- /dev/null +++ b/plugins/out_s3/s3_blob.h @@ -0,0 +1,43 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FLB_OUT_S3_BLOB_H +#define FLB_OUT_S3_BLOB_H + +#include +#include "s3.h" + +/* Register blob file parts in database */ +int s3_blob_register_parts(struct flb_s3 *ctx, uint64_t file_id, size_t total_size); + +/* Process blob chunk event */ +int s3_blob_process_events(struct flb_s3 *ctx, struct flb_event_chunk *event_chunk); + +/* Recovery: process and cleanup stale/aborted files */ +int s3_blob_recover_state(struct flb_s3 *ctx, struct flb_config *config); + +/* Send delivery notification to input plugin */ +int s3_blob_notify_delivery(struct flb_s3 *ctx, + struct flb_config *config, + cfl_sds_t source, + cfl_sds_t file_path, + uint64_t file_id, + int success); + +#endif diff --git a/plugins/out_s3/s3_multipart.c b/plugins/out_s3/s3_multipart.c index 7ad7b2095b1..4d574fdc50f 100644 --- a/plugins/out_s3/s3_multipart.c +++ b/plugins/out_s3/s3_multipart.c @@ -26,774 +26,1284 @@ #include #include #include +#include +#include +#include #include +#include +#include + +#ifdef _WIN32 +#include +#include +#include +/* Windows-specific macros for file I/O */ +#define flb_open _open +#define flb_close _close +#define flb_read _read +#define flb_lseek _lseeki64 +#define flb_stat _stat64 +#define flb_fstat _fstat64 +/* Cross-platform sleep wrapper */ +static inline void sleep_ms(int milliseconds) { + Sleep(milliseconds); +} +#else +#include +/* POSIX file I/O macros */ +#define flb_open open +#define flb_close close +#define flb_read read +#define flb_lseek lseek +#define flb_stat stat +#define flb_fstat fstat +/* Cross-platform sleep wrapper */ +static inline void sleep_ms(int milliseconds) { + usleep(milliseconds * 1000); +} +#endif #include "s3.h" +#include "s3_multipart.h" #include "s3_store.h" - -#define COMPLETE_MULTIPART_UPLOAD_BASE_LEN 100 -#define COMPLETE_MULTIPART_UPLOAD_PART_LEN 124 - -flb_sds_t get_etag(char *response, size_t size); - -static inline int try_to_write(char *buf, int *off, size_t left, - const char *str, size_t str_len) +#include "s3_auth.h" + +#define S3_MD5_BASE64_BUFFER_SIZE 25 +#define S3_PART_NUMBER_BUFFER_SIZE 11 +#define S3_XML_NAMESPACE "http://s3.amazonaws.com/doc/2006-03-01/" + +static struct flb_aws_header content_type_header = { + .key = "Content-Type", + .key_len = 12, + .val = "", + .val_len = 0, +}; + +static struct flb_aws_header canned_acl_header = { + .key = "x-amz-acl", + .key_len = 9, + .val = "", + .val_len = 0, +}; + +static struct flb_aws_header content_md5_header = { + .key = "Content-MD5", + .key_len = 11, + .val = "", + .val_len = 0, +}; + +static struct flb_aws_header storage_class_header = { + .key = "x-amz-storage-class", + .key_len = 19, + .val = "", + .val_len = 0, +}; + +extern int write_seq_index(char *seq_index_file, uint64_t seq_index); + +static flb_sds_t extract_etag(char *response, size_t size) { - if (str_len <= 0){ - str_len = strlen(str); - } - if (left <= *off+str_len) { - return FLB_FALSE; + char *tmp; + int start; + int end; + int i = 0; + flb_sds_t etag; + + if (response == NULL) { + return NULL; } - memcpy(buf+*off, str, str_len); - *off += str_len; - return FLB_TRUE; -} + tmp = strstr(response, "ETag:"); + if (!tmp) { + return NULL; + } + i = tmp - response; + i += 5; -/* the 'tag' or key in the upload_dir is s3_key + \n + upload_id */ -static flb_sds_t upload_key(struct multipart_upload *m_upload) -{ - flb_sds_t key; - flb_sds_t tmp; + while (i < size && (response[i] == '\"' || isspace(response[i]) != 0)) { + i++; + } + start = i; - key = flb_sds_create_size(64); + while (i < size && (response[i] != '\"' && isspace(response[i]) == 0)) { + i++; + } + end = i; - tmp = flb_sds_printf(&key, "%s\n%s", m_upload->s3_key, m_upload->upload_id); - if (!tmp) { + etag = flb_sds_create_len(response + start, end - start); + if (!etag) { flb_errno(); - flb_sds_destroy(key); return NULL; } - key = tmp; - return key; + return etag; } -/* the 'tag' or key in the upload_dir is s3_key + \n + upload_id */ -static int upload_data_from_key(struct multipart_upload *m_upload, char *key) +/* + * Calculate optimal part size for S3 multipart upload. + * + * NOTE: This function enforces AWS S3 multipart constraints where each part + * (except the last) must be at least 5 MiB. For small files (< 5 MiB), this + * function will still return 5 MiB even though only one part is needed, which + * means the "small file optimization" logic does not actually reduce the part + * size below the AWS minimum. + * + * Returns the optimal part size considering: + * - User configuration + * - File size to avoid exceeding 10000 parts limit + * - AWS S3 hard limits (5 MiB minimum, 5 GiB maximum per part) + */ +size_t flb_s3_calculate_optimal_part_size(size_t configured_part_size, + size_t file_size) { - flb_sds_t tmp_sds; - int len = 0; - int original_len; - char *tmp; + size_t part_size; + size_t min_required_chunk; + size_t estimated_parts; - original_len = strlen(key); + /* Step 1: Determine initial part_size */ + if (configured_part_size > 0) { + part_size = configured_part_size; + } + else if (file_size > 0 && file_size <= S3_DEFAULT_PART_SIZE) { + /* Start with file_size for small files, will be clamped to AWS minimum below */ + part_size = file_size; + } + else { + part_size = S3_DEFAULT_PART_SIZE; + } - tmp = strchr(key, '\n'); - if (!tmp) { - return -1; + /* Step 2: Adjust if file_size is known and would exceed parts limit */ + if (file_size > 0) { + estimated_parts = (file_size + part_size - 1) / part_size; + + if (estimated_parts > S3_AWS_MAX_PARTS) { + /* Calculate minimum required chunk size */ + min_required_chunk = (file_size + S3_AWS_MAX_PARTS - 1) / S3_AWS_MAX_PARTS; + + /* Choose larger of min_required or current part_size */ + if (min_required_chunk > part_size) { + part_size = min_required_chunk; + + /* Round up to next MiB for sizes < 1 GiB */ + if (part_size < S3_GiB) { + part_size = ((part_size + S3_MiB - 1) / S3_MiB) * S3_MiB; + } + /* Round up to next GiB for sizes >= 1 GiB */ + else { + part_size = ((part_size + S3_GiB - 1) / S3_GiB) * S3_GiB; + } + } + } } - len = tmp - key; - tmp_sds = flb_sds_create_len(key, len); - if (!tmp_sds) { - flb_errno(); - return -1; + /* Step 3: Enforce AWS S3 hard limits for multipart upload */ + if (part_size < S3_AWS_MIN_PART_SIZE) { + part_size = S3_AWS_MIN_PART_SIZE; + } + else if (part_size > S3_AWS_MAX_PART_SIZE) { + part_size = S3_AWS_MAX_PART_SIZE; } - m_upload->s3_key = tmp_sds; - tmp++; - original_len -= (len + 1); + return part_size; +} - tmp_sds = flb_sds_create_len(tmp, original_len); - if (!tmp_sds) { - flb_errno(); +int s3_multipart_get_md5_base64(char *buf, size_t buf_size, char *md5_str, size_t md5_str_size) +{ + unsigned char md5_bin[16]; + size_t olen; + int ret; + + ret = flb_hash_simple(FLB_HASH_MD5, + (unsigned char *) buf, buf_size, + md5_bin, sizeof(md5_bin)); + + if (ret != FLB_CRYPTO_SUCCESS) { return -1; } - m_upload->upload_id = tmp_sds; + + ret = flb_base64_encode((unsigned char*) md5_str, md5_str_size, + &olen, md5_bin, sizeof(md5_bin)); + if (ret != 0) { + return ret; + } return 0; } -/* parse etags from file data */ -static void parse_etags(struct multipart_upload *m_upload, char *data) +static flb_sds_t build_s3_uri(struct flb_s3 *ctx, + const char *pre_signed_url, + const char *s3_key, + const char *query_params) { - char *line = data; - char *start; - char *end; - flb_sds_t etag; - int part_num; - int len; + flb_sds_t uri = NULL; + flb_sds_t tmp; + flb_sds_t encoded_key = NULL; - if (!data) { - return; + if (pre_signed_url != NULL) { + uri = flb_sds_create(pre_signed_url); + if (!uri) { + flb_errno(); + } + return uri; } - line = strtok(data, "\n"); - - if (!line) { - return; + encoded_key = flb_uri_encode(s3_key, strlen(s3_key)); + if (!encoded_key) { + flb_plg_error(ctx->ins, "Failed to URL encode S3 key: %s", s3_key); + return NULL; } - do { - start = strstr(line, "part_number="); - if (!start) { - return; - } - start += 12; - end = strchr(start, '\t'); - if (!end) { - flb_debug("[s3 restart parser] Did not find tab separator in line %s", start); - return; - } - *end = '\0'; - part_num = atoi(start); - if (part_num <= 0) { - flb_debug("[s3 restart parser] Could not parse part_number from %s", start); - return; - } - m_upload->part_number = part_num; - *end = '\t'; + size_t uri_size = strlen(ctx->bucket) + flb_sds_len(encoded_key) + + strlen(query_params) + 16; - start = strstr(line, "tag="); - if (!start) { - flb_debug("[s3 restart parser] Could not find 'etag=' %s", line); - return; - } + uri = flb_sds_create_size(uri_size); + if (!uri) { + flb_errno(); + flb_sds_destroy(encoded_key); + return NULL; + } - start += 4; - len = strlen(start); + if (s3_key[0] == '/') { + tmp = flb_sds_printf(&uri, "/%s%s%s", ctx->bucket, encoded_key, query_params); + } + else { + tmp = flb_sds_printf(&uri, "/%s/%s%s", ctx->bucket, encoded_key, query_params); + } - if (len <= 0) { - flb_debug("[s3 restart parser] Could not find etag %s", line); - return; - } + flb_sds_destroy(encoded_key); - etag = flb_sds_create_len(start, len); - if (!etag) { - flb_debug("[s3 restart parser] Could create etag"); - return; - } - flb_debug("[s3 restart parser] found part number %d=%s", part_num, etag); - m_upload->etags[part_num - 1] = etag; + if (!tmp) { + flb_errno(); + flb_sds_destroy(uri); + return NULL; + } - line = strtok(NULL, "\n"); - } while (line != NULL); + return tmp; } -static struct multipart_upload *upload_from_file(struct flb_s3 *ctx, - struct flb_fstore_file *fsf) +static flb_sds_t build_complete_multipart_xml(struct flb_s3 *ctx, + struct multipart_upload *m_upload) { - struct multipart_upload *m_upload = NULL; - char *buffered_data = NULL; - size_t buffer_size = 0; - int ret; + flb_sds_t xml; + flb_sds_t tmp; + int i; + int valid_parts = 0; - ret = s3_store_file_upload_read(ctx, fsf, &buffered_data, &buffer_size); - if (ret < 0) { - flb_plg_error(ctx->ins, "Could not read locally buffered data %s", - fsf->name); - return NULL; + /* Enforce integrity: check for NULL etags */ + for (i = 0; i < m_upload->part_number; i++) { + if (m_upload->etags[i] == NULL) { + flb_plg_error(ctx->ins, "Cannot complete multipart upload: part %d (index %d) " + "has NULL ETag. Total parts: %d", + i + 1, i, m_upload->part_number); + return NULL; + } + valid_parts++; } - /* always make sure we have a fresh copy of metadata */ - ret = s3_store_file_meta_get(ctx, fsf); - if (ret == -1) { - flb_plg_error(ctx->ins, "Could not read file metadata: %s", - fsf->name); - flb_free(buffered_data); + flb_plg_debug(ctx->ins, "Building CompleteMultipartUpload payload: " + "%d valid parts", valid_parts); + + if (valid_parts == 0) { + flb_plg_error(ctx->ins, "No valid ETags found for CompleteMultipartUpload"); return NULL; } - m_upload = flb_calloc(1, sizeof(struct multipart_upload)); - if (!m_upload) { + xml = flb_sds_create(""); + if (!xml) { flb_errno(); - flb_free(buffered_data); return NULL; } - m_upload->init_time = time(NULL); - m_upload->upload_state = MULTIPART_UPLOAD_STATE_COMPLETE_IN_PROGRESS; - ret = upload_data_from_key(m_upload, fsf->meta_buf); - if (ret < 0) { - flb_plg_error(ctx->ins, "Could not extract upload data from: %s", - fsf->name); - flb_free(buffered_data); - multipart_upload_destroy(m_upload); - return NULL; + for (i = 0; i < m_upload->part_number; i++) { + tmp = flb_sds_printf(&xml, + "%s%d", + m_upload->etags[i], i + 1); + if (!tmp) { + flb_errno(); + flb_sds_destroy(xml); + return NULL; + } + xml = tmp; } - parse_etags(m_upload, buffered_data); - flb_free(buffered_data); - if (m_upload->part_number == 0) { - flb_plg_error(ctx->ins, "Could not extract upload data from %s", - fsf->name); - multipart_upload_destroy(m_upload); + tmp = flb_sds_cat(xml, "", + strlen("")); + if (!tmp) { + flb_errno(); + flb_sds_destroy(xml); return NULL; } - /* code expects it to be 1 more than the last part read */ - m_upload->part_number++; + return tmp; +} - return m_upload; +int s3_multipart_create_headers(struct flb_s3 *ctx, char *body_md5, + struct flb_aws_header **headers, int *num_headers, + int is_multipart) +{ + int n = 0; + int headers_len = 0; + struct flb_aws_header *s3_headers = NULL; + + if (ctx->content_type != NULL) { + headers_len++; + } + if (ctx->canned_acl != NULL) { + headers_len++; + } + if (body_md5 != NULL && strlen(body_md5) && is_multipart == FLB_FALSE) { + headers_len++; + } + if (ctx->storage_class != NULL) { + headers_len++; + } + + if (headers_len == 0) { + *num_headers = 0; + *headers = NULL; + return 0; + } + + s3_headers = flb_calloc(headers_len, sizeof(struct flb_aws_header)); + if (s3_headers == NULL) { + flb_errno(); + return -1; + } + + if (ctx->content_type != NULL) { + s3_headers[n] = content_type_header; + s3_headers[n].val = ctx->content_type; + s3_headers[n].val_len = strlen(ctx->content_type); + n++; + } + if (ctx->canned_acl != NULL) { + s3_headers[n] = canned_acl_header; + s3_headers[n].val = ctx->canned_acl; + s3_headers[n].val_len = strlen(ctx->canned_acl); + n++; + } + if (body_md5 != NULL && strlen(body_md5) && is_multipart == FLB_FALSE) { + s3_headers[n] = content_md5_header; + s3_headers[n].val = body_md5; + s3_headers[n].val_len = strlen(body_md5); + n++; + } + if (ctx->storage_class != NULL) { + s3_headers[n] = storage_class_header; + s3_headers[n].val = ctx->storage_class; + s3_headers[n].val_len = strlen(ctx->storage_class); + } + + *num_headers = headers_len; + *headers = s3_headers; + return 0; } -void multipart_read_uploads_from_fs(struct flb_s3 *ctx) +void s3_multipart_upload_destroy(struct multipart_upload *m_upload) { - struct mk_list *tmp; - struct mk_list *head; - struct multipart_upload *m_upload = NULL; - struct flb_fstore_file *fsf; - - mk_list_foreach_safe(head, tmp, &ctx->stream_upload->files) { - fsf = mk_list_entry(head, struct flb_fstore_file, _head); - m_upload = upload_from_file(ctx, fsf); - if (!m_upload) { - flb_plg_error(ctx->ins, - "Could not process multipart upload data in %s", - fsf->name); - continue; + int i; + + if (!m_upload) { + return; + } + + if (m_upload->tag) { + flb_sds_destroy(m_upload->tag); + } + if (m_upload->s3_key) { + flb_sds_destroy(m_upload->s3_key); + } + if (m_upload->upload_id) { + flb_sds_destroy(m_upload->upload_id); + } + + for (i = 0; i < m_upload->part_number; i++) { + if (m_upload->etags[i]) { + flb_sds_destroy(m_upload->etags[i]); } - mk_list_add(&m_upload->_head, &ctx->uploads); - flb_plg_info(ctx->ins, - "Successfully read existing upload from file system, s3_key=%s", - m_upload->s3_key); } + + flb_free(m_upload); } -/* store list of part number and etag */ -static flb_sds_t upload_data(flb_sds_t etag, int part_num) +/* + * Create a multipart upload structure with S3 key generation + * This is the common function used by both blob and chunk uploads + */ +struct multipart_upload *s3_multipart_upload_create(struct flb_s3 *ctx, + const char *tag, + int tag_len, + const char *path) { - flb_sds_t data; - flb_sds_t tmp; + struct multipart_upload *m_upload; + flb_sds_t s3_key; + flb_sds_t tmp_sds; + int ret; + + m_upload = flb_calloc(1, sizeof(struct multipart_upload)); + if (!m_upload) { + flb_errno(); + return NULL; + } - data = flb_sds_create_size(64); + /* If seq_index is used, increment it first before generating the key */ + if (ctx->key_fmt_has_seq_index) { + ctx->seq_index++; + ret = write_seq_index(ctx->seq_index_file, ctx->seq_index); + if (ret < 0) { + ctx->seq_index--; + flb_plg_error(ctx->ins, "Failed to write sequential index"); + flb_free(m_upload); + return NULL; + } + } - tmp = flb_sds_printf(&data, "part_number=%d\tetag=%s\n", part_num, etag); - if (!tmp) { + s3_key = flb_get_s3_key(ctx->s3_key_format, time(NULL), tag, + ctx->tag_delimiters, ctx->seq_index, path); + if (!s3_key) { + flb_plg_error(ctx->ins, "Failed to construct S3 Object Key for %s", tag); + flb_free(m_upload); + return NULL; + } + m_upload->s3_key = s3_key; + + tmp_sds = flb_sds_create_len(tag, tag_len); + if (!tmp_sds) { flb_errno(); - flb_sds_destroy(data); + flb_sds_destroy(s3_key); + flb_free(m_upload); return NULL; } - data = tmp; + m_upload->tag = tmp_sds; + m_upload->part_number = 1; + m_upload->init_time = time(NULL); - return data; + return m_upload; } -/* persists upload data to the file system */ -static int save_upload(struct flb_s3 *ctx, struct multipart_upload *m_upload, - flb_sds_t etag) +int s3_multipart_create(struct flb_s3 *ctx, + struct multipart_upload *m_upload, + char *pre_signed_url) { + flb_sds_t uri = NULL; + flb_sds_t tmp; + struct flb_http_client *c = NULL; + struct flb_aws_client *s3_client; + struct flb_aws_header *headers = NULL; + int num_headers = 0; int ret; - flb_sds_t key; - flb_sds_t data; - struct flb_fstore_file *fsf; - - key = upload_key(m_upload); - if (!key) { - flb_plg_debug(ctx->ins, "Could not constuct upload key for buffer dir"); - return -1; - } - data = upload_data(etag, m_upload->part_number); - if (!data) { - flb_plg_debug(ctx->ins, "Could not constuct upload key for buffer dir"); + uri = build_s3_uri(ctx, pre_signed_url, m_upload->s3_key, "?uploads="); + if (!uri) { return -1; } - fsf = s3_store_file_upload_get(ctx, key, flb_sds_len(key)); - - /* Write the key to the file */ - ret = s3_store_file_upload_put(ctx, fsf, key, data); + s3_client = ctx->s3_client; - flb_sds_destroy(key); - flb_sds_destroy(data); + if (s3_plugin_under_test() == FLB_TRUE) { + c = mock_s3_call("TEST_CREATE_MULTIPART_UPLOAD_ERROR", + "CreateMultipartUpload", NULL, 0); + } + else { + ret = s3_multipart_create_headers(ctx, NULL, &headers, &num_headers, FLB_TRUE); + if (ret == -1) { + flb_plg_error(ctx->ins, "Failed to create headers"); + flb_sds_destroy(uri); + return -1; + } - return ret; -} + c = s3_client->client_vtable->request(s3_client, FLB_HTTP_POST, + uri, NULL, 0, headers, num_headers); + flb_free(headers); + } -static int remove_upload_from_fs(struct flb_s3 *ctx, struct multipart_upload *m_upload) -{ - flb_sds_t key; - struct flb_fstore_file *fsf; + flb_sds_destroy(uri); - key = upload_key(m_upload); - if (!key) { - flb_plg_debug(ctx->ins, "Could not construct upload key"); + if (!c) { + flb_plg_error(ctx->ins, "CreateMultipartUpload request failed for %s", + m_upload->s3_key); return -1; } - fsf = s3_store_file_upload_get(ctx, key, flb_sds_len(key)); - if (fsf) { - s3_store_file_upload_delete(ctx, fsf); + if (c->resp.status == 200) { + tmp = flb_aws_xml_get_val(c->resp.payload, c->resp.payload_size, + "", ""); + if (!tmp) { + flb_plg_error(ctx->ins, "Could not find UploadId in " + "CreateMultipartUpload response"); + flb_http_client_destroy(c); + return -1; + } + m_upload->upload_id = tmp; + flb_http_client_destroy(c); + return 0; } - flb_sds_destroy(key); - return 0; + + flb_aws_print_xml_error(c->resp.payload, c->resp.payload_size, + "CreateMultipartUpload", ctx->ins); + flb_http_client_destroy(c); + return -1; } -/* - * https://docs.aws.amazon.com/AmazonS3/latest/API/API_CompleteMultipartUpload.html - */ -static int complete_multipart_upload_payload(struct flb_s3 *ctx, - struct multipart_upload *m_upload, - char **out_buf, size_t *out_size) +int s3_multipart_upload_part(struct flb_s3 *ctx, struct multipart_upload *m_upload, + char *body, size_t body_size, char *pre_signed_url) { - char *buf; - int i; - int offset = 0; - flb_sds_t etag; - size_t size = COMPLETE_MULTIPART_UPLOAD_BASE_LEN; - char part_num[11]; - - size = size + (COMPLETE_MULTIPART_UPLOAD_PART_LEN * m_upload->part_number); + flb_sds_t uri = NULL; + flb_sds_t tmp; + flb_sds_t query_params; + int ret; + struct flb_http_client *c = NULL; + struct flb_aws_client *s3_client; + struct flb_aws_header *headers = NULL; + int num_headers = 0; + char body_md5[S3_MD5_BASE64_BUFFER_SIZE]; - buf = flb_malloc(size + 1); - if (!buf) { + query_params = flb_sds_create_size(128); + if (!query_params) { flb_errno(); return -1; } - if (!try_to_write(buf, &offset, size, - "", 73)) { - goto error; + tmp = flb_sds_printf(&query_params, "?partNumber=%d&uploadId=%s", + m_upload->part_number, m_upload->upload_id); + if (!tmp) { + flb_sds_destroy(query_params); + return -1; } + query_params = tmp; - for (i = 0; i < m_upload->part_number; i++) { - etag = m_upload->etags[i]; - if (etag == NULL) { - continue; - } - if (!try_to_write(buf, &offset, size, - "", 12)) { - goto error; - } - - if (!try_to_write(buf, &offset, size, - etag, 0)) { - goto error; - } + uri = build_s3_uri(ctx, pre_signed_url, m_upload->s3_key, query_params); + flb_sds_destroy(query_params); - if (!try_to_write(buf, &offset, size, - "", 19)) { - goto error; - } + if (!uri) { + return -1; + } - if (!sprintf(part_num, "%d", i + 1)) { - goto error; + memset(body_md5, 0, sizeof(body_md5)); + if (ctx->send_content_md5 == FLB_TRUE) { + ret = s3_multipart_get_md5_base64(body, body_size, body_md5, sizeof(body_md5)); + if (ret != 0) { + flb_plg_error(ctx->ins, "Failed to create Content-MD5 header"); + flb_sds_destroy(uri); + return -1; } - if (!try_to_write(buf, &offset, size, - part_num, 0)) { - goto error; + headers = flb_malloc(sizeof(struct flb_aws_header)); + if (!headers) { + flb_errno(); + flb_sds_destroy(uri); + return -1; } - if (!try_to_write(buf, &offset, size, - "", 20)) { - goto error; - } + headers[0].key = "Content-MD5"; + headers[0].key_len = 11; + headers[0].val = body_md5; + headers[0].val_len = strlen(body_md5); + num_headers = 1; } - if (!try_to_write(buf, &offset, size, - "", 26)) { - goto error; + s3_client = ctx->s3_client; + + if (s3_plugin_under_test() == FLB_TRUE) { + c = mock_s3_call("TEST_UPLOAD_PART_ERROR", "UploadPart", body, body_size); + } + else { + c = s3_client->client_vtable->request(s3_client, FLB_HTTP_PUT, + uri, body, body_size, + headers, num_headers); } - buf[offset] = '\0'; + flb_free(headers); + flb_sds_destroy(uri); - *out_buf = buf; - *out_size = offset; - return 0; + if (!c) { + flb_plg_error(ctx->ins, "UploadPart request failed"); + return -1; + } -error: - flb_free(buf); - flb_plg_error(ctx->ins, "Failed to construct CompleteMultipartUpload " - "request body"); + if (c->resp.status == 200) { + tmp = extract_etag(c->resp.data, c->resp.data_size); + if (!tmp) { + flb_plg_error(ctx->ins, "Could not find ETag in UploadPart response"); + flb_http_client_destroy(c); + return -1; + } + m_upload->etags[m_upload->part_number - 1] = tmp; + m_upload->bytes += body_size; + flb_http_client_destroy(c); + return 0; + } + + flb_aws_print_xml_error(c->resp.payload, c->resp.payload_size, + "UploadPart", ctx->ins); + if (c->resp.payload != NULL) { + flb_plg_debug(ctx->ins, "Raw UploadPart response: %s", + c->resp.payload); + } + flb_http_client_destroy(c); return -1; } -int complete_multipart_upload(struct flb_s3 *ctx, - struct multipart_upload *m_upload, - char *pre_signed_url) +int s3_multipart_complete(struct flb_s3 *ctx, + struct multipart_upload *m_upload, + char *pre_signed_url) { - char *body; - size_t size; + flb_sds_t body = NULL; flb_sds_t uri = NULL; + flb_sds_t query_params = NULL; flb_sds_t tmp; - int ret; struct flb_http_client *c = NULL; struct flb_aws_client *s3_client; if (!m_upload->upload_id) { flb_plg_error(ctx->ins, "Cannot complete multipart upload for key %s: " - "upload ID is unset ", m_upload->s3_key); - return -1; + "upload_id is unset", m_upload->s3_key); + return S3_MULTIPART_ERROR_GENERAL; } - uri = flb_sds_create_size(flb_sds_len(m_upload->s3_key) + 11 + - flb_sds_len(m_upload->upload_id)); - if (!uri) { - flb_errno(); - return -1; + body = build_complete_multipart_xml(ctx, m_upload); + if (!body) { + flb_plg_error(ctx->ins, "Failed to build CompleteMultipartUpload payload"); + return S3_MULTIPART_ERROR_GENERAL; } - if (pre_signed_url != NULL) { - tmp = flb_sds_copy(uri, pre_signed_url, strlen(pre_signed_url)); - } - else { - tmp = flb_sds_printf(&uri, "/%s%s?uploadId=%s", ctx->bucket, - m_upload->s3_key, m_upload->upload_id); + query_params = flb_sds_create_size(128); + if (!query_params) { + flb_errno(); + flb_sds_destroy(body); + return S3_MULTIPART_ERROR_GENERAL; } + tmp = flb_sds_printf(&query_params, "?uploadId=%s", m_upload->upload_id); if (!tmp) { - flb_sds_destroy(uri); - return -1; + flb_sds_destroy(query_params); + flb_sds_destroy(body); + return S3_MULTIPART_ERROR_GENERAL; } - uri = tmp; + query_params = tmp; - ret = complete_multipart_upload_payload(ctx, m_upload, &body, &size); - if (ret < 0) { - flb_sds_destroy(uri); - return -1; + uri = build_s3_uri(ctx, pre_signed_url, m_upload->s3_key, query_params); + flb_sds_destroy(query_params); + + if (!uri) { + flb_sds_destroy(body); + return S3_MULTIPART_ERROR_GENERAL; } s3_client = ctx->s3_client; + if (s3_plugin_under_test() == FLB_TRUE) { - c = mock_s3_call("TEST_COMPLETE_MULTIPART_UPLOAD_ERROR", "CompleteMultipartUpload"); + c = mock_s3_call("TEST_COMPLETE_MULTIPART_UPLOAD_ERROR", + "CompleteMultipartUpload", body, flb_sds_len(body)); } else { c = s3_client->client_vtable->request(s3_client, FLB_HTTP_POST, - uri, body, size, + uri, body, flb_sds_len(body), NULL, 0); } + flb_sds_destroy(uri); - flb_free(body); - if (c) { - flb_plg_debug(ctx->ins, "CompleteMultipartUpload http status=%d", - c->resp.status); - if (c->resp.status == 200) { - flb_plg_info(ctx->ins, "Successfully completed multipart upload " - "for %s, UploadId=%s", m_upload->s3_key, - m_upload->upload_id); - flb_http_client_destroy(c); - /* remove this upload from the file system */ - remove_upload_from_fs(ctx, m_upload); - return 0; - } - flb_aws_print_xml_error(c->resp.payload, c->resp.payload_size, - "CompleteMultipartUpload", ctx->ins); - if (c->resp.payload != NULL) { - flb_plg_debug(ctx->ins, "Raw CompleteMultipartUpload response: %s", - c->resp.payload); - } + flb_sds_destroy(body); + + if (!c) { + flb_plg_error(ctx->ins, "CompleteMultipartUpload request failed"); + return S3_MULTIPART_ERROR_GENERAL; + } + + if (c->resp.status == 200) { flb_http_client_destroy(c); + return 0; } - flb_plg_error(ctx->ins, "CompleteMultipartUpload request failed"); - return -1; + if (c->resp.payload != NULL && + strstr(c->resp.payload, "NoSuchUpload") != NULL) { + flb_plg_warn(ctx->ins, "Upload %s does not exist (NoSuchUpload)", + m_upload->upload_id); + flb_http_client_destroy(c); + return S3_MULTIPART_ERROR_NO_SUCH_UPLOAD; + } + + flb_aws_print_xml_error(c->resp.payload, c->resp.payload_size, + "CompleteMultipartUpload", ctx->ins); + if (c->resp.payload != NULL) { + flb_plg_debug(ctx->ins, "Raw CompleteMultipartUpload response: %s", + c->resp.payload); + } + flb_http_client_destroy(c); + return S3_MULTIPART_ERROR_GENERAL; } -int abort_multipart_upload(struct flb_s3 *ctx, - struct multipart_upload *m_upload, - char *pre_signed_url) +int s3_multipart_abort(struct flb_s3 *ctx, + struct multipart_upload *m_upload, + char *pre_signed_url) { flb_sds_t uri = NULL; + flb_sds_t query_params = NULL; flb_sds_t tmp; struct flb_http_client *c = NULL; struct flb_aws_client *s3_client; if (!m_upload->upload_id) { - flb_plg_error(ctx->ins, "Cannot complete multipart upload for key %s: " - "upload ID is unset ", m_upload->s3_key); + flb_plg_error(ctx->ins, "Cannot abort multipart upload for key %s: " + "upload_id is unset", m_upload->s3_key); return -1; } - uri = flb_sds_create_size(flb_sds_len(m_upload->s3_key) + 11 + - flb_sds_len(m_upload->upload_id)); - if (!uri) { + query_params = flb_sds_create_size(128); + if (!query_params) { flb_errno(); return -1; } - if (pre_signed_url != NULL) { - tmp = flb_sds_copy(uri, pre_signed_url, strlen(pre_signed_url)); - } - else { - tmp = flb_sds_printf(&uri, "/%s%s?uploadId=%s", ctx->bucket, - m_upload->s3_key, m_upload->upload_id); + tmp = flb_sds_printf(&query_params, "?uploadId=%s", m_upload->upload_id); + if (!tmp) { + flb_sds_destroy(query_params); + return -1; } + query_params = tmp; - if (!tmp) { - flb_sds_destroy(uri); + uri = build_s3_uri(ctx, pre_signed_url, m_upload->s3_key, query_params); + flb_sds_destroy(query_params); + + if (!uri) { return -1; } - uri = tmp; s3_client = ctx->s3_client; + if (s3_plugin_under_test() == FLB_TRUE) { - c = mock_s3_call("TEST_ABORT_MULTIPART_UPLOAD_ERROR", "AbortMultipartUpload"); + c = mock_s3_call("TEST_ABORT_MULTIPART_UPLOAD_ERROR", + "AbortMultipartUpload", NULL, 0); } else { c = s3_client->client_vtable->request(s3_client, FLB_HTTP_DELETE, uri, NULL, 0, NULL, 0); } + flb_sds_destroy(uri); - if (c) { - flb_plg_debug(ctx->ins, "AbortMultipartUpload http status=%d", - c->resp.status); - if (c->resp.status == 204) { - flb_plg_info(ctx->ins, "Successfully completed multipart upload " - "for %s, UploadId=%s", m_upload->s3_key, - m_upload->upload_id); - flb_http_client_destroy(c); - /* remove this upload from the file system */ - remove_upload_from_fs(ctx, m_upload); - return 0; - } - flb_aws_print_xml_error(c->resp.payload, c->resp.payload_size, - "AbortMultipartUpload", ctx->ins); - if (c->resp.payload != NULL) { - flb_plg_debug(ctx->ins, "Raw AbortMultipartUpload response: %s", - c->resp.payload); - } + if (!c) { + flb_plg_error(ctx->ins, "AbortMultipartUpload request failed"); + return -1; + } + + flb_plg_debug(ctx->ins, "AbortMultipartUpload http status=%d", c->resp.status); + + if (c->resp.status == 204) { + flb_plg_info(ctx->ins, "Successfully aborted multipart upload for %s, " + "UploadId=%s", m_upload->s3_key, m_upload->upload_id); flb_http_client_destroy(c); + return 0; } - flb_plg_error(ctx->ins, "AbortMultipartUpload request failed"); + flb_aws_print_xml_error(c->resp.payload, c->resp.payload_size, + "AbortMultipartUpload", ctx->ins); + flb_http_client_destroy(c); return -1; } -int create_multipart_upload(struct flb_s3 *ctx, - struct multipart_upload *m_upload, - char *pre_signed_url) +/* + * Check if a multipart upload exists by calling ListParts API + * Returns: 1 if exists, 0 if not exists (NoSuchUpload), -1 on network/API error + * + * This function is used during recovery to validate whether an upload_id + * stored in the database is still valid on S3. If the upload_id has expired + * or been aborted, S3 will return NoSuchUpload error. + * + * IMPORTANT: s3_key must be the actual persisted key from the original upload. + * This avoids regeneration errors when key format contains timestamps or sequence numbers. + */ +int s3_multipart_check_upload_exists(struct flb_s3 *ctx, + const char *s3_key, + const char *upload_id) { flb_sds_t uri = NULL; + flb_sds_t query_params = NULL; flb_sds_t tmp; struct flb_http_client *c = NULL; struct flb_aws_client *s3_client; - struct flb_aws_header *headers = NULL; - int num_headers = 0; - int ret; + int result; - uri = flb_sds_create_size(flb_sds_len(m_upload->s3_key) + 8); - if (!uri) { - flb_errno(); + if (!s3_key || !upload_id) { + flb_plg_error(ctx->ins, "Invalid parameters for upload existence check"); return -1; } - if (pre_signed_url != NULL) { - tmp = flb_sds_copy(uri, pre_signed_url, strlen(pre_signed_url)); - } - else { - tmp = flb_sds_printf(&uri, "/%s%s?uploads=", ctx->bucket, m_upload->s3_key); + /* Build ListParts query (max-parts=1 for minimal response) */ + query_params = flb_sds_create_size(128); + if (!query_params) { + flb_errno(); + flb_sds_destroy(s3_key); + return -1; } + tmp = flb_sds_printf(&query_params, "?uploadId=%s&max-parts=1", upload_id); if (!tmp) { - flb_sds_destroy(uri); + flb_sds_destroy(query_params); + flb_sds_destroy(s3_key); + return -1; + } + query_params = tmp; + + uri = build_s3_uri(ctx, NULL, s3_key, query_params); + flb_sds_destroy(query_params); + + if (!uri) { + flb_sds_destroy(s3_key); return -1; } - uri = tmp; s3_client = ctx->s3_client; - if (s3_plugin_under_test() == FLB_TRUE) { - c = mock_s3_call("TEST_CREATE_MULTIPART_UPLOAD_ERROR", "CreateMultipartUpload"); + + /* Call ListParts API */ + c = s3_client->client_vtable->request(s3_client, FLB_HTTP_GET, + uri, NULL, 0, NULL, 0); + flb_sds_destroy(uri); + + if (!c) { + /* Network error or request failed */ + flb_plg_error(ctx->ins, "ListParts request failed (network/API error)"); + flb_sds_destroy(s3_key); + return -1; } - else { - ret = create_headers(ctx, NULL, &headers, &num_headers, FLB_TRUE); - if (ret == -1) { - flb_plg_error(ctx->ins, "Failed to create headers"); - flb_sds_destroy(uri); - return -1; - } - c = s3_client->client_vtable->request(s3_client, FLB_HTTP_POST, - uri, NULL, 0, headers, num_headers); - if (headers) { - flb_free(headers); - } + + /* Analyze response */ + if (c->resp.status == 200) { + /* Upload exists */ + flb_plg_debug(ctx->ins, "Upload ID validation: exists"); + result = 1; } - flb_sds_destroy(uri); - if (c) { - flb_plg_debug(ctx->ins, "CreateMultipartUpload http status=%d", - c->resp.status); - if (c->resp.status == 200) { - tmp = flb_aws_xml_get_val(c->resp.payload, c->resp.payload_size, - "", ""); - if (!tmp) { - flb_plg_error(ctx->ins, "Could not find upload ID in " - "CreateMultipartUpload response"); - flb_plg_debug(ctx->ins, "Raw CreateMultipartUpload response: %s", - c->resp.payload); - flb_http_client_destroy(c); - return -1; - } - m_upload->upload_id = tmp; - flb_plg_info(ctx->ins, "Successfully initiated multipart upload " - "for %s, UploadId=%s", m_upload->s3_key, - m_upload->upload_id); - flb_http_client_destroy(c); - return 0; - } - flb_aws_print_xml_error(c->resp.payload, c->resp.payload_size, - "CreateMultipartUpload", ctx->ins); - if (c->resp.payload != NULL) { - flb_plg_debug(ctx->ins, "Raw CreateMultipartUpload response: %s", - c->resp.payload); - } - flb_http_client_destroy(c); + else if (c->resp.payload && + strstr(c->resp.payload, "NoSuchUpload")) { + /* Upload does not exist */ + flb_plg_debug(ctx->ins, "Upload ID validation: NoSuchUpload"); + result = 0; + } + else { + /* Other error - log and treat conservatively */ + flb_plg_warn(ctx->ins, "ListParts returned status %d, " + "treating upload_id as potentially invalid", c->resp.status); + result = 0; } - flb_plg_error(ctx->ins, "CreateMultipartUpload request failed"); - return -1; + flb_http_client_destroy(c); + return result; } -/* gets the ETag value from response headers */ -flb_sds_t get_etag(char *response, size_t size) +/* + * Upload a file part for multipart upload + * + * MEMORY USAGE: + * This function allocates a buffer equal to the part size (offset_end - offset_start). + * Larger values improve throughput but increase memory usage. For memory-constrained + * environments, configure smaller upload_chunk_size (default: 100MB). + * + * TECHNICAL LIMITATION: + * The S3 UploadPart API requires complete part data in memory to calculate Content-Length. + * True streaming upload would require HTTP client support for chunked transfer encoding. + */ +int s3_multipart_upload_file_part(struct flb_s3 *ctx, + const char *file_path, + off_t offset_start, + off_t offset_end, + struct multipart_upload *m_upload, + flb_sds_t pre_signed_url) { - char *tmp; - int start; - int end; - int len; - int i = 0; - flb_sds_t etag; - - if (response == NULL) { - return NULL; + int fd = -1; + char *stream_buffer = NULL; + char *part_buffer = NULL; + size_t part_size; + size_t total_read = 0; + size_t remaining; + size_t chunk_size; + ssize_t bytes_read; + off_t current_pos; + int ret = -1; + + /* Use 8MB streaming buffer for file reads (helps with I/O, but not memory footprint) */ + #define STREAM_BUFFER_SIZE (8 * 1024 * 1024) + + /* Validate offset ranges to prevent underflow */ + if (offset_start < 0 || offset_end < 0) { + flb_plg_error(ctx->ins, "Invalid negative offsets: start=%lld, end=%lld", + (long long)offset_start, (long long)offset_end); + return -1; } - tmp = strstr(response, "ETag:"); - if (!tmp) { - return NULL; + if (offset_end <= offset_start) { + flb_plg_error(ctx->ins, "Invalid offset range: end (%lld) must be greater than start (%lld)", + (long long)offset_end, (long long)offset_start); + return -1; } - i = tmp - response; - /* advance to end of ETag key */ - i += 5; + part_size = offset_end - offset_start; - /* advance across any whitespace and the opening quote */ - while (i < size && (response[i] == '\"' || isspace(response[i]) != 0)) { - i++; + /* Allocate full part buffer for the complete part data */ + part_buffer = flb_malloc(part_size); + if (!part_buffer) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to allocate part buffer (%zu bytes)", part_size); + return -1; } - start = i; - /* advance until we hit whitespace or the end quote */ - while (i < size && (response[i] != '\"' && isspace(response[i]) == 0)) { - i++; + + /* Allocate streaming buffer for reading chunks */ + stream_buffer = flb_malloc(STREAM_BUFFER_SIZE); + if (!stream_buffer) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to allocate stream buffer (%d bytes)", STREAM_BUFFER_SIZE); + flb_free(part_buffer); + return -1; } - end = i; - len = end - start; - etag = flb_sds_create_len(response + start, len); - if (!etag) { + /* Open file */ + fd = flb_open(file_path, O_RDONLY); + if (fd < 0) { flb_errno(); - return NULL; + flb_plg_error(ctx->ins, "Failed to open file: %s", file_path); + flb_free(stream_buffer); + flb_free(part_buffer); + return -1; } - return etag; + /* Seek to start offset */ + if (flb_lseek(fd, offset_start, SEEK_SET) < 0) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to seek to offset %lld in file", + (long long)offset_start); + flb_close(fd); + flb_free(stream_buffer); + flb_free(part_buffer); + return -1; + } + + /* Stream read the part in chunks */ + current_pos = offset_start; + while (current_pos < offset_end) { + remaining = offset_end - current_pos; + chunk_size = (remaining < STREAM_BUFFER_SIZE) ? remaining : STREAM_BUFFER_SIZE; + + /* + * Loop to handle short reads: read() may return fewer bytes than requested + * (this is valid behavior on some filesystems like NFS, pipes, etc.) + * Keep reading until we get the full chunk or hit EOF/error. + */ + size_t chunk_read = 0; + while (chunk_read < chunk_size) { + bytes_read = flb_read(fd, stream_buffer + chunk_read, chunk_size - chunk_read); + + if (bytes_read < 0) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to read at offset %lld", + (long long)(current_pos + chunk_read)); + flb_close(fd); + flb_free(stream_buffer); + flb_free(part_buffer); + return -1; + } + + if (bytes_read == 0) { + /* EOF reached */ + if (current_pos + chunk_read < offset_end) { + /* Unexpected EOF before we read the expected part size */ + flb_plg_error(ctx->ins, "Unexpected EOF at offset %lld (read %zu of %zu bytes)", + (long long)(current_pos + chunk_read), + total_read + chunk_read, part_size); + flb_close(fd); + flb_free(stream_buffer); + flb_free(part_buffer); + return -1; + } + /* EOF at expected position - we're done with this chunk */ + break; + } + + chunk_read += bytes_read; + } + + /* Copy complete chunk to part buffer */ + memcpy(part_buffer + total_read, stream_buffer, chunk_read); + total_read += chunk_read; + current_pos += chunk_read; + } + + /* Close file and free stream buffer */ + flb_close(fd); + fd = -1; + flb_free(stream_buffer); + stream_buffer = NULL; + + /* Verify we read the expected amount */ + if (total_read != part_size) { + flb_plg_error(ctx->ins, "Size mismatch: read %zu bytes, expected %zu bytes", + total_read, part_size); + flb_free(part_buffer); + return -1; + } + + /* Upload the complete part */ + ret = s3_multipart_upload_part(ctx, m_upload, part_buffer, part_size, pre_signed_url); + flb_free(part_buffer); + + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to upload part %d", m_upload->part_number); + return -1; + } + + return 0; + + #undef STREAM_BUFFER_SIZE } -int upload_part(struct flb_s3 *ctx, struct multipart_upload *m_upload, - char *body, size_t body_size, char *pre_signed_url) +static int initialize_multipart_upload(struct flb_s3 *ctx, + const char *s3_key, + const char *tag, int tag_len, + struct multipart_upload **m_upload) { - flb_sds_t uri = NULL; - flb_sds_t tmp; - int ret; - struct flb_http_client *c = NULL; - struct flb_aws_client *s3_client; - struct flb_aws_header *headers = NULL; - int num_headers = 0; - char body_md5[25]; + struct multipart_upload *upload; - uri = flb_sds_create_size(flb_sds_len(m_upload->s3_key) + 8); - if (!uri) { + upload = flb_calloc(1, sizeof(struct multipart_upload)); + if (!upload) { flb_errno(); - return -1; + flb_plg_error(ctx->ins, "Failed to allocate multipart upload structure"); + return FLB_RETRY; } - if (pre_signed_url != NULL) { - tmp = flb_sds_copy(uri, pre_signed_url, strlen(pre_signed_url)); - } - else { - tmp = flb_sds_printf(&uri, "/%s%s?partNumber=%d&uploadId=%s", - ctx->bucket, m_upload->s3_key, m_upload->part_number, - m_upload->upload_id); + upload->s3_key = flb_sds_create(s3_key); + if (!upload->s3_key) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to create s3_key"); + flb_free(upload); + return FLB_RETRY; } - if (!tmp) { + upload->tag = flb_sds_create_len(tag, tag_len); + if (!upload->tag) { flb_errno(); - flb_sds_destroy(uri); - return -1; + flb_sds_destroy(upload->s3_key); + flb_free(upload); + return FLB_RETRY; } - uri = tmp; - memset(body_md5, 0, sizeof(body_md5)); - if (ctx->send_content_md5 == FLB_TRUE) { - ret = get_md5_base64(body, body_size, body_md5, sizeof(body_md5)); - if (ret != 0) { - flb_plg_error(ctx->ins, "Failed to create Content-MD5 header"); - flb_sds_destroy(uri); + upload->part_number = 0; + upload->bytes = 0; + upload->upload_id = NULL; + + *m_upload = upload; + return 0; +} + +/* Upload all parts of a file sequentially with retry support */ +static int s3_multipart_upload_file_parts(struct flb_s3 *ctx, + const char *file_path, + off_t file_size, + struct multipart_upload *m_upload) +{ + off_t current_offset = 0; + flb_sds_t pre_signed_url = NULL; + int ret; + int part_attempt; + int backoff_ms; + int max_attempts; + + /* Ensure retry loop always runs at least once */ + max_attempts = (ctx->part_delivery_attempt_limit > 0) ? ctx->part_delivery_attempt_limit : 1; + + while (current_offset < file_size) { + /* Check if next part would exceed AWS S3 limit (10000 parts) */ + if (m_upload->part_number >= 10000) { + flb_plg_error(ctx->ins, "Cannot upload part %d: exceeds AWS S3 maximum of 10000 parts", + m_upload->part_number + 1); return -1; } - num_headers = 1; - headers = flb_malloc(sizeof(struct flb_aws_header) * num_headers); - if (headers == NULL) { - flb_errno(); - flb_sds_destroy(uri); - return -1; + m_upload->part_number++; + + off_t offset_start = current_offset; + off_t offset_end = current_offset + ctx->upload_chunk_size; + + if (offset_end > file_size) { + offset_end = file_size; } - headers[0].key = "Content-MD5"; - headers[0].key_len = 11; - headers[0].val = body_md5; - headers[0].val_len = strlen(body_md5); - } + /* Retry logic for this part upload */ + part_attempt = 0; + ret = -1; + + while (part_attempt < max_attempts) { + part_attempt++; + + /* Fetch presigned URL for this attempt */ + ret = s3_auth_fetch_presigned_url(ctx, &pre_signed_url, + S3_PRESIGNED_URL_UPLOAD_PART, + m_upload->s3_key, m_upload->upload_id, + m_upload->part_number); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to fetch presigned URL for part %d, " + "attempt %d/%d", + m_upload->part_number, part_attempt, max_attempts); + if (pre_signed_url) { + flb_sds_destroy(pre_signed_url); + pre_signed_url = NULL; + } + + /* Retry presigned URL fetch with backoff */ + if (part_attempt < max_attempts) { + backoff_ms = 1000 * part_attempt; /* 1s, 2s, 3s... */ + flb_plg_info(ctx->ins, "Retrying presigned URL fetch after %d ms", + backoff_ms); + sleep_ms(backoff_ms); + } + continue; + } - s3_client = ctx->s3_client; - if (s3_plugin_under_test() == FLB_TRUE) { - c = mock_s3_call("TEST_UPLOAD_PART_ERROR", "UploadPart"); - } - else { - c = s3_client->client_vtable->request(s3_client, FLB_HTTP_PUT, - uri, body, body_size, - headers, num_headers); - } - flb_free(headers); - flb_sds_destroy(uri); - if (c) { - flb_plg_info(ctx->ins, "UploadPart http status=%d", - c->resp.status); - if (c->resp.status == 200) { - tmp = get_etag(c->resp.data, c->resp.data_size); - if (!tmp) { - flb_plg_error(ctx->ins, "Could not find ETag in " - "UploadPart response"); - flb_plg_debug(ctx->ins, "Raw UploadPart response: %s", - c->resp.payload); - flb_http_client_destroy(c); - return -1; + /* Attempt to upload the part */ + ret = s3_multipart_upload_file_part(ctx, file_path, offset_start, offset_end, + m_upload, pre_signed_url); + + if (pre_signed_url) { + flb_sds_destroy(pre_signed_url); + pre_signed_url = NULL; } - m_upload->etags[m_upload->part_number - 1] = tmp; - flb_plg_info(ctx->ins, "Successfully uploaded part #%d " - "for %s, UploadId=%s, ETag=%s", m_upload->part_number, - m_upload->s3_key, m_upload->upload_id, tmp); - flb_http_client_destroy(c); - /* track how many bytes are have gone toward this upload */ - m_upload->bytes += body_size; - /* finally, attempt to persist the data for this upload */ - ret = save_upload(ctx, m_upload, tmp); if (ret == 0) { - flb_plg_debug(ctx->ins, "Successfully persisted upload data, UploadId=%s", - m_upload->upload_id); + /* Success - break out of retry loop */ + break; } - else { - flb_plg_warn(ctx->ins, "Was not able to persisted upload data to disk; " - "if fluent bit dies without completing this upload the part " - "could be lost, UploadId=%s, ETag=%s", - m_upload->upload_id, tmp); + + /* Upload failed */ + flb_plg_warn(ctx->ins, "Failed to upload part %d for %s, attempt %d/%d", + m_upload->part_number, m_upload->s3_key, + part_attempt, max_attempts); + + /* Apply exponential backoff before retry */ + if (part_attempt < max_attempts) { + uint64_t shift_exp; + /* Prevent overflow in shift operation */ + if (part_attempt - 1 >= 30) { + backoff_ms = 30000; /* Max out immediately if exponent too large */ + } + else { + shift_exp = 1ULL << (part_attempt - 1); + if (shift_exp > 30) { + backoff_ms = 30000; + } + else { + backoff_ms = 1000 * (int)shift_exp; + if (backoff_ms > 30000) { + backoff_ms = 30000; /* Cap at 30 seconds */ + } + } + } + flb_plg_info(ctx->ins, "Retrying part upload after %d ms", backoff_ms); + sleep_ms(backoff_ms); } - return 0; } - flb_aws_print_xml_error(c->resp.payload, c->resp.payload_size, - "UploadPart", ctx->ins); - if (c->resp.payload != NULL) { - flb_plg_debug(ctx->ins, "Raw UploadPart response: %s", - c->resp.payload); + + /* Check if all retry attempts failed */ + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to upload part %d for %s after %d attempts", + m_upload->part_number, m_upload->s3_key, max_attempts); + return -1; } - flb_http_client_destroy(c); + + current_offset = offset_end; } - flb_plg_error(ctx->ins, "UploadPart request failed"); - return -1; + return 0; +} + +int s3_multipart_upload_file(struct flb_s3 *ctx, + const char *file_path, + const char *s3_key, + const char *tag, int tag_len, + time_t file_first_log_time) +{ + struct multipart_upload *m_upload = NULL; +#ifdef _WIN32 + struct _stat64 file_stat; +#else + struct stat file_stat; +#endif + flb_sds_t pre_signed_url = NULL; + int ret; + + if (flb_stat(file_path, &file_stat) != 0) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to stat file: %s", file_path); + return FLB_RETRY; + } + + ret = initialize_multipart_upload(ctx, s3_key, tag, tag_len, &m_upload); + if (ret != 0) { + return ret; + } + + ret = s3_auth_fetch_presigned_url(ctx, &pre_signed_url, + S3_PRESIGNED_URL_CREATE_MULTIPART, + m_upload->s3_key, NULL, 0); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to fetch presigned URL for create multipart"); + s3_multipart_upload_destroy(m_upload); + return FLB_RETRY; + } + + ret = s3_multipart_create(ctx, m_upload, pre_signed_url); + if (pre_signed_url) { + flb_sds_destroy(pre_signed_url); + pre_signed_url = NULL; + } + + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to create multipart upload for %s", + m_upload->s3_key); + s3_multipart_upload_destroy(m_upload); + return FLB_RETRY; + } + + ret = s3_multipart_upload_file_parts(ctx, file_path, file_stat.st_size, m_upload); + if (ret < 0) { + s3_multipart_abort(ctx, m_upload, NULL); + s3_multipart_upload_destroy(m_upload); + return FLB_RETRY; + } + + if (m_upload->bytes != (size_t)file_stat.st_size) { + flb_plg_error(ctx->ins, "Size mismatch: uploaded %zu bytes, expected %lld bytes", + m_upload->bytes, (long long)file_stat.st_size); + s3_multipart_abort(ctx, m_upload, NULL); + s3_multipart_upload_destroy(m_upload); + return FLB_RETRY; + } + + ret = s3_auth_fetch_presigned_url(ctx, &pre_signed_url, + S3_PRESIGNED_URL_COMPLETE_MULTIPART, + m_upload->s3_key, m_upload->upload_id, 0); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to fetch presigned URL for complete multipart"); + if (pre_signed_url) { + flb_sds_destroy(pre_signed_url); + } + s3_multipart_abort(ctx, m_upload, NULL); + s3_multipart_upload_destroy(m_upload); + return FLB_RETRY; + } + + ret = s3_multipart_complete(ctx, m_upload, pre_signed_url); + if (pre_signed_url) { + flb_sds_destroy(pre_signed_url); + } + + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to complete multipart upload for %s", + m_upload->s3_key); + s3_multipart_upload_destroy(m_upload); + return FLB_RETRY; + } + + flb_plg_info(ctx->ins, "Successfully uploaded %s (%zu bytes, %d parts)", + m_upload->s3_key, m_upload->bytes, m_upload->part_number); + s3_multipart_upload_destroy(m_upload); + return FLB_OK; } diff --git a/plugins/out_s3/s3_multipart.h b/plugins/out_s3/s3_multipart.h new file mode 100644 index 00000000000..b68a201903d --- /dev/null +++ b/plugins/out_s3/s3_multipart.h @@ -0,0 +1,104 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FLB_OUT_S3_MULTIPART_H +#define FLB_OUT_S3_MULTIPART_H + +#include "s3.h" + +/* Calculate optimal part size within AWS S3 limits (5MiB-5GiB, max 10000 parts) */ +size_t flb_s3_calculate_optimal_part_size(size_t configured_part_size, + size_t file_size); + +/* Get MD5 hash in base64 format */ +int s3_multipart_get_md5_base64(char *buf, size_t buf_size, + char *md5_str, size_t md5_str_size); + +/* Create HTTP headers for multipart upload */ +int s3_multipart_create_headers(struct flb_s3 *ctx, char *body_md5, + struct flb_aws_header **headers, int *num_headers, + int multipart_upload); + +/* Create multipart upload structure (used by orchestration layer) */ +struct multipart_upload *s3_multipart_upload_create(struct flb_s3 *ctx, + const char *tag, + int tag_len, + const char *path); + +/* Destroy multipart upload structure */ +void s3_multipart_upload_destroy(struct multipart_upload *m_upload); + +/* + * AWS S3 Multipart Upload API wrappers + */ + +/* Initiate multipart upload and get upload_id */ +int s3_multipart_create(struct flb_s3 *ctx, + struct multipart_upload *m_upload, + char *pre_signed_url); + +/* Upload a single part */ +int s3_multipart_upload_part(struct flb_s3 *ctx, + struct multipart_upload *m_upload, + char *body, size_t body_size, + char *pre_signed_url); + +/* Complete multipart upload */ +int s3_multipart_complete(struct flb_s3 *ctx, + struct multipart_upload *m_upload, + char *pre_signed_url); + +/* Abort multipart upload */ +int s3_multipart_abort(struct flb_s3 *ctx, + struct multipart_upload *m_upload, + char *pre_signed_url); + +/* + * Check if multipart upload exists on S3 + * Returns: 1 if exists, 0 if not exists, -1 on error + * Used during recovery to validate stored upload_id + * + * IMPORTANT: s3_key must be the actual persisted key from the original upload. + * Do NOT regenerate the key using time(NULL) or current seq_index as this may + * not match the original key if the key format includes timestamps or sequence numbers. + */ +int s3_multipart_check_upload_exists(struct flb_s3 *ctx, + const char *s3_key, + const char *upload_id); + +/* + * High-level file upload functions + */ + +/* Upload file part from disk (streaming, memory-efficient) */ +int s3_multipart_upload_file_part(struct flb_s3 *ctx, + const char *file_path, + off_t offset_start, + off_t offset_end, + struct multipart_upload *m_upload, + flb_sds_t pre_signed_url); + +/* Upload entire file using streaming multipart upload */ +int s3_multipart_upload_file(struct flb_s3 *ctx, + const char *file_path, + const char *s3_key, + const char *tag, int tag_len, + time_t file_first_log_time); + +#endif /* FLB_OUT_S3_MULTIPART_H */ diff --git a/plugins/out_s3/s3_queue.c b/plugins/out_s3/s3_queue.c new file mode 100644 index 00000000000..2e2c92f86d2 --- /dev/null +++ b/plugins/out_s3/s3_queue.c @@ -0,0 +1,1150 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "s3.h" +#include "s3_multipart.h" +#include "s3_auth.h" +#include "s3_queue.h" +#include "s3_store.h" +#include "s3_blob.h" + +/* Queue processing return codes */ +#define S3_QUEUE_ENTRY_SUCCESS 1 +#define S3_QUEUE_ENTRY_RETRY -1 +#define S3_QUEUE_ENTRY_INVALID 0 + +/* Upload failure handling codes */ +#define S3_UPLOAD_ENTRY_DESTROYED 1 +#define S3_UPLOAD_ENTRY_REQUEUE 0 + +/* Forward declarations for internal static helper functions */ +static int is_queue_entry_valid(struct upload_queue *entry, struct flb_s3 *ctx); +static int upload_part_with_db_tracking(struct flb_s3 *ctx, struct upload_queue *entry); +static int upload_without_db_tracking(struct flb_s3 *ctx, struct upload_queue *entry); +static int handle_upload_failure(struct flb_s3 *ctx, + struct upload_queue *entry, + time_t now); +static int check_and_complete_multipart(struct flb_s3 *ctx, uint64_t file_id, const char *s3_key); +static int enqueue_file_parts_for_resume(struct flb_s3 *ctx, + uint64_t file_id, + const char *file_path, + const char *upload_id, + const char *tag, + int tag_len); + +/* + * Legacy file-level queue add function + */ +int s3_queue_add_file(struct flb_s3 *ctx, + uint64_t file_id, + struct s3_file *upload_file, + const char *file_path, + const char *tag, + int tag_len) +{ + struct upload_queue *entry; + flb_sds_t tag_copy; + flb_sds_t path_copy = NULL; + + entry = flb_calloc(1, sizeof(struct upload_queue)); + if (!entry) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to allocate memory for upload queue entry"); + return -1; + } + + tag_copy = flb_sds_create_len(tag, tag_len); + if (!tag_copy) { + flb_errno(); + flb_free(entry); + return -1; + } + + if (file_id > 0 && file_path) { + path_copy = flb_sds_create(file_path); + if (!path_copy) { + flb_errno(); + flb_sds_destroy(tag_copy); + flb_free(entry); + return -1; + } + } + + entry->file_id = file_id; + entry->part_db_id = 0; /* File-level mode */ + entry->part_id = 0; + entry->upload_file = upload_file; + entry->file_path = path_copy; + entry->offset_start = 0; + entry->offset_end = 0; + entry->s3_key = NULL; + entry->upload_id = NULL; + entry->tag = tag_copy; + entry->tag_len = tag_len; + entry->retry_counter = 0; + entry->upload_time = time(NULL); + + pthread_mutex_lock(&ctx->upload_queue_lock); + mk_list_add(&entry->_head, &ctx->upload_queue); + pthread_mutex_unlock(&ctx->upload_queue_lock); + + return 0; +} + +int s3_queue_add_part(struct flb_s3 *ctx, + uint64_t file_id, + uint64_t part_db_id, + uint64_t part_id, + const char *file_path, + off_t offset_start, + off_t offset_end, + const char *s3_key, + const char *upload_id, + const char *tag, + int tag_len) +{ + struct upload_queue *entry; + + entry = flb_calloc(1, sizeof(struct upload_queue)); + if (!entry) { + flb_errno(); + return -1; + } + + entry->file_id = file_id; + entry->part_db_id = part_db_id; + entry->part_id = part_id; + entry->upload_file = NULL; + entry->offset_start = offset_start; + entry->offset_end = offset_end; + entry->retry_counter = 0; + entry->upload_time = time(NULL); + entry->needs_upload_creation = 0; + + entry->file_path = flb_sds_create(file_path); + if (!entry->file_path) { + flb_errno(); + flb_free(entry); + return -1; + } + + entry->s3_key = flb_sds_create(s3_key); + if (!entry->s3_key) { + flb_errno(); + flb_sds_destroy(entry->file_path); + flb_free(entry); + return -1; + } + + entry->upload_id = flb_sds_create(upload_id); + if (!entry->upload_id) { + flb_errno(); + flb_sds_destroy(entry->file_path); + flb_sds_destroy(entry->s3_key); + flb_free(entry); + return -1; + } + + entry->tag = flb_sds_create_len(tag, tag_len); + if (!entry->tag) { + flb_errno(); + flb_sds_destroy(entry->file_path); + flb_sds_destroy(entry->s3_key); + flb_sds_destroy(entry->upload_id); + flb_free(entry); + return -1; + } + entry->tag_len = tag_len; + + pthread_mutex_lock(&ctx->upload_queue_lock); + mk_list_add(&entry->_head, &ctx->upload_queue); + pthread_mutex_unlock(&ctx->upload_queue_lock); + + return 0; +} + +int s3_queue_add_pending_file(struct flb_s3 *ctx, + uint64_t file_id, + const char *file_path, + const char *tag, + int tag_len) +{ + struct upload_queue *entry; + + entry = flb_calloc(1, sizeof(struct upload_queue)); + if (!entry) { + flb_errno(); + return -1; + } + + entry->file_id = file_id; + entry->part_db_id = 0; + entry->part_id = 0; + entry->upload_file = NULL; + entry->offset_start = 0; + entry->offset_end = 0; + entry->retry_counter = 0; + entry->upload_time = time(NULL); + entry->needs_upload_creation = 1; + entry->s3_key = NULL; + entry->upload_id = NULL; + + entry->file_path = flb_sds_create(file_path); + if (!entry->file_path) { + flb_errno(); + flb_free(entry); + return -1; + } + + entry->tag = flb_sds_create_len(tag, tag_len); + if (!entry->tag) { + flb_errno(); + flb_sds_destroy(entry->file_path); + flb_free(entry); + return -1; + } + entry->tag_len = tag_len; + + pthread_mutex_lock(&ctx->upload_queue_lock); + mk_list_add(&entry->_head, &ctx->upload_queue); + pthread_mutex_unlock(&ctx->upload_queue_lock); + + return 0; +} + +/* + * Free queue entry memory without removing from list. + * Used when the entry has already been removed from list by caller. + */ +static void s3_queue_entry_destroy(struct flb_s3 *ctx, struct upload_queue *entry) +{ + if (!entry) { + return; + } + + if (entry->tag) { + flb_sds_destroy(entry->tag); + } + + if (entry->file_path) { + flb_sds_destroy(entry->file_path); + } + + if (entry->s3_key) { + flb_sds_destroy(entry->s3_key); + } + + if (entry->upload_id) { + flb_sds_destroy(entry->upload_id); + } + + flb_free(entry); +} + +/* + * Remove entry from list and free memory. + * Used during cleanup/shutdown. + */ +void s3_queue_remove(struct flb_s3 *ctx, struct upload_queue *entry) +{ + if (!entry) { + return; + } + + mk_list_del(&entry->_head); + s3_queue_entry_destroy(ctx, entry); +} + +static int is_queue_entry_valid(struct upload_queue *entry, struct flb_s3 *ctx) +{ + /* Basic NULL checks */ + if (!entry || !entry->tag) { + flb_plg_warn(ctx->ins, "Invalid queue entry: NULL entry or tag"); + return FLB_FALSE; + } + + /* With DB tracking (file_id > 0) - minimal checks */ + if (entry->file_id > 0) { + /* With DB: actual file state will be checked during upload */ + return FLB_TRUE; + } + + /* Without DB tracking (file_id == 0) - detailed checks needed */ + if (!entry->upload_file) { + flb_plg_warn(ctx->ins, "Invalid entry without DB: missing upload_file (tag=%s)", entry->tag); + return FLB_FALSE; + } + + if (entry->upload_file->locked == FLB_FALSE) { + flb_plg_warn(ctx->ins, "Invalid entry without DB: file not locked (tag=%s)", entry->tag); + return FLB_FALSE; + } + + if (entry->upload_file->size <= 0) { + flb_plg_warn(ctx->ins, "Invalid entry without DB: zero size (tag=%s)", entry->tag); + return FLB_FALSE; + } + + return FLB_TRUE; +} + +static int is_ready_to_upload(struct upload_queue *entry, time_t now) +{ + return (now >= entry->upload_time); +} + +int s3_queue_buffer_chunk(void *out_context, + struct s3_file *upload_file, + flb_sds_t chunk, + int chunk_size, + const char *tag, + int tag_len, + time_t file_first_log_time) +{ + struct flb_s3 *ctx = out_context; + int ret; + + ret = s3_store_buffer_put(ctx, upload_file, tag, tag_len, + chunk, (size_t)chunk_size, file_first_log_time); + flb_sds_destroy(chunk); + + if (ret < 0) { + flb_plg_warn(ctx->ins, "Failed to buffer chunk. " + "Data order preservation may be compromised"); + return -1; + } + + return 0; +} + +/* + * Upload part with DB tracking (new, granular resume support) + * - Uses part_db_id to track individual part upload + * - Supports part-level resume after crash + * - Updates database after each part upload + */ +static int upload_part_with_db_tracking(struct flb_s3 *ctx, struct upload_queue *entry) +{ + struct multipart_upload m_upload; + flb_sds_t pre_signed_url = NULL; + int ret; + + /* Check if exit is in progress */ + if (ctx->is_exiting == FLB_TRUE) { + flb_plg_debug(ctx->ins, "Upload interrupted: exit in progress"); + return FLB_RETRY; + } + + if (!entry->file_path || !entry->s3_key || !entry->upload_id) { + flb_plg_error(ctx->ins, "Part entry missing required fields"); + return -1; + } + + /* Setup minimal multipart_upload structure */ + memset(&m_upload, 0, sizeof(m_upload)); + m_upload.s3_key = entry->s3_key; + m_upload.upload_id = entry->upload_id; + m_upload.part_number = (int)entry->part_id + 1; /* AWS uses 1-based part numbers */ + m_upload.tag = entry->tag; + + /* Mark part as in_progress */ + if (ctx->blob_db.db) { + flb_blob_db_file_part_in_progress(&ctx->blob_db, entry->part_db_id, 1); + } + + /* Fetch presigned URL */ + ret = s3_auth_fetch_presigned_url(ctx, &pre_signed_url, + S3_PRESIGNED_URL_UPLOAD_PART, + entry->s3_key, entry->upload_id, + m_upload.part_number); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to fetch presigned URL for part upload"); + if (ctx->blob_db.db) { + flb_blob_db_file_part_in_progress(&ctx->blob_db, entry->part_db_id, 0); + } + return FLB_RETRY; + } + + /* Upload the part */ + ret = s3_multipart_upload_file_part(ctx, entry->file_path, + entry->offset_start, entry->offset_end, + &m_upload, pre_signed_url); + flb_sds_destroy(pre_signed_url); + + if (ret == 0) { + /* Success - mark part as uploaded and save ETag */ + if (ctx->blob_db.db) { + /* Save ETag to database */ + if (m_upload.part_number > 0 && m_upload.part_number <= 10000 && + m_upload.etags[m_upload.part_number - 1]) { + flb_blob_db_file_part_update_remote_id(&ctx->blob_db, entry->part_db_id, + m_upload.etags[m_upload.part_number - 1]); + /* Free the SDS string after it's been saved to database */ + flb_sds_destroy(m_upload.etags[m_upload.part_number - 1]); + m_upload.etags[m_upload.part_number - 1] = NULL; + } + flb_blob_db_file_part_uploaded(&ctx->blob_db, entry->part_db_id); + } + + /* Check if all parts are uploaded and complete if so */ + ret = check_and_complete_multipart(ctx, entry->file_id, entry->s3_key); + if (ret < 0) { + flb_plg_warn(ctx->ins, "Failed to complete multipart upload for file_id=%"PRIu64, + entry->file_id); + } + + return FLB_OK; + } + else { + /* Upload failed - clean up any allocated etag */ + if (m_upload.part_number > 0 && m_upload.part_number <= 10000 && + m_upload.etags[m_upload.part_number - 1]) { + flb_sds_destroy(m_upload.etags[m_upload.part_number - 1]); + m_upload.etags[m_upload.part_number - 1] = NULL; + } + + flb_plg_warn(ctx->ins, "Failed to upload part %"PRIu64" of file_id=%"PRIu64, + entry->part_id, entry->file_id); + + if (ctx->blob_db.db) { + flb_blob_db_file_part_in_progress(&ctx->blob_db, entry->part_db_id, 0); + } + + return FLB_RETRY; + } +} + +/* + * Upload without DB tracking (no resume support) + * - Uses upload_file pointer directly + * - Failed uploads retry from beginning + * - Lower overhead for small files + */ +static int upload_without_db_tracking(struct flb_s3 *ctx, struct upload_queue *entry) +{ + flb_sds_t buffer = NULL; + size_t buffer_size; + time_t file_first_log_time; + int ret; + + file_first_log_time = entry->upload_file ? + entry->upload_file->first_log_time : time(NULL); + + /* Format chunk data */ + ret = s3_format_chunk(ctx, entry->upload_file, &buffer, &buffer_size); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to format upload file for tag %s", + entry->tag); + return -1; + } + + /* Upload to S3 */ + ret = s3_upload_file(ctx, buffer, buffer_size, + entry->tag, entry->tag_len, file_first_log_time); + flb_sds_destroy(buffer); + + if (ret == FLB_OK) { + if (entry->upload_file) { + s3_store_file_delete(ctx, entry->upload_file); + } + return FLB_OK; + } + + if (entry->upload_file) { + s3_store_file_unlock(entry->upload_file); + entry->upload_file->failures++; + } + + return FLB_RETRY; +} + +/* + * Check if all parts are uploaded and complete multipart upload + * Uses the s3_key from the part entry to ensure consistency with the upload + */ +static int check_and_complete_multipart(struct flb_s3 *ctx, uint64_t file_id, const char *s3_key) +{ + uint64_t db_file_id; + cfl_sds_t file_path = NULL; + cfl_sds_t part_ids = NULL; + cfl_sds_t source = NULL; + cfl_sds_t file_remote_id = NULL; + cfl_sds_t file_tag = NULL; + time_t file_created = 0; + int part_count; + struct multipart_upload m_upload; + flb_sds_t pre_signed_url = NULL; + int ret; + int i; + + if (!ctx->blob_db.db) { + return 0; + } + + /* Check if file has all parts uploaded */ + ret = flb_blob_db_file_fetch_oldest_ready(&ctx->blob_db, + &db_file_id, &file_path, + &part_ids, &source, + &file_remote_id, &file_tag, + &part_count, &file_created); + if (ret != 1 || db_file_id != file_id) { + /* Not ready or different file */ + return 0; + } + + /* Setup multipart_upload structure */ + memset(&m_upload, 0, sizeof(m_upload)); + + /* + * IMPORTANT: Use the s3_key from the part entry that was used during upload + * to ensure consistency. Do NOT regenerate s3_key here. + */ + m_upload.s3_key = flb_sds_create(s3_key); + if (!m_upload.s3_key) { + flb_plg_error(ctx->ins, "Failed to copy S3 key for complete"); + goto cleanup; + } + + m_upload.tag = flb_sds_create(file_tag); + if (!m_upload.tag) { + flb_plg_error(ctx->ins, "Failed to create tag copy"); + goto cleanup; + } + + m_upload.upload_id = flb_sds_create(file_remote_id); + if (!m_upload.upload_id) { + flb_plg_error(ctx->ins, "Failed to create upload_id copy"); + goto cleanup; + } + + m_upload.part_number = part_count; + + /* Fetch all ETags from database */ + flb_sds_t *remote_id_list = flb_calloc(part_count, sizeof(flb_sds_t)); + if (!remote_id_list) { + goto cleanup; + } + + int remote_id_count = 0; + ret = flb_blob_db_file_fetch_part_ids(&ctx->blob_db, file_id, + remote_id_list, part_count, + &remote_id_count); + if (ret < 0 || remote_id_count != part_count) { + flb_plg_error(ctx->ins, "Failed to fetch part ETags"); + for (i = 0; i < remote_id_count; i++) { + if (remote_id_list[i]) { + flb_sds_destroy(remote_id_list[i]); + } + } + flb_free(remote_id_list); + goto cleanup; + } + + /* Copy ETags to m_upload */ + for (i = 0; i < remote_id_count && i < 10000; i++) { + m_upload.etags[i] = remote_id_list[i]; + } + flb_free(remote_id_list); + + /* Fetch presigned URL for complete */ + ret = s3_auth_fetch_presigned_url(ctx, &pre_signed_url, + S3_PRESIGNED_URL_COMPLETE_MULTIPART, + m_upload.s3_key, file_remote_id, 0); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to fetch presigned URL for complete"); + goto cleanup; + } + + /* Complete multipart upload */ + ret = s3_multipart_complete(ctx, &m_upload, pre_signed_url); + flb_sds_destroy(pre_signed_url); + + if (ret == 0) { + flb_plg_info(ctx->ins, "Completed multipart upload: file_id=%"PRIu64" (%d parts)", + file_id, part_count); + + /* Send success notification before deleting */ + s3_blob_notify_delivery(ctx, ctx->ins->config, source, file_path, file_id, FLB_TRUE); + + /* Delete file from database */ + flb_blob_db_file_delete(&ctx->blob_db, file_id); + } + else { + flb_plg_error(ctx->ins, "Failed to complete multipart upload for file_id=%"PRIu64, + file_id); + } + +cleanup: + if (m_upload.s3_key) { + flb_sds_destroy(m_upload.s3_key); + } + if (m_upload.tag) { + flb_sds_destroy(m_upload.tag); + } + if (m_upload.upload_id) { + flb_sds_destroy(m_upload.upload_id); + } + for (i = 0; i < 10000; i++) { + if (m_upload.etags[i]) { + flb_sds_destroy(m_upload.etags[i]); + } + } + if (file_path) { + cfl_sds_destroy(file_path); + } + if (part_ids) { + cfl_sds_destroy(part_ids); + } + if (source) { + cfl_sds_destroy(source); + } + if (file_remote_id) { + cfl_sds_destroy(file_remote_id); + } + if (file_tag) { + cfl_sds_destroy(file_tag); + } + + return (ret == 0) ? 0 : -1; +} + +static int handle_upload_failure(struct flb_s3 *ctx, + struct upload_queue *entry, + time_t now) +{ + entry->retry_counter++; + + if (entry->retry_counter >= ctx->ins->retry_limit) { + if (entry->file_id > 0) { + flb_plg_warn(ctx->ins, "File with DB tracking failed to send %d times, removing from queue", + entry->retry_counter); + /* Mark file as aborted in database */ + if (ctx->blob_db.db != NULL) { + flb_blob_db_file_set_aborted_state(&ctx->blob_db, entry->file_id, 1); + } + } + else { + flb_plg_warn(ctx->ins, "File without DB tracking failed to send %d times, " + "marking as inactive", entry->retry_counter); + if (entry->upload_file) { + s3_store_file_inactive(ctx, entry->upload_file); + } + } + /* Entry already removed from list by caller, just free memory */ + s3_queue_entry_destroy(ctx, entry); + return S3_UPLOAD_ENTRY_DESTROYED; + } + + /* Schedule retry */ + entry->upload_time = now + 2 * entry->retry_counter; + + if (entry->file_id > 0) { + /* Will retry */ + } + else { + if (entry->upload_file) { + s3_store_file_lock(entry->upload_file); + } + ctx->retry_time += 2 * entry->retry_counter; + ctx->upload_queue_success = FLB_FALSE; + } + + return S3_UPLOAD_ENTRY_REQUEUE; +} + +/* + * Process a queue entry - public function called by timer callback + * Returns: S3_QUEUE_ENTRY_SUCCESS on success (entry freed), + * S3_QUEUE_ENTRY_RETRY on failure (will retry), + * S3_QUEUE_ENTRY_INVALID if invalid (entry freed) + */ +int s3_queue_process_entry(struct flb_s3 *ctx, + struct upload_queue *entry, + time_t now) +{ + int ret; + int uses_db; + int failure_ret; + + if (!is_queue_entry_valid(entry, ctx)) { + flb_plg_warn(ctx->ins, "Invalid queue entry, removing"); + s3_queue_entry_destroy(ctx, entry); + return S3_QUEUE_ENTRY_INVALID; + } + + if (!is_ready_to_upload(entry, now)) { + return S3_QUEUE_ENTRY_RETRY; + } + + if (entry->needs_upload_creation) { + ret = s3_initiate_multipart_upload(ctx, entry->file_id, + entry->file_path, + entry->tag, entry->tag_len); + if (ret == 0) { + s3_queue_entry_destroy(ctx, entry); + return S3_QUEUE_ENTRY_SUCCESS; + } + + /* Only update DB state if database is enabled */ + if (ctx->blob_db.db != NULL) { + flb_blob_db_file_set_aborted_state(&ctx->blob_db, entry->file_id, 1); + } + s3_queue_entry_destroy(ctx, entry); + return S3_QUEUE_ENTRY_INVALID; + } + + uses_db = (entry->file_id > 0); + + if (entry->part_db_id > 0) { + ret = upload_part_with_db_tracking(ctx, entry); + } + else { + ret = upload_without_db_tracking(ctx, entry); + } + + if (ret == FLB_OK) { + s3_queue_entry_destroy(ctx, entry); + if (!uses_db) { + ctx->retry_time = 0; + ctx->upload_queue_success = FLB_TRUE; + } + return S3_QUEUE_ENTRY_SUCCESS; + } + else { + failure_ret = handle_upload_failure(ctx, entry, now); + if (failure_ret == S3_UPLOAD_ENTRY_DESTROYED) { + return S3_QUEUE_ENTRY_INVALID; + } + return S3_QUEUE_ENTRY_RETRY; + } +} + + +/* + * Phase 3: Rebuild queue from persistent storage + * Scans all pending files and enqueues them for upload + */ +static int rebuild_queue_from_storage(struct flb_s3 *ctx) +{ + int blob_files = 0; + int log_files = 0; + int total = 0; + + /* With DB: Scan database for all pending files */ + if (ctx->blob_db.db != NULL) { + blob_files = s3_queue_recover_from_database(ctx); + + if (blob_files > 0) { + total += blob_files; + } + else if (blob_files < 0) { + flb_plg_error(ctx->ins, "Phase 3: database scan error"); + } + } + + /* Without DB: Scan fstore for buffered chunks */ + if (ctx->fs && ctx->has_old_buffers == FLB_TRUE) { + log_files = s3_queue_recover_from_fstore(ctx); + + if (log_files > 0) { + total += log_files; + } + } + + return total; +} + +/* + * Simplified recovery interface - Three-phase architecture + * Phase 0: Global cleanup (reset zombie parts once) + * Phase 1: State transitions (stale → pending, aborted → pending/delete) + * Phase 2: Queue rebuild (scan storage and enqueue) + */ +int s3_queue_recover_all(struct flb_s3 *ctx, struct flb_config *config) +{ + int ret; + int total_enqueued = 0; + + flb_plg_info(ctx->ins, "Starting 3-phase recovery"); + + /* Phase 0: Global cleanup - reset all zombie parts once */ + if (ctx->blob_db.db != NULL) { + flb_blob_db_lock(&ctx->blob_db); + ret = flb_blob_db_reset_zombie_parts(&ctx->blob_db); + flb_blob_db_unlock(&ctx->blob_db); + + if (ret < 0) { + flb_plg_error(ctx->ins, "Phase 0: zombie cleanup failed"); + return -1; + } + flb_plg_debug(ctx->ins, "Phase 0: zombie parts reset complete"); + } + + /* Phase 1: State transitions (stale, aborted) */ + if (ctx->blob_db.db != NULL) { + ret = s3_blob_recover_state(ctx, config); + if (ret < 0) { + flb_plg_error(ctx->ins, "Phase 1: state transitions failed"); + return -1; + } + } + + /* Phase 2: Rebuild queue from storage */ + total_enqueued = rebuild_queue_from_storage(ctx); + + if (total_enqueued < 0) { + flb_plg_error(ctx->ins, "Phase 2 failed"); + return -1; + } + + if (total_enqueued > 0) { + flb_plg_info(ctx->ins, "Recovery complete: enqueued %d file(s)", total_enqueued); + } + else { + flb_plg_info(ctx->ins, "Recovery complete: no buffered data found"); + } + + return total_enqueued; +} + +/* + * Resume-aware recovery - File-level scan + * Strategy: + * 1. Query all files that need processing (not aborted, has pending parts) + * 2. For each file: + * a) Has upload_id: Resume by enqueueing unuploaded parts + * b) No upload_id: Create new multipart upload and enqueue all parts + */ +int s3_queue_recover_from_database(struct flb_s3 *ctx) +{ + uint64_t file_id; + cfl_sds_t file_path = NULL; + cfl_sds_t destination = NULL; + cfl_sds_t remote_id = NULL; + cfl_sds_t tag = NULL; + int part_count; + int total_files = 0; + int resumed_files = 0; + int new_files = 0; + int failed_files = 0; + int skipped_files = 0; + int total_enqueued = 0; + int ret; + int progress_interval = 5; + + if (!ctx->blob_db.db) { + return 0; + } + + while (1) { + /* Get next pending file using database interface */ + ret = flb_blob_db_file_get_next_pending(&ctx->blob_db, + &file_id, + &file_path, + &destination, + &remote_id, + &tag, + &part_count); + if (ret == 0) { + /* No more files */ + break; + } + if (ret < 0) { + flb_plg_error(ctx->ins, "Recovery query error"); + break; + } + + /* Check endpoint match */ + if (!destination || !ctx->endpoint || strcmp(destination, ctx->endpoint) != 0) { + flb_plg_debug(ctx->ins, "Skipping file_id=%" PRIu64 " (endpoint mismatch)", file_id); + skipped_files++; + + /* Cleanup allocated strings */ + if (file_path) cfl_sds_destroy(file_path); + if (destination) cfl_sds_destroy(destination); + if (remote_id) cfl_sds_destroy(remote_id); + if (tag) cfl_sds_destroy(tag); + + continue; + } + + /* + * CRITICAL FIX: Mark all parts as in_progress immediately + * This prevents the file from being returned by the next query, + * avoiding infinite loop where the same file is processed repeatedly. + */ + ret = flb_blob_db_file_parts_in_progress(&ctx->blob_db, file_id, 1); + if (ret < 0) { + flb_plg_error(ctx->ins, "Failed to mark parts as in_progress for file_id=%" PRIu64, file_id); + + /* Cleanup and skip this file */ + if (file_path) cfl_sds_destroy(file_path); + if (destination) cfl_sds_destroy(destination); + if (remote_id) cfl_sds_destroy(remote_id); + if (tag) cfl_sds_destroy(tag); + + failed_files++; + continue; + } + + total_files++; + + /* Decide: Resume or Create New */ + if (remote_id && cfl_sds_len(remote_id) > 0) { + /* Validate upload_id before resuming */ + /* + * KNOWN LIMITATION: Generate S3 key for validation + * If s3_key_format contains dynamic time variables (e.g., %Y/%m/%d) and recovery + * happens after a date boundary, the regenerated key may differ from the original, + * causing false "NoSuchUpload" and unnecessary file re-upload. + * + * To fully resolve this would require storing the actual S3 key string in the database, + * which is not currently supported. Users should use static key formats for reliable + * recovery, or accept potential re-uploads when using dynamic time-based keys. + */ + flb_sds_t s3_key = flb_get_s3_key(ctx->s3_key_format, time(NULL), tag, + ctx->tag_delimiters, ctx->seq_index, file_path); + + if (!s3_key) { + flb_plg_error(ctx->ins, "Failed to generate S3 key for validation"); + /* Treat as validation failure */ + ret = -1; + } else { + flb_plg_debug(ctx->ins, "Validating upload_id for file_id=%" PRIu64, file_id); + ret = s3_multipart_check_upload_exists(ctx, s3_key, remote_id); + flb_sds_destroy(s3_key); + } + + if (ret == 1) { + /* Upload exists: Resume by enqueueing unuploaded parts */ + resumed_files++; + flb_plg_debug(ctx->ins, "Upload_id valid, resuming file_id=%" PRIu64, file_id); + + ret = enqueue_file_parts_for_resume(ctx, file_id, file_path, + remote_id, tag, cfl_sds_len(tag)); + if (ret > 0) { + total_enqueued += ret; + } + else { + failed_files++; + } + } + else if (ret == 0) { + /* Upload does not exist (NoSuchUpload): Create new */ + flb_plg_warn(ctx->ins, "Upload_id expired for file_id=%" PRIu64 ", creating new upload", file_id); + new_files++; + + /* Clear invalid upload_id from database */ + flb_blob_file_update_remote_id(&ctx->blob_db, file_id, ""); + + /* Reset parts to allow re-upload */ + flb_blob_db_file_reset_upload_states(&ctx->blob_db, file_id); + + /* Create new multipart upload */ + ret = s3_initiate_multipart_upload(ctx, file_id, file_path, + tag, cfl_sds_len(tag)); + if (ret == 0) { + total_enqueued += part_count; + } + else { + failed_files++; + flb_plg_error(ctx->ins, "Failed to create new upload for file_id=%" PRIu64, file_id); + flb_blob_db_file_set_aborted_state(&ctx->blob_db, file_id, 1); + } + } + else { + /* Network error or API error: Treat as failure */ + flb_plg_error(ctx->ins, "Failed to validate upload_id for file_id=%" PRIu64 ", marking as failed", file_id); + failed_files++; + flb_blob_db_file_set_aborted_state(&ctx->blob_db, file_id, 1); + } + } + else { + /* Create New: File needs new multipart upload */ + new_files++; + + /* Show progress for large batch operations */ + if (new_files % progress_interval == 0) { + flb_plg_info(ctx->ins, "Creating multipart uploads: progress %d/%d files...", + new_files, total_files); + } + + ret = s3_initiate_multipart_upload(ctx, file_id, file_path, + tag, cfl_sds_len(tag)); + if (ret == 0) { + total_enqueued += part_count; + } + else { + failed_files++; + flb_plg_error(ctx->ins, "Failed to create upload for file_id=%" PRIu64, file_id); + flb_blob_db_file_set_aborted_state(&ctx->blob_db, file_id, 1); + } + } + + /* Cleanup allocated strings */ + if (file_path) cfl_sds_destroy(file_path); + if (destination) cfl_sds_destroy(destination); + if (remote_id) cfl_sds_destroy(remote_id); + if (tag) cfl_sds_destroy(tag); + } + + if (total_files > 0 || skipped_files > 0) { + flb_plg_info(ctx->ins, "Recovery: %d file(s) enqueued (%d resumed, %d new), " + "%d skipped, %d failed", + total_enqueued, resumed_files, new_files, skipped_files, failed_files); + } + + return total_enqueued; +} + +/* Helper: Enqueue unuploaded parts for a file with existing upload_id */ +static int enqueue_file_parts_for_resume(struct flb_s3 *ctx, + uint64_t file_id, + const char *file_path, + const char *upload_id, + const char *tag, + int tag_len) +{ + struct multipart_upload *m_upload = NULL; + uint64_t *part_db_ids = NULL; + uint64_t *part_nums = NULL; + off_t *offset_starts = NULL; + off_t *offset_ends = NULL; + int part_count = 0; + int enqueued = 0; + int ret; + int i; + + /* Get s3_key by creating upload structure */ + m_upload = s3_multipart_upload_create(ctx, tag, tag_len, file_path); + if (!m_upload) { + return -1; + } + + /* Get all parts for this file */ + ret = flb_blob_db_file_fetch_all_parts(&ctx->blob_db, file_id, + &part_db_ids, &part_nums, + &offset_starts, &offset_ends, + &part_count); + if (ret < 0 || part_count == 0) { + s3_multipart_upload_destroy(m_upload); + return -1; + } + + /* Enqueue only unuploaded parts */ + for (i = 0; i < part_count; i++) { + /* Check if this part is already uploaded using database interface */ + int uploaded = 0; + + ret = flb_blob_db_file_part_check_uploaded(&ctx->blob_db, part_db_ids[i], &uploaded); + if (ret < 0) { + flb_plg_warn(ctx->ins, "Failed to check upload status for part_id=%" PRIu64, part_db_ids[i]); + continue; + } + + /* Skip already uploaded parts */ + if (uploaded == 1) { + continue; + } + + /* Enqueue this part */ + ret = s3_queue_add_part(ctx, file_id, part_db_ids[i], part_nums[i], + file_path, offset_starts[i], offset_ends[i], + m_upload->s3_key, upload_id, + tag, tag_len); + if (ret == 0) { + enqueued++; + } + } + + /* Cleanup */ + if (part_db_ids) flb_free(part_db_ids); + if (part_nums) flb_free(part_nums); + if (offset_starts) flb_free(offset_starts); + if (offset_ends) flb_free(offset_ends); + s3_multipart_upload_destroy(m_upload); + + return enqueued; +} + +/* Recover buffered files from fstore during restart (without DB tracking) */ +int s3_queue_recover_from_fstore(struct flb_s3 *ctx) +{ + struct s3_file *chunk; + struct flb_fstore_file *fsf; + struct flb_fstore_stream *fs_stream; + struct mk_list *s_head; + struct mk_list *head; + struct mk_list *tmp; + int total_files = 0; + int ret; + + if (!ctx->fs) { + return 0; + } + + /* Iterate through all streams */ + mk_list_foreach(s_head, &ctx->fs->streams) { + fs_stream = mk_list_entry(s_head, struct flb_fstore_stream, _head); + + /* Skip metadata stream */ + if (fs_stream == ctx->stream_metadata) { + continue; + } + + /* Process all files in this stream */ + mk_list_foreach_safe(head, tmp, &fs_stream->files) { + fsf = mk_list_entry(head, struct flb_fstore_file, _head); + chunk = fsf->data; + + if (!chunk) { + continue; + } + + if (chunk->locked == FLB_TRUE) { + continue; + } + + if (chunk->failures >= ctx->ins->retry_limit) { + flb_plg_warn(ctx->ins, + "Chunk failed %d times, marking inactive (tag=%s)", + chunk->failures, (char*)fsf->meta_buf); + flb_fstore_file_inactive(ctx->fs, fsf); + continue; + } + + /* Add to worker queue (without DB tracking: file_id=0) */ + s3_store_file_lock(chunk); + ret = s3_queue_add_file(ctx, 0, chunk, NULL, + (const char*)fsf->meta_buf, + fsf->meta_size); + if (ret == 0) { + total_files++; + } + else { + s3_store_file_unlock(chunk); + } + } + } + + return total_files; +} diff --git a/plugins/out_s3/s3_queue.h b/plugins/out_s3/s3_queue.h new file mode 100644 index 00000000000..a3866e4f6ff --- /dev/null +++ b/plugins/out_s3/s3_queue.h @@ -0,0 +1,85 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FLB_OUT_S3_QUEUE_H +#define FLB_OUT_S3_QUEUE_H + +#include "s3.h" + +void s3_queue_remove(struct flb_s3 *ctx, struct upload_queue *entry); + +int s3_queue_buffer_chunk(void *out_context, struct s3_file *upload_file, + flb_sds_t chunk, int chunk_size, + const char *tag, int tag_len, + time_t file_first_log_time); + +/* + * Legacy file-level queue add (backward compatibility) + * Used for: + * - file_id == 0: Without DB tracking (chunks) + * - file_id > 0: File-level DB tracking (legacy mode) + */ +int s3_queue_add_file(struct flb_s3 *ctx, + uint64_t file_id, + struct s3_file *upload_file, + const char *file_path, + const char *tag, + int tag_len); + +int s3_queue_add_part(struct flb_s3 *ctx, + uint64_t file_id, + uint64_t part_db_id, + uint64_t part_id, + const char *file_path, + off_t offset_start, + off_t offset_end, + const char *s3_key, + const char *upload_id, + const char *tag, + int tag_len); + +int s3_queue_add_pending_file(struct flb_s3 *ctx, + uint64_t file_id, + const char *file_path, + const char *tag, + int tag_len); + +/* + * Process a queue entry (called by timer callback) + * Returns: 1 on success, -1 on failure (will retry), 0 if invalid (removed) + */ +int s3_queue_process_entry(struct flb_s3 *ctx, + struct upload_queue *entry, + time_t now); + +/* + * Unified recovery interface - Three-phase architecture + * Phase 1: Cleanup dirty states (zombie parts, locked chunks) + * Phase 2: State transitions (stale → pending, aborted → pending/delete) + * Phase 3: Queue rebuild (scan storage and enqueue all pending files) + * + * This handles both DB-tracked files and fstore-buffered files. + */ +int s3_queue_recover_all(struct flb_s3 *ctx, struct flb_config *config); + +/* Internal recovery functions (can also be called independently if needed) */ +int s3_queue_recover_from_database(struct flb_s3 *ctx); +int s3_queue_recover_from_fstore(struct flb_s3 *ctx); + +#endif diff --git a/plugins/out_s3/s3_store.c b/plugins/out_s3/s3_store.c index f9daefe2569..3a2078a2fa0 100644 --- a/plugins/out_s3/s3_store.c +++ b/plugins/out_s3/s3_store.c @@ -20,24 +20,28 @@ #include #include #include +#include +#include #include "s3.h" #include "s3_store.h" -static int s3_store_under_travis_ci() -{ - - if (getenv("CI") != NULL && getenv("TRAVIS") != NULL) { - return FLB_TRUE; - } +#define CIO_FILE_HEADER_SIZE 24 +#define CIO_MAGIC_BYTE_1 0xC1 +#define CIO_MAGIC_BYTE_2 0x00 +#define CIO_META_LENGTH_OFFSET 22 +#define BUFFER_WARNING_THRESHOLD 0.95 - return FLB_FALSE; +static int is_running_in_ci(void) +{ + return (getenv("CI") != NULL || + getenv("TRAVIS") != NULL || + getenv("GITHUB_ACTIONS") != NULL || + getenv("GITLAB_CI") != NULL || + getenv("JENKINS_URL") != NULL); } -/* - * Simple and fast hashing algorithm to create keys in the local buffer - */ -static flb_sds_t gen_store_filename(const char *tag) +static flb_sds_t generate_filename_hash(const char *tag) { int c; unsigned long hash = 5381; @@ -46,282 +50,384 @@ static flb_sds_t gen_store_filename(const char *tag) flb_sds_t tmp; struct flb_time tm; - /* get current time */ flb_time_get(&tm); - /* compose hash */ while ((c = *tag++)) { - hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ + hash = ((hash << 5) + hash) + c; } hash2 = (unsigned long) hash2 * tm.tm.tv_sec * tm.tm.tv_nsec; - /* flb_sds_printf allocs if the incoming sds is not at least 64 bytes */ hash_str = flb_sds_create_size(64); if (!hash_str) { flb_errno(); return NULL; } + tmp = flb_sds_printf(&hash_str, "%lu-%lu", hash, hash2); if (!tmp) { flb_errno(); flb_sds_destroy(hash_str); return NULL; } - hash_str = tmp; - return hash_str; + return tmp; } -/* Retrieve a candidate s3 local file using the tag */ -struct s3_file *s3_store_file_get(struct flb_s3 *ctx, const char *tag, - int tag_len) +struct s3_file *s3_store_file_get(struct flb_s3 *ctx, const char *tag, int tag_len) { struct mk_list *head; struct mk_list *tmp; - struct flb_fstore_file *fsf = NULL; + struct flb_fstore_file *fsf; struct s3_file *s3_file; - /* - * Based in the current ctx->stream_name, locate a candidate file to - * store the incoming data using as a lookup pattern the content Tag. - */ mk_list_foreach_safe(head, tmp, &ctx->stream_active->files) { fsf = mk_list_entry(head, struct flb_fstore_file, _head); - /* skip and warn on partially initialized chunks */ - if (fsf->data == NULL) { - flb_plg_warn(ctx->ins, "BAD: found flb_fstore_file with NULL data reference, tag=%s, file=%s, will try to delete", tag, fsf->name); + if (!fsf->data) { + flb_plg_warn(ctx->ins, "Found file with NULL data: %s, deleting", fsf->name); flb_fstore_file_delete(ctx->fs, fsf); + continue; } if (fsf->meta_size != tag_len) { - fsf = NULL; continue; } - /* skip locked chunks */ s3_file = fsf->data; if (s3_file->locked == FLB_TRUE) { - fsf = NULL; continue; } - /* compare meta and tag */ if (strncmp((char *) fsf->meta_buf, tag, tag_len) == 0) { - break; + return s3_file; + } + } + + return NULL; +} + +static int check_buffer_space(struct flb_s3 *ctx, size_t new_bytes) +{ + size_t space_remaining; + size_t new_total; + + if (ctx->store_dir_limit_size == 0) { + return 0; + } + + new_total = ctx->current_buffer_size + new_bytes; + + if (new_total >= ctx->store_dir_limit_size) { + flb_plg_error(ctx->ins, "Buffer is full: current=%zu, new=%zu, limit=%zu bytes", + ctx->current_buffer_size, new_bytes, ctx->store_dir_limit_size); + return -1; + } + + space_remaining = ctx->store_dir_limit_size - new_total; + if ((double)space_remaining / ctx->store_dir_limit_size < (1.0 - BUFFER_WARNING_THRESHOLD)) { + flb_plg_warn(ctx->ins, "Buffer almost full: %zu/%zu bytes used", + new_total, ctx->store_dir_limit_size); + } + + return 0; +} + +static void cleanup_failed_file(struct flb_s3 *ctx, + struct flb_fstore_file *fsf, + struct s3_file *s3_file, + const char *reason) +{ + flb_plg_warn(ctx->ins, "Deleting buffer file: %s", reason); + + if (s3_file) { + if (s3_file->stream_path) { + flb_sds_destroy(s3_file->stream_path); } + flb_free(s3_file); + } - /* not found, invalidate the reference */ - fsf = NULL; + if (fsf) { + flb_fstore_file_delete(ctx->fs, fsf); + } +} + +static struct s3_file *create_new_buffer_file(struct flb_s3 *ctx, + const char *tag, + int tag_len, + size_t bytes, + time_t file_first_log_time) +{ + flb_sds_t filename; + struct flb_fstore_file *fsf; + struct s3_file *s3_file = NULL; + int ret; + + filename = generate_filename_hash(tag); + if (!filename) { + flb_plg_error(ctx->ins, "Failed to generate filename"); + return NULL; } + fsf = flb_fstore_file_create(ctx->fs, ctx->stream_active, filename, bytes); + flb_sds_destroy(filename); + if (!fsf) { + flb_plg_error(ctx->ins, "Failed to create file in store"); + return NULL; + } + + ret = flb_fstore_file_meta_set(ctx->fs, fsf, (char *) tag, tag_len); + if (ret == -1) { + cleanup_failed_file(ctx, fsf, NULL, "metadata write failed"); + return NULL; + } + + s3_file = flb_calloc(1, sizeof(struct s3_file)); + if (!s3_file) { + flb_errno(); + cleanup_failed_file(ctx, fsf, NULL, "S3 context allocation failed"); + return NULL; + } + + s3_file->stream_path = flb_sds_create(ctx->stream_active->path); + if (!s3_file->stream_path) { + flb_errno(); + cleanup_failed_file(ctx, fsf, s3_file, "stream path allocation failed"); return NULL; } - return fsf->data; + s3_file->fsf = fsf; + s3_file->first_log_time = file_first_log_time; + s3_file->create_time = time(NULL); + s3_file->size = 0; + s3_file->locked = FLB_FALSE; + + fsf->data = s3_file; + + return s3_file; } -/* Append data to a new or existing fstore file */ -int s3_store_buffer_put(struct flb_s3 *ctx, struct s3_file *s3_file, - const char *tag, int tag_len, - char *data, size_t bytes, +int s3_store_buffer_put(struct flb_s3 *ctx, + struct s3_file *s3_file, + const char *tag, + int tag_len, + char *data, + size_t bytes, time_t file_first_log_time) { - int ret; - flb_sds_t name; struct flb_fstore_file *fsf; - size_t space_remaining; + int ret; - if (ctx->store_dir_limit_size > 0 && ctx->current_buffer_size + bytes >= ctx->store_dir_limit_size) { - flb_plg_error(ctx->ins, "Buffer is full: current_buffer_size=%zu, new_data=%zu, store_dir_limit_size=%zu bytes", - ctx->current_buffer_size, bytes, ctx->store_dir_limit_size); + ret = check_buffer_space(ctx, bytes); + if (ret == -1) { return -1; } - /* If no target file was found, create a new one */ if (!s3_file) { - name = gen_store_filename(tag); - if (!name) { - flb_plg_error(ctx->ins, "could not generate chunk file name"); - return -1; - } - - /* Create the file */ - fsf = flb_fstore_file_create(ctx->fs, ctx->stream_active, name, bytes); - if (!fsf) { - flb_plg_error(ctx->ins, "could not create the file '%s' in the store", - name); - flb_sds_destroy(name); - return -1; - } - flb_sds_destroy(name); - - /* Write tag as metadata */ - ret = flb_fstore_file_meta_set(ctx->fs, fsf, (char *) tag, tag_len); - if (ret == -1) { - flb_plg_error(ctx->ins, "error writing tag metadata"); - flb_plg_warn(ctx->ins, "Deleting buffer file because metadata could not be written"); - flb_fstore_file_delete(ctx->fs, fsf); - return -1; - } - - /* Allocate local context */ - s3_file = flb_calloc(1, sizeof(struct s3_file)); + s3_file = create_new_buffer_file(ctx, tag, tag_len, bytes, file_first_log_time); if (!s3_file) { - flb_errno(); - flb_plg_error(ctx->ins, "cannot allocate s3 file context"); - flb_plg_warn(ctx->ins, "Deleting buffer file because S3 context creation failed"); - flb_fstore_file_delete(ctx->fs, fsf); return -1; } - s3_file->fsf = fsf; - s3_file->first_log_time = file_first_log_time; - s3_file->create_time = time(NULL); - - /* Use fstore opaque 'data' reference to keep our context */ - fsf->data = s3_file; - } - else { - fsf = s3_file->fsf; } - /* Append data to the target file */ + fsf = s3_file->fsf; + ret = flb_fstore_file_append(fsf, data, bytes); if (ret != 0) { - flb_plg_error(ctx->ins, "error writing data to local s3 file"); + flb_plg_error(ctx->ins, "Failed to write data to file"); return -1; } + s3_file->size += bytes; ctx->current_buffer_size += bytes; - /* if buffer is 95% full, warn user */ - if (ctx->store_dir_limit_size > 0) { - space_remaining = ctx->store_dir_limit_size - ctx->current_buffer_size; - if ((space_remaining * 20) < ctx->store_dir_limit_size) { - flb_plg_warn(ctx->ins, "Buffer is almost full: current_buffer_size=%zu, store_dir_limit_size=%zu bytes", - ctx->current_buffer_size, ctx->store_dir_limit_size); - return -1; - } + return 0; +} + +static size_t calculate_chunk_data_size(struct flb_s3 *ctx, + const char *stream_path, + const char *chunk_name, + size_t meta_size) +{ + char chunk_path[PATH_MAX]; + struct stat st; + int ret; + + if (!chunk_name || strlen(chunk_name) == 0) { + return 0; } - return 0; + ret = snprintf(chunk_path, sizeof(chunk_path), "%s/%s", stream_path, chunk_name); + if (ret < 0 || ret >= sizeof(chunk_path)) { + flb_plg_warn(ctx->ins, "Chunk path too long"); + return 0; + } + + if (stat(chunk_path, &st) != 0) { + return 0; + } + + if (st.st_size <= CIO_FILE_HEADER_SIZE + meta_size) { + return 0; + } + + return st.st_size - CIO_FILE_HEADER_SIZE - meta_size; } -static int set_files_context(struct flb_s3 *ctx) +static struct s3_file *create_file_context(struct flb_s3 *ctx, + struct flb_fstore_stream *fs_stream, + struct flb_fstore_file *fsf) +{ + struct s3_file *s3_file; + size_t chunk_size; + + s3_file = flb_calloc(1, sizeof(struct s3_file)); + if (!s3_file) { + flb_errno(); + return NULL; + } + + s3_file->stream_path = flb_sds_create(fs_stream->path); + if (!s3_file->stream_path) { + flb_errno(); + flb_free(s3_file); + return NULL; + } + + s3_file->fsf = fsf; + s3_file->first_log_time = time(NULL); + s3_file->create_time = time(NULL); + s3_file->locked = FLB_FALSE; + + if (fsf->chunk && fsf->chunk->name) { + chunk_size = calculate_chunk_data_size(ctx, fs_stream->path, + fsf->chunk->name, fsf->meta_size); + s3_file->size = chunk_size; + ctx->current_buffer_size += chunk_size; + } + else { + s3_file->size = 0; + } + + fsf->data = s3_file; + + return s3_file; +} + +static int restore_stream_files(struct flb_s3 *ctx, struct flb_fstore_stream *fs_stream) { - struct mk_list *head; struct mk_list *f_head; - struct flb_fstore_stream *fs_stream; struct flb_fstore_file *fsf; struct s3_file *s3_file; - mk_list_foreach(head, &ctx->fs->streams) { - fs_stream = mk_list_entry(head, struct flb_fstore_stream, _head); + if (!fs_stream->path) { + flb_plg_warn(ctx->ins, "Stream has NULL path, skipping"); + return 0; + } - /* skip current stream since it's new */ - if (fs_stream == ctx->stream_active) { + mk_list_foreach(f_head, &fs_stream->files) { + fsf = mk_list_entry(f_head, struct flb_fstore_file, _head); + + if (fsf->data) { continue; } - /* skip multi-upload */ - if (fs_stream == ctx->stream_upload) { + s3_file = create_file_context(ctx, fs_stream, fsf); + if (!s3_file) { + flb_plg_error(ctx->ins, "Failed to create file context"); continue; } + } - mk_list_foreach(f_head, &fs_stream->files) { - fsf = mk_list_entry(f_head, struct flb_fstore_file, _head); - if (fsf->data) { - continue; - } + return 0; +} - /* Allocate local context */ - s3_file = flb_calloc(1, sizeof(struct s3_file)); - if (!s3_file) { - flb_errno(); - flb_plg_error(ctx->ins, "cannot allocate s3 file context"); - continue; - } - s3_file->fsf = fsf; - s3_file->first_log_time = time(NULL); - s3_file->create_time = time(NULL); +static int restore_buffered_files(struct flb_s3 *ctx) +{ + struct mk_list *head; + struct flb_fstore_stream *fs_stream; - /* Use fstore opaque 'data' reference to keep our context */ - fsf->data = s3_file; + mk_list_foreach(head, &ctx->fs->streams) { + fs_stream = mk_list_entry(head, struct flb_fstore_stream, _head); + + if (fs_stream == ctx->stream_active || fs_stream == ctx->stream_metadata) { + continue; } + + restore_stream_files(ctx, fs_stream); } return 0; } -/* Initialize filesystem storage for S3 plugin */ -int s3_store_init(struct flb_s3 *ctx) +static flb_sds_t create_stream_name(void) { - int type; time_t now; + struct tm tm_buf; char tmp[64]; - struct tm *tm; + + now = time(NULL); + +#ifdef FLB_SYSTEM_WINDOWS + /* Windows: gmtime_s(struct tm*, const time_t*) */ + if (gmtime_s(&tm_buf, &now) != 0) { + return NULL; + } + strftime(tmp, sizeof(tmp) - 1, "%Y-%m-%dT%H-%M-%S", &tm_buf); +#else + /* POSIX: gmtime_r(const time_t*, struct tm*) */ + if (gmtime_r(&now, &tm_buf) == NULL) { + return NULL; + } + strftime(tmp, sizeof(tmp) - 1, "%Y-%m-%dT%H:%M:%S", &tm_buf); +#endif + + return flb_sds_create(tmp); +} + +int s3_store_init(struct flb_s3 *ctx) +{ + int store_type; + flb_sds_t stream_name; struct flb_fstore *fs; struct flb_fstore_stream *fs_stream; - if (s3_store_under_travis_ci() == FLB_TRUE) { - type = FLB_FSTORE_MEM; - flb_plg_warn(ctx->ins, "Travis CI test, using s3 store memory backend"); - } - else { - type = FLB_FSTORE_FS; + store_type = is_running_in_ci() ? FLB_FSTORE_MEM : FLB_FSTORE_FS; + + if (store_type == FLB_FSTORE_MEM) { + flb_plg_warn(ctx->ins, "CI environment detected, using memory backend"); } - /* Initialize the storage context */ - fs = flb_fstore_create(ctx->buffer_dir, type); + fs = flb_fstore_create(ctx->buffer_dir, store_type); if (!fs) { + flb_plg_error(ctx->ins, "Failed to create file store"); return -1; } ctx->fs = fs; - /* - * On every start we create a new stream, this stream in the file system - * is directory with the name using the date like '2020-10-03T13:00:02'. So - * all the 'new' data that is generated on this process is stored there. - * - * Note that previous data in similar directories from previous runs is - * considered backlog data, in the S3 plugin we need to differenciate the - * new v/s the older buffered data. - * - * Compose a stream name... - */ - now = time(NULL); - tm = localtime(&now); - -#ifdef FLB_SYSTEM_WINDOWS - /* Windows does not allow ':' in directory names */ - strftime(tmp, sizeof(tmp) - 1, "%Y-%m-%dT%H-%M-%S", tm); -#else - strftime(tmp, sizeof(tmp) - 1, "%Y-%m-%dT%H:%M:%S", tm); -#endif - - /* Create the stream */ - fs_stream = flb_fstore_stream_create(ctx->fs, tmp); - if (!fs_stream) { - /* Upon exception abort */ - flb_plg_error(ctx->ins, "could not initialize active stream: %s", tmp); + stream_name = create_stream_name(); + if (!stream_name) { + flb_plg_error(ctx->ins, "Failed to create stream name"); flb_fstore_destroy(fs); ctx->fs = NULL; return -1; } - ctx->stream_active = fs_stream; - /* Multipart upload stream */ - fs_stream = flb_fstore_stream_create(ctx->fs, "multipart_upload_metadata"); + fs_stream = flb_fstore_stream_create(ctx->fs, stream_name); + flb_sds_destroy(stream_name); + if (!fs_stream) { - flb_plg_error(ctx->ins, "could not initialize multipart_upload stream"); + flb_plg_error(ctx->ins, "Failed to create active stream"); flb_fstore_destroy(fs); ctx->fs = NULL; return -1; } - ctx->stream_upload = fs_stream; - set_files_context(ctx); + ctx->stream_active = fs_stream; + ctx->current_buffer_size = 0; + + restore_buffered_files(ctx); + return 0; } @@ -337,33 +443,29 @@ int s3_store_exit(struct flb_s3 *ctx) return 0; } - /* release local context on non-multi upload files */ mk_list_foreach(head, &ctx->fs->streams) { fs_stream = mk_list_entry(head, struct flb_fstore_stream, _head); - if (fs_stream == ctx->stream_upload) { - continue; - } mk_list_foreach(f_head, &fs_stream->files) { fsf = mk_list_entry(f_head, struct flb_fstore_file, _head); - if (fsf->data != NULL) { + + if (fsf->data) { s3_file = fsf->data; - flb_sds_destroy(s3_file->file_path); + if (s3_file->stream_path) { + flb_sds_destroy(s3_file->stream_path); + } flb_free(s3_file); + fsf->data = NULL; } } } - if (ctx->fs) { - flb_fstore_destroy(ctx->fs); - } + flb_fstore_destroy(ctx->fs); + ctx->fs = NULL; + return 0; } -/* - * Check if the store has data. This function is only used on plugin - * initialization - */ int s3_store_has_data(struct flb_s3 *ctx) { struct mk_list *head; @@ -374,11 +476,7 @@ int s3_store_has_data(struct flb_s3 *ctx) } mk_list_foreach(head, &ctx->fs->streams) { - /* skip multi upload stream */ fs_stream = mk_list_entry(head, struct flb_fstore_stream, _head); - if (fs_stream == ctx->stream_upload) { - continue; - } if (mk_list_size(&fs_stream->files) > 0) { return FLB_TRUE; @@ -388,156 +486,101 @@ int s3_store_has_data(struct flb_s3 *ctx) return FLB_FALSE; } -int s3_store_has_uploads(struct flb_s3 *ctx) +int s3_store_file_inactive(struct flb_s3 *ctx, struct s3_file *s3_file) { - if (!ctx || !ctx->stream_upload) { - return FLB_FALSE; - } + struct flb_fstore_file *fsf; - if (mk_list_size(&ctx->stream_upload->files) > 0) { - return FLB_TRUE; + if (!s3_file) { + return 0; } - return FLB_FALSE; -} + fsf = s3_file->fsf; -int s3_store_file_inactive(struct flb_s3 *ctx, struct s3_file *s3_file) -{ - int ret; - struct flb_fstore_file *fsf; + /* Free allocated members before freeing the struct */ + if (s3_file->stream_path) { + flb_sds_destroy(s3_file->stream_path); + } - fsf = s3_file->fsf; flb_free(s3_file); - ret = flb_fstore_file_inactive(ctx->fs, fsf); - return ret; + return flb_fstore_file_inactive(ctx->fs, fsf); } int s3_store_file_delete(struct flb_s3 *ctx, struct s3_file *s3_file) { struct flb_fstore_file *fsf; + if (!s3_file || !s3_file->fsf) { + return 0; + } + fsf = s3_file->fsf; + + if (fsf->data != s3_file) { + return 0; + } + + fsf->data = NULL; ctx->current_buffer_size -= s3_file->size; - /* permanent deletion */ + if (s3_file->stream_path) { + flb_sds_destroy(s3_file->stream_path); + } + flb_fstore_file_delete(ctx->fs, fsf); flb_free(s3_file); return 0; } -int s3_store_file_read(struct flb_s3 *ctx, struct s3_file *s3_file, - char **out_buf, size_t *out_size) -{ - int ret; - - ret = flb_fstore_file_content_copy(ctx->fs, s3_file->fsf, - (void **) out_buf, out_size); - return ret; -} - -int s3_store_file_upload_read(struct flb_s3 *ctx, struct flb_fstore_file *fsf, - char **out_buf, size_t *out_size) -{ - int ret; - - ret = flb_fstore_file_content_copy(ctx->fs, fsf, - (void **) out_buf, out_size); - return ret; -} - -struct flb_fstore_file *s3_store_file_upload_get(struct flb_s3 *ctx, - char *key, int key_len) +FILE *flb_chunk_file_open(const char *chunk_path) { - struct mk_list *head; - struct flb_fstore_file *fsf = NULL; + FILE *fp; + unsigned char header[CIO_FILE_HEADER_SIZE]; + uint16_t meta_len; - mk_list_foreach(head, &ctx->stream_upload->files) { - fsf = mk_list_entry(head, struct flb_fstore_file, _head); - if (fsf->meta_buf == NULL) { - continue; - } - - if (fsf->meta_size != key_len ){ - continue; - } - - if (strncmp(fsf->meta_buf, key, key_len) == 0) { - break; - } - fsf = NULL; + fp = fopen(chunk_path, "rb"); + if (!fp) { + flb_error("[s3_store] Failed to open chunk file: %s", chunk_path); + return NULL; } - return fsf; -} - -/* param fsf can NULL if the file has not yet been created */ -int s3_store_file_upload_put(struct flb_s3 *ctx, - struct flb_fstore_file *fsf, flb_sds_t key, - flb_sds_t data) -{ - int ret; - flb_sds_t name; - - /* If no target file was found, create a new one */ - if (!fsf) { - name = gen_store_filename(key); - if (!name) { - flb_plg_error(ctx->ins, "could not generate chunk file name"); - return -1; - } - - /* Create the file */ - fsf = flb_fstore_file_create(ctx->fs, ctx->stream_upload, name, flb_sds_len(data)); - if (!fsf) { - flb_plg_error(ctx->ins, "could not create the file '%s' in the upload store", - name); - flb_sds_destroy(name); - return -1; - } - flb_sds_destroy(name); - - /* Write key as metadata */ - ret = flb_fstore_file_meta_set(ctx->fs, fsf, - key, flb_sds_len(key)); - if (ret == -1) { - flb_plg_error(ctx->ins, "error writing upload metadata"); - flb_plg_warn(ctx->ins, "Deleting s3 upload cache file because metadata could not be written"); - flb_fstore_file_delete(ctx->fs, fsf); - return -1; - } + if (fread(header, 1, CIO_FILE_HEADER_SIZE, fp) != CIO_FILE_HEADER_SIZE) { + flb_error("[s3_store] Failed to read chunk header: %s", chunk_path); + fclose(fp); + return NULL; } - /* Append data to the target file */ - ret = flb_fstore_file_append(fsf, data, flb_sds_len(data)); - if (ret != 0) { - flb_plg_error(ctx->ins, "error writing data to local s3 file"); - return -1; + if (header[0] != CIO_MAGIC_BYTE_1 || header[1] != CIO_MAGIC_BYTE_2) { + flb_error("[s3_store] Invalid chunk magic bytes: 0x%02X 0x%02X in %s", + header[0], header[1], chunk_path); + fclose(fp); + return NULL; } - return 0; -} + meta_len = ((uint16_t)header[CIO_META_LENGTH_OFFSET] << 8) | + (uint16_t)header[CIO_META_LENGTH_OFFSET + 1]; -int s3_store_file_upload_delete(struct flb_s3 *ctx, struct flb_fstore_file *fsf) -{ - /* permanent deletion */ - flb_fstore_file_delete(ctx->fs, fsf); - return 0; -} + if (meta_len > 0 && fseek(fp, meta_len, SEEK_CUR) != 0) { + flb_error("[s3_store] Failed to skip metadata (%d bytes): %s", + meta_len, chunk_path); + fclose(fp); + return NULL; + } -/* Always set an updated copy of metadata into the fs_store_file entry */ -int s3_store_file_meta_get(struct flb_s3 *ctx, struct flb_fstore_file *fsf) -{ - return flb_fstore_file_meta_get(ctx->fs, fsf); + return fp; } void s3_store_file_lock(struct s3_file *s3_file) { - s3_file->locked = FLB_TRUE; + if (s3_file) { + s3_file->locked = FLB_TRUE; + } } void s3_store_file_unlock(struct s3_file *s3_file) { - s3_file->locked = FLB_FALSE; + if (s3_file) { + s3_file->locked = FLB_FALSE; + } } diff --git a/plugins/out_s3/s3_store.h b/plugins/out_s3/s3_store.h index a8134061250..031706fc758 100644 --- a/plugins/out_s3/s3_store.h +++ b/plugins/out_s3/s3_store.h @@ -29,7 +29,7 @@ struct s3_file { size_t size; /* file size */ time_t create_time; /* creation time */ time_t first_log_time; /* first log time */ - flb_sds_t file_path; /* file path */ + flb_sds_t stream_path; /* stream directory path */ struct flb_fstore_file *fsf; /* reference to parent flb_fstore_file */ }; @@ -42,25 +42,14 @@ int s3_store_init(struct flb_s3 *ctx); int s3_store_exit(struct flb_s3 *ctx); int s3_store_has_data(struct flb_s3 *ctx); -int s3_store_has_uploads(struct flb_s3 *ctx); int s3_store_file_inactive(struct flb_s3 *ctx, struct s3_file *s3_file); struct s3_file *s3_store_file_get(struct flb_s3 *ctx, const char *tag, int tag_len); int s3_store_file_delete(struct flb_s3 *ctx, struct s3_file *s3_file); -int s3_store_file_read(struct flb_s3 *ctx, struct s3_file *s3_file, - char **out_buf, size_t *out_size); -int s3_store_file_upload_read(struct flb_s3 *ctx, struct flb_fstore_file *fsf, - char **out_buf, size_t *out_size); -struct flb_fstore_file *s3_store_file_upload_get(struct flb_s3 *ctx, - char *key, int key_len); -int s3_store_file_upload_put(struct flb_s3 *ctx, - struct flb_fstore_file *fsf, flb_sds_t key, - flb_sds_t data); -int s3_store_file_upload_delete(struct flb_s3 *ctx, struct flb_fstore_file *fsf); - -int s3_store_file_meta_get(struct flb_s3 *ctx, struct flb_fstore_file *fsf); +/* Context-free chunk file opener - can be used from anywhere including C++ */ +FILE *flb_chunk_file_open(const char *chunk_path); void s3_store_file_lock(struct s3_file *s3_file); void s3_store_file_unlock(struct s3_file *s3_file); diff --git a/plugins/out_s3/s3_stream.c b/plugins/out_s3/s3_stream.c new file mode 100644 index 00000000000..2a9bfccf925 --- /dev/null +++ b/plugins/out_s3/s3_stream.c @@ -0,0 +1,546 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "s3.h" +#include "s3_store.h" +#include "s3_stream.h" +#include +#include +#include +#include +#include + +#define S3_STREAM_COMPRESS_BUFFER_SIZE (1024 * 1024) +#define S3_STREAM_READ_BUFFER_SIZE (1024 * 1024) + +int stream_compress_file(struct flb_s3 *ctx, + const char *input_path, + const char *output_path, + off_t offset_start, + off_t offset_end) +{ + FILE *in_fp = NULL; + FILE *out_fp = NULL; + char *read_buffer = NULL; + void *compressed_chunk = NULL; + size_t compressed_chunk_size; + size_t bytes_to_read; + size_t bytes_read; + off_t current_offset; + off_t remaining; + int ret = -1; + + flb_plg_debug(ctx->ins, "Compressing file with %s: %s", + ctx->compression == FLB_AWS_COMPRESS_GZIP ? "gzip" : + ctx->compression == FLB_AWS_COMPRESS_ZSTD ? "zstd" : "snappy", + input_path); + + in_fp = fopen(input_path, "rb"); + if (!in_fp) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to open file for compression: %s", input_path); + goto cleanup; + } + + if (offset_start > 0 && fseek(in_fp, offset_start, SEEK_SET) != 0) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to seek in file"); + goto cleanup; + } + + out_fp = fopen(output_path, "wb"); + if (!out_fp) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to create compressed file: %s", output_path); + goto cleanup; + } + + read_buffer = flb_malloc(S3_STREAM_COMPRESS_BUFFER_SIZE); + if (!read_buffer) { + flb_errno(); + goto cleanup; + } + + current_offset = offset_start; + remaining = (offset_end > 0) ? (offset_end - offset_start) : -1; + + while (1) { + if (remaining > 0) { + bytes_to_read = (remaining < S3_STREAM_COMPRESS_BUFFER_SIZE) ? + remaining : S3_STREAM_COMPRESS_BUFFER_SIZE; + } + else { + bytes_to_read = S3_STREAM_COMPRESS_BUFFER_SIZE; + } + + bytes_read = fread(read_buffer, 1, bytes_to_read, in_fp); + if (bytes_read == 0) { + break; + } + + ret = flb_aws_compression_compress(ctx->compression, read_buffer, bytes_read, + &compressed_chunk, &compressed_chunk_size); + if (ret == -1) { + flb_plg_error(ctx->ins, "Failed to compress chunk"); + goto cleanup; + } + + if (fwrite(compressed_chunk, 1, compressed_chunk_size, out_fp) != compressed_chunk_size) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to write compressed data"); + flb_free(compressed_chunk); + goto cleanup; + } + + flb_free(compressed_chunk); + compressed_chunk = NULL; + + if (remaining > 0) { + remaining -= bytes_read; + current_offset += bytes_read; + if (remaining <= 0 || current_offset >= offset_end) { + break; + } + } + } + + if (ferror(in_fp)) { + flb_errno(); + flb_plg_error(ctx->ins, "Error reading file during compression"); + goto cleanup; + } + + ret = 0; + +cleanup: + if (read_buffer) { + flb_free(read_buffer); + } + if (out_fp) { + fclose(out_fp); + } + if (in_fp) { + fclose(in_fp); + } + + if (ret == -1 && output_path) { + unlink(output_path); + } + + return ret; +} + +struct stream_context { + FILE *msgpack_fp; + FILE *temp_fp; + flb_sds_t temp_path; + char *read_buffer; + msgpack_unpacker unpacker; + msgpack_unpacked result; + int unpacker_initialized; + int result_initialized; +}; + +static void stream_context_init(struct stream_context *ctx) +{ + memset(ctx, 0, sizeof(struct stream_context)); +} + +static void stream_context_destroy(struct stream_context *ctx, int keep_temp_file) +{ + if (!ctx) { + return; + } + + if (ctx->result_initialized) { + msgpack_unpacked_destroy(&ctx->result); + } + if (ctx->unpacker_initialized) { + msgpack_unpacker_destroy(&ctx->unpacker); + } + if (ctx->read_buffer) { + flb_free(ctx->read_buffer); + } + if (ctx->msgpack_fp) { + fclose(ctx->msgpack_fp); + } + if (ctx->temp_fp) { + fclose(ctx->temp_fp); + } + if (ctx->temp_path) { + if (!keep_temp_file) { + unlink(ctx->temp_path); + } + flb_sds_destroy(ctx->temp_path); + } +} + +static int stream_context_setup(struct flb_s3 *s3_ctx, + const char *input_path, + const char *output_suffix, + struct stream_context *ctx) +{ + flb_sds_t tmp; + + stream_context_init(ctx); + + ctx->msgpack_fp = flb_chunk_file_open(input_path); + if (!ctx->msgpack_fp) { + flb_plg_error(s3_ctx->ins, "Failed to open msgpack file: %s", input_path); + return -1; + } + + ctx->temp_path = flb_sds_create_size(PATH_MAX); + if (!ctx->temp_path) { + flb_errno(); + flb_plg_error(s3_ctx->ins, "Failed to allocate temp path buffer"); + return -1; + } + + tmp = flb_sds_printf(&ctx->temp_path, "%s/stream_%d_%lu%s", + s3_ctx->buffer_dir, (int)getpid(), (unsigned long)time(NULL), output_suffix); + if (!tmp) { + flb_errno(); + flb_plg_error(s3_ctx->ins, "Failed to format temp path"); + return -1; + } + ctx->temp_path = tmp; + + ctx->temp_fp = fopen(ctx->temp_path, "wb"); + if (!ctx->temp_fp) { + flb_errno(); + flb_plg_error(s3_ctx->ins, "Failed to create temp file: %s", ctx->temp_path); + return -1; + } + + ctx->read_buffer = flb_malloc(S3_STREAM_READ_BUFFER_SIZE); + if (!ctx->read_buffer) { + flb_errno(); + flb_plg_error(s3_ctx->ins, "Failed to allocate read buffer"); + return -1; + } + + if (!msgpack_unpacker_init(&ctx->unpacker, S3_STREAM_READ_BUFFER_SIZE)) { + flb_plg_error(s3_ctx->ins, "Failed to initialize msgpack unpacker"); + return -1; + } + ctx->unpacker_initialized = FLB_TRUE; + + msgpack_unpacked_init(&ctx->result); + ctx->result_initialized = FLB_TRUE; + + return 0; +} + +static int process_unpacked_records(struct flb_s3 *ctx, + struct stream_context *stream_ctx, + record_processor_fn processor, + struct stream_processor_context *proc_ctx) +{ + msgpack_unpack_return ret; + const msgpack_object *record; + + while ((ret = msgpack_unpacker_next(&stream_ctx->unpacker, + &stream_ctx->result)) != MSGPACK_UNPACK_CONTINUE) { + if (ret == MSGPACK_UNPACK_SUCCESS) { + record = &stream_ctx->result.data; + if (record->type == MSGPACK_OBJECT_ARRAY && record->via.array.size == 2) { + if (processor(ctx, record, stream_ctx->temp_fp, proc_ctx) < 0) { + return -1; + } + proc_ctx->records_processed++; + } + } + else if (ret == MSGPACK_UNPACK_PARSE_ERROR) { + flb_plg_error(ctx->ins, "Msgpack parse error"); + return -1; + } + else if (ret == MSGPACK_UNPACK_EXTRA_BYTES) { + break; + } + } + + return 0; +} + +static int process_msgpack_stream(struct flb_s3 *ctx, + struct stream_context *stream_ctx, + record_processor_fn processor, + struct stream_processor_context *proc_ctx) +{ + size_t bytes_read; + + while ((bytes_read = fread(stream_ctx->read_buffer, 1, + S3_STREAM_READ_BUFFER_SIZE, + stream_ctx->msgpack_fp)) > 0) { + + if (!msgpack_unpacker_reserve_buffer(&stream_ctx->unpacker, bytes_read)) { + flb_plg_error(ctx->ins, "msgpack unpacker buffer reserve failed"); + return -1; + } + + memcpy(msgpack_unpacker_buffer(&stream_ctx->unpacker), + stream_ctx->read_buffer, bytes_read); + msgpack_unpacker_buffer_consumed(&stream_ctx->unpacker, bytes_read); + + if (process_unpacked_records(ctx, stream_ctx, processor, proc_ctx) < 0) { + return -1; + } + } + + if (ferror(stream_ctx->msgpack_fp)) { + flb_errno(); + flb_plg_error(ctx->ins, "Error reading msgpack file"); + return -1; + } + + return 0; +} + +static int create_output_file_marker(struct flb_s3 *ctx, + const char *temp_path, + size_t file_size, + flb_sds_t *out_buf, + size_t *out_size) +{ + flb_sds_t marker; + flb_sds_t tmp; + + marker = flb_sds_create_size(strlen(temp_path) + 6); + if (!marker) { + flb_plg_error(ctx->ins, "Failed to create path marker"); + return -1; + } + + tmp = flb_sds_printf(&marker, "FILE:%s", temp_path); + if (!tmp) { + flb_plg_error(ctx->ins, "Failed to format path marker"); + flb_sds_destroy(marker); + return -1; + } + + *out_buf = tmp; + *out_size = file_size; + + return 0; +} + +int stream_process_msgpack_file(struct flb_s3 *ctx, + const char *input_path, + size_t input_size, + const char *output_suffix, + record_processor_fn processor, + void *processor_ctx, + flb_sds_t *out_buf, + size_t *out_size) +{ + struct stream_context stream_ctx; + struct stream_processor_context proc_ctx = { + .processor = processor, + .user_data = processor_ctx, + .records_processed = 0, + .bytes_written = 0 + }; + struct stat temp_stat; + int ret; + + if (input_size == 0) { + flb_plg_debug(ctx->ins, "Empty input file, skipping: %s", input_path); + *out_buf = NULL; + *out_size = 0; + return -1; + } + + ret = stream_context_setup(ctx, input_path, output_suffix, &stream_ctx); + if (ret < 0) { + stream_context_destroy(&stream_ctx, FLB_FALSE); + return -1; + } + + ret = process_msgpack_stream(ctx, &stream_ctx, processor, &proc_ctx); + if (ret < 0) { + stream_context_destroy(&stream_ctx, FLB_FALSE); + return -1; + } + + fclose(stream_ctx.temp_fp); + stream_ctx.temp_fp = NULL; + + if (stat(stream_ctx.temp_path, &temp_stat) != 0) { + flb_errno(); + flb_plg_error(ctx->ins, "Failed to stat temp file: %s", stream_ctx.temp_path); + stream_context_destroy(&stream_ctx, FLB_FALSE); + return -1; + } + + if (temp_stat.st_size == 0) { + flb_plg_debug(ctx->ins, "No data generated by processor"); + stream_context_destroy(&stream_ctx, FLB_FALSE); + *out_buf = NULL; + *out_size = 0; + return -1; + } + + ret = create_output_file_marker(ctx, stream_ctx.temp_path, + temp_stat.st_size, out_buf, out_size); + if (ret < 0) { + stream_context_destroy(&stream_ctx, FLB_FALSE); + return -1; + } + + flb_plg_debug(ctx->ins, "Stream processing: %zu records, %zu bytes → %lld bytes", + proc_ctx.records_processed, input_size, (long long)temp_stat.st_size); + + stream_context_destroy(&stream_ctx, FLB_TRUE); + return 0; +} + +int stream_json_processor(struct flb_s3 *ctx, + const msgpack_object *record, + FILE *output_file, + void *proc_ctx_ptr) +{ + struct stream_processor_context *proc_ctx = proc_ctx_ptr; + const msgpack_object *body = &record->via.array.ptr[1]; + char *json_str; + size_t json_len; + + json_str = flb_msgpack_to_json_str(1024, body, + ctx->ins->config->json_escape_unicode); + if (!json_str) { + flb_plg_error(ctx->ins, "Failed to convert record to JSON"); + return -1; + } + + json_len = strlen(json_str); + + if (fwrite(json_str, 1, json_len, output_file) != json_len || + fputc('\n', output_file) == EOF) { + flb_free(json_str); + return -1; + } + + proc_ctx->bytes_written += json_len + 1; + flb_free(json_str); + return 0; +} + +static const msgpack_object *find_log_key_in_map(struct flb_s3 *ctx, + const msgpack_object *map_obj) +{ + const char *key_str; + size_t key_str_size; + uint32_t i; + + if (map_obj->type != MSGPACK_OBJECT_MAP) { + return NULL; + } + + for (i = 0; i < map_obj->via.map.size; i++) { + const msgpack_object *key = &map_obj->via.map.ptr[i].key; + const msgpack_object *val = &map_obj->via.map.ptr[i].val; + + if (key->type == MSGPACK_OBJECT_STR) { + key_str = key->via.str.ptr; + key_str_size = key->via.str.size; + } + else if (key->type == MSGPACK_OBJECT_BIN) { + key_str = key->via.bin.ptr; + key_str_size = key->via.bin.size; + } + else { + continue; + } + + if (key_str_size == strlen(ctx->log_key) && + strncmp(ctx->log_key, key_str, key_str_size) == 0) { + return val; + } + } + + return NULL; +} + +static int write_string_value(FILE *output_file, + const void *data, + size_t data_size, + struct stream_processor_context *proc_ctx) +{ + if (fwrite(data, 1, data_size, output_file) != data_size || + fputc('\n', output_file) == EOF) { + return -1; + } + + proc_ctx->bytes_written += data_size + 1; + return 0; +} + +static int write_json_value(struct flb_s3 *ctx, + FILE *output_file, + const msgpack_object *val, + struct stream_processor_context *proc_ctx) +{ + char *json_str; + size_t json_len; + int ret; + + json_str = flb_msgpack_to_json_str(1024, val, + ctx->ins->config->json_escape_unicode); + if (!json_str) { + flb_plg_error(ctx->ins, "Failed to convert log_key value to JSON"); + return -1; + } + + json_len = strlen(json_str); + ret = write_string_value(output_file, json_str, json_len, proc_ctx); + flb_free(json_str); + + return ret; +} + +int stream_log_key_processor(struct flb_s3 *ctx, + const msgpack_object *record, + FILE *output_file, + void *proc_ctx_ptr) +{ + struct stream_processor_context *proc_ctx = proc_ctx_ptr; + const msgpack_object *map_obj = &record->via.array.ptr[1]; + const msgpack_object *val; + const void *data; + size_t data_size; + + val = find_log_key_in_map(ctx, map_obj); + if (!val) { + return 0; + } + + if (val->type == MSGPACK_OBJECT_STR) { + data = val->via.str.ptr; + data_size = val->via.str.size; + return write_string_value(output_file, data, data_size, proc_ctx); + } + else if (val->type == MSGPACK_OBJECT_BIN) { + data = val->via.bin.ptr; + data_size = val->via.bin.size; + return write_string_value(output_file, data, data_size, proc_ctx); + } + else { + return write_json_value(ctx, output_file, val, proc_ctx); + } +} diff --git a/plugins/out_s3/s3_stream.h b/plugins/out_s3/s3_stream.h new file mode 100644 index 00000000000..82f29283e04 --- /dev/null +++ b/plugins/out_s3/s3_stream.h @@ -0,0 +1,111 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FLB_OUT_S3_STREAM_H +#define FLB_OUT_S3_STREAM_H + +#include "s3.h" +#include +#include +#include + +/* Record processor callback function type */ +typedef int (*record_processor_fn)( + struct flb_s3 *ctx, + const msgpack_object *record, + FILE *output_file, + void *processor_ctx +); + +/* Stream processor context */ +struct stream_processor_context { + record_processor_fn processor; + void *user_data; + size_t records_processed; + size_t bytes_written; +}; + +/** + * Compress file or file segment using streaming approach + * + * Supports compression starting from any offset for memory-efficient + * processing of large files. + * + * @param ctx S3 context + * @param input_path Input file path + * @param output_path Output file path + * @param offset_start Start offset (0 for beginning) + * @param offset_end End offset (-1 for EOF) + * @return 0 on success, -1 on failure + */ +int stream_compress_file(struct flb_s3 *ctx, + const char *input_path, + const char *output_path, + off_t offset_start, + off_t offset_end); + +/** + * Unified msgpack streaming processor + * + * Provides a unified framework for processing msgpack data with + * format-specific callbacks. + * + * @param ctx S3 context + * @param input_path Input msgpack file path + * @param input_size Input file size + * @param output_suffix Output file suffix (e.g., ".json", ".txt") + * @param processor Format-specific processor callback + * @param processor_ctx User data for processor + * @param out_buf Output buffer (FILE: marker for temp file) + * @param out_size Output size + * @return 0 on success, -1 on failure + */ +int stream_process_msgpack_file( + struct flb_s3 *ctx, + const char *input_path, + size_t input_size, + const char *output_suffix, + record_processor_fn processor, + void *processor_ctx, + flb_sds_t *out_buf, + size_t *out_size); + +/** + * JSON record processor + * + * Converts msgpack records to JSON line format. + */ +int stream_json_processor( + struct flb_s3 *ctx, + const msgpack_object *record, + FILE *output_file, + void *proc_ctx_ptr); + +/** + * log_key record processor + * + * Extracts specified field value from records. + */ +int stream_log_key_processor( + struct flb_s3 *ctx, + const msgpack_object *record, + FILE *output_file, + void *proc_ctx_ptr); + +#endif /* FLB_OUT_S3_STREAM_H */ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ac5fe5ef863..697335cfd69 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -154,6 +154,17 @@ if(FLB_AVRO_ENCODER) ) endif() +if(FLB_PARQUET_ENCODER) + # Parquet requires C++ implementation + set(src + ${src} + flb_parquet_impl.cpp + ) + + # Enable C++ for this target + set_source_files_properties(flb_parquet_impl.cpp PROPERTIES LANGUAGE CXX) +endif() + # Fluent Bit have TLS support if(FLB_TLS) # Register the TLS interface and functions @@ -219,6 +230,11 @@ if(FLB_KAFKA) ${src} "flb_kafka.c" ) + # Add Kafka libraries (includes SASL if enabled) + set(FLB_DEPS + ${FLB_DEPS} + ${KAFKA_LIBRARIES} + ) endif() # Link to libco @@ -384,6 +400,15 @@ set(FLB_DEPS ) endif() +# Parquet Encoding +if(FLB_PARQUET_ENCODER) + set(FLB_DEPS + ${FLB_DEPS} + ${ARROW_LIBRARIES} + ${PARQUET_LIBRARIES} + ) +endif() + # WASM runtime if(FLB_WASM) set(FLB_DEPS @@ -574,11 +599,6 @@ if(FLB_BINARY) target_link_libraries(fluent-bit-bin fluent-bit-static ${CMAKE_THREAD_LIBS_INIT}) - # KAFKA SPECIFIC - if(FLB_SASL_ENABLED) - target_link_libraries(fluent-bit-bin sasl2) - endif() - set_target_properties(fluent-bit-bin PROPERTIES OUTPUT_NAME ${FLB_OUT_NAME} diff --git a/src/aws/CMakeLists.txt b/src/aws/CMakeLists.txt index a8d1bdf7bbb..6f3c581cc61 100644 --- a/src/aws/CMakeLists.txt +++ b/src/aws/CMakeLists.txt @@ -1,5 +1,3 @@ -add_subdirectory(compression) - include_directories( ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR} @@ -44,7 +42,6 @@ endif() message(STATUS "===========================") add_library(flb-aws STATIC ${src}) -target_link_libraries(flb-aws flb-aws-compress) # Link rdkafka when MSK IAM is enabled if(DEFINED FLB_HAVE_AWS_MSK_IAM AND KAFKA_LIBRARIES) diff --git a/src/aws/compression/CMakeLists.txt b/src/aws/compression/CMakeLists.txt index 02a1ba3a6a5..e69de29bb2d 100644 --- a/src/aws/compression/CMakeLists.txt +++ b/src/aws/compression/CMakeLists.txt @@ -1,6 +0,0 @@ -add_library(flb-aws-compress INTERFACE) - -if(FLB_ARROW) - add_subdirectory(arrow EXCLUDE_FROM_ALL) - target_link_libraries(flb-aws-compress INTERFACE flb-aws-arrow) -endif() diff --git a/src/aws/flb_aws_compress.c b/src/aws/flb_aws_compress.c index 45fc1510255..9a40f61f5fb 100644 --- a/src/aws/flb_aws_compress.c +++ b/src/aws/flb_aws_compress.c @@ -23,13 +23,18 @@ #include #include +#include #include #include -#ifdef FLB_HAVE_ARROW -#include "compression/arrow/compress.h" -#endif +/* Wrapper function to adapt flb_snappy_compress signature */ +static int flb_snappy_compress_wrapper(void *in_data, size_t in_len, + void **out_data, size_t *out_len) +{ + return flb_snappy_compress((char *)in_data, in_len, + (char **)out_data, out_len); +} struct compression_option { int compression_type; @@ -38,36 +43,37 @@ struct compression_option { }; /* - * Library of compression options + * Library of compression options and format converters * AWS plugins that support compression will have these options. * Referenced function should return -1 on error and 0 on success. + * + * IMPORTANT NOTES: + * 1. True compression algorithms: none, gzip, snappy, zstd + * 2. Format converters: + * - ARROW: REMOVED - Arrow support has been removed as it was not a proper file format for S3 + * - PARQUET: Valid file format converter (deprecated: use format=parquet instead) + * 3. Supported S3 output formats: json (FLB_S3_FORMAT_JSON), parquet (FLB_S3_FORMAT_PARQUET) */ static const struct compression_option compression_options[] = { /* FLB_AWS_COMPRESS_NONE which is 0 is reserved for array footer */ + + /* True compression algorithms */ { FLB_AWS_COMPRESS_GZIP, "gzip", &flb_gzip_compress }, + { + FLB_AWS_COMPRESS_SNAPPY, + "snappy", + &flb_snappy_compress_wrapper + }, { FLB_AWS_COMPRESS_ZSTD, "zstd", &flb_zstd_compress }, -#ifdef FLB_HAVE_ARROW - { - FLB_AWS_COMPRESS_ARROW, - "arrow", - &out_s3_compress_arrow - }, -#endif -#ifdef FLB_HAVE_ARROW_PARQUET - { - FLB_AWS_COMPRESS_PARQUET, - "parquet", - &out_s3_compress_parquet - }, -#endif + { 0 } }; @@ -76,6 +82,10 @@ int flb_aws_compression_get_type(const char *compression_keyword) int ret; const struct compression_option *o; + if (strcmp(compression_keyword, "none") == 0) { + return FLB_AWS_COMPRESS_NONE; + } + o = compression_options; while (o->compression_type != 0) { diff --git a/src/aws/flb_aws_util.c b/src/aws/flb_aws_util.c index 135fe425ed0..50cc341401f 100644 --- a/src/aws/flb_aws_util.c +++ b/src/aws/flb_aws_util.c @@ -42,6 +42,8 @@ #define S3_KEY_SIZE 1024 #define RANDOM_STRING "$UUID" #define INDEX_STRING "$INDEX" +#define FILE_PATH_STRING "$FILE_PATH" +#define FILE_NAME_STRING "$FILE_NAME" #define AWS_USER_AGENT_NONE "none" #define AWS_USER_AGENT_ECS "ecs" #define AWS_USER_AGENT_K8S "k8s" @@ -179,6 +181,7 @@ struct flb_http_client *flb_aws_client_request(struct flb_aws_client *aws_client size_t dynamic_headers_len) { struct flb_http_client *c = NULL; + int auth_retry_done = 0; c = request_do(aws_client, method, uri, body, body_len, dynamic_headers, dynamic_headers_len); @@ -204,6 +207,23 @@ struct flb_http_client *flb_aws_client_request(struct flb_aws_client *aws_client flb_info("[aws_client] auth error, refreshing creds"); aws_client->refresh_limit = time(NULL) + FLB_AWS_CREDENTIAL_REFRESH_LIMIT; aws_client->provider->provider_vtable->refresh(aws_client->provider); + + /* + * Immediately retry with refreshed credentials. + * This handles the common case where credentials expire during normal operation + * (e.g., IAM role rotation in EKS/ECS). Without this retry, the request would + * fail and require file-level retry with additional delay. + * The auth_retry_done flag prevents infinite retry loops. + */ + if (!auth_retry_done) { + auth_retry_done = 1; + flb_http_client_destroy(c); + c = NULL; + + flb_debug("[aws_client] retrying with refreshed credentials"); + c = request_do(aws_client, method, uri, body, body_len, + dynamic_headers, dynamic_headers_len); + } } } } @@ -828,217 +848,25 @@ char* strtok_concurrent( #endif } -/* Constructs S3 object key as per the blob format. */ -flb_sds_t flb_get_s3_blob_key(const char *format, - const char *tag, - char *tag_delimiter, - const char *blob_path) -{ - int i = 0; - int ret = 0; - char *tag_token = NULL; - char *random_alphanumeric; - /* concurrent safe strtok_r requires a tracking ptr */ - char *strtok_saveptr; - flb_sds_t tmp = NULL; - flb_sds_t buf = NULL; - flb_sds_t s3_key = NULL; - flb_sds_t tmp_key = NULL; - flb_sds_t tmp_tag = NULL; - flb_sds_t sds_result = NULL; - char *valid_blob_path = NULL; - - if (strlen(format) > S3_KEY_SIZE){ - flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); - } - - tmp_tag = flb_sds_create_len(tag, strlen(tag)); - if(!tmp_tag){ - goto error; - } - - s3_key = flb_sds_create_len(format, strlen(format)); - if (!s3_key) { - goto error; - } - - /* Check if delimiter(s) specifed exists in the tag. */ - for (i = 0; i < strlen(tag_delimiter); i++){ - if (strchr(tag, tag_delimiter[i])){ - ret = 1; - break; - } - } - - tmp = flb_sds_create_len(TAG_PART_DESCRIPTOR, 5); - if (!tmp) { - goto error; - } - if (strstr(s3_key, tmp)){ - if(ret == 0){ - flb_warn("[s3_key] Invalid Tag delimiter: does not exist in tag. " - "tag=%s, format=%s", tag, format); - } - } - - flb_sds_destroy(tmp); - tmp = NULL; - - /* Split the string on the delimiters */ - tag_token = strtok_concurrent(tmp_tag, tag_delimiter, &strtok_saveptr); - - /* Find all occurences of $TAG[*] and - * replaces it with the right token from tag. - */ - i = 0; - while(tag_token != NULL && i < MAX_TAG_PARTS) { - buf = flb_sds_create_size(10); - if (!buf) { - goto error; - } - tmp = flb_sds_printf(&buf, TAG_PART_DESCRIPTOR, i); - if (!tmp) { - goto error; - } - - tmp_key = replace_uri_tokens(s3_key, tmp, tag_token); - if (!tmp_key) { - goto error; - } - - if(strlen(tmp_key) > S3_KEY_SIZE){ - flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); - } - - if (buf != tmp) { - flb_sds_destroy(buf); - } - flb_sds_destroy(tmp); - tmp = NULL; - buf = NULL; - flb_sds_destroy(s3_key); - s3_key = tmp_key; - tmp_key = NULL; - - tag_token = strtok_concurrent(NULL, tag_delimiter, &strtok_saveptr); - i++; - } - - tmp = flb_sds_create_len(TAG_PART_DESCRIPTOR, 5); - if (!tmp) { - goto error; - } - - /* A match against "$TAG[" indicates an invalid or out of bounds tag part. */ - if (strstr(s3_key, tmp)){ - flb_warn("[s3_key] Invalid / Out of bounds tag part: At most 10 tag parts " - "($TAG[0] - $TAG[9]) can be processed. tag=%s, format=%s, delimiters=%s", - tag, format, tag_delimiter); - } - - /* Find all occurences of $TAG and replace with the entire tag. */ - tmp_key = replace_uri_tokens(s3_key, TAG_DESCRIPTOR, tag); - if (!tmp_key) { - goto error; - } - - if(strlen(tmp_key) > S3_KEY_SIZE){ - flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); - } - - flb_sds_destroy(s3_key); - s3_key = tmp_key; - tmp_key = NULL; - - flb_sds_len_set(s3_key, strlen(s3_key)); - - valid_blob_path = (char *) blob_path; - - while (*valid_blob_path == '.' || - *valid_blob_path == '/') { - valid_blob_path++; - } - - /* Append the blob path. */ - sds_result = flb_sds_cat(s3_key, valid_blob_path, strlen(valid_blob_path)); - - if (!sds_result) { - goto error; - } - - s3_key = sds_result; - - if(strlen(s3_key) > S3_KEY_SIZE){ - flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); - } - - /* Find all occurences of $UUID and replace with a random string. */ - random_alphanumeric = flb_sts_session_name(); - if (!random_alphanumeric) { - goto error; - } - /* only use 8 chars of the random string */ - random_alphanumeric[8] = '\0'; - tmp_key = replace_uri_tokens(s3_key, RANDOM_STRING, random_alphanumeric); - if (!tmp_key) { - flb_free(random_alphanumeric); - goto error; - } - - if(strlen(tmp_key) > S3_KEY_SIZE){ - flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); - } - - flb_sds_destroy(s3_key); - s3_key = tmp_key; - tmp_key = NULL; - - flb_free(random_alphanumeric); - - flb_sds_destroy(tmp); - tmp = NULL; - - flb_sds_destroy(tmp_tag); - tmp_tag = NULL; - - return s3_key; - - error: - flb_errno(); - - if (tmp_tag){ - flb_sds_destroy(tmp_tag); - } - - if (s3_key){ - flb_sds_destroy(s3_key); - } - - if (buf && buf != tmp){ - flb_sds_destroy(buf); - } - - if (tmp){ - flb_sds_destroy(tmp); - } - - return NULL; -} - -/* Constructs S3 object key as per the format. */ -flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, - char *tag_delimiter, uint64_t seq_index) +/* + * Common function to process S3 key template with standard variables and time formatting. + * Handles: $TAG, $TAG[0-9], $UUID, $INDEX, and strftime time formatting. + */ +static flb_sds_t process_s3_key_template(const char *format, + time_t time, + const char *tag, + char *tag_delimiter, + uint64_t seq_index) { int i = 0; int ret = 0; + int len; int seq_index_len; char *tag_token = NULL; char *key; char *random_alphanumeric; char *seq_index_str; - /* concurrent safe strtok_r requires a tracking ptr */ char *strtok_saveptr; - int len; flb_sds_t tmp = NULL; flb_sds_t buf = NULL; flb_sds_t s3_key = NULL; @@ -1046,12 +874,12 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, flb_sds_t tmp_tag = NULL; struct tm gmt = {0}; - if (strlen(format) > S3_KEY_SIZE){ + if (strlen(format) > S3_KEY_SIZE) { flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); } tmp_tag = flb_sds_create_len(tag, strlen(tag)); - if(!tmp_tag){ + if (!tmp_tag) { goto error; } @@ -1060,9 +888,9 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, goto error; } - /* Check if delimiter(s) specifed exists in the tag. */ - for (i = 0; i < strlen(tag_delimiter); i++){ - if (strchr(tag, tag_delimiter[i])){ + /* Check if delimiter(s) specified exists in the tag. */ + for (i = 0; i < strlen(tag_delimiter); i++) { + if (strchr(tag, tag_delimiter[i])) { ret = 1; break; } @@ -1072,8 +900,8 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, if (!tmp) { goto error; } - if (strstr(s3_key, tmp)){ - if(ret == 0){ + if (strstr(s3_key, tmp)) { + if (ret == 0) { flb_warn("[s3_key] Invalid Tag delimiter: does not exist in tag. " "tag=%s, format=%s", tag, format); } @@ -1085,11 +913,9 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, /* Split the string on the delimiters */ tag_token = strtok_concurrent(tmp_tag, tag_delimiter, &strtok_saveptr); - /* Find all occurences of $TAG[*] and - * replaces it with the right token from tag. - */ + /* Find all occurrences of $TAG[*] and replace with the right token from tag. */ i = 0; - while(tag_token != NULL && i < MAX_TAG_PARTS) { + while (tag_token != NULL && i < MAX_TAG_PARTS) { buf = flb_sds_create_size(10); if (!buf) { goto error; @@ -1104,7 +930,7 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, goto error; } - if(strlen(tmp_key) > S3_KEY_SIZE){ + if (strlen(tmp_key) > S3_KEY_SIZE) { flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); } @@ -1128,19 +954,19 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, } /* A match against "$TAG[" indicates an invalid or out of bounds tag part. */ - if (strstr(s3_key, tmp)){ + if (strstr(s3_key, tmp)) { flb_warn("[s3_key] Invalid / Out of bounds tag part: At most 10 tag parts " "($TAG[0] - $TAG[9]) can be processed. tag=%s, format=%s, delimiters=%s", tag, format, tag_delimiter); } - /* Find all occurences of $TAG and replace with the entire tag. */ + /* Find all occurrences of $TAG and replace with the entire tag. */ tmp_key = replace_uri_tokens(s3_key, TAG_DESCRIPTOR, tag); if (!tmp_key) { goto error; } - if(strlen(tmp_key) > S3_KEY_SIZE){ + if (strlen(tmp_key) > S3_KEY_SIZE) { flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); } @@ -1148,7 +974,7 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, s3_key = tmp_key; tmp_key = NULL; - /* Find all occurences of $INDEX and replace with the appropriate index. */ + /* Find all occurrences of $INDEX and replace with the appropriate index. */ if (strstr((char *) format, INDEX_STRING)) { seq_index_len = snprintf(NULL, 0, "%"PRIu64, seq_index); seq_index_str = flb_calloc(seq_index_len + 1, sizeof(char)); @@ -1156,8 +982,7 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, goto error; } - sprintf(seq_index_str, "%"PRIu64, seq_index); - seq_index_str[seq_index_len] = '\0'; + snprintf(seq_index_str, seq_index_len + 1, "%"PRIu64, seq_index); tmp_key = replace_uri_tokens(s3_key, INDEX_STRING, seq_index_str); if (tmp_key == NULL) { flb_free(seq_index_str); @@ -1173,7 +998,7 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, flb_free(seq_index_str); } - /* Find all occurences of $UUID and replace with a random string. */ + /* Find all occurrences of $UUID and replace with a random string. */ random_alphanumeric = flb_sts_session_name(); if (!random_alphanumeric) { goto error; @@ -1186,7 +1011,7 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, goto error; } - if(strlen(tmp_key) > S3_KEY_SIZE){ + if (strlen(tmp_key) > S3_KEY_SIZE) { flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); } @@ -1195,6 +1020,7 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, tmp_key = NULL; flb_free(random_alphanumeric); + /* Perform time formatting using strftime */ if (!gmtime_r(&time, &gmt)) { flb_error("[s3_key] Failed to create timestamp."); goto error; @@ -1203,14 +1029,14 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, flb_sds_destroy(tmp); tmp = NULL; - /* A string no longer than S3_KEY_SIZE + 1 is created to store the formatted timestamp. */ + /* Create buffer for formatted timestamp */ key = flb_calloc(1, (S3_KEY_SIZE + 1) * sizeof(char)); if (!key) { goto error; } ret = strftime(key, S3_KEY_SIZE, s3_key, &gmt); - if(ret == 0){ + if (ret == 0) { flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); } flb_sds_destroy(s3_key); @@ -1227,33 +1053,140 @@ flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, } flb_sds_destroy(tmp_tag); - tmp_tag = NULL; return s3_key; - error: - flb_errno(); - if (tmp_tag){ - flb_sds_destroy(tmp_tag); - } - if (s3_key){ +error: + flb_errno(); + if (tmp_tag) { + flb_sds_destroy(tmp_tag); + } + if (s3_key) { + flb_sds_destroy(s3_key); + } + if (buf && buf != tmp) { + flb_sds_destroy(buf); + } + if (tmp) { + flb_sds_destroy(tmp); + } + if (tmp_key) { + flb_sds_destroy(tmp_key); + } + return NULL; +} + +/* + * Constructs S3 object key as per the format. + * Supports variables: $TAG, $TAG[0-9], $UUID, $INDEX, time formatters, $FILE_PATH, $FILE_NAME + * file_path: optional file path (can be NULL) - used only when format contains $FILE_PATH or $FILE_NAME + */ +flb_sds_t flb_get_s3_key(const char *format, time_t time, const char *tag, + char *tag_delimiter, uint64_t seq_index, + const char *file_path) +{ + flb_sds_t s3_key = NULL; + flb_sds_t tmp_key = NULL; + flb_sds_t result = NULL; + char *valid_file_path = NULL; + char *file_name = NULL; + int has_file_var = 0; + + /* Use common template processing for standard variables */ + s3_key = process_s3_key_template(format, time, tag, tag_delimiter, seq_index); + if (!s3_key) { + return NULL; + } + + /* If file_path is NULL, just return the processed template */ + if (file_path == NULL) { + return s3_key; + } + + /* Clean up file_path: skip leading dots and slashes */ + valid_file_path = (char *) file_path; + while (*valid_file_path == '.' || *valid_file_path == '/') { + valid_file_path++; + } + + /* Validate that file_path is not empty after stripping */ + if (*valid_file_path == '\0') { + flb_warn("[s3_key] file_path contains only dots/slashes, skipping $FILE_PATH substitution"); + /* Return the processed template without file path modifications */ + return s3_key; + } + + /* Check if format contains $FILE_PATH or $FILE_NAME */ + if (strstr(format, FILE_PATH_STRING) != NULL) { + /* Replace $FILE_PATH with the full file path */ + tmp_key = replace_uri_tokens(s3_key, FILE_PATH_STRING, valid_file_path); + if (!tmp_key) { flb_sds_destroy(s3_key); + return NULL; } - if (buf && buf != tmp){ - flb_sds_destroy(buf); + flb_sds_destroy(s3_key); + s3_key = tmp_key; + has_file_var = 1; + } + + if (strstr(format, FILE_NAME_STRING) != NULL) { + /* Extract just the filename from the path */ + file_name = strrchr(valid_file_path, '/'); + if (file_name) { + file_name++; /* skip the '/' */ + } else { + file_name = valid_file_path; /* no slash, entire string is filename */ + } + + /* Replace $FILE_NAME with just the filename */ + tmp_key = replace_uri_tokens(s3_key, FILE_NAME_STRING, file_name); + if (!tmp_key) { + flb_sds_destroy(s3_key); + return NULL; } - if (tmp){ - flb_sds_destroy(tmp); + flb_sds_destroy(s3_key); + s3_key = tmp_key; + has_file_var = 1; + } + + /* + * If no file variables used, append the cleaned file path. + * This preserves directory structure while removing absolute path prefix. + * For backward compatibility with the original implementation. + * + * IMPORTANT: Ensure proper path separator to maintain S3 key structure. + * Note: S3 keys should contain UTF-8 characters as-is (not URL-encoded). + * URL encoding is done separately when constructing HTTP requests. + */ + if (!has_file_var) { + /* Ensure there's a separator between the key format result and the file path */ + if (flb_sds_len(s3_key) > 0 && s3_key[flb_sds_len(s3_key) - 1] != '/') { + result = flb_sds_cat(s3_key, "/", 1); + if (!result) { + flb_sds_destroy(s3_key); + return NULL; + } + s3_key = result; } - if (tmp_key){ - flb_sds_destroy(tmp_key); + + result = flb_sds_cat(s3_key, valid_file_path, strlen(valid_file_path)); + if (!result) { + flb_sds_destroy(s3_key); + return NULL; } - return NULL; + s3_key = result; + } + + if (strlen(s3_key) > S3_KEY_SIZE) { + flb_warn("[s3_key] Object key length is longer than the 1024 character limit."); + } + + return s3_key; } /* - * This function is an extension to strftime which can support milliseconds with %3N, - * support nanoseconds with %9N or %L. The return value is the length of formatted - * time string. + * Constructs S3 object key as per the format. + * Supports variables: $TAG, $TAG[0-9], $UUID, $INDEX, time formatters, $FILE_PATH, $FILE_NAME + * file_path: optional log file path (can be NULL, e.g., nginx log file path) - used only when format contains $FILE_PATH or $FILE_NAME */ size_t flb_aws_strftime_precision(char **out_buf, const char *time_format, struct flb_time *tms) diff --git a/src/flb_blob_db.c b/src/flb_blob_db.c index fe6bbcf05ee..0b6e48f2927 100644 --- a/src/flb_blob_db.c +++ b/src/flb_blob_db.c @@ -212,6 +212,46 @@ static int prepare_stmts(struct flb_blob_db *context) return FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_GET_OLDEST_FILE_WITH_PARTS; } + /* get all parts for a specific file */ + result = sqlite3_prepare_v2(context->db->handler, + SQL_GET_ALL_PARTS_FOR_FILE, -1, + &context->stmt_get_all_parts_for_file, + NULL); + + if (result != SQLITE_OK) { + return FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_GET_ALL_PARTS_FOR_FILE; + } + + /* get next pending file for recovery */ + result = sqlite3_prepare_v2(context->db->handler, + SQL_GET_NEXT_PENDING_FILE, -1, + &context->stmt_get_next_pending_file, + NULL); + + if (result != SQLITE_OK) { + return FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_GET_NEXT_PENDING_FILE; + } + + /* get part upload status */ + result = sqlite3_prepare_v2(context->db->handler, + SQL_GET_PART_UPLOAD_STATUS, -1, + &context->stmt_get_part_upload_status, + NULL); + + if (result != SQLITE_OK) { + return FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_GET_PART_UPLOAD_STATUS; + } + + /* update file parts in progress (batch update by file_id) */ + result = sqlite3_prepare_v2(context->db->handler, + SQL_UPDATE_FILE_PARTS_IN_PROGRESS, -1, + &context->stmt_update_file_parts_in_progress, + NULL); + + if (result != SQLITE_OK) { + return FLB_BLOB_DB_ERROR_PREPARING_STATEMENT_UPDATE_FILE_PARTS_IN_PROGRESS; + } + return FLB_BLOB_DB_SUCCESS; } @@ -321,6 +361,10 @@ int flb_blob_db_close(struct flb_blob_db *context) sqlite3_finalize(context->stmt_get_next_file_part); sqlite3_finalize(context->stmt_get_oldest_file_with_parts); + sqlite3_finalize(context->stmt_get_all_parts_for_file); + sqlite3_finalize(context->stmt_get_next_pending_file); + sqlite3_finalize(context->stmt_get_part_upload_status); + sqlite3_finalize(context->stmt_update_file_parts_in_progress); flb_lock_destroy(&context->global_lock); @@ -953,6 +997,47 @@ int flb_blob_db_file_part_in_progress(struct flb_blob_db *context, return result; } +/* + * Update in_progress status for all parts of a file + * Used during recovery to mark entire file as being processed + */ +int flb_blob_db_file_parts_in_progress(struct flb_blob_db *context, + uint64_t file_id, + int status) +{ + sqlite3_stmt *statement; + int result; + + if (context == NULL || context->db == NULL) { + return FLB_BLOB_DB_ERROR_INVALID_BLOB_DB_CONTEXT; + } + + statement = context->stmt_update_file_parts_in_progress; + + flb_sqldb_lock(context->db); + + sqlite3_bind_int(statement, 1, status); + sqlite3_bind_int64(statement, 2, file_id); + + result = sqlite3_step(statement); + + if (result != SQLITE_DONE) { + context->last_error = result; + + result = FLB_BLOB_DB_ERROR_FILE_PART_IN_PROGRESS_UPDATE; + } + else { + result = FLB_BLOB_DB_SUCCESS; + } + + sqlite3_clear_bindings(statement); + sqlite3_reset(statement); + + flb_sqldb_unlock(context->db); + + return result; +} + int flb_blob_db_file_part_get_next(struct flb_blob_db *context, uint64_t *id, uint64_t *file_id, @@ -1182,7 +1267,8 @@ int flb_blob_db_file_fetch_oldest_ready(struct flb_blob_db *context, cfl_sds_t *source, cfl_sds_t *file_remote_id, cfl_sds_t *file_tag, - int *part_count) + int *part_count, + time_t *file_created) { sqlite3_stmt *statement; int result; @@ -1195,6 +1281,7 @@ int flb_blob_db_file_fetch_oldest_ready(struct flb_blob_db *context, *source = NULL; *file_remote_id = NULL; *file_tag = NULL; + *file_created = 0; statement = context->stmt_get_oldest_file_with_parts; @@ -1224,6 +1311,9 @@ int flb_blob_db_file_fetch_oldest_ready(struct flb_blob_db *context, tmp = (char *) sqlite3_column_text(statement, 5); *file_tag = cfl_sds_create(tmp); + /* created */ + *file_created = (time_t) sqlite3_column_int64(statement, 6); + if (*path == NULL || *part_ids == NULL || *source == NULL || @@ -1283,6 +1373,8 @@ int flb_blob_db_file_fetch_oldest_ready(struct flb_blob_db *context, *file_tag = NULL; } + + *file_created = 0; } flb_sqldb_unlock(context->db); @@ -1292,7 +1384,7 @@ int flb_blob_db_file_fetch_oldest_ready(struct flb_blob_db *context, int flb_blob_db_file_fetch_part_ids(struct flb_blob_db *context, uint64_t file_id, - cfl_sds_t *remote_id_list, + flb_sds_t *remote_id_list, size_t remote_id_list_size, int *remote_id_count) { @@ -1305,7 +1397,7 @@ int flb_blob_db_file_fetch_part_ids(struct flb_blob_db *context, flb_sqldb_lock(context->db); - memset(remote_id_list, 0, sizeof(cfl_sds_t) * remote_id_list_size); + memset(remote_id_list, 0, sizeof(flb_sds_t) * remote_id_list_size); sqlite3_bind_int64(statement, 1, file_id); @@ -1352,7 +1444,7 @@ int flb_blob_db_file_fetch_part_ids(struct flb_blob_db *context, flb_sds_destroy(remote_id_list[remote_id_index]); } - memset(remote_id_list, 0, sizeof(cfl_sds_t) * remote_id_list_size); + memset(remote_id_list, 0, sizeof(flb_sds_t) * remote_id_list_size); } else { *remote_id_count = (int) remote_id_index; @@ -1397,6 +1489,273 @@ int flb_blob_db_file_fetch_part_count(struct flb_blob_db *context, return result; } +int flb_blob_db_file_fetch_all_parts(struct flb_blob_db *context, + uint64_t file_id, + uint64_t **part_db_ids, + uint64_t **part_nums, + off_t **offset_starts, + off_t **offset_ends, + int *count) +{ + sqlite3_stmt *statement; + int total_count; + int idx = 0; + int result; + + *part_db_ids = NULL; + *part_nums = NULL; + *offset_starts = NULL; + *offset_ends = NULL; + *count = 0; + + /* Query total count first to pre-allocate exact size needed */ + total_count = flb_blob_db_file_fetch_part_count(context, file_id); + if (total_count <= 0) { + return total_count; /* 0 if no parts, or negative error code */ + } + + /* Allocate arrays with exact size needed */ + *part_db_ids = flb_calloc(total_count, sizeof(uint64_t)); + *part_nums = flb_calloc(total_count, sizeof(uint64_t)); + *offset_starts = flb_calloc(total_count, sizeof(off_t)); + *offset_ends = flb_calloc(total_count, sizeof(off_t)); + + if (!*part_db_ids || !*part_nums || !*offset_starts || !*offset_ends) { + /* Clean up any successful allocations */ + if (*part_db_ids) flb_free(*part_db_ids); + if (*part_nums) flb_free(*part_nums); + if (*offset_starts) flb_free(*offset_starts); + if (*offset_ends) flb_free(*offset_ends); + *part_db_ids = NULL; + *part_nums = NULL; + *offset_starts = NULL; + *offset_ends = NULL; + return FLB_BLOB_DB_ERROR_ALLOCATOR_FAILURE; + } + + statement = context->stmt_get_all_parts_for_file; + + flb_sqldb_lock(context->db); + + sqlite3_bind_int64(statement, 1, file_id); + + /* Fetch all rows and populate arrays */ + while ((result = sqlite3_step(statement)) == SQLITE_ROW) { + (*part_db_ids)[idx] = sqlite3_column_int64(statement, 0); + (*part_nums)[idx] = sqlite3_column_int64(statement, 1); + (*offset_starts)[idx] = sqlite3_column_int64(statement, 2); + (*offset_ends)[idx] = sqlite3_column_int64(statement, 3); + idx++; + } + + sqlite3_clear_bindings(statement); + sqlite3_reset(statement); + flb_sqldb_unlock(context->db); + + /* Check for query errors */ + if (result != SQLITE_DONE) { + context->last_error = result; + flb_free(*part_db_ids); + flb_free(*part_nums); + flb_free(*offset_starts); + flb_free(*offset_ends); + *part_db_ids = NULL; + *part_nums = NULL; + *offset_starts = NULL; + *offset_ends = NULL; + return -1; + } + + *count = idx; + return idx; +} + +/* + * Get next pending file for recovery + * Returns: 1 if found, 0 if no more files, -1 on error + */ +int flb_blob_db_file_get_next_pending(struct flb_blob_db *context, + uint64_t *file_id, + cfl_sds_t *path, + cfl_sds_t *destination, + cfl_sds_t *remote_id, + cfl_sds_t *tag, + int *part_count) +{ + sqlite3_stmt *statement; + char *tmp_path; + char *tmp_destination; + char *tmp_remote_id; + char *tmp_tag; + int result; + int exists; + + *path = NULL; + *destination = NULL; + *remote_id = NULL; + *tag = NULL; + *part_count = 0; + + statement = context->stmt_get_next_pending_file; + + flb_sqldb_lock(context->db); + + result = sqlite3_step(statement); + + if (result == SQLITE_ROW) { + exists = 1; + + *file_id = sqlite3_column_int64(statement, 0); + tmp_path = (char *) sqlite3_column_text(statement, 1); + tmp_destination = (char *) sqlite3_column_text(statement, 2); + tmp_remote_id = (char *) sqlite3_column_text(statement, 3); + tmp_tag = (char *) sqlite3_column_text(statement, 4); + *part_count = sqlite3_column_int(statement, 5); + + *path = cfl_sds_create(tmp_path); + if (*path == NULL) { + exists = -1; + } + else { + *destination = cfl_sds_create(tmp_destination); + if (*destination == NULL) { + exists = -1; + } + else { + *remote_id = cfl_sds_create(tmp_remote_id); + if (*remote_id == NULL) { + exists = -1; + } + else { + *tag = cfl_sds_create(tmp_tag); + if (*tag == NULL) { + exists = -1; + } + } + } + } + + /* Always reset/clear after processing row - even on success */ + sqlite3_clear_bindings(statement); + sqlite3_reset(statement); + + /* Set error context only on actual error */ + if (exists == -1) { + context->last_error = result; + } + } + else if (result == SQLITE_DONE) { + /* No more rows - reset statement for potential reuse */ + exists = 0; + sqlite3_clear_bindings(statement); + sqlite3_reset(statement); + } + else { + /* Error occurred - reset statement */ + context->last_error = result; + exists = -1; + sqlite3_clear_bindings(statement); + sqlite3_reset(statement); + } + + flb_sqldb_unlock(context->db); + + if (exists == -1) { + *file_id = 0; + *part_count = 0; + + if (*path != NULL) { + cfl_sds_destroy(*path); + *path = NULL; + } + + if (*destination != NULL) { + cfl_sds_destroy(*destination); + *destination = NULL; + } + + if (*remote_id != NULL) { + cfl_sds_destroy(*remote_id); + *remote_id = NULL; + } + + if (*tag != NULL) { + cfl_sds_destroy(*tag); + *tag = NULL; + } + } + + return exists; +} + +/* + * Check if a part is uploaded + * Returns: 0 on success, -1 on error + */ +int flb_blob_db_file_part_check_uploaded(struct flb_blob_db *context, + uint64_t part_id, + int *uploaded) +{ + sqlite3_stmt *statement; + int result; + + *uploaded = 0; + + statement = context->stmt_get_part_upload_status; + + flb_sqldb_lock(context->db); + + sqlite3_bind_int64(statement, 1, part_id); + + result = sqlite3_step(statement); + + if (result == SQLITE_ROW) { + *uploaded = sqlite3_column_int(statement, 0); + result = 0; + } + else if (result == SQLITE_DONE) { + result = -1; + } + else { + context->last_error = result; + result = -1; + } + + sqlite3_clear_bindings(statement); + sqlite3_reset(statement); + + flb_sqldb_unlock(context->db); + + return result; +} + +/* Reset zombie parts (in_progress=1 from crashed process) */ +int flb_blob_db_reset_zombie_parts(struct flb_blob_db *context) +{ + const char *sql; + int result; + + if (!context || !context->db) { + return FLB_BLOB_DB_ERROR_INVALID_BLOB_DB_CONTEXT; + } + + sql = "UPDATE blob_parts SET in_progress = 0 " + "WHERE uploaded = 0 AND in_progress = 1"; + + flb_sqldb_lock(context->db); + + result = sqlite3_exec(context->db->handler, sql, NULL, NULL, NULL); + + flb_sqldb_unlock(context->db); + + if (result != SQLITE_OK) { + context->last_error = result; + return -1; + } + + return FLB_BLOB_DB_SUCCESS; +} + #else int flb_blob_db_open(struct flb_blob_db *context, @@ -1512,7 +1871,8 @@ int flb_blob_db_file_part_get_next(struct flb_blob_db *context, cfl_sds_t *file_path, cfl_sds_t *destination, cfl_sds_t *remote_file_id, - cfl_sds_t *tag) + cfl_sds_t *tag, + int *part_count) { return FLB_BLOB_DB_ERROR_NO_BACKEND_AVAILABLE; } @@ -1539,14 +1899,15 @@ int flb_blob_db_file_fetch_oldest_ready(struct flb_blob_db *context, cfl_sds_t *source, cfl_sds_t *file_remote_id, cfl_sds_t *file_tag, - int *part_count) + int *part_count, + time_t *file_created) { return FLB_BLOB_DB_ERROR_NO_BACKEND_AVAILABLE; } int flb_blob_db_file_fetch_part_ids(struct flb_blob_db *context, uint64_t file_id, - cfl_sds_t *remote_id_list, + flb_sds_t *remote_id_list, size_t remote_id_list_size, int *remote_id_count) { @@ -1559,4 +1920,34 @@ int flb_blob_db_file_fetch_part_count(struct flb_blob_db *context, return FLB_BLOB_DB_ERROR_NO_BACKEND_AVAILABLE; } -#endif \ No newline at end of file +int flb_blob_db_file_get_next_pending(struct flb_blob_db *context, + uint64_t *file_id, + cfl_sds_t *path, + cfl_sds_t *destination, + cfl_sds_t *remote_id, + cfl_sds_t *tag, + int *part_count) +{ + return FLB_BLOB_DB_ERROR_NO_BACKEND_AVAILABLE; +} + +int flb_blob_db_file_part_check_uploaded(struct flb_blob_db *context, + uint64_t part_id, + int *uploaded) +{ + return FLB_BLOB_DB_ERROR_NO_BACKEND_AVAILABLE; +} + +int flb_blob_db_file_parts_in_progress(struct flb_blob_db *context, + uint64_t file_id, + int status) +{ + return FLB_BLOB_DB_ERROR_NO_BACKEND_AVAILABLE; +} + +int flb_blob_db_reset_zombie_parts(struct flb_blob_db *context) +{ + return FLB_BLOB_DB_ERROR_NO_BACKEND_AVAILABLE; +} + +#endif diff --git a/src/flb_parquet_impl.cpp b/src/flb_parquet_impl.cpp new file mode 100644 index 00000000000..2d796e4ca8d --- /dev/null +++ b/src/flb_parquet_impl.cpp @@ -0,0 +1,1158 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include /* errno, ERANGE */ +#include /* memcpy, strlen, strerror */ +#include /* std::strtoll, std::strtof, std::strtod */ +#include /* std::numeric_limits */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* Platform-specific headers for unlink */ +#ifdef _WIN32 + #include + #define unlink _unlink +#else + #include +#endif + +extern "C" { +#include +#include +#include +#include +#include +#include + +char* flb_msgpack_to_json_str(size_t size, const msgpack_object *obj, int escape_unicode); +FILE *flb_chunk_file_open(const char *chunk_path); +} + +/* Parquet processing constants */ +namespace { +constexpr size_t IO_BUFFER_SIZE = 1024 * 1024; /* 1MB for file I/O operations */ +constexpr size_t RECORDS_PER_BATCH = 65536; /* Arrow standard batch size (64K rows) */ +constexpr size_t FIELD_MAP_RESERVE_FACTOR = 2; /* Reserve 2x fields for hash map efficiency */ +constexpr int64_t MAX_ROW_GROUP_LENGTH = 524288; /* 512K rows per row group for S3 compatibility */ +} + +namespace { + +arrow::Result msgpack_object_to_json_string(const msgpack_object* obj) { + char *json_str = flb_msgpack_to_json_str(1024, obj, FLB_FALSE); + if (!json_str) { + return arrow::Status::Invalid("Failed to convert msgpack object to JSON string"); + } + std::string result(json_str); + flb_free(json_str); + return result; +} + +template +std::optional parse_string_to_number(const std::string& str) { + char* endptr; + errno = 0; + + if constexpr (std::is_same_v || std::is_same_v) { + long long val = std::strtoll(str.c_str(), &endptr, 10); + if (endptr == str.c_str() || *endptr != '\0' || errno == ERANGE) { + return std::nullopt; + } + + // Enforce target type bounds before casting to avoid silent overflow + const long long min_t = static_cast(std::numeric_limits::min()); + const long long max_t = static_cast(std::numeric_limits::max()); + if (val < min_t || val > max_t) { + errno = ERANGE; + return std::nullopt; + } + + return static_cast(val); + } else if constexpr (std::is_same_v) { + float val = std::strtof(str.c_str(), &endptr); + if (endptr == str.c_str() || *endptr != '\0' || errno == ERANGE) { + return std::nullopt; + } + return val; + } else if constexpr (std::is_same_v) { + double val = std::strtod(str.c_str(), &endptr); + if (endptr == str.c_str() || *endptr != '\0' || errno == ERANGE) { + return std::nullopt; + } + return val; + } + return std::nullopt; +} + +std::optional parse_string_to_bool(const std::string& str) { + std::string lower = str; + std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); + + if (lower == "true" || lower == "1" || lower == "yes" || lower == "y" || lower == "on") { + return true; + } + if (lower == "false" || lower == "0" || lower == "no" || lower == "n" || lower == "off") { + return false; + } + return std::nullopt; +} + +class MsgpackToArrowConverter { +public: + arrow::Status convert_value(const msgpack_object* obj, + arrow::ArrayBuilder* builder, + const std::shared_ptr& type) { + if (obj->type == MSGPACK_OBJECT_NIL) { + return arrow::Status::Invalid("Null value encountered"); + } + + switch (type->id()) { + case arrow::Type::BOOL: + return convert_to_bool(obj, static_cast(builder)); + case arrow::Type::INT32: + return convert_to_int32(obj, static_cast(builder)); + case arrow::Type::INT64: + return convert_to_int64(obj, static_cast(builder)); + case arrow::Type::FLOAT: + return convert_to_float(obj, static_cast(builder)); + case arrow::Type::DOUBLE: + return convert_to_double(obj, static_cast(builder)); + case arrow::Type::STRING: + return convert_to_string(obj, static_cast(builder)); + case arrow::Type::BINARY: + return convert_to_binary(obj, static_cast(builder)); + case arrow::Type::TIMESTAMP: + return convert_to_timestamp(obj, static_cast(builder), type); + default: + return arrow::Status::NotImplemented("Unsupported Arrow type: " + type->ToString()); + } + } + +private: + arrow::Status convert_to_bool(const msgpack_object* obj, arrow::BooleanBuilder* builder) { + switch (obj->type) { + case MSGPACK_OBJECT_BOOLEAN: + return builder->Append(obj->via.boolean); + + case MSGPACK_OBJECT_POSITIVE_INTEGER: + return builder->Append(obj->via.u64 != 0); + + case MSGPACK_OBJECT_NEGATIVE_INTEGER: + return builder->Append(obj->via.i64 != 0); + + case MSGPACK_OBJECT_FLOAT32: + case MSGPACK_OBJECT_FLOAT64: + return builder->Append(obj->via.f64 != 0.0); + + case MSGPACK_OBJECT_STR: { + std::string str(obj->via.str.ptr, obj->via.str.size); + if (auto result = parse_string_to_bool(str)) { + return builder->Append(*result); + } + return arrow::Status::Invalid("Cannot parse string to bool"); + } + + default: + return arrow::Status::Invalid("Cannot convert msgpack type to bool"); + } + } + + template + arrow::Status convert_to_integer(const msgpack_object* obj, BuilderT* builder) { + constexpr T MIN_VAL = std::numeric_limits::min(); + constexpr T MAX_VAL = std::numeric_limits::max(); + + switch (obj->type) { + case MSGPACK_OBJECT_POSITIVE_INTEGER: { + if (obj->via.u64 > static_cast(MAX_VAL)) { + flb_warn("[parquet] Value %llu clamped to max %lld for integer type", + (unsigned long long)obj->via.u64, (long long)MAX_VAL); + return builder->Append(MAX_VAL); + } + return builder->Append(static_cast(obj->via.u64)); + } + + case MSGPACK_OBJECT_NEGATIVE_INTEGER: { + if (obj->via.i64 < MIN_VAL || obj->via.i64 > MAX_VAL) { + T clamped = (obj->via.i64 < MIN_VAL) ? MIN_VAL : MAX_VAL; + flb_warn("[parquet] Value %lld clamped to %lld for integer type", + (long long)obj->via.i64, (long long)clamped); + return builder->Append(clamped); + } + return builder->Append(static_cast(obj->via.i64)); + } + + case MSGPACK_OBJECT_FLOAT32: + case MSGPACK_OBJECT_FLOAT64: { + if (obj->via.f64 > MAX_VAL || obj->via.f64 < MIN_VAL) { + T clamped = (obj->via.f64 > MAX_VAL) ? MAX_VAL : MIN_VAL; + flb_warn("[parquet] Float value %f clamped to %lld for integer type", + obj->via.f64, (long long)clamped); + return builder->Append(clamped); + } + return builder->Append(static_cast(obj->via.f64)); + } + + case MSGPACK_OBJECT_STR: { + std::string str(obj->via.str.ptr, obj->via.str.size); + if (auto val = parse_string_to_number(str)) { + if (*val > MAX_VAL || *val < MIN_VAL) { + T clamped = (*val > MAX_VAL) ? MAX_VAL : MIN_VAL; + flb_warn("[parquet] Parsed string value clamped to %lld for integer type (original: %s)", + (long long)clamped, str.c_str()); + return builder->Append(clamped); + } + return builder->Append(*val); + } + return arrow::Status::Invalid("Cannot parse string to integer"); + } + + case MSGPACK_OBJECT_BOOLEAN: + return builder->Append(obj->via.boolean ? 1 : 0); + + default: + return arrow::Status::Invalid("Cannot convert msgpack type to integer"); + } + } + + arrow::Status convert_to_int32(const msgpack_object* obj, arrow::Int32Builder* builder) { + return convert_to_integer(obj, builder); + } + + arrow::Status convert_to_int64(const msgpack_object* obj, arrow::Int64Builder* builder) { + return convert_to_integer(obj, builder); + } + + template + arrow::Status convert_to_floating(const msgpack_object* obj, BuilderT* builder) { + switch (obj->type) { + case MSGPACK_OBJECT_FLOAT32: + case MSGPACK_OBJECT_FLOAT64: + return builder->Append(static_cast(obj->via.f64)); + + case MSGPACK_OBJECT_POSITIVE_INTEGER: + return builder->Append(static_cast(obj->via.u64)); + + case MSGPACK_OBJECT_NEGATIVE_INTEGER: + return builder->Append(static_cast(obj->via.i64)); + + case MSGPACK_OBJECT_STR: { + std::string str(obj->via.str.ptr, obj->via.str.size); + if (auto val = parse_string_to_number(str)) { + return builder->Append(*val); + } + return arrow::Status::Invalid("Cannot parse string to float"); + } + + case MSGPACK_OBJECT_BOOLEAN: + return builder->Append(obj->via.boolean ? static_cast(1.0) : static_cast(0.0)); + + default: + return arrow::Status::Invalid("Cannot convert msgpack type to float"); + } + } + + arrow::Status convert_to_float(const msgpack_object* obj, arrow::FloatBuilder* builder) { + return convert_to_floating(obj, builder); + } + + arrow::Status convert_to_double(const msgpack_object* obj, arrow::DoubleBuilder* builder) { + return convert_to_floating(obj, builder); + } + + arrow::Status convert_to_string(const msgpack_object* obj, arrow::StringBuilder* builder) { + switch (obj->type) { + case MSGPACK_OBJECT_STR: + return builder->Append(obj->via.str.ptr, obj->via.str.size); + + case MSGPACK_OBJECT_BIN: + return builder->Append(obj->via.bin.ptr, obj->via.bin.size); + + case MSGPACK_OBJECT_BOOLEAN: + return builder->Append(obj->via.boolean ? "true" : "false"); + + case MSGPACK_OBJECT_POSITIVE_INTEGER: + return builder->Append(std::to_string(obj->via.u64)); + + case MSGPACK_OBJECT_NEGATIVE_INTEGER: + return builder->Append(std::to_string(obj->via.i64)); + + case MSGPACK_OBJECT_FLOAT32: + case MSGPACK_OBJECT_FLOAT64: + return builder->Append(std::to_string(obj->via.f64)); + + case MSGPACK_OBJECT_MAP: + case MSGPACK_OBJECT_ARRAY: { + auto json_result = msgpack_object_to_json_string(obj); + if (!json_result.ok()) { + return json_result.status(); + } + return builder->Append(*json_result); + } + + default: + return arrow::Status::Invalid("Cannot convert msgpack type to string"); + } + } + + arrow::Status convert_to_binary(const msgpack_object* obj, arrow::BinaryBuilder* builder) { + if (obj->type == MSGPACK_OBJECT_BIN) { + return builder->Append(reinterpret_cast(obj->via.bin.ptr), + obj->via.bin.size); + } + if (obj->type == MSGPACK_OBJECT_STR) { + return builder->Append(reinterpret_cast(obj->via.str.ptr), + obj->via.str.size); + } + return arrow::Status::Invalid("Binary field only accepts BIN or STR types"); + } + + arrow::Status convert_to_timestamp(const msgpack_object* obj, + arrow::TimestampBuilder* builder, + const std::shared_ptr& type) { + /* Get the timestamp type to determine the time unit */ + auto ts_type = std::static_pointer_cast(type); + arrow::TimeUnit::type time_unit = ts_type->unit(); + + /* Scale factor: Assume input is in seconds, scale to target unit + * MILLI (ms): multiply by 1,000 + * MICRO (us): multiply by 1,000,000 + * NANO (ns): multiply by 1,000,000,000 + */ + int64_t scale_factor = 1; + switch (time_unit) { + case arrow::TimeUnit::SECOND: + scale_factor = 1; + break; + case arrow::TimeUnit::MILLI: + scale_factor = 1000LL; + break; + case arrow::TimeUnit::MICRO: + scale_factor = 1000000LL; + break; + case arrow::TimeUnit::NANO: + scale_factor = 1000000000LL; + break; + } + + switch (obj->type) { + case MSGPACK_OBJECT_POSITIVE_INTEGER: { + int64_t scaled_value = static_cast(obj->via.u64) * scale_factor; + return builder->Append(scaled_value); + } + + case MSGPACK_OBJECT_NEGATIVE_INTEGER: { + int64_t scaled_value = obj->via.i64 * scale_factor; + return builder->Append(scaled_value); + } + + case MSGPACK_OBJECT_FLOAT32: + case MSGPACK_OBJECT_FLOAT64: { + /* For floating point, apply scaling to handle fractional seconds */ + double scaled_value = obj->via.f64 * static_cast(scale_factor); + return builder->Append(static_cast(scaled_value)); + } + + case MSGPACK_OBJECT_STR: { + std::string str(obj->via.str.ptr, obj->via.str.size); + if (auto val = parse_string_to_number(str)) { + double scaled_value = (*val) * static_cast(scale_factor); + return builder->Append(static_cast(scaled_value)); + } + return arrow::Status::Invalid("Cannot parse string to timestamp"); + } + + case MSGPACK_OBJECT_BOOLEAN: { + int64_t scaled_value = (obj->via.boolean ? 1 : 0) * scale_factor; + return builder->Append(scaled_value); + } + + default: + return arrow::Status::Invalid("Cannot convert msgpack type to timestamp"); + } + } +}; + +using TypeFactory = std::function()>; + +const std::unordered_map TYPE_FACTORY_MAP = { + {"bool", []() { return arrow::boolean(); }}, + {"boolean", []() { return arrow::boolean(); }}, + {"int32", []() { return arrow::int32(); }}, + {"int64", []() { return arrow::int64(); }}, + {"float", []() { return arrow::float32(); }}, + {"float32", []() { return arrow::float32(); }}, + {"double", []() { return arrow::float64(); }}, + {"float64", []() { return arrow::float64(); }}, + {"utf8", []() { return arrow::utf8(); }}, + {"string", []() { return arrow::utf8(); }}, + {"binary", []() { return arrow::binary(); }} +}; + +arrow::Result> parse_schema_from_json(const char* schema_str) { + yyjson_doc* doc = yyjson_read(schema_str, strlen(schema_str), 0); + if (!doc) { + flb_error("[parquet] Failed to parse JSON schema"); + return arrow::Status::Invalid("Failed to parse JSON schema"); + } + + yyjson_val* root = yyjson_doc_get_root(doc); + if (!root || !yyjson_is_obj(root)) { + yyjson_doc_free(doc); + return arrow::Status::Invalid("Schema root must be an object"); + } + + yyjson_val* fields_array = yyjson_obj_get(root, "fields"); + if (!fields_array || !yyjson_is_arr(fields_array)) { + yyjson_doc_free(doc); + return arrow::Status::Invalid("Schema must contain 'fields' array"); + } + + std::vector> arrow_fields; + yyjson_val* field_obj; + yyjson_arr_iter iter; + yyjson_arr_iter_init(fields_array, &iter); + + while ((field_obj = yyjson_arr_iter_next(&iter))) { + if (!yyjson_is_obj(field_obj)) { + continue; + } + + yyjson_val* name_val = yyjson_obj_get(field_obj, "name"); + yyjson_val* type_val = yyjson_obj_get(field_obj, "type"); + yyjson_val* nullable_val = yyjson_obj_get(field_obj, "nullable"); + + if (!name_val || !yyjson_is_str(name_val) || !type_val) { + continue; + } + + std::string field_name(yyjson_get_str(name_val)); + bool nullable = nullable_val ? yyjson_get_bool(nullable_val) : true; + std::shared_ptr data_type; + + const char* type_name_cstr = nullptr; + yyjson_val* type_params = nullptr; + + if (yyjson_is_str(type_val)) { + type_name_cstr = yyjson_get_str(type_val); + } else if (yyjson_is_obj(type_val)) { + yyjson_val* type_name_val = yyjson_obj_get(type_val, "name"); + if (type_name_val && yyjson_is_str(type_name_val)) { + type_name_cstr = yyjson_get_str(type_name_val); + type_params = type_val; + } + } + + if (!type_name_cstr) { + continue; + } + + std::string type_name(type_name_cstr); + + if (type_name == "timestamp") { + arrow::TimeUnit::type time_unit = arrow::TimeUnit::MILLI; + + if (type_params) { + yyjson_val* unit_val = yyjson_obj_get(type_params, "unit"); + if (unit_val && yyjson_is_str(unit_val)) { + std::string unit = yyjson_get_str(unit_val); + + if (unit == "s") { + time_unit = arrow::TimeUnit::SECOND; + } else if (unit == "ms") { + time_unit = arrow::TimeUnit::MILLI; + } else if (unit == "us" || unit == "μs") { + time_unit = arrow::TimeUnit::MICRO; + } else if (unit == "ns") { + time_unit = arrow::TimeUnit::NANO; + } else { + yyjson_doc_free(doc); + return arrow::Status::Invalid( + "Invalid timestamp unit '" + unit + "'. Supported units: s, ms, us, ns"); + } + } + } + data_type = arrow::timestamp(time_unit); + } else { + auto it = TYPE_FACTORY_MAP.find(type_name); + data_type = (it != TYPE_FACTORY_MAP.end()) ? it->second() : arrow::utf8(); + } + + arrow_fields.push_back(arrow::field(field_name, data_type, nullable)); + } + + yyjson_doc_free(doc); + + if (arrow_fields.empty()) { + return arrow::Status::Invalid("No valid fields found in schema"); + } + + return arrow::schema(arrow_fields); +} + +bool append_default_value(arrow::ArrayBuilder* builder, + const std::shared_ptr& type) { + switch (type->id()) { + case arrow::Type::BOOL: + return static_cast(builder)->Append(false).ok(); + case arrow::Type::INT32: + return static_cast(builder)->Append(0).ok(); + case arrow::Type::INT64: + return static_cast(builder)->Append(0).ok(); + case arrow::Type::FLOAT: + return static_cast(builder)->Append(0.0f).ok(); + case arrow::Type::DOUBLE: + return static_cast(builder)->Append(0.0).ok(); + case arrow::Type::STRING: + return static_cast(builder)->Append("").ok(); + case arrow::Type::BINARY: + return static_cast(builder)->AppendEmptyValue().ok(); + case arrow::Type::TIMESTAMP: + return static_cast(builder)->Append(0).ok(); + default: + return false; + } +} + +} // anonymous namespace + +extern "C" { + +/* Validate parquet file: record count, field type and/or field values + * expected_records: -1 to skip, field_name: NULL to skip field check + * expected_type: NULL to skip type check (e.g., "int32", "string", "bool") + */ +int validate_parquet_file(const char *file_path, + int expected_records, + const char *field_name, + const char *expected_type, + const char *expected_value, + int row_index) +{ + if (!file_path) { + flb_error("[parquet] NULL file path for validation"); + return -1; + } + + try { + /* Open parquet file */ + auto file_result = arrow::io::ReadableFile::Open(file_path); + if (!file_result.ok()) { + flb_error("[parquet] Failed to open parquet file for validation: %s", + file_result.status().ToString().c_str()); + return -1; + } + auto infile = std::move(file_result).ValueOrDie(); + + /* Open parquet reader using the file path directly */ + std::unique_ptr reader; + auto parquet_reader = parquet::ParquetFileReader::OpenFile(file_path); + auto reader_result = parquet::arrow::FileReader::Make( + arrow::default_memory_pool(), std::move(parquet_reader), &reader); + if (!reader_result.ok()) { + flb_error("[parquet] Failed to create parquet reader: %s", + reader_result.ToString().c_str()); + return -1; + } + + /* Get file metadata */ + auto metadata = reader->parquet_reader()->metadata(); + int64_t total_rows = metadata->num_rows(); + + /* Validate record count if expected value provided */ + if (expected_records >= 0) { + if (total_rows != expected_records) { + flb_error("[parquet] Record count mismatch: expected=%d, got=%lld", + expected_records, (long long)total_rows); + return -1; + } + } + + /* Read all data to validate it can be read */ + std::shared_ptr table; + auto read_result = reader->ReadTable(&table); + if (!read_result.ok()) { + flb_error("[parquet] Failed to read table: %s", + read_result.ToString().c_str()); + return -1; + } + + /* Validate table has data */ + if (expected_records > 0 && table->num_rows() != total_rows) { + flb_error("[parquet] Table row count mismatch: metadata=%lld, table=%lld", + (long long)total_rows, (long long)table->num_rows()); + return -1; + } + + /* Validate schema and data types */ + auto schema = table->schema(); + + /* If field_name is provided, validate field type and/or value */ + if (field_name) { + int col_index = schema->GetFieldIndex(field_name); + if (col_index < 0) { + flb_error("[parquet] Field '%s' not found in schema", field_name); + return -1; + } + + auto field_type = schema->field(col_index)->type(); + + /* Validate type if expected_type is provided */ + if (expected_type) { + std::string actual_type = field_type->ToString(); + if (actual_type != expected_type) { + flb_error("[parquet] Type mismatch for field '%s': expected='%s', actual='%s'", + field_name, expected_type, actual_type.c_str()); + return -1; + } + } + + /* Skip value validation if expected_value is NULL */ + if (!expected_value) { + return 0; + } + + /* Validate row index */ + if (row_index >= table->num_rows()) { + flb_error("[parquet] Row index %d out of range (total rows: %lld)", + row_index, (long long)table->num_rows()); + return -1; + } + + /* Get the column and combine all chunks for simpler access (test utility) */ + auto column = table->column(col_index); + + /* Combine all chunks into a single array */ + std::shared_ptr chunk; + if (column->num_chunks() == 1) { + chunk = column->chunk(0); + } else { + arrow::ArrayVector chunks_to_concat; + for (int i = 0; i < column->num_chunks(); i++) { + chunks_to_concat.push_back(column->chunk(i)); + } + auto concat_result = arrow::Concatenate(chunks_to_concat); + if (!concat_result.ok()) { + flb_error("[parquet] Failed to concatenate column chunks: %s", + concat_result.status().ToString().c_str()); + return -1; + } + chunk = std::move(concat_result).ValueOrDie(); + } + + /* Convert value to string for comparison */ + std::string actual_value; + + switch (field_type->id()) { + case arrow::Type::STRING: { + auto string_array = std::static_pointer_cast(chunk); + if (!string_array->IsNull(row_index)) { + actual_value = string_array->GetString(row_index); + } + break; + } + case arrow::Type::INT32: { + auto int_array = std::static_pointer_cast(chunk); + if (!int_array->IsNull(row_index)) { + actual_value = std::to_string(int_array->Value(row_index)); + } + break; + } + case arrow::Type::INT64: { + auto int_array = std::static_pointer_cast(chunk); + if (!int_array->IsNull(row_index)) { + actual_value = std::to_string(int_array->Value(row_index)); + } + break; + } + case arrow::Type::BOOL: { + auto bool_array = std::static_pointer_cast(chunk); + if (!bool_array->IsNull(row_index)) { + actual_value = bool_array->Value(row_index) ? "true" : "false"; + } + break; + } + case arrow::Type::FLOAT: { + auto float_array = std::static_pointer_cast(chunk); + if (!float_array->IsNull(row_index)) { + actual_value = std::to_string(float_array->Value(row_index)); + } + break; + } + case arrow::Type::DOUBLE: { + auto double_array = std::static_pointer_cast(chunk); + if (!double_array->IsNull(row_index)) { + actual_value = std::to_string(double_array->Value(row_index)); + } + break; + } + case arrow::Type::TIMESTAMP: { + auto ts_array = std::static_pointer_cast(chunk); + if (!ts_array->IsNull(row_index)) { + actual_value = std::to_string(ts_array->Value(row_index)); + } + break; + } + case arrow::Type::BINARY: { + auto binary_array = std::static_pointer_cast(chunk); + if (!binary_array->IsNull(row_index)) { + /* Convert binary data to hex string for comparison */ + auto binary_view = binary_array->GetView(row_index); + actual_value = ""; + for (int i = 0; i < binary_view.length(); i++) { + char hex[3]; + snprintf(hex, sizeof(hex), "%02x", (unsigned char)binary_view.data()[i]); + actual_value += hex; + } + } + break; + } + default: + flb_warn("[parquet] Unsupported type for validation: %s", + field_type->ToString().c_str()); + return 0; /* Skip validation for unsupported types */ + } + + /* Compare values if expected value is provided */ + if (expected_value) { + if (actual_value != expected_value) { + flb_error("[parquet] Value mismatch for field '%s' at row %d: expected='%s', actual='%s'", + field_name, row_index, expected_value, actual_value.c_str()); + return -1; + } + } + } else { + /* No field validation requested, show first row values for all columns */ + if (total_rows > 0) { + for (int col = 0; col < table->num_columns(); col++) { + auto column = table->column(col); + auto chunk = column->chunk(0); + + /* Print first value as string for verification */ + if (chunk->length() > 0) { + auto value_str = chunk->ToString(); + /* Truncate if too long */ + if (value_str.length() > 100) { + value_str = value_str.substr(0, 100) + "..."; + } + } + } + } + } + + return 0; + } + catch (const parquet::ParquetException& e) { + flb_error("[parquet] Validation parquet exception: %s", e.what()); + return -1; + } + catch (const std::exception& e) { + flb_error("[parquet] Validation exception: %s", e.what()); + return -1; + } + catch (...) { + flb_error("[parquet] Unknown validation exception"); + return -1; + } +} + +int flb_parquet_validate_schema(const char *schema_str, + char *error_msg, + size_t error_msg_size) +{ + /* Reuse parse_schema_from_json to avoid code duplication */ + auto result = parse_schema_from_json(schema_str); + + if (!result.ok()) { + if (error_msg && error_msg_size > 0) { + snprintf(error_msg, error_msg_size, "%s", + result.status().ToString().c_str()); + } + return -1; + } + + return 0; +} + +flb_parquet_schema *flb_parquet_schema_create(const char *schema_str, + char *error_msg, + size_t error_msg_size) +{ + if (!schema_str) { + if (error_msg && error_msg_size > 0) { + snprintf(error_msg, error_msg_size, "NULL schema_str"); + } + return NULL; + } + + auto result = parse_schema_from_json(schema_str); + + if (!result.ok()) { + if (error_msg && error_msg_size > 0) { + snprintf(error_msg, error_msg_size, "%s", + result.status().ToString().c_str()); + } + return NULL; + } + + /* Store as heap-allocated shared_ptr to pass through C API */ + auto schema = result.ValueOrDie(); + auto cached = new std::shared_ptr(schema); + return reinterpret_cast(cached); +} + +void flb_parquet_schema_destroy(flb_parquet_schema *schema) +{ + if (schema) { + auto schema_ptr = reinterpret_cast*>(schema); + delete schema_ptr; + } +} + +int flb_msgpack_to_parquet_streaming(const char *msgpack_file_path, + flb_parquet_schema *schema, + int compression, + const char *output_file, + size_t *out_file_size, + size_t total_file_size __attribute__((unused))) +{ + FILE *msgpack_fp = NULL; + char *read_buffer = NULL; + msgpack_unpacker unpacker; + msgpack_unpacked result; + bool unpacker_ready = false; + bool result_ready = false; + + if (!msgpack_file_path || !out_file_size || !schema || !output_file) { + flb_error("[parquet] NULL parameter"); + return -1; + } + + auto schema_ptr = reinterpret_cast*>(schema); + auto arrow_schema = *schema_ptr; + + msgpack_fp = flb_chunk_file_open(msgpack_file_path); + if (!msgpack_fp) { + flb_error("[parquet] Failed to open msgpack file: %s", msgpack_file_path); + return -1; + } + + parquet::Compression::type parquet_compression; + switch (compression) { + case 1: parquet_compression = parquet::Compression::GZIP; break; + case 2: parquet_compression = parquet::Compression::SNAPPY; break; + case 3: parquet_compression = parquet::Compression::ZSTD; break; + default: parquet_compression = parquet::Compression::UNCOMPRESSED; break; + } + + try { + read_buffer = (char *)flb_malloc(IO_BUFFER_SIZE); + if (!read_buffer) { + flb_error("[parquet] Failed to allocate read buffer"); + fclose(msgpack_fp); + return -1; + } + + if (!msgpack_unpacker_init(&unpacker, IO_BUFFER_SIZE)) { + flb_error("[parquet] Failed to initialize msgpack unpacker"); + flb_free(read_buffer); + fclose(msgpack_fp); + return -1; + } + unpacker_ready = true; + + msgpack_unpacked_init(&result); + result_ready = true; + + std::vector> field_builders; + for (int i = 0; i < arrow_schema->num_fields(); i++) { + auto field = arrow_schema->field(i); + auto builder_result = arrow::MakeBuilder(field->type(), arrow::default_memory_pool()); + if (!builder_result.ok()) { + flb_error("[parquet] Failed to create builder for field '%s': %s", + field->name().c_str(), builder_result.status().ToString().c_str()); + throw std::runtime_error("Failed to create builder"); + } + field_builders.push_back(std::move(builder_result).ValueOrDie()); + } + + MsgpackToArrowConverter converter; + + std::unordered_map field_map; + field_map.reserve(arrow_schema->num_fields() * FIELD_MAP_RESERVE_FACTOR); + + auto output_stream_result = arrow::io::FileOutputStream::Open(output_file); + if (!output_stream_result.ok()) { + flb_error("[parquet] Failed to open output file: %s", + output_stream_result.status().ToString().c_str()); + throw std::runtime_error("Failed to open output file"); + } + auto output_stream = std::move(output_stream_result).ValueOrDie(); + + parquet::WriterProperties::Builder props_builder; + props_builder.compression(parquet_compression); + props_builder.max_row_group_length(MAX_ROW_GROUP_LENGTH); + auto writer_properties = props_builder.build(); + + auto writer_result = parquet::arrow::FileWriter::Open( + *arrow_schema, arrow::default_memory_pool(), output_stream, writer_properties); + if (!writer_result.ok()) { + flb_error("[parquet] Failed to create parquet writer: %s", + writer_result.status().ToString().c_str()); + throw std::runtime_error("Failed to create parquet writer"); + } + auto writer = std::move(writer_result).ValueOrDie(); + + size_t bytes_read; + size_t records_processed = 0; + size_t batch_records = 0; + + while ((bytes_read = fread(read_buffer, 1, IO_BUFFER_SIZE, msgpack_fp)) > 0) { + if (!msgpack_unpacker_reserve_buffer(&unpacker, bytes_read)) { + throw std::runtime_error("msgpack unpacker buffer reserve failed"); + } + + memcpy(msgpack_unpacker_buffer(&unpacker), read_buffer, bytes_read); + msgpack_unpacker_buffer_consumed(&unpacker, bytes_read); + + msgpack_unpack_return ret; + while ((ret = msgpack_unpacker_next(&unpacker, &result)) != MSGPACK_UNPACK_CONTINUE) { + if (ret == MSGPACK_UNPACK_SUCCESS) { + const msgpack_object* record = &result.data; + if (record->type != MSGPACK_OBJECT_ARRAY || record->via.array.size != 2) { + continue; + } + + const msgpack_object* map_obj = &record->via.array.ptr[1]; + if (map_obj->type != MSGPACK_OBJECT_MAP) { + continue; + } + + field_map.clear(); + for (uint32_t i = 0; i < map_obj->via.map.size; i++) { + const msgpack_object_kv* kv = &map_obj->via.map.ptr[i]; + if (kv->key.type == MSGPACK_OBJECT_STR) { + std::string_view key(kv->key.via.str.ptr, kv->key.via.str.size); + field_map[key] = &kv->val; + } + } + + for (int i = 0; i < arrow_schema->num_fields(); i++) { + auto field = arrow_schema->field(i); + auto it = field_map.find(field->name()); + + if (it != field_map.end() && it->second->type != MSGPACK_OBJECT_NIL) { + auto status = converter.convert_value(it->second, field_builders[i].get(), + field->type()); + if (!status.ok()) { + if (field->nullable()) { + (void)field_builders[i]->AppendNull(); + } else { + append_default_value(field_builders[i].get(), field->type()); + } + } + } else { + if (field->nullable()) { + (void)field_builders[i]->AppendNull(); + } else { + append_default_value(field_builders[i].get(), field->type()); + } + } + } + + batch_records++; + records_processed++; + + if (batch_records >= RECORDS_PER_BATCH) { + std::vector> arrays; + for (size_t i = 0; i < field_builders.size(); i++) { + auto array_result = field_builders[i]->Finish(); + if (!array_result.ok()) { + flb_error("[parquet] Failed to finish array %zu: %s", + i, array_result.status().ToString().c_str()); + throw std::runtime_error("Failed to finish array"); + } + arrays.push_back(std::move(array_result).ValueOrDie()); + } + + /* Create and write batch */ + auto batch = arrow::RecordBatch::Make(arrow_schema, batch_records, arrays); + if (!batch) { + throw std::runtime_error("Failed to create RecordBatch"); + } + + auto write_status = writer->WriteRecordBatch(*batch); + if (!write_status.ok()) { + flb_error("[parquet] Failed to write batch: %s", + write_status.ToString().c_str()); + throw std::runtime_error("Failed to write batch"); + } + + /* Reset builders */ + field_builders.clear(); + for (int i = 0; i < arrow_schema->num_fields(); i++) { + auto field = arrow_schema->field(i); + auto builder_result = arrow::MakeBuilder(field->type(), + arrow::default_memory_pool()); + if (!builder_result.ok()) { + throw std::runtime_error("Failed to recreate builder"); + } + field_builders.push_back(std::move(builder_result).ValueOrDie()); + } + batch_records = 0; + } + } else if (ret == MSGPACK_UNPACK_PARSE_ERROR) { + throw std::runtime_error("Msgpack parse error"); + } else if (ret == MSGPACK_UNPACK_EXTRA_BYTES) { + break; + } + } + } + + /* Flush remaining records */ + if (batch_records > 0) { + std::vector> arrays; + for (size_t i = 0; i < field_builders.size(); i++) { + auto array_result = field_builders[i]->Finish(); + if (!array_result.ok()) { + throw std::runtime_error("Failed to finish final array"); + } + arrays.push_back(std::move(array_result).ValueOrDie()); + } + + auto batch = arrow::RecordBatch::Make(arrow_schema, batch_records, arrays); + if (!batch) { + throw std::runtime_error("Failed to create final RecordBatch"); + } + + auto write_status = writer->WriteRecordBatch(*batch); + if (!write_status.ok()) { + throw std::runtime_error("Failed to write final batch"); + } + } + + /* Check for read errors */ + if (ferror(msgpack_fp)) { + throw std::runtime_error("Error reading msgpack file"); + } + + /* Cleanup C resources - update flags/pointers immediately after each release */ + msgpack_unpacked_destroy(&result); + result_ready = false; + + msgpack_unpacker_destroy(&unpacker); + unpacker_ready = false; + + flb_free(read_buffer); + read_buffer = NULL; + + fclose(msgpack_fp); + msgpack_fp = NULL; + + if (records_processed == 0) { + flb_error("[parquet] No records processed"); + auto close_status = writer->Close(); + if (!close_status.ok()) { + flb_error("[parquet] Failed to close writer: %s", close_status.ToString().c_str()); + } + auto stream_status = output_stream->Close(); + if (!stream_status.ok()) { + flb_error("[parquet] Failed to close stream: %s", stream_status.ToString().c_str()); + } + unlink(output_file); + return -1; + } + + /* Close the Parquet writer */ + auto close_status = writer->Close(); + if (!close_status.ok()) { + flb_error("[parquet] Failed to close parquet writer: %s", + close_status.ToString().c_str()); + auto stream_status = output_stream->Close(); + if (!stream_status.ok()) { + flb_error("[parquet] Failed to close stream: %s", stream_status.ToString().c_str()); + } + unlink(output_file); + return -1; + } + + auto stream_close_status = output_stream->Close(); + if (!stream_close_status.ok()) { + flb_error("[parquet] Failed to close output stream: %s", + stream_close_status.ToString().c_str()); + unlink(output_file); + return -1; + } + + /* Get output file size */ + struct stat st; + if (stat(output_file, &st) != 0) { + flb_error("[parquet] Failed to stat output file %s: %s", + output_file, strerror(errno)); + unlink(output_file); + return -1; + } + *out_file_size = st.st_size; + + return 0; + } + catch (const parquet::ParquetException& e) { + flb_error("[parquet] Parquet exception: %s", e.what()); + } + catch (const std::exception& e) { + flb_error("[parquet] Exception: %s", e.what()); + } + catch (...) { + flb_error("[parquet] Unknown exception"); + } + + /* Cleanup on error - update flags/pointers immediately after each release */ + if (result_ready) { + msgpack_unpacked_destroy(&result); + result_ready = false; + } + if (unpacker_ready) { + msgpack_unpacker_destroy(&unpacker); + unpacker_ready = false; + } + if (read_buffer) { + flb_free(read_buffer); + read_buffer = NULL; + } + if (msgpack_fp) { + fclose(msgpack_fp); + msgpack_fp = NULL; + } + unlink(output_file); + return -1; +} + +} // extern "C" diff --git a/src/flb_signv4.c b/src/flb_signv4.c index 14f9b04a076..d805792d34c 100644 --- a/src/flb_signv4.c +++ b/src/flb_signv4.c @@ -644,9 +644,9 @@ static flb_sds_t flb_signv4_canonical_request(struct flb_http_client *c, } /* - * URI normalization is required by certain AWS service, for hence the caller - * plugin is responsible to enable/disable this flag. If set the URI in the - * canonical request will be normalized. + * URI encoding is handled differently based on the service: + * - normalize_uri=TRUE: Normalize path then encode (most AWS services) + * - normalize_uri=FALSE: Use pre-encoded URI as-is (S3) */ if (normalize_uri == FLB_TRUE) { tmp = flb_signv4_uri_normalize_path((char *) c->uri, len); @@ -656,20 +656,24 @@ static flb_sds_t flb_signv4_canonical_request(struct flb_http_client *c, return NULL; } len = flb_sds_len(tmp); - } - else { - tmp = (char *) c->uri; - } - /* Do URI encoding (rfc3986) */ - uri = uri_encode(tmp, len); - if (tmp != c->uri) { + uri = uri_encode(tmp, len); flb_sds_destroy(tmp); + + if (!uri) { + flb_error("[signv4] error encoding URI"); + flb_sds_destroy(cr); + return NULL; + } } - if (!uri) { - /* error composing outgoing buffer */ - flb_sds_destroy(cr); - return NULL; + else { + /* URI is pre-encoded by caller, use as-is */ + uri = flb_sds_create_len(c->uri, len); + if (!uri) { + flb_error("[signv4] error creating URI buffer"); + flb_sds_destroy(cr); + return NULL; + } } tmp = flb_sds_cat(cr, uri, flb_sds_len(uri)); diff --git a/tests/internal/CMakeLists.txt b/tests/internal/CMakeLists.txt index a9ab28649f0..0186d7e089c 100644 --- a/tests/internal/CMakeLists.txt +++ b/tests/internal/CMakeLists.txt @@ -133,6 +133,13 @@ if(FLB_AVRO_ENCODER) ) endif() +if(FLB_PARQUET_ENCODER) + set(UNIT_TESTS_FILES + ${UNIT_TESTS_FILES} + parquet.c + ) +endif() + if(FLB_AWS) set(UNIT_TESTS_FILES ${UNIT_TESTS_FILES} @@ -228,6 +235,10 @@ function(prepare_unit_tests TEST_PREFIX SOURCEFILES) target_link_libraries(${source_file_we} avro-static jansson) endif() + if(FLB_PARQUET_ENCODER) + target_link_libraries(${source_file_we} ${ARROW_LIBRARIES} ${PARQUET_LIBRARIES}) + endif() + add_test(NAME ${source_file_we} COMMAND ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${source_file_we} WORKING_DIRECTORY ${CMAKE_HOME_DIRECTORY}/build) diff --git a/tests/internal/aws_util.c b/tests/internal/aws_util.c index 6f6fbdb1aca..e444ced33fc 100644 --- a/tests/internal/aws_util.c +++ b/tests/internal/aws_util.c @@ -44,6 +44,28 @@ #define S3_OBJECT_KEY_MIXED_TIMESTAMP "logs/2020/m/08/d/15/%q" #endif +#define S3_KEY_FORMAT_FILE_PATH "logs/$TAG/$FILE_PATH" +#define S3_OBJECT_KEY_FILE_PATH "logs/aa.bb.ccc/var/log/nginx/access.log.1.gz" + +#define S3_KEY_FORMAT_FILE_NAME "logs/$TAG/%Y/%m/%d/$FILE_NAME" +#define S3_OBJECT_KEY_FILE_NAME "logs/aa.bb.ccc/2020/08/15/access.log.1.gz" + +#define S3_KEY_FORMAT_FILE_BOTH "logs/$FILE_PATH-$FILE_NAME" +#define S3_OBJECT_KEY_FILE_BOTH "logs/var/log/nginx/access.log.1.gz-access.log.1.gz" + +/* Edge case tests for relative paths and dotted prefixes */ +#define S3_KEY_FORMAT_FILE_PATH_EDGE "logs/$FILE_PATH" +#define FILE_PATH_RELATIVE "./logs/file.log" +#define S3_OBJECT_KEY_RELATIVE "logs/logs/file.log" +#define FILE_PATH_TRIPLE_DOT "...hidden/file.log" +#define S3_OBJECT_KEY_TRIPLE_DOT "logs/hidden/file.log" +#define FILE_PATH_PARENT "../../../etc/passwd" +#define S3_OBJECT_KEY_PARENT "logs/etc/passwd" +#define FILE_PATH_HIDDEN ".hidden" +#define S3_OBJECT_KEY_HIDDEN "logs/hidden" +#define FILE_PATH_MIXED_DOTS ".///logs/./file.log" +#define S3_OBJECT_KEY_MIXED_DOTS "logs/logs/./file.log" + #define NO_TAG "" #define TAG "aa.bb.ccc" #define MULTI_DELIMITER_TAG "aa.bb-ccc" @@ -51,6 +73,7 @@ #define TAG_DELIMITERS ".-" #define INVALID_TAG_DELIMITERS ",/" #define VALID_SEQ_INDEX 0 +#define FILE_PATH "/var/log/nginx/access.log.1.gz" /* Example: standard nginx rotated log (logrotate format) */ static void initialization_crutch() { @@ -174,7 +197,7 @@ static void test_flb_get_s3_key_multi_tag_exists() initialization_crutch(); mktime_utc(&day, &t); - s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_TAG_PART, t, TAG, TAG_DELIMITER, 0); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_TAG_PART, t, TAG, TAG_DELIMITER, 0, NULL); TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_TAG_PART) == 0); flb_sds_destroy(s3_key_format); @@ -189,7 +212,7 @@ static void test_flb_get_s3_key_full_tag() initialization_crutch(); mktime_utc(&day, &t); - s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_FULL_TAG, t, TAG, TAG_DELIMITER, 0); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_FULL_TAG, t, TAG, TAG_DELIMITER, 0, NULL); TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_FULL_TAG) == 0); flb_sds_destroy(s3_key_format); @@ -205,7 +228,7 @@ static void test_flb_get_s3_key_tag_special_characters() mktime_utc(&day, &t); s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_SPECIAL_CHARCATERS_TAG, t, TAG, - TAG_DELIMITER, 0); + TAG_DELIMITER, 0, NULL); TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_SPECIAL_CHARCATERS_TAG) == 0); flb_sds_destroy(s3_key_format); @@ -221,7 +244,7 @@ static void test_flb_get_s3_key_multi_tag_delimiter() mktime_utc(&day, &t); s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_TAG_PART, t, MULTI_DELIMITER_TAG, - TAG_DELIMITERS, 0); + TAG_DELIMITERS, 0, NULL); TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_TAG_PART) == 0); flb_sds_destroy(s3_key_format); @@ -237,7 +260,7 @@ static void test_flb_get_s3_key_invalid_tag_delimiter() mktime_utc(&day, &t); s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_TAG_PART, t, MULTI_DELIMITER_TAG, - INVALID_TAG_DELIMITERS, 0); + INVALID_TAG_DELIMITERS, 0, NULL); TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_INVALID_DELIMITER) == 0); flb_sds_destroy(s3_key_format); @@ -252,7 +275,7 @@ static void test_flb_get_s3_key_invalid_tag_index() initialization_crutch(); mktime_utc(&day, &t); - s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_INVALID_TAG, t, TAG, TAG_DELIMITER, 0); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_INVALID_TAG, t, TAG, TAG_DELIMITER, 0, NULL); TEST_CHECK(strcmp(s3_key_format, S3_OBJECY_KEY_INVALID_TAG) == 0); flb_sds_destroy(s3_key_format); @@ -275,7 +298,7 @@ static void test_flb_get_s3_key_invalid_key_length() time_t t; mktime_utc(&day, &t); - s3_key_format = flb_get_s3_key(buf, t, TAG, TAG_DELIMITER, 0); + s3_key_format = flb_get_s3_key(buf, t, TAG, TAG_DELIMITER, 0, NULL); TEST_CHECK(strlen(s3_key_format) <= 1024); flb_sds_destroy(s3_key_format); @@ -291,7 +314,7 @@ static void test_flb_get_s3_key_static_string() mktime_utc(&day, &t); s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_STATIC_STRING, t, NO_TAG, - TAG_DELIMITER, 0); + TAG_DELIMITER, 0, NULL); TEST_CHECK(strcmp(s3_key_format, S3_KEY_FORMAT_STATIC_STRING) == 0); flb_sds_destroy(s3_key_format); @@ -307,7 +330,7 @@ static void test_flb_get_s3_key_valid_index() mktime_utc(&day, &t); s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_VALID_INDEX, t, NO_TAG, - TAG_DELIMITER, 12); + TAG_DELIMITER, 12, NULL); TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_VALID_INDEX) == 0); flb_sds_destroy(s3_key_format); @@ -323,14 +346,14 @@ static void test_flb_get_s3_key_increment_index() mktime_utc(&day, &t); s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_VALID_INDEX, t, NO_TAG, - TAG_DELIMITER, 5); + TAG_DELIMITER, 5, NULL); TEST_CHECK(strcmp(s3_key_format, "logs/a-5-b-c") == 0); flb_sds_destroy(s3_key_format); s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_VALID_INDEX, t, NO_TAG, - TAG_DELIMITER, 10); + TAG_DELIMITER, 10, NULL); TEST_CHECK(strcmp(s3_key_format, "logs/a-10-b-c") == 0); @@ -348,13 +371,13 @@ static void test_flb_get_s3_key_index_overflow() mktime_utc(&day, &t); s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_VALID_INDEX, t, NO_TAG, - TAG_DELIMITER, index); + TAG_DELIMITER, index, NULL); TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_PRE_OVERFLOW_INDEX) == 0); flb_sds_destroy(s3_key_format); index++; s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_VALID_INDEX, t, NO_TAG, - TAG_DELIMITER, index); + TAG_DELIMITER, index, NULL); TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_POST_OVERFLOW_INDEX) == 0); flb_sds_destroy(s3_key_format); @@ -370,12 +393,145 @@ static void test_flb_get_s3_key_mixed_timestamp() mktime_utc(&day, &t); s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_MIXED_TIMESTAMP, t, NO_TAG, - TAG_DELIMITER, 12); + TAG_DELIMITER, 12, NULL); TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_MIXED_TIMESTAMP) == 0); flb_sds_destroy(s3_key_format); } +static void test_flb_get_s3_key_file_path() +{ + flb_sds_t s3_key_format = NULL; + struct tm day = { 0, 0, 0, 15, 7, 120}; + time_t t; + + initialization_crutch(); + + mktime_utc(&day, &t); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_FILE_PATH, t, TAG, + TAG_DELIMITER, 0, FILE_PATH); + TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_FILE_PATH) == 0); + + flb_sds_destroy(s3_key_format); +} + +static void test_flb_get_s3_key_file_name() +{ + flb_sds_t s3_key_format = NULL; + struct tm day = { 0, 0, 0, 15, 7, 120}; + time_t t; + + initialization_crutch(); + + mktime_utc(&day, &t); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_FILE_NAME, t, TAG, + TAG_DELIMITER, 0, FILE_PATH); + TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_FILE_NAME) == 0); + + flb_sds_destroy(s3_key_format); +} + +static void test_flb_get_s3_key_file_both() +{ + flb_sds_t s3_key_format = NULL; + struct tm day = { 0, 0, 0, 15, 7, 120}; + time_t t; + + initialization_crutch(); + + mktime_utc(&day, &t); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_FILE_BOTH, t, NO_TAG, + TAG_DELIMITER, 0, FILE_PATH); + TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_FILE_BOTH) == 0); + + flb_sds_destroy(s3_key_format); +} + +/* Edge case: relative path with ./ prefix */ +static void test_flb_get_s3_key_relative_path() +{ + flb_sds_t s3_key_format = NULL; + struct tm day = { 0, 0, 0, 15, 7, 120}; + time_t t; + + initialization_crutch(); + + mktime_utc(&day, &t); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_FILE_PATH_EDGE, t, NO_TAG, + TAG_DELIMITER, 0, FILE_PATH_RELATIVE); + TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_RELATIVE) == 0); + + flb_sds_destroy(s3_key_format); +} + +/* Edge case: triple dot prefix */ +static void test_flb_get_s3_key_triple_dot() +{ + flb_sds_t s3_key_format = NULL; + struct tm day = { 0, 0, 0, 15, 7, 120}; + time_t t; + + initialization_crutch(); + + mktime_utc(&day, &t); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_FILE_PATH_EDGE, t, NO_TAG, + TAG_DELIMITER, 0, FILE_PATH_TRIPLE_DOT); + TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_TRIPLE_DOT) == 0); + + flb_sds_destroy(s3_key_format); +} + +/* Edge case: parent directory traversal */ +static void test_flb_get_s3_key_parent_traversal() +{ + flb_sds_t s3_key_format = NULL; + struct tm day = { 0, 0, 0, 15, 7, 120}; + time_t t; + + initialization_crutch(); + + mktime_utc(&day, &t); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_FILE_PATH_EDGE, t, NO_TAG, + TAG_DELIMITER, 0, FILE_PATH_PARENT); + TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_PARENT) == 0); + + flb_sds_destroy(s3_key_format); +} + +/* Edge case: hidden file (single dot prefix) */ +static void test_flb_get_s3_key_hidden_file() +{ + flb_sds_t s3_key_format = NULL; + struct tm day = { 0, 0, 0, 15, 7, 120}; + time_t t; + + initialization_crutch(); + + mktime_utc(&day, &t); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_FILE_PATH_EDGE, t, NO_TAG, + TAG_DELIMITER, 0, FILE_PATH_HIDDEN); + TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_HIDDEN) == 0); + + flb_sds_destroy(s3_key_format); +} + +/* Edge case: mixed dots and slashes */ +static void test_flb_get_s3_key_mixed_dots_slashes() +{ + flb_sds_t s3_key_format = NULL; + struct tm day = { 0, 0, 0, 15, 7, 120}; + time_t t; + + initialization_crutch(); + + mktime_utc(&day, &t); + s3_key_format = flb_get_s3_key(S3_KEY_FORMAT_FILE_PATH_EDGE, t, NO_TAG, + TAG_DELIMITER, 0, FILE_PATH_MIXED_DOTS); + TEST_CHECK(strcmp(s3_key_format, S3_OBJECT_KEY_MIXED_DOTS) == 0); + + flb_sds_destroy(s3_key_format); +} + TEST_LIST = { { "parse_api_error" , test_flb_aws_error}, { "flb_aws_endpoint" , test_flb_aws_endpoint}, @@ -391,5 +547,13 @@ TEST_LIST = { {"flb_get_s3_key_increment_index", test_flb_get_s3_key_increment_index}, {"flb_get_s3_key_index_overflow", test_flb_get_s3_key_index_overflow}, {"flb_get_s3_key_mixed_timestamp", test_flb_get_s3_key_mixed_timestamp}, + {"flb_get_s3_key_file_path", test_flb_get_s3_key_file_path}, + {"flb_get_s3_key_file_name", test_flb_get_s3_key_file_name}, + {"flb_get_s3_key_file_both", test_flb_get_s3_key_file_both}, + {"flb_get_s3_key_relative_path", test_flb_get_s3_key_relative_path}, + {"flb_get_s3_key_triple_dot", test_flb_get_s3_key_triple_dot}, + {"flb_get_s3_key_parent_traversal", test_flb_get_s3_key_parent_traversal}, + {"flb_get_s3_key_hidden_file", test_flb_get_s3_key_hidden_file}, + {"flb_get_s3_key_mixed_dots_slashes", test_flb_get_s3_key_mixed_dots_slashes}, { 0 } }; diff --git a/tests/internal/fuzzers/aws_util_fuzzer.c b/tests/internal/fuzzers/aws_util_fuzzer.c index dcaa2b9ec68..f21a5f2f2ad 100644 --- a/tests/internal/fuzzers/aws_util_fuzzer.c +++ b/tests/internal/fuzzers/aws_util_fuzzer.c @@ -70,7 +70,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) if (format && tag && tag_delimiter) { if (!initialization_crutch()) { flb_sds_t s3_key_format = NULL; - s3_key_format = flb_get_s3_key(format, t, tag, tag_delimiter, 0); + s3_key_format = flb_get_s3_key(format, t, tag, tag_delimiter, 0, NULL); if (s3_key_format) { flb_sds_destroy(s3_key_format); } diff --git a/tests/internal/parquet.c b/tests/internal/parquet.c new file mode 100644 index 00000000000..4089c4d3991 --- /dev/null +++ b/tests/internal/parquet.c @@ -0,0 +1,2117 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "flb_tests_internal.h" + +/* Parquet validation function - implemented in C++ */ +#ifdef __cplusplus +extern "C" { +#endif + +int validate_parquet_file(const char *file_path, + int expected_records, + const char *field_name, + const char *expected_type, + const char *expected_value, + int row_index); + +#ifdef __cplusplus +} +#endif + +/* Compression types */ +#define FLB_AWS_COMPRESS_NONE 0 +#define FLB_AWS_COMPRESS_GZIP 1 +#define FLB_AWS_COMPRESS_SNAPPY 2 +#define FLB_AWS_COMPRESS_ZSTD 3 + +/* Helper: Write msgpack data to file in chunk format + * Chunk format: [24-byte header][metadata][msgpack data] + * Header: bytes 0-1 = 0xC1 0x00 (magic), bytes 22-23 = metadata length (big-endian) + */ +static int write_msgpack_to_chunk_file(const char *file_path, const char *data, size_t size) +{ + FILE *fp = fopen(file_path, "wb"); + if (!fp) { + return -1; + } + + /* Prepare chunk header (24 bytes) */ + unsigned char header[24]; + memset(header, 0, sizeof(header)); + + /* Magic bytes */ + header[0] = 0xC1; + header[1] = 0x00; + + /* Metadata: tag name "test" */ + const char *tag = "test"; + size_t tag_len = strlen(tag); + uint16_t metadata_len = (uint16_t)tag_len; + + /* Store metadata length in bytes 22-23 (big-endian) */ + header[22] = (metadata_len >> 8) & 0xFF; + header[23] = metadata_len & 0xFF; + + /* Write header */ + if (fwrite(header, 1, sizeof(header), fp) != sizeof(header)) { + fclose(fp); + return -1; + } + + /* Write metadata (tag name) */ + if (fwrite(tag, 1, tag_len, fp) != tag_len) { + fclose(fp); + return -1; + } + + /* Write msgpack data */ + size_t written = fwrite(data, 1, size, fp); + fclose(fp); + + return (written == size) ? 0 : -1; +} + +/* Helper: Check if parquet file exists and has content */ +static int check_parquet_file(const char *file_path, size_t *out_size) +{ + struct stat st; + if (stat(file_path, &st) != 0) { + return -1; + } + if (st.st_size == 0) { + return -1; + } + if (out_size) { + *out_size = st.st_size; + } + return 0; +} + +/* Helper: Pack Fluent Bit format [timestamp, map] */ +static void pack_fluent_bit_record(msgpack_packer *packer, int64_t ts) +{ + msgpack_pack_array(packer, 2); + msgpack_pack_int64(packer, ts); +} + +/* Helper: Pack JSON data into msgpack format [timestamp, map] */ +static int pack_json_record(msgpack_sbuffer *sbuf, msgpack_packer *packer, + int64_t timestamp, const char *json_data) +{ + char *msgpack_buf = NULL; + size_t msgpack_size = 0; + int root_type; + size_t consumed; + int ret; + + ret = flb_pack_json(json_data, strlen(json_data), + &msgpack_buf, &msgpack_size, + &root_type, &consumed); + if (ret != 0) { + return -1; + } + + msgpack_pack_array(packer, 2); + msgpack_pack_int64(packer, timestamp); + msgpack_sbuffer_write(sbuf, msgpack_buf, msgpack_size); + flb_free(msgpack_buf); + + return 0; +} + +/* Helper: Validate parquet with expected data */ +typedef struct { + const char *field_name; + const char *expected_type; + const char *expected_value; + int row_index; +} field_expectation; + +static int validate_parquet_data(const char *parquet_file, int expected_records, + const field_expectation *expectations, int num_expectations) +{ + int i, ret; + + /* Validate record count */ + if (expected_records > 0) { + ret = validate_parquet_file(parquet_file, expected_records, NULL, NULL, NULL, 0); + if (ret != 0) return ret; + } + + /* Validate each field expectation flexibly: + * - If both type and value provided: validate both in one call + * - If only type provided: validate type only + * - If only value provided: validate value only + */ + for (i = 0; i < num_expectations; i++) { + const field_expectation *exp = &expectations[i]; + + if (exp->expected_type && exp->expected_value) { + /* Validate both type and value together - most strict */ + ret = validate_parquet_file(parquet_file, -1, exp->field_name, + exp->expected_type, exp->expected_value, exp->row_index); + if (ret != 0) return ret; + } else if (exp->expected_type) { + /* Validate type only */ + ret = validate_parquet_file(parquet_file, -1, exp->field_name, + exp->expected_type, NULL, exp->row_index); + if (ret != 0) return ret; + } else if (exp->expected_value) { + /* Validate value only */ + ret = validate_parquet_file(parquet_file, -1, exp->field_name, + NULL, exp->expected_value, exp->row_index); + if (ret != 0) return ret; + } + } + + return 0; +} + +/* ============================================================================ + * TEST CONTEXT FRAMEWORK - Reduces boilerplate code + * ============================================================================ */ + +/* Test context structure with resource management */ +typedef struct { + msgpack_sbuffer sbuf; + msgpack_packer packer; + char msgpack_file[256]; + char parquet_file[256]; + size_t parquet_size; + const char *schema; + int compression; + flb_parquet_schema *cached_schema; /* Cached schema for new API */ +} test_context; + +/* Helper wrapper for new cached API - manages schema lifecycle automatically */ +static int flb_msgpack_raw_to_parquet_file_streaming(const char *msgpack_file_path, + const char *schema_str, + int compression, + const char *output_file, + size_t *out_file_size, + size_t total_file_size) +{ + char error_msg[512]; + flb_parquet_schema *cached_schema = NULL; + int ret; + + if (!schema_str) { + return -1; + } + + /* Parse and cache schema */ + cached_schema = flb_parquet_schema_create(schema_str, error_msg, sizeof(error_msg)); + if (!cached_schema) { + return -1; + } + + /* Call cached version */ + ret = flb_msgpack_to_parquet_streaming( + msgpack_file_path, + cached_schema, + compression, + output_file, + out_file_size, + total_file_size + ); + + /* Cleanup */ + flb_parquet_schema_destroy(cached_schema); + + return ret; +} + +/* Initialize test context */ +static int init_test_context(test_context *ctx, const char *test_name) +{ + msgpack_sbuffer_init(&ctx->sbuf); + msgpack_packer_init(&ctx->packer, &ctx->sbuf, msgpack_sbuffer_write); + + snprintf(ctx->msgpack_file, sizeof(ctx->msgpack_file), + "/tmp/flb_test_%s.msgpack", test_name); + snprintf(ctx->parquet_file, sizeof(ctx->parquet_file), + "/tmp/flb_test_%s.parquet", test_name); + + ctx->parquet_size = 0; + ctx->schema = NULL; + ctx->compression = FLB_AWS_COMPRESS_NONE; + + return 0; +} + +/* Run standard conversion: write msgpack -> convert to parquet -> validate file exists */ +static int run_conversion(test_context *ctx) +{ + int ret; + + ret = write_msgpack_to_chunk_file(ctx->msgpack_file, + ctx->sbuf.data, + ctx->sbuf.size); + if (ret != 0) return ret; + + ret = flb_msgpack_raw_to_parquet_file_streaming(ctx->msgpack_file, + ctx->schema, + ctx->compression, + ctx->parquet_file, + &ctx->parquet_size, + 0); + if (ret != 0) return ret; + + return check_parquet_file(ctx->parquet_file, NULL); +} + +/* Cleanup test context */ +static void cleanup_test_context(test_context *ctx) +{ + msgpack_sbuffer_destroy(&ctx->sbuf); + unlink(ctx->msgpack_file); + unlink(ctx->parquet_file); +} + +/* Pack JSON record with context (convenience wrapper) */ +static int ctx_pack_json(test_context *ctx, int64_t timestamp, const char *json_data) +{ + return pack_json_record(&ctx->sbuf, &ctx->packer, timestamp, json_data); +} + +/* Validate parquet data with context (convenience wrapper) */ +static int ctx_validate(test_context *ctx, int expected_records, + const field_expectation *expectations, int num_expectations) +{ + return validate_parquet_data(ctx->parquet_file, expected_records, + expectations, num_expectations); +} + + +/* Single record basic conversion */ +static void test_basic_conversion(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_basic"); + ctx.schema = "{\"fields\":[{\"name\":\"message\",\"type\":\"utf8\"},{\"name\":\"level\",\"type\":\"int32\"}]}"; + + ret = ctx_pack_json(&ctx, 1609459200, "{\"message\":\"hello world\",\"level\":1}"); + TEST_CHECK(ret == 0); + + TEST_CHECK(run_conversion(&ctx) == 0); + + ret = validate_parquet_file(ctx.parquet_file, 1, "message", "string", "hello world", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "level", "int32", "1", 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Multiple records */ +static void test_multiple_records(void) +{ + test_context ctx; + int i, ret; + + init_test_context(&ctx, "parquet_multi"); + ctx.schema = "{\"fields\":[{\"name\":\"id\",\"type\":\"int32\"},{\"name\":\"message\",\"type\":\"utf8\"}]}"; + + /* Pack 100 records */ + for (i = 0; i < 100; i++) { + char json_buf[256]; + snprintf(json_buf, sizeof(json_buf), "{\"id\":%d,\"message\":\"test\"}", i); + ret = ctx_pack_json(&ctx, 1609459200 + i, json_buf); + TEST_CHECK(ret == 0); + } + + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Define key validation points */ + field_expectation expectations[] = { + {"id", "int32", "0", 0}, /* First record */ + {"message", "string", "test", 0}, + {"id", "int32", "50", 50}, /* Middle record */ + {"message", "string", "test", 50}, + {"id", "int32", "99", 99}, /* Last record */ + {"message", "string", "test", 99} + }; + + ret = ctx_validate(&ctx, 100, expectations, 6); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Large record count (trigger multiple batches) */ +static void test_large_record_count(void) +{ + test_context ctx; + int i, ret; + const int record_count = 70000; + + init_test_context(&ctx, "parquet_large"); + ctx.schema = "{\"fields\":[{\"name\":\"id\",\"type\":\"int32\"}]}"; + + /* Pack 70000 records to trigger multiple batches */ + for (i = 0; i < record_count; i++) { + char json_buf[64]; + snprintf(json_buf, sizeof(json_buf), "{\"id\":%d}", i); + ret = ctx_pack_json(&ctx, 1609459200 + i, json_buf); + TEST_CHECK(ret == 0); + } + + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Define key validation points across batches */ + field_expectation expectations[] = { + {"id", "int32", "0", 0}, /* First record */ + {"id", "int32", "35000", 35000}, /* Middle record */ + {"id", "int32", "65535", 65535}, /* Last of first batch */ + {"id", "int32", "65536", 65536}, /* First of second batch */ + {"id", "int32", "69999", 69999} /* Last record */ + }; + + ret = ctx_validate(&ctx, record_count, expectations, 5); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Boolean type - all conversion paths */ +static void test_bool_conversions(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_bool"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"bool_val\",\"type\":\"bool\"}," + "{\"name\":\"int_to_bool\",\"type\":\"bool\"}," + "{\"name\":\"float_to_bool\",\"type\":\"bool\"}," + "{\"name\":\"str_to_bool\",\"type\":\"bool\"}" + "]}"; + + ret = ctx_pack_json(&ctx, 1609459200, + "{\"bool_val\":true,\"int_to_bool\":1,\"float_to_bool\":1.0,\"str_to_bool\":\"true\"}"); + TEST_CHECK(ret == 0); + + ret = ctx_pack_json(&ctx, 1609459201, + "{\"bool_val\":false,\"int_to_bool\":0,\"float_to_bool\":0.0,\"str_to_bool\":\"no\"}"); + TEST_CHECK(ret == 0); + + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Define expected conversions - verify ALL conversion paths */ + field_expectation expectations[] = { + /* Record 0 - all true conversions */ + {"bool_val", "bool", "true", 0}, + {"int_to_bool", "bool", "true", 0}, /* int 1 -> true */ + {"float_to_bool", "bool", "true", 0}, /* float 1.0 -> true */ + {"str_to_bool", "bool", "true", 0}, /* string "true" -> true */ + /* Record 1 - all false conversions */ + {"bool_val", "bool", "false", 1}, + {"int_to_bool", "bool", "false", 1}, /* int 0 -> false */ + {"float_to_bool", "bool", "false", 1}, /* float 0.0 -> false */ + {"str_to_bool", "bool", "false", 1} /* string "no" -> false */ + }; + + ret = ctx_validate(&ctx, 2, expectations, 8); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Integer conversions with overflow/underflow */ +static void test_integer_conversions(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_int"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"int32_normal\",\"type\":\"int32\"}," + "{\"name\":\"int32_from_float\",\"type\":\"int32\"}," + "{\"name\":\"int32_from_string\",\"type\":\"int32\"}," + "{\"name\":\"int32_from_bool\",\"type\":\"int32\"}," + "{\"name\":\"int64_val\",\"type\":\"int64\"}" + "]}"; + + ret = ctx_pack_json(&ctx, 1609459200, + "{\"int32_normal\":42,\"int32_from_float\":123.456,\"int32_from_string\":\"999\",\"int32_from_bool\":true,\"int64_val\":9223372036854775807}"); + TEST_CHECK(ret == 0); + + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Define expected conversions */ + field_expectation expectations[] = { + {"int32_normal", "int32", "42", 0}, + {"int32_from_float", "int32", "123", 0}, /* float 123.456 -> int 123 (truncate) */ + {"int32_from_string", "int32", "999", 0}, /* string "999" -> int 999 */ + {"int32_from_bool", "int32", "1", 0}, /* bool true -> int 1 */ + {"int64_val", "int64", NULL, 0} /* Type check only */ + }; + + ret = ctx_validate(&ctx, 1, expectations, 5); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Float conversions */ +static void test_float_conversions(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_float"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"float_val\",\"type\":\"float\"}," + "{\"name\":\"float_from_int\",\"type\":\"float\"}," + "{\"name\":\"float_from_string\",\"type\":\"float\"}," + "{\"name\":\"float_from_bool\",\"type\":\"float\"}," + "{\"name\":\"double_val\",\"type\":\"double\"}" + "]}"; + + ret = ctx_pack_json(&ctx, 1609459200, + "{\"float_val\":3.14,\"float_from_int\":42,\"float_from_string\":\"2.71\",\"float_from_bool\":true,\"double_val\":2.718281828}"); + TEST_CHECK(ret == 0); + + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Define expected conversions - type validation */ + field_expectation expectations[] = { + {"float_val", "float", NULL, 0}, /* Type check only */ + {"float_from_int", "float", NULL, 0}, /* int 42 -> float */ + {"float_from_string", "float", NULL, 0}, /* string "2.71" -> float */ + {"float_from_bool", "float", NULL, 0}, /* bool true -> float */ + {"double_val", "double", NULL, 0} /* Type check only */ + }; + + ret = ctx_validate(&ctx, 1, expectations, 5); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* String conversions */ +static void test_string_conversions(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_string"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"str_val\",\"type\":\"utf8\"}," + "{\"name\":\"str_from_int\",\"type\":\"utf8\"}," + "{\"name\":\"str_from_float\",\"type\":\"utf8\"}," + "{\"name\":\"str_from_bool\",\"type\":\"utf8\"}," + "{\"name\":\"str_from_obj\",\"type\":\"utf8\"}," /* Object -> JSON string */ + "{\"name\":\"str_from_array\",\"type\":\"utf8\"}" /* Array -> JSON string */ + "]}"; + + ret = ctx_pack_json(&ctx, 1609459200, + "{\"str_val\":\"test\",\"str_from_int\":42,\"str_from_float\":3.14,\"str_from_bool\":true,\"str_from_obj\":{\"key\":\"val\"},\"str_from_array\":[1,2,3]}"); + TEST_CHECK(ret == 0); + + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Define expected conversions */ + field_expectation expectations[] = { + {"str_val", "string", "test", 0}, /* string -> string */ + {"str_from_int", "string", "42", 0}, /* int 42 -> string "42" */ + {"str_from_float", "string", "3.140000", 0}, /* float 3.14 -> string "3.140000" */ + {"str_from_bool", "string", "true", 0}, /* bool true -> string "true" */ + {"str_from_obj", "string", "{\"key\":\"val\"}", 0}, /* object -> JSON string */ + {"str_from_array", "string", "[1,2,3]", 0} /* array -> JSON string */ + }; + + ret = ctx_validate(&ctx, 1, expectations, 6); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Binary type */ +static void test_binary_type(void) +{ + test_context ctx; + const char binary_data[] = {0x00, 0x01, 0x02, 0xFF}; + int ret; + + init_test_context(&ctx, "parquet_binary"); + ctx.schema = "{\"fields\":[{\"name\":\"data\",\"type\":\"binary\"}]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "data", 4); + msgpack_pack_bin(&ctx.packer, sizeof(binary_data)); + msgpack_pack_bin_body(&ctx.packer, binary_data, sizeof(binary_data)); + + TEST_CHECK(run_conversion(&ctx) == 0); + + ret = validate_parquet_file(ctx.parquet_file, 1, "data", "binary", NULL, 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Timestamp type with conversions */ +static void test_timestamp_type(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_timestamp"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"ts_int\",\"type\":{\"name\":\"timestamp\",\"unit\":\"s\"}}," + "{\"name\":\"ts_float\",\"type\":{\"name\":\"timestamp\",\"unit\":\"ms\"}}," + "{\"name\":\"ts_string\",\"type\":{\"name\":\"timestamp\",\"unit\":\"us\"}}," + "{\"name\":\"ts_bool\",\"type\":{\"name\":\"timestamp\",\"unit\":\"ns\"}}" + "]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 4); + + msgpack_pack_str(&ctx.packer, 6); + msgpack_pack_str_body(&ctx.packer, "ts_int", 6); + msgpack_pack_int64(&ctx.packer, 1609459200LL); + + msgpack_pack_str(&ctx.packer, 8); + msgpack_pack_str_body(&ctx.packer, "ts_float", 8); + msgpack_pack_double(&ctx.packer, 1609459200000.0); + + msgpack_pack_str(&ctx.packer, 9); + msgpack_pack_str_body(&ctx.packer, "ts_string", 9); + msgpack_pack_str(&ctx.packer, 16); + msgpack_pack_str_body(&ctx.packer, "1609459200000000", 16); + + msgpack_pack_str(&ctx.packer, 7); + msgpack_pack_str_body(&ctx.packer, "ts_bool", 7); + msgpack_pack_true(&ctx.packer); + + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Validate timestamp types with different units + * Note: Parquet format does not support second-precision timestamps. + * Arrow automatically converts timestamp[s] to timestamp[ms]. + */ + ret = validate_parquet_file(ctx.parquet_file, -1, "ts_int", "timestamp[ms]", NULL, 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "ts_float", "timestamp[ms]", NULL, 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "ts_string", "timestamp[us]", NULL, 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "ts_bool", "timestamp[ns]", NULL, 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Nullable fields with NULL values */ +static void test_nullable_fields(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_nullable"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"required_field\",\"type\":\"utf8\",\"nullable\":false}," + "{\"name\":\"optional_field\",\"type\":\"utf8\",\"nullable\":true}" + "]}"; + + /* Record with only required field */ + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 14); + msgpack_pack_str_body(&ctx.packer, "required_field", 14); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "test", 4); + + /* Record with NIL in optional field */ + pack_fluent_bit_record(&ctx.packer, 1609459201); + msgpack_pack_map(&ctx.packer, 2); + msgpack_pack_str(&ctx.packer, 14); + msgpack_pack_str_body(&ctx.packer, "required_field", 14); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "test", 4); + msgpack_pack_str(&ctx.packer, 14); + msgpack_pack_str_body(&ctx.packer, "optional_field", 14); + msgpack_pack_nil(&ctx.packer); + + TEST_CHECK(run_conversion(&ctx) == 0); + + cleanup_test_context(&ctx); +} + +/* Non-nullable field with default value */ +static void test_default_values(void) +{ + test_context ctx; + + init_test_context(&ctx, "parquet_defaults"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"int_field\",\"type\":\"int32\",\"nullable\":false}," + "{\"name\":\"str_field\",\"type\":\"utf8\",\"nullable\":false}" + "]}"; + + /* Record missing str_field - should get default empty string */ + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 9); + msgpack_pack_str_body(&ctx.packer, "int_field", 9); + msgpack_pack_int(&ctx.packer, 42); + + TEST_CHECK(run_conversion(&ctx) == 0); + + cleanup_test_context(&ctx); +} + +/* All compression types - tests GZIP, Snappy, ZSTD, and None */ +static void test_all_compression_types(void) +{ + const int compressions[] = { + FLB_AWS_COMPRESS_NONE, + FLB_AWS_COMPRESS_GZIP, + FLB_AWS_COMPRESS_SNAPPY, + FLB_AWS_COMPRESS_ZSTD + }; + const char *names[] = {"none", "gzip", "snappy", "zstd"}; + const char *test_data[] = { + "{\"data\":\"uncompressed\"}", + "{\"data\":\"gzip compressed\"}", + "{\"data\":\"snappy compressed\"}", + "{\"data\":\"zstd compressed\"}" + }; + const char *schema = "{\"fields\":[{\"name\":\"data\",\"type\":\"utf8\"}]}"; + int i; + + for (i = 0; i < 4; i++) { + msgpack_sbuffer sbuf; + msgpack_packer packer; + char msgpack_file[256], parquet_file[256]; + size_t parquet_size = 0; + int ret; + + snprintf(msgpack_file, sizeof(msgpack_file), + "/tmp/flb_test_compress_%s.msgpack", names[i]); + snprintf(parquet_file, sizeof(parquet_file), + "/tmp/flb_test_compress_%s.parquet", names[i]); + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + ret = pack_json_record(&sbuf, &packer, 1609459200, test_data[i]); + TEST_CHECK(ret == 0); + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema, + compressions[i], + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == 0); + TEST_CHECK(check_parquet_file(parquet_file, NULL) == 0); + + /* Validate data can be read back correctly */ + ret = validate_parquet_file(parquet_file, 1, "data", "string", NULL, 0); + TEST_CHECK(ret == 0); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); + unlink(parquet_file); + } +} + +/* Empty strings and binary data */ +static void test_boundary_empty_data(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_empty"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"empty_str\",\"type\":\"utf8\"}," + "{\"name\":\"empty_bin\",\"type\":\"binary\"}" + "]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 2); + + msgpack_pack_str(&ctx.packer, 9); + msgpack_pack_str_body(&ctx.packer, "empty_str", 9); + msgpack_pack_str(&ctx.packer, 0); + + msgpack_pack_str(&ctx.packer, 9); + msgpack_pack_str_body(&ctx.packer, "empty_bin", 9); + msgpack_pack_bin(&ctx.packer, 0); + + TEST_CHECK(run_conversion(&ctx) == 0); + + ret = validate_parquet_file(ctx.parquet_file, 1, "empty_str", "string", "", 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Extreme integer values */ +static void test_boundary_extreme_integers(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_extreme_int"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"int32_min\",\"type\":\"int32\"}," + "{\"name\":\"int32_max\",\"type\":\"int32\"}," + "{\"name\":\"int64_min\",\"type\":\"int64\"}," + "{\"name\":\"int64_max\",\"type\":\"int64\"}" + "]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 4); + + msgpack_pack_str(&ctx.packer, 9); + msgpack_pack_str_body(&ctx.packer, "int32_min", 9); + msgpack_pack_int64(&ctx.packer, -2147483648LL); /* INT32_MIN */ + + msgpack_pack_str(&ctx.packer, 9); + msgpack_pack_str_body(&ctx.packer, "int32_max", 9); + msgpack_pack_int64(&ctx.packer, 2147483647LL); /* INT32_MAX */ + + msgpack_pack_str(&ctx.packer, 9); + msgpack_pack_str_body(&ctx.packer, "int64_min", 9); + msgpack_pack_int64(&ctx.packer, -9223372036854775807LL - 1); /* INT64_MIN */ + + msgpack_pack_str(&ctx.packer, 9); + msgpack_pack_str_body(&ctx.packer, "int64_max", 9); + msgpack_pack_int64(&ctx.packer, 9223372036854775807LL); /* INT64_MAX */ + + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Validate extreme integer values */ + ret = validate_parquet_file(ctx.parquet_file, 1, "int32_min", "int32", "-2147483648", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "int32_max", "int32", "2147483647", 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Special floating point values */ +static void test_boundary_special_floats(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_special_float"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"zero\",\"type\":\"double\"}," + "{\"name\":\"neg_zero\",\"type\":\"double\"}," + "{\"name\":\"very_small\",\"type\":\"double\"}," + "{\"name\":\"very_large\",\"type\":\"double\"}" + "]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 4); + + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "zero", 4); + msgpack_pack_double(&ctx.packer, 0.0); + + msgpack_pack_str(&ctx.packer, 8); + msgpack_pack_str_body(&ctx.packer, "neg_zero", 8); + msgpack_pack_double(&ctx.packer, -0.0); + + msgpack_pack_str(&ctx.packer, 10); + msgpack_pack_str_body(&ctx.packer, "very_small", 10); + msgpack_pack_double(&ctx.packer, 1.0e-308); /* Near DBL_MIN */ + + msgpack_pack_str(&ctx.packer, 10); + msgpack_pack_str_body(&ctx.packer, "very_large", 10); + msgpack_pack_double(&ctx.packer, 1.0e308); /* Near DBL_MAX */ + + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Validate zero values */ + ret = validate_parquet_file(ctx.parquet_file, 1, "zero", "double", "0.000000", 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Very long strings */ +static void test_boundary_long_string(void) +{ + test_context ctx; + int ret, i; + const size_t long_str_size = 100000; /* 100KB string */ + char *long_str = (char *)malloc(long_str_size); + + if (!long_str) { + TEST_CHECK(0); /* Memory allocation failed */ + return; + } + + /* Fill with repeating pattern */ + for (i = 0; i < long_str_size; i++) { + long_str[i] = 'A' + (i % 26); + } + + init_test_context(&ctx, "parquet_long_str"); + ctx.schema = "{\"fields\":[{\"name\":\"long_text\",\"type\":\"utf8\"}]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 9); + msgpack_pack_str_body(&ctx.packer, "long_text", 9); + msgpack_pack_str(&ctx.packer, long_str_size); + msgpack_pack_str_body(&ctx.packer, long_str, long_str_size); + + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Validate record count for long string */ + ret = validate_parquet_file(ctx.parquet_file, 1, NULL, NULL, NULL, 0); + TEST_CHECK(ret == 0); + + free(long_str); + cleanup_test_context(&ctx); +} + +/* Empty map (no fields) */ +static void test_boundary_empty_map(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_empty_map"); + ctx.schema = "{\"fields\":[{\"name\":\"field1\",\"type\":\"utf8\",\"nullable\":true}]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 0); + + TEST_CHECK(run_conversion(&ctx) == 0); + + ret = validate_parquet_file(ctx.parquet_file, 1, NULL, NULL, NULL, 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Zero value boundary for all numeric types */ +static void test_boundary_zero_values(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_zeros"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"int32_zero\",\"type\":\"int32\"}," + "{\"name\":\"int64_zero\",\"type\":\"int64\"}," + "{\"name\":\"float_zero\",\"type\":\"float\"}," + "{\"name\":\"double_zero\",\"type\":\"double\"}" + "]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 4); + + msgpack_pack_str(&ctx.packer, 10); + msgpack_pack_str_body(&ctx.packer, "int32_zero", 10); + msgpack_pack_int(&ctx.packer, 0); + + msgpack_pack_str(&ctx.packer, 10); + msgpack_pack_str_body(&ctx.packer, "int64_zero", 10); + msgpack_pack_int64(&ctx.packer, 0LL); + + msgpack_pack_str(&ctx.packer, 10); + msgpack_pack_str_body(&ctx.packer, "float_zero", 10); + msgpack_pack_float(&ctx.packer, 0.0f); + + msgpack_pack_str(&ctx.packer, 11); + msgpack_pack_str_body(&ctx.packer, "double_zero", 11); + msgpack_pack_double(&ctx.packer, 0.0); + + TEST_CHECK(run_conversion(&ctx) == 0); + + ret = validate_parquet_file(ctx.parquet_file, 1, "int32_zero", "int32", "0", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "int64_zero", "int64", "0", 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Truncated/corrupted msgpack data */ +static void test_destructive_truncated_data(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_truncated"); + ctx.schema = "{\"fields\":[{\"name\":\"message\",\"type\":\"utf8\"}]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 7); + msgpack_pack_str_body(&ctx.packer, "message", 7); + msgpack_pack_str(&ctx.packer, 10); + msgpack_pack_str_body(&ctx.packer, "test", 4); /* Only write 4 bytes but claim 10 */ + + /* Write truncated data */ + ret = write_msgpack_to_chunk_file(ctx.msgpack_file, ctx.sbuf.data, ctx.sbuf.size / 2); + TEST_CHECK(ret == 0); + + ret = flb_msgpack_raw_to_parquet_file_streaming(ctx.msgpack_file, ctx.schema, + FLB_AWS_COMPRESS_NONE, + ctx.parquet_file, &ctx.parquet_size, 0); + TEST_CHECK(ret == -1); /* Should fail */ + + cleanup_test_context(&ctx); +} + +/* Invalid JSON schema */ +static void test_destructive_invalid_schema_json(void) +{ + test_context ctx; + int ret; + const char *bad_schema = "{\"fields\":[{\"name\":\"test\",\"type\":\"utf8\""; /* Missing closing braces */ + + init_test_context(&ctx, "parquet_bad_schema"); + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "test", 4); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "data", 4); + + ret = write_msgpack_to_chunk_file(ctx.msgpack_file, ctx.sbuf.data, ctx.sbuf.size); + TEST_CHECK(ret == 0); + + ret = flb_msgpack_raw_to_parquet_file_streaming(ctx.msgpack_file, bad_schema, + FLB_AWS_COMPRESS_NONE, + ctx.parquet_file, &ctx.parquet_size, 0); + TEST_CHECK(ret == -1); /* Should fail */ + + cleanup_test_context(&ctx); +} + +/* Empty schema (no fields) */ +static void test_destructive_empty_schema(void) +{ + test_context ctx; + int ret; + const char *empty_schema = "{\"fields\":[]}"; /* No fields */ + + init_test_context(&ctx, "parquet_empty_schema"); + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "test", 4); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "data", 4); + + ret = write_msgpack_to_chunk_file(ctx.msgpack_file, ctx.sbuf.data, ctx.sbuf.size); + TEST_CHECK(ret == 0); + + ret = flb_msgpack_raw_to_parquet_file_streaming(ctx.msgpack_file, empty_schema, + FLB_AWS_COMPRESS_NONE, + ctx.parquet_file, &ctx.parquet_size, 0); + TEST_CHECK(ret == -1); /* Should fail */ + + cleanup_test_context(&ctx); +} + +/* Schema with unsupported type */ +static void test_destructive_unsupported_type(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_unsupported"); + ctx.schema = "{\"fields\":[{\"name\":\"test\",\"type\":\"unknown_type\"}]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "test", 4); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "data", 4); + + /* Should fall back to utf8 type and succeed */ + TEST_CHECK(run_conversion(&ctx) == 0); + + cleanup_test_context(&ctx); +} + +/* Invalid compression type */ +static void test_destructive_invalid_compression(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_bad_compress"); + ctx.schema = "{\"fields\":[{\"name\":\"test\",\"type\":\"utf8\"}]}"; + ctx.compression = 999; /* Invalid compression type */ + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "test", 4); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "data", 4); + + /* Use invalid compression type (should default to UNCOMPRESSED) */ + TEST_CHECK(run_conversion(&ctx) == 0); /* Should succeed with default compression */ + + cleanup_test_context(&ctx); +} + +/* Type conversion failure - unparseable string */ +static void test_destructive_unparseable_conversion(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_bad_convert"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"int_field\",\"type\":\"int32\",\"nullable\":false}," + "{\"name\":\"float_field\",\"type\":\"float\",\"nullable\":false}," + "{\"name\":\"bool_field\",\"type\":\"bool\",\"nullable\":false}" + "]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 3); + + /* Strings that cannot be parsed to respective types */ + msgpack_pack_str(&ctx.packer, 9); + msgpack_pack_str_body(&ctx.packer, "int_field", 9); + msgpack_pack_str(&ctx.packer, 12); + msgpack_pack_str_body(&ctx.packer, "not_a_number", 12); + + msgpack_pack_str(&ctx.packer, 11); + msgpack_pack_str_body(&ctx.packer, "float_field", 11); + msgpack_pack_str(&ctx.packer, 6); + msgpack_pack_str_body(&ctx.packer, "xyz123", 6); + + msgpack_pack_str(&ctx.packer, 10); + msgpack_pack_str_body(&ctx.packer, "bool_field", 10); + msgpack_pack_str(&ctx.packer, 7); + msgpack_pack_str_body(&ctx.packer, "invalid", 7); + + /* Should use default values (0, 0.0, false) */ + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Validate default values are used for unparseable conversions */ + ret = validate_parquet_file(ctx.parquet_file, -1, "int_field", "int32", "0", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "float_field", "float", "0.000000", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "bool_field", "bool", "false", 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* NULL input parameters */ +static void test_error_null_input(void) +{ + const char *parquet_file = "/tmp/flb_test_parquet_error.parquet"; + size_t parquet_size = 0; + int ret; + const char *schema = "{\"fields\":[{\"name\":\"test\",\"type\":\"utf8\"}]}"; + + ret = flb_msgpack_raw_to_parquet_file_streaming(NULL, schema, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == -1); +} + +/* NULL schema */ +static void test_error_null_schema(void) +{ + msgpack_sbuffer sbuf; + msgpack_packer packer; + const char *msgpack_file = "/tmp/flb_test_parquet_noschema.msgpack"; + const char *parquet_file = "/tmp/flb_test_parquet_noschema.parquet"; + size_t parquet_size = 0; + int ret; + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + pack_fluent_bit_record(&packer, 1609459200); + msgpack_pack_map(&packer, 1); + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "test", 4); + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "data", 4); + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, NULL, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == -1); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); +} + +/* Nonexistent input file */ +static void test_error_missing_file(void) +{ + const char *msgpack_file = "/tmp/flb_test_parquet_nonexistent.msgpack"; + const char *parquet_file = "/tmp/flb_test_parquet_nonexistent.parquet"; + size_t parquet_size = 0; + int ret; + const char *schema = "{\"fields\":[{\"name\":\"test\",\"type\":\"utf8\"}]}"; + + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == -1); +} + +/* Invalid record format (not array) */ +static void test_error_invalid_format(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "parquet_invalid"); + ctx.schema = "{\"fields\":[{\"name\":\"test\",\"type\":\"utf8\"}]}"; + + /* Pack just a map, not [timestamp, map] */ + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "test", 4); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "data", 4); + + ret = write_msgpack_to_chunk_file(ctx.msgpack_file, ctx.sbuf.data, ctx.sbuf.size); + TEST_CHECK(ret == 0); + + /* Should skip invalid records, resulting in no records */ + ret = flb_msgpack_raw_to_parquet_file_streaming(ctx.msgpack_file, ctx.schema, + FLB_AWS_COMPRESS_NONE, + ctx.parquet_file, &ctx.parquet_size, 0); + TEST_CHECK(ret == -1); /* No records processed */ + + cleanup_test_context(&ctx); +} + +/* Schema has MORE fields than data - Critical for crash fix validation */ +static void test_edge_schema_more_fields(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "edge_more_schema"); + /* Schema has 5 fields */ + ctx.schema = "{\"fields\":[" + "{\"name\":\"field1\",\"type\":\"utf8\",\"nullable\":false}," + "{\"name\":\"field2\",\"type\":\"int32\",\"nullable\":false}," + "{\"name\":\"field3\",\"type\":\"utf8\",\"nullable\":false}," + "{\"name\":\"field4\",\"type\":\"int32\",\"nullable\":false}," + "{\"name\":\"field5\",\"type\":\"utf8\",\"nullable\":false}" + "]}"; + + /* Data only has 2 fields - field3, field4, field5 missing */ + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 2); + msgpack_pack_str(&ctx.packer, 6); + msgpack_pack_str_body(&ctx.packer, "field1", 6); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "val1", 4); + msgpack_pack_str(&ctx.packer, 6); + msgpack_pack_str_body(&ctx.packer, "field2", 6); + msgpack_pack_int(&ctx.packer, 42); + + /* Should succeed - missing fields get default values */ + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Validate present fields */ + ret = validate_parquet_file(ctx.parquet_file, 1, "field2", "int32", "42", 0); + TEST_CHECK(ret == 0); + + /* Validate missing fields got default values */ + ret = validate_parquet_file(ctx.parquet_file, -1, "field3", "string", "", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "field4", "int32", "0", 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Schema has LESS fields than data */ +static void test_edge_schema_less_fields(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "edge_less_schema"); + /* Schema only has 2 fields */ + ctx.schema = "{\"fields\":[" + "{\"name\":\"field1\",\"type\":\"utf8\"}," + "{\"name\":\"field2\",\"type\":\"int32\"}" + "]}"; + + /* Data has 5 fields - extra fields should be ignored */ + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 5); + msgpack_pack_str(&ctx.packer, 6); + msgpack_pack_str_body(&ctx.packer, "field1", 6); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "val1", 4); + msgpack_pack_str(&ctx.packer, 6); + msgpack_pack_str_body(&ctx.packer, "field2", 6); + msgpack_pack_int(&ctx.packer, 42); + msgpack_pack_str(&ctx.packer, 6); + msgpack_pack_str_body(&ctx.packer, "field3", 6); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "val3", 4); + msgpack_pack_str(&ctx.packer, 6); + msgpack_pack_str_body(&ctx.packer, "field4", 6); + msgpack_pack_int(&ctx.packer, 99); + msgpack_pack_str(&ctx.packer, 6); + msgpack_pack_str_body(&ctx.packer, "field5", 6); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "val5", 4); + + /* Should succeed - extra data fields ignored */ + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Validate only schema fields are present */ + ret = validate_parquet_file(ctx.parquet_file, 1, "field1", "string", "val1", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "field2", "int32", "42", 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Field name mismatch */ +static void test_edge_field_name_mismatch(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "edge_name_mismatch"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"expected_field\",\"type\":\"utf8\",\"nullable\":false}" + "]}"; + + /* Data has different field name */ + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 12); + msgpack_pack_str_body(&ctx.packer, "actual_field", 12); + msgpack_pack_str(&ctx.packer, 4); + msgpack_pack_str_body(&ctx.packer, "data", 4); + + /* Should succeed - missing field gets default */ + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Validate default value used */ + ret = validate_parquet_file(ctx.parquet_file, 1, "expected_field", "string", "", 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* ALL fields missing from data */ +static void test_edge_all_fields_missing(void) +{ + test_context ctx; + int ret; + + init_test_context(&ctx, "edge_all_missing"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"field1\",\"type\":\"utf8\",\"nullable\":false}," + "{\"name\":\"field2\",\"type\":\"int32\",\"nullable\":false}," + "{\"name\":\"field3\",\"type\":\"bool\",\"nullable\":false}" + "]}"; + + /* Data has completely different fields */ + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 2); + msgpack_pack_str(&ctx.packer, 10); + msgpack_pack_str_body(&ctx.packer, "unrelated1", 10); + msgpack_pack_str(&ctx.packer, 3); + msgpack_pack_str_body(&ctx.packer, "xyz", 3); + msgpack_pack_str(&ctx.packer, 10); + msgpack_pack_str_body(&ctx.packer, "unrelated2", 10); + msgpack_pack_int(&ctx.packer, 999); + + /* Should succeed - all fields get defaults */ + TEST_CHECK(run_conversion(&ctx) == 0); + + /* Validate all default values */ + ret = validate_parquet_file(ctx.parquet_file, 1, "field1", "string", "", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "field2", "int32", "0", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(ctx.parquet_file, -1, "field3", "bool", "false", 0); + TEST_CHECK(ret == 0); + + cleanup_test_context(&ctx); +} + +/* Mixed present and missing fields */ +static void test_edge_mixed_present_missing(void) +{ + msgpack_sbuffer sbuf; + msgpack_packer packer; + const char *msgpack_file = "/tmp/flb_test_edge_mixed.msgpack"; + const char *parquet_file = "/tmp/flb_test_edge_mixed.parquet"; + size_t parquet_size = 0; + int ret, i; + const char *schema = "{\"fields\":[" + "{\"name\":\"id\",\"type\":\"int32\",\"nullable\":false}," + "{\"name\":\"name\",\"type\":\"utf8\",\"nullable\":false}," + "{\"name\":\"score\",\"type\":\"int32\",\"nullable\":false}," + "{\"name\":\"status\",\"type\":\"utf8\",\"nullable\":false}" + "]}"; + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + /* Multiple records with different field patterns */ + for (i = 0; i < 10; i++) { + pack_fluent_bit_record(&packer, 1609459200 + i); + + if (i % 3 == 0) { + /* All fields present */ + msgpack_pack_map(&packer, 4); + msgpack_pack_str(&packer, 2); + msgpack_pack_str_body(&packer, "id", 2); + msgpack_pack_int(&packer, i); + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "name", 4); + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "test", 4); + msgpack_pack_str(&packer, 5); + msgpack_pack_str_body(&packer, "score", 5); + msgpack_pack_int(&packer, i * 10); + msgpack_pack_str(&packer, 6); + msgpack_pack_str_body(&packer, "status", 6); + msgpack_pack_str(&packer, 2); + msgpack_pack_str_body(&packer, "ok", 2); + } else if (i % 3 == 1) { + /* Only id and name */ + msgpack_pack_map(&packer, 2); + msgpack_pack_str(&packer, 2); + msgpack_pack_str_body(&packer, "id", 2); + msgpack_pack_int(&packer, i); + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "name", 4); + msgpack_pack_str(&packer, 7); + msgpack_pack_str_body(&packer, "partial", 7); + } else { + /* Only id */ + msgpack_pack_map(&packer, 1); + msgpack_pack_str(&packer, 2); + msgpack_pack_str_body(&packer, "id", 2); + msgpack_pack_int(&packer, i); + } + } + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + /* Should succeed */ + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == 0); + TEST_CHECK(check_parquet_file(parquet_file, NULL) == 0); + + /* Validate record count */ + ret = validate_parquet_file(parquet_file, 10, NULL, NULL, NULL, 0); + TEST_CHECK(ret == 0); + + /* Validate some values */ + ret = validate_parquet_file(parquet_file, -1, "id", "int32", "0", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "name", "string", "partial", 1); + TEST_CHECK(ret == 0); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); + unlink(parquet_file); +} + +/* Schema with many fields (50+) */ +static void test_boundary_many_schema_fields(void) +{ + msgpack_sbuffer sbuf; + msgpack_packer packer; + const char *msgpack_file = "/tmp/flb_test_many_schema_fields.msgpack"; + const char *parquet_file = "/tmp/flb_test_many_schema_fields.parquet"; + size_t parquet_size = 0; + int ret, i; + char schema_buf[8192]; + int offset = 0; + + /* Build schema with 50 fields */ + offset += snprintf(schema_buf + offset, sizeof(schema_buf) - offset, "{\"fields\":["); + for (i = 0; i < 50; i++) { + if (i > 0) offset += snprintf(schema_buf + offset, sizeof(schema_buf) - offset, ","); + offset += snprintf(schema_buf + offset, sizeof(schema_buf) - offset, + "{\"name\":\"field%d\",\"type\":\"int32\",\"nullable\":false}", i); + } + offset += snprintf(schema_buf + offset, sizeof(schema_buf) - offset, "]}"); + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + /* Data only has first 10 fields */ + pack_fluent_bit_record(&packer, 1609459200); + msgpack_pack_map(&packer, 10); + for (i = 0; i < 10; i++) { + char field_name[20]; + snprintf(field_name, sizeof(field_name), "field%d", i); + msgpack_pack_str(&packer, strlen(field_name)); + msgpack_pack_str_body(&packer, field_name, strlen(field_name)); + msgpack_pack_int(&packer, i * 10); + } + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + /* Should succeed - missing 40 fields get defaults */ + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema_buf, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == 0); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); + unlink(parquet_file); +} + +/* Data with many fields (100+) but schema only has few */ +static void test_boundary_many_data_fields(void) +{ + msgpack_sbuffer sbuf; + msgpack_packer packer; + const char *msgpack_file = "/tmp/flb_test_many_data_fields.msgpack"; + const char *parquet_file = "/tmp/flb_test_many_data_fields.parquet"; + size_t parquet_size = 0; + int ret, i; + const char *schema = "{\"fields\":[" + "{\"name\":\"field0\",\"type\":\"int32\"}," + "{\"name\":\"field50\",\"type\":\"int32\"}" + "]}"; + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + /* Data has 100 fields */ + pack_fluent_bit_record(&packer, 1609459200); + msgpack_pack_map(&packer, 100); + for (i = 0; i < 100; i++) { + char field_name[20]; + snprintf(field_name, sizeof(field_name), "field%d", i); + msgpack_pack_str(&packer, strlen(field_name)); + msgpack_pack_str_body(&packer, field_name, strlen(field_name)); + msgpack_pack_int(&packer, i); + } + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + /* Should succeed - only field0 and field50 extracted */ + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == 0); + + /* Validate correct fields extracted */ + ret = validate_parquet_file(parquet_file, 1, "field0", "int32", "0", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "field50", "int32", "50", 0); + TEST_CHECK(ret == 0); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); + unlink(parquet_file); +} + +/* Single field with many records */ +static void test_boundary_single_field_many_records(void) +{ + msgpack_sbuffer sbuf; + msgpack_packer packer; + const char *msgpack_file = "/tmp/flb_test_single_field.msgpack"; + const char *parquet_file = "/tmp/flb_test_single_field.parquet"; + size_t parquet_size = 0; + int ret, i; + const char *schema = "{\"fields\":[{\"name\":\"value\",\"type\":\"int32\",\"nullable\":false}]}"; + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + /* 1000 records with single field */ + for (i = 0; i < 1000; i++) { + pack_fluent_bit_record(&packer, 1609459200 + i); + msgpack_pack_map(&packer, 1); + msgpack_pack_str(&packer, 5); + msgpack_pack_str_body(&packer, "value", 5); + msgpack_pack_int(&packer, i); + } + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == 0); + + /* Validate count and some values */ + ret = validate_parquet_file(parquet_file, 1000, NULL, NULL, NULL, 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "value", "int32", "0", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "value", "int32", "999", 999); + TEST_CHECK(ret == 0); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); + unlink(parquet_file); +} + +/* Schema evolution (new schema with old data) */ +static void test_realworld_schema_evolution(void) +{ + msgpack_sbuffer sbuf; + msgpack_packer packer; + const char *msgpack_file = "/tmp/flb_test_schema_evolution.msgpack"; + const char *parquet_file = "/tmp/flb_test_schema_evolution.parquet"; + size_t parquet_size = 0; + int ret; + /* New schema with added fields */ + const char *schema = "{\"fields\":[" + "{\"name\":\"message\",\"type\":\"utf8\",\"nullable\":false}," + "{\"name\":\"level\",\"type\":\"int32\",\"nullable\":false}," + "{\"name\":\"timestamp\",\"type\":\"int64\",\"nullable\":false}," /* New field */ + "{\"name\":\"source\",\"type\":\"utf8\",\"nullable\":false}" /* New field */ + "]}"; + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + /* Old data format (only message and level) */ + pack_fluent_bit_record(&packer, 1609459200); + msgpack_pack_map(&packer, 2); + msgpack_pack_str(&packer, 7); + msgpack_pack_str_body(&packer, "message", 7); + msgpack_pack_str(&packer, 8); + msgpack_pack_str_body(&packer, "old data", 8); + msgpack_pack_str(&packer, 5); + msgpack_pack_str_body(&packer, "level", 5); + msgpack_pack_int(&packer, 1); + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + /* Should succeed with defaults for new fields */ + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == 0); + + /* Validate old fields preserved */ + ret = validate_parquet_file(parquet_file, 1, "message", "string", "old data", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "level", "int32", "1", 0); + TEST_CHECK(ret == 0); + + /* Validate new fields have defaults */ + ret = validate_parquet_file(parquet_file, -1, "timestamp", "int64", "0", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "source", "string", "", 0); + TEST_CHECK(ret == 0); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); + unlink(parquet_file); +} + +/* Partial record (simulates crashed fluent-bit) */ +static void test_realworld_partial_record(void) +{ + msgpack_sbuffer sbuf; + msgpack_packer packer; + const char *msgpack_file = "/tmp/flb_test_partial.msgpack"; + const char *parquet_file = "/tmp/flb_test_partial.parquet"; + size_t parquet_size = 0; + int ret; + const char *schema = "{\"fields\":[" + "{\"name\":\"field1\",\"type\":\"utf8\",\"nullable\":false}," + "{\"name\":\"field2\",\"type\":\"utf8\",\"nullable\":false}," + "{\"name\":\"field3\",\"type\":\"utf8\",\"nullable\":false}" + "]}"; + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + /* Complete record */ + pack_fluent_bit_record(&packer, 1609459200); + msgpack_pack_map(&packer, 3); + msgpack_pack_str(&packer, 6); + msgpack_pack_str_body(&packer, "field1", 6); + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "val1", 4); + msgpack_pack_str(&packer, 6); + msgpack_pack_str_body(&packer, "field2", 6); + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "val2", 4); + msgpack_pack_str(&packer, 6); + msgpack_pack_str_body(&packer, "field3", 6); + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "val3", 4); + + /* Partial record (missing field3) */ + pack_fluent_bit_record(&packer, 1609459201); + msgpack_pack_map(&packer, 2); + msgpack_pack_str(&packer, 6); + msgpack_pack_str_body(&packer, "field1", 6); + msgpack_pack_str(&packer, 8); + msgpack_pack_str_body(&packer, "partial1", 8); + msgpack_pack_str(&packer, 6); + msgpack_pack_str_body(&packer, "field2", 6); + msgpack_pack_str(&packer, 8); + msgpack_pack_str_body(&packer, "partial2", 8); + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + /* Should succeed */ + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == 0); + + /* Validate both records */ + ret = validate_parquet_file(parquet_file, 2, NULL, NULL, NULL, 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "field1", "string", "val1", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "field1", "string", "partial1", 1); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "field3", "string", "", 1); /* Default */ + TEST_CHECK(ret == 0); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); + unlink(parquet_file); +} + +/* Extra data fields not in schema */ +static void test_realworld_extra_data_fields(void) +{ + msgpack_sbuffer sbuf; + msgpack_packer packer; + const char *msgpack_file = "/tmp/flb_test_extra_fields.msgpack"; + const char *parquet_file = "/tmp/flb_test_extra_fields.parquet"; + size_t parquet_size = 0; + int ret, i; + /* Schema only defines 3 important fields */ + const char *schema = "{\"fields\":[" + "{\"name\":\"log_level\",\"type\":\"int32\",\"nullable\":false}," + "{\"name\":\"message\",\"type\":\"utf8\",\"nullable\":false}," + "{\"name\":\"timestamp\",\"type\":\"int64\",\"nullable\":false}" + "]}"; + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + /* Multiple records with many extra fields */ + for (i = 0; i < 5; i++) { + pack_fluent_bit_record(&packer, 1609459200 + i); + msgpack_pack_map(&packer, 10); /* 10 fields but schema only has 3 */ + + /* Schema fields */ + msgpack_pack_str(&packer, 9); + msgpack_pack_str_body(&packer, "log_level", 9); + msgpack_pack_int(&packer, i % 3); + + msgpack_pack_str(&packer, 7); + msgpack_pack_str_body(&packer, "message", 7); + msgpack_pack_str(&packer, 8); + msgpack_pack_str_body(&packer, "test msg", 8); + + msgpack_pack_str(&packer, 9); + msgpack_pack_str_body(&packer, "timestamp", 9); + msgpack_pack_int64(&packer, 1609459200LL + i); + + /* Extra fields not in schema */ + msgpack_pack_str(&packer, 8); + msgpack_pack_str_body(&packer, "hostname", 8); + msgpack_pack_str(&packer, 7); + msgpack_pack_str_body(&packer, "server1", 7); + + msgpack_pack_str(&packer, 3); + msgpack_pack_str_body(&packer, "pid", 3); + msgpack_pack_int(&packer, 1234); + + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "user", 4); + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "root", 4); + + msgpack_pack_str(&packer, 7); + msgpack_pack_str_body(&packer, "service", 7); + msgpack_pack_str(&packer, 3); + msgpack_pack_str_body(&packer, "web", 3); + + msgpack_pack_str(&packer, 7); + msgpack_pack_str_body(&packer, "version", 7); + msgpack_pack_str(&packer, 5); + msgpack_pack_str_body(&packer, "1.0.0", 5); + + msgpack_pack_str(&packer, 11); + msgpack_pack_str_body(&packer, "environment", 11); + msgpack_pack_str(&packer, 4); + msgpack_pack_str_body(&packer, "prod", 4); + + msgpack_pack_str(&packer, 6); + msgpack_pack_str_body(&packer, "region", 6); + msgpack_pack_str(&packer, 7); + msgpack_pack_str_body(&packer, "us-west", 7); + } + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + /* Should succeed - only schema fields extracted */ + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == 0); + + /* Validate schema fields present */ + ret = validate_parquet_file(parquet_file, 5, NULL, NULL, NULL, 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "log_level", "int32", "0", 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "log_level", "int32", "1", 1); + TEST_CHECK(ret == 0); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); + unlink(parquet_file); +} + +/* Batch boundary tests - tests 65536 boundary and multiple batches */ +static void test_batch_boundaries(void) +{ + const int test_counts[] = {65535, 65536, 65537, 131072}; + const char *names[] = {"below", "exact", "above", "double"}; + const char *schema = "{\"fields\":[{\"name\":\"id\",\"type\":\"int32\"}]}"; + int t, i, ret; + + for (t = 0; t < 4; t++) { + msgpack_sbuffer sbuf; + msgpack_packer packer; + char msgpack_file[256], parquet_file[256]; + size_t parquet_size = 0; + const int record_count = test_counts[t]; + + snprintf(msgpack_file, sizeof(msgpack_file), + "/tmp/flb_test_batch_%s.msgpack", names[t]); + snprintf(parquet_file, sizeof(parquet_file), + "/tmp/flb_test_batch_%s.parquet", names[t]); + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + for (i = 0; i < record_count; i++) { + pack_fluent_bit_record(&packer, 1609459200 + i); + msgpack_pack_map(&packer, 1); + msgpack_pack_str(&packer, 2); + msgpack_pack_str_body(&packer, "id", 2); + msgpack_pack_int(&packer, i); + } + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == 0); + TEST_CHECK(check_parquet_file(parquet_file, NULL) == 0); + + /* Validate count */ + ret = validate_parquet_file(parquet_file, record_count, NULL, NULL, NULL, 0); + TEST_CHECK(ret == 0); + + /* Validate boundary records */ + ret = validate_parquet_file(parquet_file, -1, "id", "int32", "0", 0); + TEST_CHECK(ret == 0); + + if (record_count > 65535) { + ret = validate_parquet_file(parquet_file, -1, "id", "int32", "65535", 65535); + TEST_CHECK(ret == 0); + } + + if (record_count > 65536) { + ret = validate_parquet_file(parquet_file, -1, "id", "int32", "65536", 65536); + TEST_CHECK(ret == 0); + } + + char last_val[20]; + snprintf(last_val, sizeof(last_val), "%d", record_count - 1); + ret = validate_parquet_file(parquet_file, -1, "id", "int32", last_val, record_count - 1); + TEST_CHECK(ret == 0); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); + unlink(parquet_file); + } +} + +/* Special floating point values - NaN, +Infinity, -Infinity */ +static void test_special_float_values(void) +{ + msgpack_sbuffer sbuf; + msgpack_packer packer; + const char *msgpack_file = "/tmp/flb_test_special_floats.msgpack"; + const char *parquet_file = "/tmp/flb_test_special_floats.parquet"; + size_t parquet_size = 0; + int ret; + const char *schema = "{\"fields\":[" + "{\"name\":\"float_nan\",\"type\":\"float\"}," + "{\"name\":\"float_inf\",\"type\":\"float\"}," + "{\"name\":\"float_neg_inf\",\"type\":\"float\"}," + "{\"name\":\"double_nan\",\"type\":\"double\"}," + "{\"name\":\"double_inf\",\"type\":\"double\"}," + "{\"name\":\"double_neg_inf\",\"type\":\"double\"}" + "]}"; + + const double nan_val = 0.0 / 0.0; + const double inf_val = 1.0 / 0.0; + const double neg_inf_val = -1.0 / 0.0; + + msgpack_sbuffer_init(&sbuf); + msgpack_packer_init(&packer, &sbuf, msgpack_sbuffer_write); + + pack_fluent_bit_record(&packer, 1609459200); + msgpack_pack_map(&packer, 6); + + msgpack_pack_str(&packer, 9); + msgpack_pack_str_body(&packer, "float_nan", 9); + msgpack_pack_float(&packer, (float)nan_val); + + msgpack_pack_str(&packer, 9); + msgpack_pack_str_body(&packer, "float_inf", 9); + msgpack_pack_float(&packer, (float)inf_val); + + msgpack_pack_str(&packer, 13); + msgpack_pack_str_body(&packer, "float_neg_inf", 13); + msgpack_pack_float(&packer, (float)neg_inf_val); + + msgpack_pack_str(&packer, 10); + msgpack_pack_str_body(&packer, "double_nan", 10); + msgpack_pack_double(&packer, nan_val); + + msgpack_pack_str(&packer, 10); + msgpack_pack_str_body(&packer, "double_inf", 10); + msgpack_pack_double(&packer, inf_val); + + msgpack_pack_str(&packer, 14); + msgpack_pack_str_body(&packer, "double_neg_inf", 14); + msgpack_pack_double(&packer, neg_inf_val); + + ret = write_msgpack_to_chunk_file(msgpack_file, sbuf.data, sbuf.size); + TEST_CHECK(ret == 0); + + /* Should succeed - NaN and Infinity are valid IEEE 754 */ + ret = flb_msgpack_raw_to_parquet_file_streaming(msgpack_file, schema, + FLB_AWS_COMPRESS_NONE, + parquet_file, &parquet_size, 0); + TEST_CHECK(ret == 0); + TEST_CHECK(check_parquet_file(parquet_file, NULL) == 0); + + /* Validate all special values are present (type check only) */ + ret = validate_parquet_file(parquet_file, 1, "float_nan", "float", NULL, 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "float_inf", "float", NULL, 0); + TEST_CHECK(ret == 0); + ret = validate_parquet_file(parquet_file, -1, "double_nan", "double", NULL, 0); + TEST_CHECK(ret == 0); + + msgpack_sbuffer_destroy(&sbuf); + unlink(msgpack_file); + unlink(parquet_file); +} + +/* Negative timestamp */ +static void test_timestamp_negative(void) +{ + test_context ctx; + + init_test_context(&ctx, "ts_negative"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"ts_negative\",\"type\":{\"name\":\"timestamp\",\"unit\":\"s\"}}" + "]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 11); + msgpack_pack_str_body(&ctx.packer, "ts_negative", 11); + msgpack_pack_int64(&ctx.packer, -1609459200LL); + + TEST_CHECK(run_conversion(&ctx) == 0); + + cleanup_test_context(&ctx); +} + +/* Zero timestamp (Unix epoch) */ +static void test_timestamp_zero(void) +{ + test_context ctx; + + init_test_context(&ctx, "ts_zero"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"ts_zero\",\"type\":{\"name\":\"timestamp\",\"unit\":\"ms\"}}" + "]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 7); + msgpack_pack_str_body(&ctx.packer, "ts_zero", 7); + msgpack_pack_int64(&ctx.packer, 0LL); + + TEST_CHECK(run_conversion(&ctx) == 0); + + cleanup_test_context(&ctx); +} + +/* Maximum int64 timestamp */ +static void test_timestamp_max_int64(void) +{ + test_context ctx; + + init_test_context(&ctx, "ts_max"); + ctx.schema = "{\"fields\":[" + "{\"name\":\"ts_max\",\"type\":{\"name\":\"timestamp\",\"unit\":\"ns\"}}" + "]}"; + + pack_fluent_bit_record(&ctx.packer, 1609459200); + msgpack_pack_map(&ctx.packer, 1); + msgpack_pack_str(&ctx.packer, 6); + msgpack_pack_str_body(&ctx.packer, "ts_max", 6); + msgpack_pack_int64(&ctx.packer, 9223372036854775807LL); + + TEST_CHECK(run_conversion(&ctx) == 0); + + cleanup_test_context(&ctx); +} + +TEST_LIST = { + /* ========================================================================= + * CATEGORY 1: FUNCTIONAL TESTS (9 tests) + * Basic functionality and type conversion tests + * ========================================================================= */ + {"basic_conversion", test_basic_conversion}, + {"multiple_records", test_multiple_records}, + {"large_record_count", test_large_record_count}, + {"bool_conversions", test_bool_conversions}, + {"integer_conversions", test_integer_conversions}, + {"float_conversions", test_float_conversions}, + {"string_conversions", test_string_conversions}, + {"binary_type", test_binary_type}, + {"timestamp_type", test_timestamp_type}, + + /* ========================================================================= + * CATEGORY 2: DATA QUALITY TESTS (12 tests) + * NULL handling, schema mismatches, and data integrity + * ========================================================================= */ + {"nullable_fields", test_nullable_fields}, + {"default_values", test_default_values}, + {"edge_schema_more_fields", test_edge_schema_more_fields}, + {"edge_schema_less_fields", test_edge_schema_less_fields}, + {"edge_field_name_mismatch", test_edge_field_name_mismatch}, + {"edge_all_fields_missing", test_edge_all_fields_missing}, + {"edge_mixed_present_missing", test_edge_mixed_present_missing}, + {"boundary_many_schema_fields", test_boundary_many_schema_fields}, + {"boundary_many_data_fields", test_boundary_many_data_fields}, + {"boundary_single_field_many_records", test_boundary_single_field_many_records}, + {"boundary_empty_data", test_boundary_empty_data}, + {"boundary_empty_map", test_boundary_empty_map}, + + /* ========================================================================= + * CATEGORY 3: SCALE & BOUNDARY TESTS (9 tests - OPTIMIZED) + * Batch processing, extreme values, and boundary conditions + * ========================================================================= */ + {"batch_boundaries", test_batch_boundaries}, /* Tests 65535, 65536, 65537, 131072 */ + {"boundary_extreme_integers", test_boundary_extreme_integers}, + {"boundary_special_floats", test_boundary_special_floats}, + {"boundary_zero_values", test_boundary_zero_values}, + {"boundary_long_string", test_boundary_long_string}, + {"special_float_values", test_special_float_values}, /* Tests NaN, +Inf, -Inf */ + {"timestamp_negative", test_timestamp_negative}, + {"timestamp_zero", test_timestamp_zero}, + {"timestamp_max_int64", test_timestamp_max_int64}, + + /* ========================================================================= + * CATEGORY 4: COMPRESSION TESTS (1 test - OPTIMIZED) + * ========================================================================= */ + {"all_compression_types", test_all_compression_types}, /* Tests NONE, GZIP, Snappy, ZSTD */ + + /* ========================================================================= + * CATEGORY 5: NEGATIVE TESTS (10 tests) + * Destructive tests and error handling + * ========================================================================= */ + {"destructive_truncated_data", test_destructive_truncated_data}, + {"destructive_invalid_schema_json", test_destructive_invalid_schema_json}, + {"destructive_empty_schema", test_destructive_empty_schema}, + {"destructive_unsupported_type", test_destructive_unsupported_type}, + {"destructive_invalid_compression", test_destructive_invalid_compression}, + {"destructive_unparseable_conversion", test_destructive_unparseable_conversion}, + {"error_null_input", test_error_null_input}, + {"error_null_schema", test_error_null_schema}, + {"error_missing_file", test_error_missing_file}, + {"error_invalid_format", test_error_invalid_format}, + + /* ========================================================================= + * CATEGORY 6: REAL-WORLD SCENARIOS (3 tests) + * Production-like scenarios and integration patterns + * ========================================================================= */ + {"realworld_schema_evolution", test_realworld_schema_evolution}, + {"realworld_partial_record", test_realworld_partial_record}, + {"realworld_extra_data_fields", test_realworld_extra_data_fields}, + + {NULL, NULL} +}; diff --git a/tests/runtime/CMakeLists.txt b/tests/runtime/CMakeLists.txt index dd76c16faee..3c5915ac6bf 100644 --- a/tests/runtime/CMakeLists.txt +++ b/tests/runtime/CMakeLists.txt @@ -235,7 +235,14 @@ if(FLB_IN_LIB) if(NOT FLB_SYSTEM_WINDOWS) FLB_RT_TEST(FLB_OUT_FILE "out_file.c") endif() - FLB_RT_TEST(FLB_OUT_S3 "out_s3.c") + FLB_RT_TEST(FLB_OUT_S3 "out_s3_config.c") + FLB_RT_TEST(FLB_OUT_S3 "out_s3_multipart.c") + FLB_RT_TEST(FLB_OUT_S3 "out_s3_format.c") + FLB_RT_TEST(FLB_OUT_S3 "out_s3_error_handling.c") + FLB_RT_TEST(FLB_OUT_S3 "out_s3_edge_cases.c") + FLB_RT_TEST(FLB_OUT_S3 "out_s3_special_features.c") + FLB_RT_TEST(FLB_OUT_S3 "out_s3_queue.c") + FLB_RT_TEST(FLB_OUT_S3 "out_s3_recovery.c") FLB_RT_TEST(FLB_OUT_TD "out_td.c") FLB_RT_TEST(FLB_OUT_INFLUXDB "out_influxdb.c") FLB_RT_TEST(FLB_OUT_CHRONICLE "out_chronicle.c") diff --git a/tests/runtime/out_s3.c b/tests/runtime/out_s3.c index 5968ff12a50..c3f37f91a44 100644 --- a/tests/runtime/out_s3.c +++ b/tests/runtime/out_s3.c @@ -46,6 +46,7 @@ void flb_test_s3_multipart_success(void) sleep(2); flb_stop(ctx); flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); } void flb_test_s3_putobject_success(void) @@ -82,6 +83,7 @@ void flb_test_s3_putobject_success(void) sleep(2); flb_stop(ctx); flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); } void flb_test_s3_putobject_error(void) @@ -120,6 +122,7 @@ void flb_test_s3_putobject_error(void) flb_stop(ctx); flb_destroy(ctx); unsetenv("TEST_PUT_OBJECT_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); } @@ -156,6 +159,7 @@ void flb_test_s3_create_upload_error(void) flb_stop(ctx); flb_destroy(ctx); unsetenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); } void flb_test_s3_upload_part_error(void) @@ -191,6 +195,7 @@ void flb_test_s3_upload_part_error(void) flb_stop(ctx); flb_destroy(ctx); unsetenv("TEST_UPLOAD_PART_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); } void flb_test_s3_complete_upload_error(void) @@ -226,6 +231,7 @@ void flb_test_s3_complete_upload_error(void) flb_stop(ctx); flb_destroy(ctx); unsetenv("TEST_COMPLETE_MULTIPART_UPLOAD_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); } diff --git a/tests/runtime/out_s3_config.c b/tests/runtime/out_s3_config.c new file mode 100644 index 00000000000..655cf2c38ef --- /dev/null +++ b/tests/runtime/out_s3_config.c @@ -0,0 +1,809 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "flb_tests_runtime.h" + +/* Test data */ +#include "data/td/json_td.h" /* JSON_TD */ + +/* Test: s3_key_format with TAG expansion */ +void flb_test_s3_key_format_tag_expansion(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + /* mocks calls - signals that we are in test mode */ + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "app.production.service1", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format", "/logs/$TAG/%Y/%m/%d", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: s3_key_format with TAG[n] parts */ +void flb_test_s3_key_format_tag_parts(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "app.production.service1", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format", "/logs/$TAG[0]/$TAG[1]/%H%M%S", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format_tag_delimiters", ".", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: s3_key_format with $INDEX sequence */ +void flb_test_s3_key_format_index_sequence(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format", "/logs/$TAG-$INDEX", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push data multiple times to trigger index increment */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: s3_key_format with $UUID generation */ +void flb_test_s3_key_format_uuid_generation(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format", "/logs/$TAG-$UUID", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: s3_key_format with time formatters */ +void flb_test_s3_key_format_time_formatters(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format", "/logs/%Y/%m/%d/%H/%M/%S", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: s3_key_format with mixed variables */ +void flb_test_s3_key_format_mixed_variables(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "app.production.service1", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format", "/$TAG[0]/%Y%m%d-$INDEX-$UUID", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format_tag_delimiters", ".", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: endpoint HTTP vs HTTPS */ +void flb_test_endpoint_http_vs_https(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test HTTP endpoint */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "endpoint", "http://s3.example.com", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Test HTTPS endpoint */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "endpoint", "https://s3.example.com", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_stop(ctx); + flb_destroy(ctx); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: endpoint with custom port */ +void flb_test_endpoint_custom_port(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test with port 9000 (MinIO default) */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "endpoint", "http://localhost:9000", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Test with custom HTTPS port */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "endpoint", "https://s3.example.com:8443", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_stop(ctx); + flb_destroy(ctx); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: storage_class variations */ +void flb_test_storage_class_variations(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + const char *classes[] = { + "STANDARD", + "STANDARD_IA", + "GLACIER", + "INTELLIGENT_TIERING" + }; + int i; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + for (i = 0; i < 4; i++) { + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "storage_class", classes[i], NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + } + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: canned_acl options */ +void flb_test_canned_acl_options(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + const char *acls[] = { + "private", + "public-read", + "bucket-owner-full-control" + }; + int i; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + for (i = 0; i < 3; i++) { + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "canned_acl", acls[i], NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + } + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: content_type setting */ +void flb_test_content_type_setting(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test with application/json */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "content_type", "application/json", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Test with text/plain */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "content_type", "text/plain", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: send_content_md5 flag */ +void flb_test_send_content_md5_flag(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test with MD5 enabled */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "send_content_md5", "true", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Test with MD5 disabled */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "send_content_md5", "false", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: store_dir_limit_size enforcement */ +void flb_test_store_dir_limit_enforcement(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "store_dir_limit_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: chunk_size auto adjustment */ +void flb_test_chunk_size_auto_adjustment(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test: upload_chunk_size > total_file_size should auto-adjust */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + /* Should auto-adjust upload_chunk_size to total_file_size */ + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Test: very large total_file_size requiring chunk adjustment */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1000G", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + /* Should auto-adjust to prevent exceeding 10000 parts limit */ + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_stop(ctx); + flb_destroy(ctx); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: invalid parameter combinations */ +void flb_test_invalid_parameter_combinations(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test: invalid compression type */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "compression", "invalid", NULL); + + /* Should fail initialization */ + ret = flb_start(ctx); + TEST_CHECK(ret == -1); + + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Test: invalid format type */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "unknown", NULL); + + /* Should fail initialization */ + ret = flb_start(ctx); + TEST_CHECK(ret == -1); + + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + +#ifdef FLB_HAVE_PARQUET_ENCODER + /* Test: format=parquet without schema_str */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "parquet", NULL); + /* Missing schema_str */ + + /* Should fail initialization */ + ret = flb_start(ctx); + TEST_CHECK(ret == -1); + + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +#endif + + /* Test: total_file_size > 5TB (AWS max) */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "6TB", NULL); + + /* Should fail initialization - exceeds AWS S3 limit */ + ret = flb_start(ctx); + TEST_CHECK(ret == -1); + + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test list */ +TEST_LIST = { + {"s3_key_format_tag_expansion", flb_test_s3_key_format_tag_expansion}, + {"s3_key_format_tag_parts", flb_test_s3_key_format_tag_parts}, + {"s3_key_format_index_sequence", flb_test_s3_key_format_index_sequence}, + {"s3_key_format_uuid_generation", flb_test_s3_key_format_uuid_generation}, + {"s3_key_format_time_formatters", flb_test_s3_key_format_time_formatters}, + {"s3_key_format_mixed_variables", flb_test_s3_key_format_mixed_variables}, + {"endpoint_http_vs_https", flb_test_endpoint_http_vs_https}, + {"endpoint_custom_port", flb_test_endpoint_custom_port}, + {"storage_class_variations", flb_test_storage_class_variations}, + {"canned_acl_options", flb_test_canned_acl_options}, + {"content_type_setting", flb_test_content_type_setting}, + {"send_content_md5_flag", flb_test_send_content_md5_flag}, + {"store_dir_limit_enforcement", flb_test_store_dir_limit_enforcement}, + {"chunk_size_auto_adjustment", flb_test_chunk_size_auto_adjustment}, + {"invalid_parameter_combinations", flb_test_invalid_parameter_combinations}, + {NULL, NULL} +}; diff --git a/tests/runtime/out_s3_edge_cases.c b/tests/runtime/out_s3_edge_cases.c new file mode 100644 index 00000000000..17a307a75ea --- /dev/null +++ b/tests/runtime/out_s3_edge_cases.c @@ -0,0 +1,578 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "flb_tests_runtime.h" + +/* Test data */ +#include "data/td/json_td.h" /* JSON_TD */ + +#define TEST_STORE_DIR "/tmp/flb-s3-test-edge" + +/* Test: Empty data upload */ +void flb_test_empty_data_upload(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + /* mocks calls - signals that we are in test mode */ + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Do not push any data - test empty flush */ + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify no crash occurred - the fact we reached here is the test */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Large file chunking */ +void flb_test_large_file_chunking(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + int i; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "100K", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push multiple chunks to trigger file rotation */ + for (i = 0; i < 50; i++) { + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + } + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify rotated or queued behavior occurred */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Maximum concurrent uploads */ +void flb_test_max_concurrent_uploads(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "workers", "10", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify upload was queued */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Minimal timeout settings */ +void flb_test_minimal_timeout(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify upload completed with minimal timeout */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Timeout trigger priority over file size */ +void flb_test_timeout_trigger_priority(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* + * Test that upload_timeout can trigger upload before total_file_size is reached. + * With total_file_size=100M and upload_timeout=1s, the timeout should trigger first + * since we only push a small amount of data. This verifies that the timeout mechanism + * works independently and has priority over file size conditions. + */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "100M", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push small amount of data - timeout should trigger before reaching 100M */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify timeout triggered upload */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Invalid S3 key format characters */ +void flb_test_invalid_s3_key_chars(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test/special", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format", "/logs/$TAG[0]/$TAG[1]/%Y/%m/%d/data.log", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify S3 key format handled special characters */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Duplicate tag/index combinations */ +void flb_test_duplicate_tag_index(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test.app.logs", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format", "/$TAG[0]/$TAG[1]/$TAG[2]/%Y%m%d.log", NULL); + flb_output_set(ctx, out_ffd, "s3_key_format_tag_delimiters", ".", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push same data multiple times */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify duplicate tags handled correctly */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Storage path permissions */ +void flb_test_storage_path_permissions(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "store_dir", TEST_STORE_DIR, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify storage path was created and used */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Configuration boundary values */ +void flb_test_config_boundary_values(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test minimum values */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "workers", "1", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "1", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify minimum config values accepted */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Test maximum values */ + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "100M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "50M", NULL); + flb_output_set(ctx, out_ffd, "workers", "100", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "10", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "10m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify maximum config values accepted */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Compression boundary */ +void flb_test_compression_boundary(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "compression", "gzip", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1K", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push very small data to test compression edge case */ + flb_lib_push(ctx, in_ffd, (char *) "{\"msg\":\"x\"}", 12); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify compression handled small data */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Rapid configuration changes */ +void flb_test_rapid_config_changes(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* First configuration */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket-1", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(1); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify first configuration worked */ + TEST_CHECK(ret == 0); + + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Second configuration - different bucket */ + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket-2", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(1); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Verify second configuration worked */ + TEST_CHECK(ret == 0); + + /* mocks calls - signals that we are in test mode */ + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test list */ +TEST_LIST = { + {"empty_data_upload", flb_test_empty_data_upload}, + {"large_file_chunking", flb_test_large_file_chunking}, + {"max_concurrent_uploads", flb_test_max_concurrent_uploads}, + {"minimal_timeout", flb_test_minimal_timeout}, + {"timeout_trigger_priority", flb_test_timeout_trigger_priority}, + {"invalid_s3_key_chars", flb_test_invalid_s3_key_chars}, + {"duplicate_tag_index", flb_test_duplicate_tag_index}, + {"storage_path_permissions", flb_test_storage_path_permissions}, + {"config_boundary_values", flb_test_config_boundary_values}, + {"compression_boundary", flb_test_compression_boundary}, + {"rapid_config_changes", flb_test_rapid_config_changes}, + {NULL, NULL} +}; diff --git a/tests/runtime/out_s3_error_handling.c b/tests/runtime/out_s3_error_handling.c new file mode 100644 index 00000000000..2e10e32ed97 --- /dev/null +++ b/tests/runtime/out_s3_error_handling.c @@ -0,0 +1,556 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "flb_tests_runtime.h" + +/* Test data */ +#include "data/td/json_td.h" /* JSON_TD */ + +/* AWS Error Response Templates */ +#define AWS_ERROR_ACCESS_DENIED "\ +AccessDeniedAccess Denied" + +#define AWS_ERROR_NO_SUCH_BUCKET "\ +NoSuchBucketThe specified bucket does not exist" + +#define AWS_ERROR_NO_SUCH_UPLOAD "\ +NoSuchUploadThe specified upload does not exist" + +#define AWS_ERROR_INVALID_ACCESS_KEY "\ +InvalidAccessKeyIdThe AWS access key ID you provided does not exist" + +#define AWS_ERROR_SIGNATURE_MISMATCH "\ +SignatureDoesNotMatchThe request signature we calculated does not match" + +#define AWS_ERROR_SLOW_DOWN "\ +SlowDownPlease reduce your request rate" + +/* Test: AccessDenied error handling */ +void flb_test_error_access_denied(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + /* mocks calls - signals that we are in test mode */ + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR", AWS_ERROR_ACCESS_DENIED, 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "1", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: NoSuchBucket error handling */ +void flb_test_error_no_such_bucket(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR", AWS_ERROR_NO_SUCH_BUCKET, 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "nonexistent-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "1", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: NoSuchUpload error handling */ +void flb_test_error_no_such_upload(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_COMPLETE_MULTIPART_UPLOAD_ERROR", AWS_ERROR_NO_SUCH_UPLOAD, 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "1", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_COMPLETE_MULTIPART_UPLOAD_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: InvalidAccessKeyId error handling */ +void flb_test_error_invalid_access_key(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR", AWS_ERROR_INVALID_ACCESS_KEY, 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "2", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: SignatureDoesNotMatch error handling */ +void flb_test_error_signature_mismatch(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR", AWS_ERROR_SIGNATURE_MISMATCH, 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "2", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: SlowDown throttling error handling */ +void flb_test_error_slow_down_throttling(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_UPLOAD_PART_ERROR", AWS_ERROR_SLOW_DOWN, 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "3", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_UPLOAD_PART_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Network connection timeout */ +void flb_test_network_connection_timeout(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_NETWORK_TIMEOUT", "connection", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "auto_retry_requests", "true", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_NETWORK_TIMEOUT"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Network read timeout */ +void flb_test_network_read_timeout(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_NETWORK_TIMEOUT", "read", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "auto_retry_requests", "true", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_NETWORK_TIMEOUT"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: auto_retry_requests layer */ +void flb_test_retry_layer_auto_retry(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test with auto_retry enabled */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "auto_retry_requests", "true", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + + /* Test with auto_retry disabled */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "auto_retry_requests", "false", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: part_delivery_attempt_limit layer */ +void flb_test_retry_layer_part_delivery(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_UPLOAD_PART_ERROR", AWS_ERROR_SLOW_DOWN, 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "part_delivery_attempt_limit", "5", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_UPLOAD_PART_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: file_delivery_attempt_limit layer */ +void flb_test_retry_layer_file_delivery(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR", AWS_ERROR_SLOW_DOWN, 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "file_delivery_attempt_limit", "3", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_CREATE_MULTIPART_UPLOAD_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Failure cleanup and abort */ +void flb_test_failure_cleanup_and_abort(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + /* Simulate permanent failure */ + setenv("TEST_UPLOAD_PART_ERROR", AWS_ERROR_ACCESS_DENIED, 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "2", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(3); + + /* Verify cleanup occurred */ + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_UPLOAD_PART_ERROR"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test list */ +TEST_LIST = { + {"error_access_denied", flb_test_error_access_denied}, + {"error_no_such_bucket", flb_test_error_no_such_bucket}, + {"error_no_such_upload", flb_test_error_no_such_upload}, + {"error_invalid_access_key", flb_test_error_invalid_access_key}, + {"error_signature_mismatch", flb_test_error_signature_mismatch}, + {"error_slow_down_throttling", flb_test_error_slow_down_throttling}, + {"network_connection_timeout", flb_test_network_connection_timeout}, + {"network_read_timeout", flb_test_network_read_timeout}, + {"retry_layer_auto_retry", flb_test_retry_layer_auto_retry}, + {"retry_layer_part_delivery", flb_test_retry_layer_part_delivery}, + {"retry_layer_file_delivery", flb_test_retry_layer_file_delivery}, + {"failure_cleanup_and_abort", flb_test_failure_cleanup_and_abort}, + {NULL, NULL} +}; diff --git a/tests/runtime/out_s3_format.c b/tests/runtime/out_s3_format.c new file mode 100644 index 00000000000..113744ed668 --- /dev/null +++ b/tests/runtime/out_s3_format.c @@ -0,0 +1,547 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "flb_tests_runtime.h" + +/* Test data */ +#include "data/td/json_td.h" /* JSON_TD */ + +/* Test: Basic JSON format conversion */ +void flb_test_json_format_basic(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + /* mocks calls - signals that we are in test mode */ + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "json", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: JSON with date key formatting */ +void flb_test_json_with_date_key(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "json", NULL); + flb_output_set(ctx, out_ffd, "json_date_format", "iso8601", NULL); + flb_output_set(ctx, out_ffd, "json_date_key", "timestamp", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: JSON streaming conversion for large files */ +void flb_test_json_streaming_conversion(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + int i; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "json", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "2s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push data multiple times to simulate streaming */ + for (i = 0; i < 5; i++) { + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + } + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +#ifdef FLB_HAVE_PARQUET_ENCODER +/* Test: Parquet schema validation */ +void flb_test_parquet_schema_validation(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + const char *valid_schema = "{\"fields\":[{\"name\":\"message\",\"type\":{\"name\":\"utf8\"}}]}"; + const char *invalid_schema = "{\"invalid_json"; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test with valid schema */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "parquet", NULL); + flb_output_set(ctx, out_ffd, "schema_str", valid_schema, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Test with invalid schema - should fail initialization */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "parquet", NULL); + flb_output_set(ctx, out_ffd, "schema_str", invalid_schema, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + + /* Should fail to start with invalid schema */ + ret = flb_start(ctx); + TEST_CHECK(ret == -1); + + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Parquet empty data handling */ +void flb_test_parquet_empty_data_handling(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + const char *schema = "{\"fields\":[{\"name\":\"message\",\"type\":{\"name\":\"utf8\"}}]}"; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "parquet", NULL); + flb_output_set(ctx, out_ffd, "schema_str", schema, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Don't push any data - test empty data handling */ + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Parquet schema mismatch handling */ +void flb_test_parquet_schema_mismatch(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + const char *schema = "{\"fields\":[{\"name\":\"nonexistent_field\",\"type\":{\"name\":\"utf8\"}}]}"; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "parquet", NULL); + /* Schema doesn't match the actual data fields */ + flb_output_set(ctx, out_ffd, "schema_str", schema, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push data with fields that don't match schema */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} +#endif /* FLB_HAVE_PARQUET_ENCODER */ + +/* Test: GZIP compression integration */ +void flb_test_compression_gzip_integration(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "json", NULL); + flb_output_set(ctx, out_ffd, "compression", "gzip", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: ZSTD compression integration */ +void flb_test_compression_zstd_integration(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "json", NULL); + flb_output_set(ctx, out_ffd, "compression", "zstd", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: log_key extraction */ +void flb_test_log_key_extraction(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "log_key", "log", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Various format and compression combinations */ +void flb_test_format_compression_combinations(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test 1: JSON + GZIP */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "json", NULL); + flb_output_set(ctx, out_ffd, "compression", "gzip", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Test 2: JSON + ZSTD */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "json", NULL); + flb_output_set(ctx, out_ffd, "compression", "zstd", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + +#ifdef FLB_HAVE_PARQUET_ENCODER + /* Test 3: Parquet + GZIP (Parquet has internal compression) */ + const char *parquet_schema = "{\"fields\":[{\"name\":\"message\",\"type\":{\"name\":\"utf8\"}}]}"; + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "parquet", NULL); + flb_output_set(ctx, out_ffd, "schema_str", parquet_schema, NULL); + flb_output_set(ctx, out_ffd, "compression", "gzip", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +#endif + + /* Test 4: log_key + GZIP */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "log_key", "log", NULL); + flb_output_set(ctx, out_ffd, "compression", "gzip", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test list */ +TEST_LIST = { + {"json_format_basic", flb_test_json_format_basic}, + {"json_with_date_key", flb_test_json_with_date_key}, + {"json_streaming_conversion", flb_test_json_streaming_conversion}, +#ifdef FLB_HAVE_PARQUET_ENCODER + {"parquet_schema_validation", flb_test_parquet_schema_validation}, + {"parquet_empty_data_handling", flb_test_parquet_empty_data_handling}, + {"parquet_schema_mismatch", flb_test_parquet_schema_mismatch}, +#endif + {"compression_gzip_integration", flb_test_compression_gzip_integration}, + {"compression_zstd_integration", flb_test_compression_zstd_integration}, + {"log_key_extraction", flb_test_log_key_extraction}, + {"format_compression_combinations", flb_test_format_compression_combinations}, + {NULL, NULL} +}; diff --git a/tests/runtime/out_s3_multipart.c b/tests/runtime/out_s3_multipart.c new file mode 100644 index 00000000000..2e7c3639e68 --- /dev/null +++ b/tests/runtime/out_s3_multipart.c @@ -0,0 +1,571 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "flb_tests_runtime.h" + +/* Test data */ +#include "data/td/json_td.h" /* JSON_TD */ + +/* Test: Basic CreateMultipartUpload */ +void flb_test_create_multipart_basic(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + /* mocks calls - signals that we are in test mode */ + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push data to trigger multipart upload creation */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: CreateMultipartUpload with metadata (ACL, storage class, etc.) */ +void flb_test_create_multipart_with_metadata(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "canned_acl", "bucket-owner-full-control", NULL); + flb_output_set(ctx, out_ffd, "storage_class", "STANDARD_IA", NULL); + flb_output_set(ctx, out_ffd, "content_type", "application/json", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Upload single part */ +void flb_test_upload_part_success(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push enough data to trigger part upload */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: ETag collection from multiple parts */ +void flb_test_upload_part_etag_collection(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + int i; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "15M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "2s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push data multiple times to create multiple parts */ + for (i = 0; i < 3; i++) { + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(1); + } + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Part number ordering verification */ +void flb_test_upload_part_order_verification(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "20M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "2s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push data to trigger multiple parts */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(1); + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(1); + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: CompleteMultipartUpload success */ +void flb_test_complete_multipart_success(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "2s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push enough data to complete multipart upload */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: CompleteMultipartUpload with retry */ +void flb_test_complete_multipart_retry(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + /* Mock: First CompleteMultipartUpload fails, second succeeds */ + setenv("TEST_COMPLETE_MULTIPART_RETRY", "1", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "2s", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "3", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(4); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_COMPLETE_MULTIPART_RETRY"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: CompleteMultipartUpload final failure */ +void flb_test_complete_multipart_final_failure(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + /* Mock: CompleteMultipartUpload always fails */ + setenv("TEST_COMPLETE_MULTIPART_FAIL", "1", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "2s", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "2", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(4); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_COMPLETE_MULTIPART_FAIL"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: AbortMultipartUpload on error */ +void flb_test_abort_multipart_on_error(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + /* Mock: UploadPart consistently fails to trigger abort */ + setenv("TEST_UPLOAD_PART_FAIL", "1", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "2", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_UPLOAD_PART_FAIL"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: AbortMultipartUpload cleanup */ +void flb_test_abort_multipart_cleanup(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + setenv("TEST_UPLOAD_PART_FAIL", "1", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "1", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(3); + + /* Verify cleanup - temporary files and database records should be removed */ + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("TEST_UPLOAD_PART_FAIL"); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: upload_id persistence and recovery */ +void flb_test_upload_id_persistence(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* First session: Create multipart upload */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", "/tmp/test_s3_multipart_persistence.db", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Second session: Verify upload_id recovery from database */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", "/tmp/test_s3_multipart_persistence.db", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Multipart part size calculation */ +void flb_test_multipart_part_size_calculation(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Test 1: Small file - part_size equals total_file_size */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "50M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + /* Should calculate ~5 parts */ + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Test 2: Large file requiring auto-adjustment to avoid >10000 parts */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "100G", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1s", NULL); + + /* Should auto-adjust part_size to avoid exceeding 10000 parts */ + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test list */ +TEST_LIST = { + {"create_multipart_basic", flb_test_create_multipart_basic}, + {"create_multipart_with_metadata", flb_test_create_multipart_with_metadata}, + {"upload_part_success", flb_test_upload_part_success}, + {"upload_part_etag_collection", flb_test_upload_part_etag_collection}, + {"upload_part_order_verification", flb_test_upload_part_order_verification}, + {"complete_multipart_success", flb_test_complete_multipart_success}, + {"complete_multipart_retry", flb_test_complete_multipart_retry}, + {"complete_multipart_final_failure", flb_test_complete_multipart_final_failure}, + {"abort_multipart_on_error", flb_test_abort_multipart_on_error}, + {"abort_multipart_cleanup", flb_test_abort_multipart_cleanup}, + {"upload_id_persistence", flb_test_upload_id_persistence}, + {"multipart_part_size_calculation", flb_test_multipart_part_size_calculation}, + {NULL, NULL} +}; diff --git a/tests/runtime/out_s3_queue.c b/tests/runtime/out_s3_queue.c new file mode 100644 index 00000000000..aa2e6110910 --- /dev/null +++ b/tests/runtime/out_s3_queue.c @@ -0,0 +1,266 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "flb_tests_runtime.h" + +/* Test data */ +#include "data/td/json_td.h" /* JSON_TD */ + +/* Test: Basic queue operations (add/remove) */ +void flb_test_queue_basic_operations(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + /* mocks calls - signals that we are in test mode */ + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push data to trigger queue operations */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Queue with multiple workers */ +void flb_test_queue_multiple_workers(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "workers", "5", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push multiple chunks to test worker distribution */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Queue entry retry mechanism */ +void flb_test_queue_retry_mechanism(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "3", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Queue entry timeout handling */ +void flb_test_queue_timeout_handling(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "2s", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "100M", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push data and wait for timeout */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Queue with concurrent file uploads */ +void flb_test_queue_concurrent_uploads(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + int i; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "workers", "10", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "100K", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push multiple chunks to trigger concurrent uploads */ + for (i = 0; i < 20; i++) { + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + } + sleep(3); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Queue cleanup on shutdown */ +void flb_test_queue_cleanup_on_shutdown(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "2", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "10m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push data but shutdown before upload */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(1); + + /* Graceful shutdown should handle pending queue entries */ + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test list */ +TEST_LIST = { + {"queue_basic_operations", flb_test_queue_basic_operations}, + {"queue_multiple_workers", flb_test_queue_multiple_workers}, + {"queue_retry_mechanism", flb_test_queue_retry_mechanism}, + {"queue_timeout_handling", flb_test_queue_timeout_handling}, + {"queue_concurrent_uploads", flb_test_queue_concurrent_uploads}, + {"queue_cleanup_on_shutdown", flb_test_queue_cleanup_on_shutdown}, + {NULL, NULL} +}; diff --git a/tests/runtime/out_s3_recovery.c b/tests/runtime/out_s3_recovery.c new file mode 100644 index 00000000000..8e2dd3a3d87 --- /dev/null +++ b/tests/runtime/out_s3_recovery.c @@ -0,0 +1,439 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "flb_tests_runtime.h" + +/* Test data */ +#include "data/td/json_td.h" /* JSON_TD */ + +#define TEST_STORE_DIR "/tmp/flb-s3-recovery-test" + +/* Test: Basic restart recovery with blob database */ +void flb_test_recovery_with_blob_db(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + char db_path[256]; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Create unique database path */ + snprintf(db_path, sizeof(db_path), "/tmp/s3_recovery_test_%d.db", (int)time(NULL)); + + /* First run: Initialize and buffer data */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", db_path, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "10m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(1); + + /* Shutdown without completing upload */ + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Second run: Recovery should pick up buffered data */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", db_path, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Wait for recovery to complete */ + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Cleanup database file */ + unlink(db_path); +} + +/* Test: Recovery without blob database (fstore only) */ +void flb_test_recovery_without_db(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* First run */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "store_dir", TEST_STORE_DIR, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "10m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(1); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Second run - recovery */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "store_dir", TEST_STORE_DIR, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Multipart upload resume after crash */ +void flb_test_recovery_multipart_resume(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + char db_path[256]; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + snprintf(db_path, sizeof(db_path), "/tmp/s3_multipart_recovery_%d.db", (int)time(NULL)); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", db_path, NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push enough data to trigger multipart */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Cleanup */ + unlink(db_path); +} + +/* Test: Recovery with part-level tracking */ +void flb_test_recovery_part_level_tracking(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + char db_path[256]; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + snprintf(db_path, sizeof(db_path), "/tmp/s3_part_tracking_%d.db", (int)time(NULL)); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", db_path, NULL); + flb_output_set(ctx, out_ffd, "part_size", "1000", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + unlink(db_path); +} + +/* Test: Recovery with failed uploads cleanup */ +void flb_test_recovery_failed_uploads_cleanup(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + char db_path[256]; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + snprintf(db_path, sizeof(db_path), "/tmp/s3_failed_cleanup_%d.db", (int)time(NULL)); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", db_path, NULL); + flb_output_set(ctx, out_ffd, "retry_limit", "2", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + unlink(db_path); +} + +/* Test: Recovery with stale uploads */ +void flb_test_recovery_stale_uploads(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + char db_path[256]; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + snprintf(db_path, sizeof(db_path), "/tmp/s3_stale_uploads_%d.db", (int)time(NULL)); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", db_path, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + unlink(db_path); +} + +/* Test: Blob database file operations */ +void flb_test_blob_db_file_operations(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + char db_path[256]; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + snprintf(db_path, sizeof(db_path), "/tmp/s3_blob_ops_%d.db", (int)time(NULL)); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", db_path, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Test basic file operations */ + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + unlink(db_path); +} + +/* Test: Blob database concurrent access */ +void flb_test_blob_db_concurrent_access(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + char db_path[256]; + int i; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + snprintf(db_path, sizeof(db_path), "/tmp/s3_blob_concurrent_%d.db", (int)time(NULL)); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", db_path, NULL); + flb_output_set(ctx, out_ffd, "workers", "5", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "100K", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + /* Push multiple chunks to trigger concurrent DB access */ + for (i = 0; i < 10; i++) { + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + } + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + unlink(db_path); +} + +/* Test list */ +TEST_LIST = { + {"recovery_with_blob_db", flb_test_recovery_with_blob_db}, + {"recovery_without_db", flb_test_recovery_without_db}, + {"recovery_multipart_resume", flb_test_recovery_multipart_resume}, + {"recovery_part_level_tracking", flb_test_recovery_part_level_tracking}, + {"recovery_failed_uploads_cleanup", flb_test_recovery_failed_uploads_cleanup}, + {"recovery_stale_uploads", flb_test_recovery_stale_uploads}, + {"blob_db_file_operations", flb_test_blob_db_file_operations}, + {"blob_db_concurrent_access", flb_test_blob_db_concurrent_access}, + {NULL, NULL} +}; diff --git a/tests/runtime/out_s3_special_features.c b/tests/runtime/out_s3_special_features.c new file mode 100644 index 00000000000..0ed7856226f --- /dev/null +++ b/tests/runtime/out_s3_special_features.c @@ -0,0 +1,520 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "flb_tests_runtime.h" + +/* Test data */ +#include "data/td/json_td.h" /* JSON_TD */ + +/* Test: Snappy compression */ +void flb_test_snappy_compression(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + /* mocks calls - signals that we are in test mode */ + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "compression", "snappy", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Snappy compression with JSON format */ +void flb_test_snappy_with_json(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "format", "json", NULL); + flb_output_set(ctx, out_ffd, "compression", "snappy", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: preserve_data_ordering enabled */ +void flb_test_preserve_ordering_enabled(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "preserve_data_ordering", "true", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: preserve_data_ordering disabled */ +void flb_test_preserve_ordering_disabled(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "preserve_data_ordering", "false", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Blob database configuration */ +void flb_test_blob_database_config(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + char db_path[256]; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* Create unique database path */ + snprintf(db_path, sizeof(db_path), "/tmp/s3_blob_test_%d.db", (int)time(NULL)); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", db_path, NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Cleanup database file */ + unlink(db_path); +} + +/* Test: Blob with custom part_size */ +void flb_test_blob_custom_part_size(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + char db_path[256]; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + snprintf(db_path, sizeof(db_path), "/tmp/s3_blob_test_%d.db", (int)time(NULL)); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "blob_database_file", db_path, NULL); + flb_output_set(ctx, out_ffd, "part_size", "5000", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); + + /* Cleanup database file */ + unlink(db_path); +} + +/* Test: Invalid compression type configuration */ +void flb_test_invalid_compression_type(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "compression", "invalid_compression", NULL); + + /* Should fail due to invalid compression type */ + ret = flb_start(ctx); + TEST_CHECK(ret == -1); + + if (ret == 0) { + flb_stop(ctx); + } + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Auto-adjust chunk_size > total_file_size */ +void flb_test_auto_adjust_chunk_size(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "10M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + /* S3 plugin should auto-adjust upload_chunk_size to match total_file_size */ + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + if (ret == 0) { + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + flb_stop(ctx); + } + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: chunk_size equals total_file_size (boundary condition) */ +void flb_test_chunk_size_equals_file_size(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* + * Test the boundary condition where upload_chunk_size exactly equals total_file_size. + * This verifies that no unnecessary adjustment logic is triggered and the plugin + * handles this equality case correctly. + */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_chunk_size", "5M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + if (ret == 0) { + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + flb_stop(ctx); + } + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Credential expiration with retry */ +void flb_test_credential_expiration_retry(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* + * Test credential expiration scenario with file_delivery_attempt_limit > 1 + * This simulates the case where credentials expire during upload. + * With the credential refresh retry logic, the request should be retried + * after credentials are refreshed. + */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "file_delivery_attempt_limit", "3", NULL); + flb_output_set(ctx, out_ffd, "part_delivery_attempt_limit", "5", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: Credential expiration with strict mode (file_delivery_attempt_limit=1) */ +void flb_test_credential_expiration_strict(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + /* + * Test that with file_delivery_attempt_limit=1, credential expiration + * still allows one immediate retry after credential refresh. + * This tests the credential refresh retry logic. + */ + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "file_delivery_attempt_limit", "1", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test: preserve_data_ordering with multiple workers */ +void flb_test_preserve_ordering_with_workers(void) +{ + int ret; + flb_ctx_t *ctx; + int in_ffd; + int out_ffd; + + setenv("FLB_S3_PLUGIN_UNDER_TEST", "true", 1); + + ctx = flb_create(); + flb_service_set(ctx, "flush", "1", "grace", "1", NULL); + + in_ffd = flb_input(ctx, (char *) "lib", NULL); + TEST_CHECK(in_ffd >= 0); + flb_input_set(ctx, in_ffd, "tag", "test", NULL); + + out_ffd = flb_output(ctx, (char *) "s3", NULL); + TEST_CHECK(out_ffd >= 0); + flb_output_set(ctx, out_ffd, "match", "*", NULL); + flb_output_set(ctx, out_ffd, "region", "us-west-2", NULL); + flb_output_set(ctx, out_ffd, "bucket", "test-bucket", NULL); + flb_output_set(ctx, out_ffd, "preserve_data_ordering", "true", NULL); + flb_output_set(ctx, out_ffd, "workers", "3", NULL); + flb_output_set(ctx, out_ffd, "total_file_size", "1M", NULL); + flb_output_set(ctx, out_ffd, "upload_timeout", "1m", NULL); + + ret = flb_start(ctx); + TEST_CHECK(ret == 0); + + flb_lib_push(ctx, in_ffd, (char *) JSON_TD, (int) sizeof(JSON_TD) - 1); + sleep(2); + + flb_stop(ctx); + flb_destroy(ctx); + unsetenv("FLB_S3_PLUGIN_UNDER_TEST"); +} + +/* Test list */ +TEST_LIST = { + {"snappy_compression", flb_test_snappy_compression}, + {"snappy_with_json", flb_test_snappy_with_json}, + {"preserve_ordering_enabled", flb_test_preserve_ordering_enabled}, + {"preserve_ordering_disabled", flb_test_preserve_ordering_disabled}, + {"blob_database_config", flb_test_blob_database_config}, + {"blob_custom_part_size", flb_test_blob_custom_part_size}, + {"invalid_compression_type", flb_test_invalid_compression_type}, + {"auto_adjust_chunk_size", flb_test_auto_adjust_chunk_size}, + {"chunk_size_equals_file_size", flb_test_chunk_size_equals_file_size}, + {"credential_expiration_retry", flb_test_credential_expiration_retry}, + {"credential_expiration_strict", flb_test_credential_expiration_strict}, + {"preserve_ordering_with_workers", flb_test_preserve_ordering_with_workers}, + {NULL, NULL} +};