From f28273296adcf42418cbb2df8c3d1d98749d516e Mon Sep 17 00:00:00 2001 From: Sagar Sumit Date: Fri, 13 Dec 2024 10:03:57 +0530 Subject: [PATCH 1/2] [DOCS] Cut new docs version 1.0.0 --- website/docusaurus.config.js | 10 +- .../version-1.0.0/azure_hoodie.md | 52 + .../version-1.0.0/bos_hoodie.md | 58 + .../version-1.0.0/cloud.md | 28 + .../version-1.0.0/comparison.md | 48 + .../version-1.0.0/concepts.md | 154 ++ .../version-1.0.0/configurations.md | 604 +++++ .../version-1.0.0/cos_hoodie.md | 73 + .../version-1.0.0/deployment.md | 433 ++++ .../version-1.0.0/docker_demo.md | 1122 ++++++++ .../version-1.0.0/docs-versions.md | 12 + .../version-1.0.0/flink-quick-start-guide.md | 530 ++++ .../version-1.0.0/gcs_hoodie.md | 61 + .../version-1.0.0/ibm_cos_hoodie.md | 78 + .../version-1.0.0/migration_guide.md | 60 + .../version-1.0.0/oss_hoodie.md | 71 + .../version-1.0.0/overview.md | 155 ++ .../version-1.0.0/performance.md | 62 + .../version-1.0.0/powered_by.md | 88 + .../version-1.0.0/privacy.md | 23 + .../version-1.0.0/querying_data.md | 224 ++ .../version-1.0.0/s3_hoodie.md | 81 + .../version-1.0.0/spark_quick-start-guide.md | 449 ++++ .../version-1.0.0/use_cases.md | 67 + .../version-1.0.0/writing_data.md | 222 ++ website/releases/download.md | 4 + .../src/components/HomepageHeader/index.js | 2 +- .../version-1.0.0/azure_hoodie.md | 50 + .../version-1.0.0/basic_configurations.md | 797 ++++++ .../version-1.0.0/bos_hoodie.md | 57 + .../versioned_docs/version-1.0.0/cleaning.md | 155 ++ website/versioned_docs/version-1.0.0/cli.md | 755 ++++++ website/versioned_docs/version-1.0.0/cloud.md | 38 + .../version-1.0.0/clustering.md | 346 +++ .../version-1.0.0/compaction.md | 228 ++ .../version-1.0.0/comparison.md | 56 + .../versioned_docs/version-1.0.0/concepts.md | 172 ++ .../version-1.0.0/concurrency_control.md | 339 +++ .../version-1.0.0/configurations.md | 2287 +++++++++++++++++ .../version-1.0.0/cos_hoodie.md | 71 + .../version-1.0.0/deployment.md | 284 ++ .../version-1.0.0/disaster_recovery.md | 309 +++ .../version-1.0.0/docker_demo.md | 1526 +++++++++++ .../version-1.0.0/encryption.md | 73 + website/versioned_docs/version-1.0.0/faq.md | 15 + .../version-1.0.0/faq_design_and_concepts.md | 61 + .../version-1.0.0/faq_general.md | 98 + .../version-1.0.0/faq_integrations.md | 68 + .../version-1.0.0/faq_reading_tables.md | 31 + .../version-1.0.0/faq_storage.md | 193 ++ .../version-1.0.0/faq_table_services.md | 55 + .../version-1.0.0/faq_writing_tables.md | 194 ++ .../version-1.0.0/file_sizing.md | 177 ++ .../version-1.0.0/flink-quick-start-guide.md | 474 ++++ .../version-1.0.0/flink_tuning.md | 117 + .../version-1.0.0/gcp_bigquery.md | 102 + .../version-1.0.0/gcs_hoodie.md | 60 + .../hoodie_streaming_ingestion.md | 638 +++++ .../version-1.0.0/hudi_stack.md | 174 ++ .../version-1.0.0/ibm_cos_hoodie.md | 77 + .../versioned_docs/version-1.0.0/indexes.md | 226 ++ .../version-1.0.0/ingestion_flink.md | 179 ++ .../version-1.0.0/ingestion_kafka_connect.md | 51 + website/versioned_docs/version-1.0.0/intro.md | 47 + .../version-1.0.0/jfs_hoodie.md | 96 + .../version-1.0.0/key_generation.md | 215 ++ .../versioned_docs/version-1.0.0/markers.md | 91 + .../versioned_docs/version-1.0.0/metadata.md | 134 + .../version-1.0.0/metadata_indexing.md | 318 +++ .../versioned_docs/version-1.0.0/metrics.md | 232 ++ .../version-1.0.0/migration_guide.md | 118 + .../version-1.0.0/oci_hoodie.md | 80 + .../version-1.0.0/oss_hoodie.md | 70 + .../versioned_docs/version-1.0.0/overview.mdx | 78 + .../version-1.0.0/performance.md | 133 + .../platform_services_post_commit_callback.md | 58 + .../version-1.0.0/precommit_validator.md | 101 + .../versioned_docs/version-1.0.0/privacy.md | 22 + .../version-1.0.0/procedures.md | 2001 ++++++++++++++ .../python-rust-quick-start-guide.md | 119 + .../version-1.0.0/querying_data.md | 99 + .../version-1.0.0/quick-start-guide.md | 1315 ++++++++++ .../reading_tables_batch_reads.md | 35 + .../reading_tables_streaming_reads.md | 99 + .../version-1.0.0/record_merger.md | 253 ++ .../versioned_docs/version-1.0.0/rollbacks.md | 72 + .../versioned_docs/version-1.0.0/s3_hoodie.md | 94 + .../version-1.0.0/schema_evolution.md | 319 +++ .../version-1.0.0/snapshot_exporter.md | 135 + .../versioned_docs/version-1.0.0/sql_ddl.md | 969 +++++++ .../versioned_docs/version-1.0.0/sql_dml.md | 539 ++++ .../version-1.0.0/sql_queries.md | 714 +++++ .../version-1.0.0/storage_layouts.md | 69 + .../versioned_docs/version-1.0.0/structure.md | 20 + .../syncing_aws_glue_data_catalog.md | 72 + .../version-1.0.0/syncing_datahub.md | 50 + .../version-1.0.0/syncing_metastore.md | 299 +++ .../version-1.0.0/syncing_xtable.md | 54 + .../version-1.0.0/table_types.md | 207 ++ .../versioned_docs/version-1.0.0/timeline.md | 153 ++ .../version-1.0.0/troubleshooting.md | 222 ++ .../version-1.0.0/tuning-guide.md | 106 + .../versioned_docs/version-1.0.0/use_cases.md | 103 + .../version-1.0.0/write_operations.md | 145 ++ .../version-1.0.0/writing_data.md | 444 ++++ .../writing_tables_streaming_writes.md | 95 + .../version-1.0.0-sidebars.json | 163 ++ website/versions.json | 1 + 108 files changed, 26057 insertions(+), 6 deletions(-) create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/azure_hoodie.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/bos_hoodie.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/cloud.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/comparison.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/concepts.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/configurations.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/cos_hoodie.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/deployment.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/docker_demo.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/docs-versions.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/flink-quick-start-guide.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/gcs_hoodie.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/ibm_cos_hoodie.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/migration_guide.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/oss_hoodie.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/overview.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/performance.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/powered_by.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/privacy.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/querying_data.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/s3_hoodie.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/spark_quick-start-guide.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/use_cases.md create mode 100644 website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/writing_data.md create mode 100644 website/versioned_docs/version-1.0.0/azure_hoodie.md create mode 100644 website/versioned_docs/version-1.0.0/basic_configurations.md create mode 100644 website/versioned_docs/version-1.0.0/bos_hoodie.md create mode 100644 website/versioned_docs/version-1.0.0/cleaning.md create mode 100644 website/versioned_docs/version-1.0.0/cli.md create mode 100644 website/versioned_docs/version-1.0.0/cloud.md create mode 100644 website/versioned_docs/version-1.0.0/clustering.md create mode 100644 website/versioned_docs/version-1.0.0/compaction.md create mode 100644 website/versioned_docs/version-1.0.0/comparison.md create mode 100644 website/versioned_docs/version-1.0.0/concepts.md create mode 100644 website/versioned_docs/version-1.0.0/concurrency_control.md create mode 100644 website/versioned_docs/version-1.0.0/configurations.md create mode 100644 website/versioned_docs/version-1.0.0/cos_hoodie.md create mode 100644 website/versioned_docs/version-1.0.0/deployment.md create mode 100644 website/versioned_docs/version-1.0.0/disaster_recovery.md create mode 100644 website/versioned_docs/version-1.0.0/docker_demo.md create mode 100644 website/versioned_docs/version-1.0.0/encryption.md create mode 100644 website/versioned_docs/version-1.0.0/faq.md create mode 100644 website/versioned_docs/version-1.0.0/faq_design_and_concepts.md create mode 100644 website/versioned_docs/version-1.0.0/faq_general.md create mode 100644 website/versioned_docs/version-1.0.0/faq_integrations.md create mode 100644 website/versioned_docs/version-1.0.0/faq_reading_tables.md create mode 100644 website/versioned_docs/version-1.0.0/faq_storage.md create mode 100644 website/versioned_docs/version-1.0.0/faq_table_services.md create mode 100644 website/versioned_docs/version-1.0.0/faq_writing_tables.md create mode 100644 website/versioned_docs/version-1.0.0/file_sizing.md create mode 100644 website/versioned_docs/version-1.0.0/flink-quick-start-guide.md create mode 100644 website/versioned_docs/version-1.0.0/flink_tuning.md create mode 100644 website/versioned_docs/version-1.0.0/gcp_bigquery.md create mode 100644 website/versioned_docs/version-1.0.0/gcs_hoodie.md create mode 100644 website/versioned_docs/version-1.0.0/hoodie_streaming_ingestion.md create mode 100644 website/versioned_docs/version-1.0.0/hudi_stack.md create mode 100644 website/versioned_docs/version-1.0.0/ibm_cos_hoodie.md create mode 100644 website/versioned_docs/version-1.0.0/indexes.md create mode 100644 website/versioned_docs/version-1.0.0/ingestion_flink.md create mode 100644 website/versioned_docs/version-1.0.0/ingestion_kafka_connect.md create mode 100644 website/versioned_docs/version-1.0.0/intro.md create mode 100644 website/versioned_docs/version-1.0.0/jfs_hoodie.md create mode 100644 website/versioned_docs/version-1.0.0/key_generation.md create mode 100644 website/versioned_docs/version-1.0.0/markers.md create mode 100644 website/versioned_docs/version-1.0.0/metadata.md create mode 100644 website/versioned_docs/version-1.0.0/metadata_indexing.md create mode 100644 website/versioned_docs/version-1.0.0/metrics.md create mode 100644 website/versioned_docs/version-1.0.0/migration_guide.md create mode 100644 website/versioned_docs/version-1.0.0/oci_hoodie.md create mode 100644 website/versioned_docs/version-1.0.0/oss_hoodie.md create mode 100644 website/versioned_docs/version-1.0.0/overview.mdx create mode 100644 website/versioned_docs/version-1.0.0/performance.md create mode 100644 website/versioned_docs/version-1.0.0/platform_services_post_commit_callback.md create mode 100644 website/versioned_docs/version-1.0.0/precommit_validator.md create mode 100644 website/versioned_docs/version-1.0.0/privacy.md create mode 100644 website/versioned_docs/version-1.0.0/procedures.md create mode 100644 website/versioned_docs/version-1.0.0/python-rust-quick-start-guide.md create mode 100644 website/versioned_docs/version-1.0.0/querying_data.md create mode 100644 website/versioned_docs/version-1.0.0/quick-start-guide.md create mode 100644 website/versioned_docs/version-1.0.0/reading_tables_batch_reads.md create mode 100644 website/versioned_docs/version-1.0.0/reading_tables_streaming_reads.md create mode 100644 website/versioned_docs/version-1.0.0/record_merger.md create mode 100644 website/versioned_docs/version-1.0.0/rollbacks.md create mode 100644 website/versioned_docs/version-1.0.0/s3_hoodie.md create mode 100755 website/versioned_docs/version-1.0.0/schema_evolution.md create mode 100644 website/versioned_docs/version-1.0.0/snapshot_exporter.md create mode 100644 website/versioned_docs/version-1.0.0/sql_ddl.md create mode 100644 website/versioned_docs/version-1.0.0/sql_dml.md create mode 100644 website/versioned_docs/version-1.0.0/sql_queries.md create mode 100644 website/versioned_docs/version-1.0.0/storage_layouts.md create mode 100644 website/versioned_docs/version-1.0.0/structure.md create mode 100644 website/versioned_docs/version-1.0.0/syncing_aws_glue_data_catalog.md create mode 100644 website/versioned_docs/version-1.0.0/syncing_datahub.md create mode 100644 website/versioned_docs/version-1.0.0/syncing_metastore.md create mode 100644 website/versioned_docs/version-1.0.0/syncing_xtable.md create mode 100644 website/versioned_docs/version-1.0.0/table_types.md create mode 100644 website/versioned_docs/version-1.0.0/timeline.md create mode 100644 website/versioned_docs/version-1.0.0/troubleshooting.md create mode 100644 website/versioned_docs/version-1.0.0/tuning-guide.md create mode 100644 website/versioned_docs/version-1.0.0/use_cases.md create mode 100644 website/versioned_docs/version-1.0.0/write_operations.md create mode 100644 website/versioned_docs/version-1.0.0/writing_data.md create mode 100644 website/versioned_docs/version-1.0.0/writing_tables_streaming_writes.md create mode 100644 website/versioned_sidebars/version-1.0.0-sidebars.json diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 874a8fb1f80b2..1d5c45afc0240 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -140,11 +140,11 @@ module.exports = { }, { from: ["/docs/releases", "/docs/next/releases"], - to: "/releases/release-0.15.0", + to: "/releases/release-1.0.0", }, { from: ["/releases"], - to: "/releases/release-0.15.0", + to: "/releases/release-1.0.0", }, ], }, @@ -327,7 +327,7 @@ module.exports = { }, { label: "Releases", - to: "/releases/release-0.15.0", + to: "/releases/release-1.0.0", }, { label: "Download", @@ -509,8 +509,8 @@ module.exports = { path: "next", banner: "unreleased", }, - "0.15.0": { - label: "0.15.0", + "1.0.0": { + label: "1.0.0", path: "", }, }, diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/azure_hoodie.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/azure_hoodie.md new file mode 100644 index 0000000000000..f7ccb8424a127 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/azure_hoodie.md @@ -0,0 +1,52 @@ +--- +title: Azure 文件系统 +keywords: [ hudi, hive, azure, spark, presto] +summary: 在本页中,我们讨论如何在 Azure 文件系统中配置 Hudi 。 +last_modified_at: 2020-05-25T19:00:57-04:00 +language: cn +--- +在本页中,我们解释如何在 Microsoft Azure 上使用 Hudi 。 + +## 声明 + +本页面由 Hudi 社区维护。 +如果信息不准确,或者你有信息要补充,请尽管创建 JIRA ticket。 +对此贡献高度赞赏。 + +## 支持的存储系统 + +Hudi 支持两种存储系统。 + +- Azure Blob 存储 +- Azure Data Lake Gen 2 + +## 经过验证的 Spark 与存储系统的组合 + +#### Azure Data Lake Storage Gen 2 上的 HDInsight Spark 2.4 +This combination works out of the box. No extra config needed. +这种组合开箱即用,不需要额外的配置。 + +#### Azure Data Lake Storage Gen 2 上的 Databricks Spark 2.4 +- 将 Hudi jar 包导入到 databricks 工作区 。 + +- 将文件系统挂载到 dbutils 。 + ```scala + dbutils.fs.mount( + source = "abfss://xxx@xxx.dfs.core.windows.net", + mountPoint = "/mountpoint", + extraConfigs = configs) + ``` +- 当写入 Hudi 数据集时,使用 abfss URL + ```scala + inputDF.write + .format("org.apache.hudi") + .options(opts) + .mode(SaveMode.Append) + .save("abfss://<>.dfs.core.windows.net/hudi-tables/customer") + ``` +- 当读取 Hudi 数据集时,使用挂载点 + ```scala + spark.read + .format("org.apache.hudi") + .load("/mountpoint/hudi-tables/customer") + ``` diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/bos_hoodie.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/bos_hoodie.md new file mode 100644 index 0000000000000..6cb9f582c25de --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/bos_hoodie.md @@ -0,0 +1,58 @@ +--- +title: BOS Filesystem +keywords: [ hudi, hive, baidu, bos, spark, presto] +summary: In this page, we go over how to configure Hudi with BOS filesystem. +last_modified_at: 2021-06-09T11:38:24-10:00 +language: cn +--- +这个页面描述了如何让你的Hudi任务使用Baidu BOS存储。 + +## Baidu BOS 部署 + +为了让Hudi使用BOS,需要增加两部分的配置: + +- 为Hudi增加Baidu BOS的相关配置 +- 增加Jar包到classpath + +### Baidu BOS 相关的配置 + +新增下面的配置到你的Hudi能访问的core-site.xml文件。使用你的BOS bucket name替换掉`fs.defaultFS`,使用BOS endpoint地址替换`fs.bos.endpoint`,使用BOS的key和secret分别替换`fs.bos.access.key`和`fs.bos.secret.access.key`,这样Hudi就能读写相应的bucket。 + +```xml + + fs.defaultFS + bos://bucketname/ + + + + fs.bos.endpoint + bos-endpoint-address + Baidu bos endpoint to connect to,for example : http://bj.bcebos.com + + + + fs.bos.access.key + bos-key + Baidu access key + + + + fs.bos.secret.access.key + bos-secret-key + Baidu secret key. + + + + fs.bos.impl + org.apache.hadoop.fs.bos.BaiduBosFileSystem + +``` + +### Baidu BOS Libs + +新增Baidu hadoop的jar包添加到classpath. + +- com.baidubce:bce-java-sdk:0.10.165 +- bos-hdfs-sdk-1.0.2-community.jar + +可以从[这里](https://sdk.bce.baidu.com/console-sdk/bos-hdfs-sdk-1.0.2-community.jar.zip) 下载bos-hdfs-sdk jar包,然后解压。 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/cloud.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/cloud.md new file mode 100644 index 0000000000000..ae4d2728f9818 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/cloud.md @@ -0,0 +1,28 @@ +--- +title: 云储存 +keywords: [ hudi, aws, gcp, oss, azure, cloud] +summary: "In this page, we introduce how Hudi work with different Cloud providers." +toc: true +last_modified_at: 2019-06-16T21:59:57-04:00 +language: cn +--- + +## 与云存储连接 + +无论使用RDD/WriteClient API还是数据源,以下信息都有助于配置对云存储的访问。 + + * [AWS S3](/cn/docs/s3_hoodie)
+ S3和Hudi协同工作所需的配置。 + * [Google Cloud Storage](/cn/docs/gcs_hoodie)
+ GCS和Hudi协同工作所需的配置。 + * [Alibaba Cloud OSS](/cn/docs/oss_hoodie)
+ 阿里云和Hudi协同工作所需的配置。 + * [Microsoft Azure](/cn/docs/azure_hoodie)
+ Azure和Hudi协同工作所需的配置。 + * [Tencent Cloud Object Storage](/cn/docs/cos_hoodie)
+ COS和Hudi协同工作所需的配置。 + * [IBM Cloud Object Storage](/cn/docs/ibm_cos_hoodie)
+ IBM Cloud Object Storage和Hudi协同工作所需的配置。 +* [Baidu Cloud Object Storage](bos_hoodie)
+ 百度BOS和Hudi协同工作所需的配置。 + diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/comparison.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/comparison.md new file mode 100644 index 0000000000000..050dd530a940d --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/comparison.md @@ -0,0 +1,48 @@ +--- +title: 对比 +keywords: [ apache, hudi, kafka, kudu, hive, hbase, stream processing] +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +Apache Hudi填补了在DFS上处理数据的巨大空白,并可以和这些技术很好地共存。然而, +通过将Hudi与一些相关系统进行对比,来了解Hudi如何适应当前的大数据生态系统,并知晓这些系统在设计中做的不同权衡仍将非常有用。 + +## Kudu + +[Apache Kudu](https://kudu.apache.org)是一个与Hudi具有相似目标的存储系统,该系统通过对`upserts`支持来对PB级数据进行实时分析。 +一个关键的区别是Kudu还试图充当OLTP工作负载的数据存储,而Hudi并不希望这样做。 +因此,Kudu不支持增量拉取(截至2017年初),而Hudi支持以便进行增量处理。 + +Kudu与分布式文件系统抽象和HDFS完全不同,它自己的一组存储服务器通过RAFT相互通信。 +与之不同的是,Hudi旨在与底层Hadoop兼容的文件系统(HDFS,S3或Ceph)一起使用,并且没有自己的存储服务器群,而是依靠Apache Spark来完成繁重的工作。 +因此,Hudi可以像其他Spark作业一样轻松扩展,而Kudu则需要硬件和运营支持,特别是HBase或Vertica等数据存储系统。 +到目前为止,我们还没有做任何直接的基准测试来比较Kudu和Hudi(鉴于RTTable正在进行中)。 +但是,如果我们要使用[CERN](https://db-blog.web.cern.ch/blog/zbigniew-baranowski/2017-01-performance-comparison-different-file-formats-and-storage-engines), +我们预期Hudi在摄取parquet上有更卓越的性能。 + +## Hive事务 + +[Hive事务/ACID](https://cwiki.apache.org/confluence/display/Hive/Hive+Transactions)是另一项类似的工作,它试图实现在ORC文件格式之上的存储`读取时合并`。 +可以理解,此功能与Hive以及[LLAP](https://cwiki.apache.org/confluence/display/Hive/LLAP)之类的其他工作紧密相关。 +Hive事务不提供Hudi提供的读取优化存储选项或增量拉取。 +在实现选择方面,Hudi充分利用了类似Spark的处理框架的功能,而Hive事务特性则在用户或Hive Metastore启动的Hive任务/查询的下实现。 +根据我们的生产经验,与其他方法相比,将Hudi作为库嵌入到现有的Spark管道中要容易得多,并且操作不会太繁琐。 +Hudi还设计用于与Presto/Spark等非Hive引擎合作,并计划引入除parquet以外的文件格式。 + +## HBase + +尽管[HBase](https://hbase.apache.org)最终是OLTP工作负载的键值存储层,但由于与Hadoop的相似性,用户通常倾向于将HBase与分析相关联。 +鉴于HBase经过严格的写优化,它支持开箱即用的亚秒级更新,Hive-on-HBase允许用户查询该数据。 但是,就分析工作负载的实际性能而言,Parquet/ORC之类的混合列式存储格式可以轻松击败HBase,因为这些工作负载主要是读取繁重的工作。 +Hudi弥补了更快的数据与分析存储格式之间的差距。从运营的角度来看,与管理分析使用的HBase region服务器集群相比,为用户提供可更快给出数据的库更具可扩展性。 +最终,HBase不像Hudi这样重点支持`提交时间`、`增量拉取`之类的增量处理原语。 + +## 流式处理 + +一个普遍的问题:"Hudi与流处理系统有何关系?",我们将在这里尝试回答。简而言之,Hudi可以与当今的批处理(`写时复制存储`)和流处理(`读时合并存储`)作业集成,以将计算结果存储在Hadoop中。 +对于Spark应用程序,这可以通过将Hudi库与Spark/Spark流式DAG直接集成来实现。在非Spark处理系统(例如Flink、Hive)情况下,可以在相应的系统中进行处理,然后通过Kafka主题/DFS中间文件将其发送到Hudi表中。从概念上讲,数据处理 +管道仅由三个部分组成:`输入`,`处理`,`输出`,用户最终针对输出运行查询以便使用管道的结果。Hudi可以充当将数据存储在DFS上的输入或输出。Hudi在给定流处理管道上的适用性最终归结为你的查询在Presto/SparkSQL/Hive的适用性。 + +更高级的用例围绕[增量处理](https://www.oreilly.com/ideas/ubers-case-for-incremental-processing-on-hadoop)的概念展开, +甚至在`处理`引擎内部也使用Hudi来加速典型的批处理管道。例如:Hudi可用作DAG内的状态存储(类似Flink使用的[rocksDB(https://ci.apache.org/projects/flink/flink-docs-release-1.2/ops/state_backends#the-rocksdbstatebackend))。 +这是路线图上的一个项目并将最终以[Beam Runner](https://issues.apache.org/jira/browse/HUDI-60)的形式呈现。 \ No newline at end of file diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/concepts.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/concepts.md new file mode 100644 index 0000000000000..822b1ccb369ce --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/concepts.md @@ -0,0 +1,154 @@ +--- +title: 概念 +keywords: [ hudi, design, storage, views, timeline] +summary: "Here we introduce some basic concepts & give a broad technical overview of Hudi" +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +Apache Hudi(发音为“Hudi”)在DFS的数据集上提供以下流原语 + + * 插入更新 (如何改变数据集?) + * 增量拉取 (如何获取变更的数据?) + +在本节中,我们将讨论重要的概念和术语,这些概念和术语有助于理解并有效使用这些原语。 + +## 时间轴 +在它的核心,Hudi维护一条包含在不同的`即时`时间所有对数据集操作的`时间轴`,从而提供,从不同时间点出发得到不同的视图下的数据集。Hudi即时包含以下组件 + + * `操作类型` : 对数据集执行的操作类型 + * `即时时间` : 即时时间通常是一个时间戳(例如:20190117010349),该时间戳按操作开始时间的顺序单调增加。 + * `状态` : 即时的状态 + +Hudi保证在时间轴上执行的操作的原子性和基于即时时间的时间轴一致性。 + +执行的关键操作包括 + + * `COMMITS` - 一次提交表示将一组记录**原子写入**到数据集中。 + * `CLEANS` - 删除数据集中不再需要的旧文件版本的后台活动。 + * `DELTA_COMMIT` - 增量提交是指将一批记录**原子写入**到MergeOnRead存储类型的数据集中,其中一些/所有数据都可以只写到增量日志中。 + * `COMPACTION` - 协调Hudi中差异数据结构的后台活动,例如:将更新从基于行的日志文件变成列格式。在内部,压缩表现为时间轴上的特殊提交。 + * `ROLLBACK` - 表示提交/增量提交不成功且已回滚,删除在写入过程中产生的所有部分文件。 + * `SAVEPOINT` - 将某些文件组标记为"已保存",以便清理程序不会将其删除。在发生灾难/数据恢复的情况下,它有助于将数据集还原到时间轴上的某个点。 + +任何给定的即时都可以处于以下状态之一 + + * `REQUESTED` - 表示已调度但尚未启动的操作。 + * `INFLIGHT` - 表示当前正在执行该操作。 + * `COMPLETED` - 表示在时间轴上完成了该操作。 + +
+ hudi_timeline.png +
+ +上面的示例显示了在Hudi数据集上大约10:00到10:20之间发生的更新事件,大约每5分钟一次,将提交元数据以及其他后台清理/压缩保留在Hudi时间轴上。 +观察的关键点是:提交时间指示数据的`到达时间`(上午10:20),而实际数据组织则反映了实际时间或`事件时间`,即数据所反映的(从07:00开始的每小时时段)。在权衡数据延迟和完整性时,这是两个关键概念。 + +如果有延迟到达的数据(事件时间为9:00的数据在10:20达到,延迟 >1 小时),我们可以看到upsert将新数据生成到更旧的时间段/文件夹中。 +在时间轴的帮助下,增量查询可以只提取10:00以后成功提交的新数据,并非常高效地只消费更改过的文件,且无需扫描更大的文件范围,例如07:00后的所有时间段。 + +## 文件组织 +Hudi将DFS上的数据集组织到`基本路径`下的目录结构中。数据集分为多个分区,这些分区是包含该分区的数据文件的文件夹,这与Hive表非常相似。 +每个分区被相对于基本路径的特定`分区路径`区分开来。 + +在每个分区内,文件被组织为`文件组`,由`文件id`唯一标识。 +每个文件组包含多个`文件切片`,其中每个切片包含在某个提交/压缩即时时间生成的基本列文件(`*.parquet`)以及一组日志文件(`*.log*`),该文件包含自生成基本文件以来对基本文件的插入/更新。 +Hudi采用MVCC设计,其中压缩操作将日志和基本文件合并以产生新的文件片,而清理操作则将未使用的/较旧的文件片删除以回收DFS上的空间。 + +Hudi通过索引机制将给定的hoodie键(记录键+分区路径)映射到文件组,从而提供了高效的Upsert。 +一旦将记录的第一个版本写入文件,记录键和文件组/文件id之间的映射就永远不会改变。 简而言之,映射的文件组包含一组记录的所有版本。 + +## 存储类型和视图 +Hudi存储类型定义了如何在DFS上对数据进行索引和布局以及如何在这种组织之上实现上述原语和时间轴活动(即如何写入数据)。 +反过来,`视图`定义了基础数据如何暴露给查询(即如何读取数据)。 + +| 存储类型 | 支持的视图 | +|-------------- |----------------| +| 写时复制 | 近实时 + 增量 | +| 读时合并 | 近实时 + 增量 + 读优化 | + +### 存储类型 +Hudi支持以下存储类型。 + + - [写时复制](#copy-on-write-storage) : 仅使用列文件格式(例如parquet)存储数据。通过在写入过程中执行同步合并以更新版本并重写文件。 + + - [读时合并](#merge-on-read-storage) : 使用列式(例如parquet)+ 基于行(例如avro)的文件格式组合来存储数据。 更新记录到增量文件中,然后进行同步或异步压缩以生成列文件的新版本。 + +下表总结了这两种存储类型之间的权衡 + +| 权衡 | 写时复制 | 读时合并 | +|-------------- |------------------| ------------------| +| 数据延迟 | 更高 | 更低 | +| 更新代价(I/O) | 更高(重写整个parquet文件) | 更低(追加到增量日志) | +| Parquet文件大小 | 更小(高更新代价(I/o)) | 更大(低更新代价) | +| 写放大 | 更高 | 更低(取决于压缩策略) | + + +### 视图 +Hudi支持以下存储数据的视图 + + - **读优化视图** : 在此视图上的查询将查看给定提交或压缩操作中数据集的最新快照。 + 该视图仅将最新文件切片中的基本/列文件暴露给查询,并保证与非Hudi列式数据集相比,具有相同的列式查询性能。 + - **增量视图** : 对该视图的查询只能看到从某个提交/压缩后写入数据集的新数据。该视图有效地提供了更改流,来支持增量数据管道。 + - **实时视图** : 在此视图上的查询将查看某个增量提交操作中数据集的最新快照。该视图通过动态合并最新的基本文件(例如parquet)和增量文件(例如avro)来提供近实时数据集(几分钟的延迟)。 + + +下表总结了不同视图之间的权衡。 + +| 权衡 | 读优化 | 实时 | +|-------------- |------------------| ------------------| +| 数据延迟 | 更高 | 更低 | +| 查询延迟 | 更低(原始列式性能)| 更高(合并列式 + 基于行的增量) | + + +## 写时复制存储 {#copy-on-write-storage} + +写时复制存储中的文件片仅包含基本/列文件,并且每次提交都会生成新版本的基本文件。 +换句话说,我们压缩每个提交,从而所有的数据都是以列数据的形式储存。在这种情况下,写入数据非常昂贵(我们需要重写整个列数据文件,即使只有一个字节的新数据被提交),而读取数据的成本则没有增加。 +这种视图有利于读取繁重的分析工作。 + +以下内容说明了将数据写入写时复制存储并在其上运行两个查询时,它是如何工作的。 + +
+ hudi_cow.png +
+ + +随着数据的写入,对现有文件组的更新将为该文件组生成一个带有提交即时时间标记的新切片,而插入分配一个新文件组并写入该文件组的第一个切片。 +这些文件切片及其提交即时时间在上面用颜色编码。 +针对这样的数据集运行SQL查询(例如:`select count(*)`统计该分区中的记录数目),首先检查时间轴上的最新提交并过滤每个文件组中除最新文件片以外的所有文件片。 +如您所见,旧查询不会看到以粉红色标记的当前进行中的提交的文件,但是在该提交后的新查询会获取新数据。因此,查询不受任何写入失败/部分写入的影响,仅运行在已提交数据上。 + +写时复制存储的目的是从根本上改善当前管理数据集的方式,通过以下方法来实现 + + - 优先支持在文件级原子更新数据,而无需重写整个表/分区 + - 能够只读取更新的部分,而不是进行低效的扫描或搜索 + - 严格控制文件大小来保持出色的查询性能(小的文件会严重损害查询性能)。 + +## 读时合并存储 {#merge-on-read-storage} + +读时合并存储是写时复制的升级版,从某种意义上说,它仍然可以通过读优化表提供数据集的读取优化视图(写时复制的功能)。 +此外,它将每个文件组的更新插入存储到基于行的增量日志中,通过文件id,将增量日志和最新版本的基本文件进行合并,从而提供近实时的数据查询。因此,此存储类型智能地平衡了读和写的成本,以提供近乎实时的查询。 +这里最重要的一点是压缩器,它现在可以仔细挑选需要压缩到其列式基础文件中的增量日志(根据增量日志的文件大小),以保持查询性能(较大的增量日志将会提升近实时的查询时间,并同时需要更长的合并时间)。 + +以下内容说明了存储的工作方式,并显示了对近实时表和读优化表的查询。 + +
+ hudi_mor.png +
+ +此示例中发生了很多有趣的事情,这些带出了该方法的微妙之处。 + + - 现在,我们每1分钟左右就有一次提交,这是其他存储类型无法做到的。 + - 现在,在每个文件id组中,都有一个增量日志,其中包含对基础列文件中记录的更新。 + 在示例中,增量日志包含10:05至10:10的所有数据。与以前一样,基本列式文件仍使用提交进行版本控制。 + 因此,如果只看一眼基本文件,那么存储布局看起来就像是写时复制表的副本。 + - 定期压缩过程会从增量日志中合并这些更改,并生成基础文件的新版本,就像示例中10:05发生的情况一样。 + - 有两种查询同一存储的方式:读优化(RO)表和近实时(RT)表,具体取决于我们选择查询性能还是数据新鲜度。 + - 对于RO表来说,提交数据在何时可用于查询将有些许不同。 请注意,以10:10运行的(在RO表上的)此类查询将不会看到10:05之后的数据,而在RT表上的查询总会看到最新的数据。 + - 何时触发压缩以及压缩什么是解决这些难题的关键。 + 通过实施压缩策略,在该策略中,与较旧的分区相比,我们会积极地压缩最新的分区,从而确保RO表能够以一致的方式看到几分钟内发布的数据。 + +读时合并存储上的目的是直接在DFS上启用近实时处理,而不是将数据复制到专用系统,后者可能无法处理大数据量。 +该存储还有一些其他方面的好处,例如通过避免数据的同步合并来减少写放大,即批量数据中每1字节数据需要的写入数据量。 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/configurations.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/configurations.md new file mode 100644 index 0000000000000..4c3a3415d3eb0 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/configurations.md @@ -0,0 +1,604 @@ +--- +title: 配置 +keywords: [ garbage collection, hudi, jvm, configs, tuning] +summary: "Here we list all possible configurations and what they mean" +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +该页面介绍了几种配置写入或读取Hudi数据集的作业的方法。 +简而言之,您可以在几个级别上控制行为。 + +- **[Spark数据源配置](#spark-datasource)** : 这些配置控制Hudi Spark数据源,提供如下功能: + 定义键和分区、选择写操作、指定如何合并记录或选择要读取的视图类型。 +- **[WriteClient 配置](#writeclient-configs)** : 在内部,Hudi数据源使用基于RDD的`HoodieWriteClient` API + 真正执行对存储的写入。 这些配置可对文件大小、压缩(compression)、并行度、压缩(compaction)、写入模式、清理等底层方面进行完全控制。 + 尽管Hudi提供了合理的默认设置,但在不同情形下,可能需要对这些配置进行调整以针对特定的工作负载进行优化。 +- **[RecordPayload 配置](#PAYLOAD_CLASS_OPT_KEY)** : 这是Hudi提供的最底层的定制。 + RecordPayload定义了如何根据传入的新记录和存储的旧记录来产生新值以进行插入更新。 + Hudi提供了诸如`OverwriteWithLatestAvroPayload`的默认实现,该实现仅使用最新或最后写入的记录来更新存储。 + 在数据源和WriteClient级别,都可以将其重写为扩展`HoodieRecordPayload`类的自定义类。 + + +## Spark数据源配置 + +可以通过将以下选项传递到`option(k,v)`方法中来配置使用数据源的Spark作业。 +实际的数据源级别配置在下面列出。 + +### 写选项 + +另外,您可以使用`options()`或`option(k,v)`方法直接传递任何WriteClient级别的配置。 + +```java +inputDF.write() +.format("org.apache.hudi") +.options(clientOpts) // 任何Hudi客户端选项都可以传入 +.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") +.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") +.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") +.option(HoodieWriteConfig.TABLE_NAME, tableName) +.mode(SaveMode.Append) +.save(basePath); +``` + +用于通过`write.format.option(...)`写入数据集的选项 + +#### TABLE_NAME_OPT_KEY + 属性:`hoodie.datasource.write.table.name` [必须]
+ Hive表名,用于将数据集注册到其中。 + +#### OPERATION_OPT_KEY + 属性:`hoodie.datasource.write.operation`, 默认值:`upsert`
+ 是否为写操作进行插入更新、插入或批量插入。使用`bulkinsert`将新数据加载到表中,之后使用`upsert`或`insert`。 + 批量插入使用基于磁盘的写入路径来扩展以加载大量输入,而无需对其进行缓存。 + +#### STORAGE_TYPE_OPT_KEY + 属性:`hoodie.datasource.write.storage.type`, 默认值:`COPY_ON_WRITE`
+ 此写入的基础数据的存储类型。两次写入之间不能改变。 + +#### PRECOMBINE_FIELD_OPT_KEY + 属性:`hoodie.datasource.write.precombine.field`, 默认值:`ts`
+ 实际写入之前在preCombining中使用的字段。 + 当两个记录具有相同的键值时,我们将使用Object.compareTo(..)从precombine字段中选择一个值最大的记录。 + +#### PAYLOAD_CLASS_OPT_KEY + 属性:`hoodie.datasource.write.payload.class`, 默认值:`org.apache.hudi.OverwriteWithLatestAvroPayload`
+ 使用的有效载荷类。如果您想在插入更新或插入时使用自己的合并逻辑,请重写此方法。 + 这将使得`PRECOMBINE_FIELD_OPT_VAL`设置的任何值无效 + +#### RECORDKEY_FIELD_OPT_KEY + 属性:`hoodie.datasource.write.recordkey.field`, 默认值:`uuid`
+ 记录键字段。用作`HoodieKey`中`recordKey`部分的值。 + 实际值将通过在字段值上调用.toString()来获得。可以使用点符号指定嵌套字段,例如:`a.b.c` + +#### PARTITIONPATH_FIELD_OPT_KEY + 属性:`hoodie.datasource.write.partitionpath.field`, 默认值:`partitionpath`
+ 分区路径字段。用作`HoodieKey`中`partitionPath`部分的值。 + 通过调用.toString()获得实际的值 + +#### HIVE_STYLE_PARTITIONING_OPT_KEY + 属性:`hoodie.datasource.write.hive_style_partitioning`, 默认值:`false`
+ 如果设置为true,则生成基于Hive格式的partition目录:`partition_column_name=partition_value` + +#### KEYGENERATOR_CLASS_OPT_KEY + 属性:`hoodie.datasource.write.keygenerator.class`
+ 键生成器类,实现从输入的`Row`对象中提取键。该配置优先级大于 `hoodie.datasource.write.keygenerator.type`, 用于使用用户自定义键生成器 + +#### KEYGENERATOR_TYPE_OPT_KEY + 属性: `hoodie.datasource.write.keygenerator.type`, 默认值: `SIMPLE`
+键生成器类型,默认 `SIMPLE` 类型,该配置优先级低于 `hoodie.datasource.write.keygenerator.class`, 是推荐使用的配置方式 + +#### COMMIT_METADATA_KEYPREFIX_OPT_KEY + 属性:`hoodie.datasource.write.commitmeta.key.prefix`, 默认值:`_`
+ 以该前缀开头的选项键会自动添加到提交/增量提交的元数据中。 + 这对于与hudi时间轴一致的方式存储检查点信息很有用 + +#### INSERT_DROP_DUPS_OPT_KEY + 属性:`hoodie.datasource.write.insert.drop.duplicates`, 默认值:`false`
+ 如果设置为true,则在插入操作期间从传入DataFrame中过滤掉所有重复记录。 + +#### HIVE_SYNC_ENABLED_OPT_KEY + 属性:`hoodie.datasource.hive_sync.enable`, 默认值:`false`
+ 设置为true时,将数据集注册并同步到Apache Hive Metastore + +#### HIVE_DATABASE_OPT_KEY + 属性:`hoodie.datasource.hive_sync.database`, 默认值:`default`
+ 要同步到的数据库 + +#### HIVE_TABLE_OPT_KEY + 属性:`hoodie.datasource.hive_sync.table`, [Required]
+ 要同步到的表 + +#### HIVE_USER_OPT_KEY + 属性:`hoodie.datasource.hive_sync.username`, 默认值:`hive`
+ 要使用的Hive用户名 + +#### HIVE_PASS_OPT_KEY + 属性:`hoodie.datasource.hive_sync.password`, 默认值:`hive`
+ 要使用的Hive密码 + +#### HIVE_URL_OPT_KEY + 属性:`hoodie.datasource.hive_sync.jdbcurl`, 默认值:`jdbc:hive2://localhost:10000`
+ Hive metastore url + +#### HIVE_PARTITION_FIELDS_OPT_KEY + 属性:`hoodie.datasource.hive_sync.partition_fields`, 默认值:` `
+ 数据集中用于确定Hive分区的字段。 + +#### HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY + 属性:`hoodie.datasource.hive_sync.partition_extractor_class`, 默认值:`org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor`
+ 用于将分区字段值提取到Hive分区列中的类。 + +#### HIVE_ASSUME_DATE_PARTITION_OPT_KEY + 属性:`hoodie.datasource.hive_sync.assume_date_partitioning`, 默认值:`false`
+ 假设分区格式是yyyy/mm/dd + +### 读选项 + +用于通过`read.format.option(...)`读取数据集的选项 + +#### VIEW_TYPE_OPT_KEY +属性:`hoodie.datasource.view.type`, 默认值:`read_optimized`
+是否需要以某种模式读取数据,增量模式(自InstantTime以来的新数据) +(或)读优化模式(基于列数据获取最新视图) +(或)实时模式(基于行和列数据获取最新视图) + +#### BEGIN_INSTANTTIME_OPT_KEY +属性:`hoodie.datasource.read.begin.instanttime`, [在增量模式下必须]
+开始增量提取数据的即时时间。这里的instanttime不必一定与时间轴上的即时相对应。 +取出以`instant_time > BEGIN_INSTANTTIME`写入的新数据。 +例如:'20170901080000'将获取2017年9月1日08:00 AM之后写入的所有新数据。 + +#### END_INSTANTTIME_OPT_KEY +属性:`hoodie.datasource.read.end.instanttime`, 默认值:最新即时(即从开始即时获取所有新数据)
+限制增量提取的数据的即时时间。取出以`instant_time <= END_INSTANTTIME`写入的新数据。 + + +## WriteClient 配置 + +直接使用RDD级别api进行编程的Jobs可以构建一个`HoodieWriteConfig`对象,并将其传递给`HoodieWriteClient`构造函数。 +HoodieWriteConfig可以使用以下构建器模式构建。 + +```java +HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder() + .withPath(basePath) + .forTable(tableName) + .withSchema(schemaStr) + .withProps(props) // 从属性文件传递原始k、v对。 + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withXXX(...).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withXXX(...).build()) + ... + .build(); +``` + +以下各节介绍了写配置的不同方面,并解释了最重要的配置及其属性名称和默认值。 + +#### withPath(hoodie_base_path) +属性:`hoodie.base.path` [必须]
+创建所有数据分区所依据的基本DFS路径。 +始终在前缀中明确指明存储方式(例如hdfs://,s3://等)。 +Hudi将有关提交、保存点、清理审核日志等的所有主要元数据存储在基本目录下的.hoodie目录中。 + +#### withSchema(schema_str) +属性:`hoodie.avro.schema` [必须]
+这是数据集的当前读取器的avro模式(schema)。 +这是整个模式的字符串。HoodieWriteClient使用此模式传递到HoodieRecordPayload的实现,以从源格式转换为avro记录。 +在更新过程中重写记录时也使用此模式。 + +#### forTable(table_name) +属性:`hoodie.table.name` [必须]
+ 数据集的表名,将用于在Hive中注册。每次运行需要相同。 + +#### withBulkInsertParallelism(bulk_insert_parallelism = 1500) +属性:`hoodie.bulkinsert.shuffle.parallelism`
+批量插入旨在用于较大的初始导入,而此处的并行度决定了数据集中文件的初始数量。 +调整此值以达到在初始导入期间所需的最佳尺寸。 + +#### withParallelism(insert_shuffle_parallelism = 1500, upsert_shuffle_parallelism = 1500) +属性:`hoodie.insert.shuffle.parallelism`, `hoodie.upsert.shuffle.parallelism`
+最初导入数据后,此并行度将控制用于读取输入记录的初始并行度。 +确保此值足够高,例如:1个分区用于1 GB的输入数据 + +#### combineInput(on_insert = false, on_update=true) +属性:`hoodie.combine.before.insert`, `hoodie.combine.before.upsert`
+在DFS中插入或更新之前先组合输入RDD并将多个部分记录合并为单个记录的标志 + +#### withWriteStatusStorageLevel(level = MEMORY_AND_DISK_SER) +属性:`hoodie.write.status.storage.level`
+HoodieWriteClient.insert和HoodieWriteClient.upsert返回一个持久的RDD[WriteStatus], +这是因为客户端可以选择检查WriteStatus并根据失败选择是否提交。这是此RDD的存储级别的配置 + +#### withAutoCommit(autoCommit = true) +属性:`hoodie.auto.commit`
+插入和插入更新后,HoodieWriteClient是否应该自动提交。 +客户端可以选择关闭自动提交,并在"定义的成功条件"下提交 + +#### withAssumeDatePartitioning(assumeDatePartitioning = false) +属性:`hoodie.assume.date.partitioning`
+HoodieWriteClient是否应该假设数据按日期划分,即从基本路径划分为三个级别。 +这是支持<0.3.1版本创建的表的一个补丁。最终将被删除 + +#### withConsistencyCheckEnabled(enabled = false) +属性:`hoodie.consistency.check.enabled`
+HoodieWriteClient是否应该执行其他检查,以确保写入的文件在基础文件系统/存储上可列出。 +将其设置为true可以解决S3的最终一致性模型,并确保作为提交的一部分写入的所有数据均能准确地用于查询。 + +### 索引配置 +以下配置控制索引行为,该行为将传入记录标记为对较旧记录的插入或更新。 + +[withIndexConfig](#withIndexConfig) (HoodieIndexConfig)
+可插入以具有外部索引(HBase)或使用存储在Parquet文件中的默认布隆过滤器(bloom filter) + +#### withIndexClass(indexClass = "x.y.z.UserDefinedIndex") +属性:`hoodie.index.class`
+用户自定义索引的全路径名,索引类必须为HoodieIndex的子类,当指定该配置时,其会优先于`hoodie.index.type`配置 + +#### withIndexType(indexType = BLOOM) +属性:`hoodie.index.type`
+要使用的索引类型。默认为布隆过滤器。可能的选项是[BLOOM | HBASE | INMEMORY]。 +布隆过滤器消除了对外部系统的依赖,并存储在Parquet数据文件的页脚中 + +#### bloomFilterNumEntries(numEntries = 60000) +属性:`hoodie.index.bloom.num_entries`
+仅在索引类型为BLOOM时适用。
这是要存储在布隆过滤器中的条目数。 +我们假设maxParquetFileSize为128MB,averageRecordSize为1024B,因此,一个文件中的记录总数约为130K。 +默认值(60000)大约是此近似值的一半。[HUDI-56](https://issues.apache.org/jira/browse/HUDI-56) +描述了如何动态地对此进行计算。 +警告:将此值设置得太低,将产生很多误报,并且索引查找将必须扫描比其所需的更多的文件;如果将其设置得非常高,将线性增加每个数据文件的大小(每50000个条目大约4KB)。
+ +#### bloomFilterFPP(fpp = 0.000000001) +属性:`hoodie.index.bloom.fpp`
+仅在索引类型为BLOOM时适用。
根据条目数允许的错误率。 +这用于计算应为布隆过滤器分配多少位以及哈希函数的数量。通常将此值设置得很低(默认值:0.000000001),我们希望在磁盘空间上进行权衡以降低误报率
+ +#### bloomIndexPruneByRanges(pruneRanges = true) +属性:`hoodie.bloom.index.prune.by.ranges`
+仅在索引类型为BLOOM时适用。
为true时,从文件框定信息,可以加快索引查找的速度。 如果键具有单调递增的前缀,例如时间戳,则特别有用。
+ +#### bloomIndexUseCaching(useCaching = true) +属性:`hoodie.bloom.index.use.caching`
+仅在索引类型为BLOOM时适用。
为true时,将通过减少用于计算并行度或受影响分区的IO来缓存输入的RDD以加快索引查找
+ +#### bloomIndexTreebasedFilter(useTreeFilter = true) +属性:`hoodie.bloom.index.use.treebased.filter`
+仅在索引类型为BLOOM时适用。
为true时,启用基于间隔树的文件过滤优化。与暴力模式相比,此模式可根据键范围加快文件过滤速度
+ +#### bloomIndexBucketizedChecking(bucketizedChecking = true) +属性:`hoodie.bloom.index.bucketized.checking`
+仅在索引类型为BLOOM时适用。
为true时,启用了桶式布隆过滤。这减少了在基于排序的布隆索引查找中看到的偏差
+ +#### bloomIndexKeysPerBucket(keysPerBucket = 10000000) +属性:`hoodie.bloom.index.keys.per.bucket`
+仅在启用bloomIndexBucketizedChecking并且索引类型为bloom的情况下适用。
+此配置控制“存储桶”的大小,该大小可跟踪对单个文件进行的记录键检查的次数,并且是分配给执行布隆过滤器查找的每个分区的工作单位。 +较高的值将分摊将布隆过滤器读取到内存的固定成本。
+ +#### bloomIndexParallelism(0) +属性:`hoodie.bloom.index.parallelism`
+仅在索引类型为BLOOM时适用。
这是索引查找的并行度,其中涉及Spark Shuffle。 默认情况下,这是根据输入的工作负载特征自动计算的
+ +#### hbaseZkQuorum(zkString) [必须] +属性:`hoodie.index.hbase.zkquorum`
+仅在索引类型为HBASE时适用。要连接的HBase ZK Quorum URL。 + +#### hbaseZkPort(port) [必须] +属性:`hoodie.index.hbase.zkport`
+仅在索引类型为HBASE时适用。要连接的HBase ZK Quorum端口。 + +#### hbaseZkZnodeParent(zkZnodeParent) [必须] +属性:`hoodie.index.hbase.zknode.path`
+仅在索引类型为HBASE时适用。这是根znode,它将包含HBase创建及使用的所有znode。 + +#### hbaseTableName(tableName) [必须] +属性:`hoodie.index.hbase.table`
+仅在索引类型为HBASE时适用。HBase表名称,用作索引。Hudi将row_key和[partition_path, fileID, commitTime]映射存储在表中。 + +##### bloomIndexUpdatePartitionPath(updatePartitionPath = false) +属性:`hoodie.bloom.index.update.partition.path`
+仅在索引类型为GLOBAL_BLOOM时适用。
为true时,当对一个已有记录执行包含分区路径的更新操作时,将会导致把新记录插入到新分区,而把原有记录从旧分区里删除。为false时,只对旧分区的原有记录进行更新。
+ + +### 存储选项 +控制有关调整parquet和日志文件大小的方面。 + +[withStorageConfig](#withStorageConfig) (HoodieStorageConfig)
+ +#### limitFileSize (size = 120MB) +属性:`hoodie.parquet.max.file.size`
+Hudi写阶段生成的parquet文件的目标大小。对于DFS,这需要与基础文件系统块大小保持一致,以实现最佳性能。 + +#### parquetBlockSize(rowgroupsize = 120MB) +属性:`hoodie.parquet.block.size`
+Parquet行组大小。最好与文件大小相同,以便将文件中的单个列连续存储在磁盘上 + +#### parquetPageSize(pagesize = 1MB) +属性:`hoodie.parquet.page.size`
+Parquet页面大小。页面是parquet文件中的读取单位。 在一个块内,页面被分别压缩。 + +#### parquetCompressionRatio(parquetCompressionRatio = 0.1) +属性:`hoodie.parquet.compression.ratio`
+当Hudi尝试调整新parquet文件的大小时,预期对parquet数据进行压缩的比例。 +如果bulk_insert生成的文件小于预期大小,请增加此值 + +#### parquetCompressionCodec(parquetCompressionCodec = gzip) +属性:`hoodie.parquet.compression.codec`
+Parquet压缩编解码方式名称。默认值为gzip。可能的选项是[gzip | snappy | uncompressed | lzo] + +#### logFileMaxSize(logFileSize = 1GB) +属性:`hoodie.logfile.max.size`
+LogFile的最大大小。这是在将日志文件移到下一个版本之前允许的最大大小。 + +#### logFileDataBlockMaxSize(dataBlockSize = 256MB) +属性:`hoodie.logfile.data.block.max.size`
+LogFile数据块的最大大小。这是允许将单个数据块附加到日志文件的最大大小。 +这有助于确保附加到日志文件的数据被分解为可调整大小的块,以防止发生OOM错误。此大小应大于JVM内存。 + +#### logFileToParquetCompressionRatio(logFileToParquetCompressionRatio = 0.35) +属性:`hoodie.logfile.to.parquet.compression.ratio`
+随着记录从日志文件移动到parquet,预期会进行额外压缩的比例。 +用于merge_on_read存储,以将插入内容发送到日志文件中并控制压缩parquet文件的大小。 + +#### parquetCompressionCodec(parquetCompressionCodec = gzip) +属性:`hoodie.parquet.compression.codec`
+Parquet文件的压缩编解码方式 + +### 压缩配置 +压缩配置用于控制压缩(将日志文件合并到新的parquet基本文件中)、清理(回收较旧及未使用的文件组)。 +[withCompactionConfig](#withCompactionConfig) (HoodieCompactionConfig)
+ +#### withCleanerPolicy(policy = KEEP_LATEST_COMMITS) +属性:`hoodie.cleaner.policy`
+要使用的清理政策。Hudi将删除旧版本的parquet文件以回收空间。 +任何引用此版本文件的查询和计算都将失败。最好确保数据保留的时间超过最大查询执行时间。 + +#### retainCommits(no_of_commits_to_retain = 24) +属性:`hoodie.cleaner.commits.retained`
+保留的提交数。因此,数据将保留为num_of_commits * time_between_commits(计划的)。 +这也直接转化为您可以逐步提取此数据集的数量 + +#### archiveCommitsWith(minCommits = 96, maxCommits = 128) +属性:`hoodie.keep.min.commits`, `hoodie.keep.max.commits`
+每个提交都是`.hoodie`目录中的一个小文件。由于DFS通常不支持大量小文件,因此Hudi将较早的提交归档到顺序日志中。 +提交通过重命名提交文件以原子方式发布。 + +#### withCommitsArchivalBatchSize(batch = 10) +属性:`hoodie.commits.archival.batch`
+这控制着批量读取并一起归档的提交即时的数量。 + +#### compactionSmallFileSize(size = 0) +属性:`hoodie.parquet.small.file.limit`
+该值应小于maxFileSize,如果将其设置为0,会关闭此功能。 +由于批处理中分区中插入记录的数量众多,总会出现小文件。 +Hudi提供了一个选项,可以通过将对该分区中的插入作为对现有小文件的更新来解决小文件的问题。 +此处的大小是被视为“小文件大小”的最小文件大小。 + +#### insertSplitSize(size = 500000) +属性:`hoodie.copyonwrite.insert.split.size`
+插入写入并行度。为单个分区的总共插入次数。 +写出100MB的文件,至少1kb大小的记录,意味着每个文件有100K记录。默认值是超额配置为500K。 +为了改善插入延迟,请对其进行调整以匹配单个文件中的记录数。 +将此值设置为较小的值将导致文件变小(尤其是当compactionSmallFileSize为0时) + +#### autoTuneInsertSplits(true) +属性:`hoodie.copyonwrite.insert.auto.split`
+Hudi是否应该基于最后24个提交的元数据动态计算insertSplitSize。默认关闭。 + +#### approxRecordSize(size = 1024) +属性:`hoodie.copyonwrite.record.size.estimate`
+平均记录大小。如果指定,hudi将使用它,并且不会基于最后24个提交的元数据动态地计算。 +没有默认值设置。这对于计算插入并行度以及将插入打包到小文件中至关重要。如上所述。 + +#### withInlineCompaction(inlineCompaction = false) +属性:`hoodie.compact.inline`
+当设置为true时,紧接在插入或插入更新或批量插入的提交或增量提交操作之后由摄取本身触发压缩 + +#### withMaxNumDeltaCommitsBeforeCompaction(maxNumDeltaCommitsBeforeCompaction = 10) +属性:`hoodie.compact.inline.max.delta.commits`
+触发内联压缩之前要保留的最大增量提交数 + +#### withCompactionLazyBlockReadEnabled(true) +属性:`hoodie.compaction.lazy.block.read`
+当CompactedLogScanner合并所有日志文件时,此配置有助于选择是否应延迟读取日志块。 +选择true以使用I/O密集型延迟块读取(低内存使用),或者为false来使用内存密集型立即块读取(高内存使用) + +#### withCompactionReverseLogReadEnabled(false) +属性:`hoodie.compaction.reverse.log.read`
+HoodieLogFormatReader会从pos=0到pos=file_length向前读取日志文件。 +如果此配置设置为true,则Reader会从pos=file_length到pos=0反向读取日志文件 + +#### withCleanerParallelism(cleanerParallelism = 200) +属性:`hoodie.cleaner.parallelism`
+如果清理变慢,请增加此值。 + +#### withCompactionStrategy(compactionStrategy = org.apache.hudi.io.compact.strategy.LogFileSizeBasedCompactionStrategy) +属性:`hoodie.compaction.strategy`
+用来决定在每次压缩运行期间选择要压缩的文件组的压缩策略。 +默认情况下,Hudi选择具有累积最多未合并数据的日志文件 + +#### withTargetIOPerCompactionInMB(targetIOPerCompactionInMB = 500000) +属性:`hoodie.compaction.target.io`
+LogFileSizeBasedCompactionStrategy的压缩运行期间要花费的MB量。当压缩以内联模式运行时,此值有助于限制摄取延迟。 + +#### withTargetPartitionsPerDayBasedCompaction(targetPartitionsPerCompaction = 10) +属性:`hoodie.compaction.daybased.target`
+由org.apache.hudi.io.compact.strategy.DayBasedCompactionStrategy使用,表示在压缩运行期间要压缩的最新分区数。 + +#### withPayloadClass(payloadClassName = org.apache.hudi.common.model.HoodieAvroPayload) +属性:`hoodie.compaction.payload.class`
+这需要与插入/插入更新过程中使用的类相同。 +就像写入一样,压缩也使用记录有效负载类将日志中的记录彼此合并,再次与基本文件合并,并生成压缩后要写入的最终记录。 + + + +### 指标配置 +配置Hudi指标报告。 +[withMetricsConfig](#withMetricsConfig) (HoodieMetricsConfig)
+Hudi会发布有关每次提交、清理、回滚等的指标。 + +#### GRAPHITE + +##### on(metricsOn = false) +属性:`hoodie.metrics.on`
+打开或关闭发送指标。默认情况下处于关闭状态。 + +##### withReporterType(reporterType = GRAPHITE) +属性:`hoodie.metrics.reporter.type`
+指标报告者的类型。默认使用graphite。 + +##### toGraphiteHost(host = localhost) +属性:`hoodie.metrics.graphite.host`
+要连接的graphite主机 + +##### onGraphitePort(port = 4756) +属性:`hoodie.metrics.graphite.port`
+要连接的graphite端口 + +##### usePrefix(prefix = "") +属性:`hoodie.metrics.graphite.metric.prefix`
+适用于所有指标的标准前缀。这有助于添加如数据中心、环境等信息 + +#### JMX + +##### on(metricsOn = false) +属性:`hoodie.metrics.on`
+打开或关闭发送指标。默认情况下处于关闭状态。 + +##### withReporterType(reporterType = JMX) +属性:`hoodie.metrics.reporter.type`
+指标报告者的类型。 + +##### toJmxHost(host = localhost) +属性:`hoodie.metrics.jmx.host`
+要连接的Jmx主机 + +##### onJmxPort(port = 1000-5000) +属性:`hoodie.metrics.graphite.port`
+要连接的Jmx端口 + +#### DATADOG + +##### on(metricsOn = false) +属性:`hoodie.metrics.on`
+打开或关闭发送指标。默认情况下处于关闭状态。 + +##### withReporterType(reporterType = DATADOG) +属性: `hoodie.metrics.reporter.type`
+指标报告者的类型。 + +##### withDatadogReportPeriodSeconds(period = 30) +属性: `hoodie.metrics.datadog.report.period.seconds`
+Datadog报告周期,单位为秒,默认30秒。 + +##### withDatadogApiSite(apiSite) +属性: `hoodie.metrics.datadog.api.site`
+Datadog API站点:EU 或者 US + +##### withDatadogApiKey(apiKey) +属性: `hoodie.metrics.datadog.api.key`
+Datadog API密匙 + +##### withDatadogApiKeySkipValidation(skip = false) +属性: `hoodie.metrics.datadog.api.key.skip.validation`
+在通过Datadog API发送指标前,选择是否跳过验证API密匙。默认不跳过。 + +##### withDatadogApiKeySupplier(apiKeySupplier) +属性: `hoodie.metrics.datadog.api.key.supplier`
+Datadog API 密匙提供者,用来在运行时提供密匙。只有当`hoodie.metrics.datadog.api.key`未设定的情况下才有效。 + +##### withDatadogApiTimeoutSeconds(timeout = 3) +属性: `hoodie.metrics.datadog.metric.prefix`
+Datadog API超时时长,单位为秒,默认3秒。 + +##### withDatadogPrefix(prefix) +属性: `hoodie.metrics.datadog.metric.prefix`
+Datadog指标前缀。将被加在所有指标名称前,以点间隔。例如:如果设成`foo`,`foo.`将被用作实际前缀。 + +##### withDatadogHost(host) +属性: `hoodie.metrics.datadog.metric.host`
+Datadog指标主机,将和指标数据一并发送。 + +##### withDatadogTags(tags) +属性: `hoodie.metrics.datadog.metric.tags`
+Datadog指标标签(逗号分隔),将和指标数据一并发送。 + +#### 用户自定义发送器 + +##### on(metricsOn = false) +属性: `hoodie.metrics.on`
+打开或关闭发送指标。默认情况下处于关闭状态。 + +##### withReporterClass(className = "") +属性: `hoodie.metrics.reporter.class`
+用于处理发送指标的用户自定义类,必须是AbstractUserDefinedMetricsReporter类的子类. + +### 内存配置 +控制由Hudi内部执行的压缩和合并的内存使用情况 +[withMemoryConfig](#withMemoryConfig) (HoodieMemoryConfig)
+内存相关配置 + +#### withMaxMemoryFractionPerPartitionMerge(maxMemoryFractionPerPartitionMerge = 0.6) +属性:`hoodie.memory.merge.fraction`
+该比例乘以用户内存比例(1-spark.memory.fraction)以获得合并期间要使用的堆空间的最终比例 + +#### withMaxMemorySizePerCompactionInBytes(maxMemorySizePerCompactionInBytes = 1GB) +属性:`hoodie.memory.compaction.fraction`
+HoodieCompactedLogScanner读取日志块,将记录转换为HoodieRecords,然后合并这些日志块和记录。 +在任何时候,日志块中的条目数可以小于或等于相应的parquet文件中的条目数。这可能导致Scanner出现OOM。 +因此,可溢出的映射有助于减轻内存压力。使用此配置来设置可溢出映射的最大允许inMemory占用空间。 + +#### withWriteStatusFailureFraction(failureFraction = 0.1) +属性:`hoodie.memory.writestatus.failure.fraction`
+此属性控制报告给驱动程序的失败记录和异常的比例 + +### 写提交回调配置 +控制写提交的回调。 如果用户启用了回调并且回调过程发生了错误,则会抛出异常。 当前支持HTTP, Kafka 两种回调方式。 +[withCallbackConfig](#withCallbackConfig) (HoodieWriteCommitCallbackConfig)
+写提交回调相关配置 + +##### writeCommitCallbackOn(callbackOn = false) +Property: `hoodie.write.commit.callback.on`
+打开或关闭回调功能. 默认关闭. + +##### withCallbackClass(callbackClass) +Property: `hoodie.write.commit.callback.class`
+回调类的完全限定名,必须实现HoodieWriteCommitCallback接口。默认 org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback + +#### HTTP CALLBACK +通过 HTTP 发送写提交回调信息. 这是默认的实现方式,用户不需要显式指定。 + +##### withCallbackHttpUrl(url) +Property: `hoodie.write.commit.callback.http.url`
+Http回调主机,回调信息将会发送到该主机 + +##### withCallbackHttpTimeoutSeconds(timeoutSeconds = 3) +Property: `hoodie.write.commit.callback.http.timeout.seconds`
+Http回调超时时间(单位秒),默认3秒 + +##### withCallbackHttpApiKey(apiKey) +Property: `hoodie.write.commit.callback.http.api.key`
+Http 回调秘钥. 默认 hudi_write_commit_http_callback + +#### KAFKA CALLBACK +使用Kafka发送写提交回调信息, 用户需要配置 `hoodie.write.commit.callback.class` = `org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallback` + +##### CALLBACK_KAFKA_BOOTSTRAP_SERVERS +Property: `hoodie.write.commit.callback.kafka.bootstrap.servers`
+Kafka 集群地址 + +##### CALLBACK_KAFKA_TOPIC +Property: `hoodie.write.commit.callback.kafka.topic`
+发送回调信息的topic + +##### CALLBACK_KAFKA_PARTITION +Property: `hoodie.write.commit.callback.kafka.partition`
+指定发送的分区, 默认 0 + +##### CALLBACK_KAFKA_ACKS +Property: `hoodie.write.commit.callback.kafka.acks`
+Acks 级别, 默认 `all` + +##### CALLBACK_KAFKA_RETRIES +Property: `hoodie.write.commit.callback.kafka.retries`
+Kafka 发送数据失败重试次数. 默认 3 次 + diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/cos_hoodie.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/cos_hoodie.md new file mode 100644 index 0000000000000..043e3768619e7 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/cos_hoodie.md @@ -0,0 +1,73 @@ +--- +title: COS Filesystem +keywords: [ hudi, hive, tencent, cos, spark, presto] +summary: In this page, we go over how to configure Hudi with COS filesystem. +last_modified_at: 2020-04-21T12:50:50-10:00 +language: cn +--- +这个页面描述了如何让你的Hudi spark任务使用Tencent Cloud COS存储。 + +## Tencent Cloud COS 部署 + +为了让Hudi使用COS,需要增加两部分的配置: + +- 为Hidi增加Tencent Cloud COS的相关配置 +- 增加Jar包的MVN依赖 + +### Tencent Cloud COS 相关的配置 + +新增下面的配置到你的Hudi能访问的core-site.xml文件。使用你的COS bucket name替换掉`fs.defaultFS`,使用COS的key和secret分别替换`fs.cosn.userinfo.secretKey`和`fs.cosn.userinfo.secretId`。主要Hudi就能读写相应的bucket。 + + +```xml + + fs.defaultFS + cosn://bucketname + COS bucket name + + + + fs.cosn.userinfo.secretId + cos-secretId + Tencent Cloud Secret Id + + + + fs.cosn.userinfo.secretKey + cos-secretkey + Tencent Cloud Secret Key + + + + fs.cosn.bucket.region + ap-region + The region where the bucket is located. + + + + fs.cosn.bucket.endpoint_suffix + cos.endpoint.suffix + + COS endpoint to connect to. + For public cloud users, it is recommended not to set this option, and only the correct area field is required. + + + + + fs.cosn.impl + org.apache.hadoop.fs.CosFileSystem + The implementation class of the CosN Filesystem. + + + + fs.AbstractFileSystem.cosn.impl + org.apache.hadoop.fs.CosN + The implementation class of the CosN AbstractFileSystem. + + +``` + +### Tencent Cloud COS Libs +添加COS依赖jar包到classpath + +- org.apache.hadoop:hadoop-cos:2.8.5 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/deployment.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/deployment.md new file mode 100644 index 0000000000000..76c79cf4c8e60 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/deployment.md @@ -0,0 +1,433 @@ +--- +title: 管理 Hudi Pipelines +keywords: [ hudi, administration, operation, devops] +summary: This section offers an overview of tools available to operate an ecosystem of Hudi datasets +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +管理员/运维人员可以通过以下方式了解Hudi数据集/管道 + + - [通过Admin CLI进行管理](#admin-cli) + - [Graphite指标](#metrics) + - [Hudi应用程序的Spark UI](#spark-ui) + +本节简要介绍了每一种方法,并提供了有关[故障排除](#troubleshooting)的一些常规指南 + +## Admin CLI {#admin-cli} + +一旦构建了hudi,就可以通过`cd hudi-cli && ./hudi-cli.sh`启动shell。 +一个hudi数据集位于DFS上的**basePath**位置,我们需要该位置才能连接到Hudi数据集。 +Hudi库使用.hoodie子文件夹跟踪所有元数据,从而有效地在内部管理该数据集。 + +初始化hudi表,可使用如下命令。 + +```java +18/09/06 15:56:52 INFO annotation.AutowiredAnnotationBeanPostProcessor: JSR-330 'javax.inject.Inject' annotation found and supported for autowiring +============================================ +* * +* _ _ _ _ * +* | | | | | | (_) * +* | |__| | __| | - * +* | __ || | / _` | || * +* | | | || || (_| | || * +* |_| |_|\___/ \____/ || * +* * +============================================ + +Welcome to Hoodie CLI. Please type help if you are looking for help. +hudi->create --path /user/hive/warehouse/table1 --tableName hoodie_table_1 --tableType COPY_ON_WRITE +..... +18/09/06 15:57:15 INFO table.HoodieTableMetaClient: Finished Loading Table of type COPY_ON_WRITE from ... +``` + +使用desc命令可以查看hudi表的描述信息: + +```java +hoodie:hoodie_table_1->desc +18/09/06 15:57:19 INFO timeline.HoodieActiveTimeline: Loaded instants [] + _________________________________________________________ + | Property | Value | + |========================================================| + | basePath | ... | + | metaPath | ... | + | fileSystem | hdfs | + | hoodie.table.name | hoodie_table_1 | + | hoodie.table.type | COPY_ON_WRITE | + | hoodie.archivelog.folder| | +``` + +以下是连接到包含uber trips的Hudi数据集的示例命令。 + +```java +hoodie:trips->connect --path /app/uber/trips + +16/10/05 23:20:37 INFO model.HoodieTableMetadata: Attempting to load the commits under /app/uber/trips/.hoodie with suffix .commit +16/10/05 23:20:37 INFO model.HoodieTableMetadata: Attempting to load the commits under /app/uber/trips/.hoodie with suffix .inflight +16/10/05 23:20:37 INFO model.HoodieTableMetadata: All commits :HoodieCommits{commitList=[20161002045850, 20161002052915, 20161002055918, 20161002065317, 20161002075932, 20161002082904, 20161002085949, 20161002092936, 20161002105903, 20161002112938, 20161002123005, 20161002133002, 20161002155940, 20161002165924, 20161002172907, 20161002175905, 20161002190016, 20161002192954, 20161002195925, 20161002205935, 20161002215928, 20161002222938, 20161002225915, 20161002232906, 20161003003028, 20161003005958, 20161003012936, 20161003022924, 20161003025859, 20161003032854, 20161003042930, 20161003052911, 20161003055907, 20161003062946, 20161003065927, 20161003075924, 20161003082926, 20161003085925, 20161003092909, 20161003100010, 20161003102913, 20161003105850, 20161003112910, 20161003115851, 20161003122929, 20161003132931, 20161003142952, 20161003145856, 20161003152953, 20161003155912, 20161003162922, 20161003165852, 20161003172923, 20161003175923, 20161003195931, 20161003210118, 20161003212919, 20161003215928, 20161003223000, 20161003225858, 20161004003042, 20161004011345, 20161004015235, 20161004022234, 20161004063001, 20161004072402, 20161004074436, 20161004080224, 20161004082928, 20161004085857, 20161004105922, 20161004122927, 20161004142929, 20161004163026, 20161004175925, 20161004194411, 20161004203202, 20161004211210, 20161004214115, 20161004220437, 20161004223020, 20161004225321, 20161004231431, 20161004233643, 20161005010227, 20161005015927, 20161005022911, 20161005032958, 20161005035939, 20161005052904, 20161005070028, 20161005074429, 20161005081318, 20161005083455, 20161005085921, 20161005092901, 20161005095936, 20161005120158, 20161005123418, 20161005125911, 20161005133107, 20161005155908, 20161005163517, 20161005165855, 20161005180127, 20161005184226, 20161005191051, 20161005193234, 20161005203112, 20161005205920, 20161005212949, 20161005223034, 20161005225920]} +Metadata for table trips loaded +hoodie:trips-> +``` + +连接到数据集后,便可使用许多其他命令。该shell程序具有上下文自动完成帮助(按TAB键),下面是所有命令的列表,本节中对其中的一些命令进行了详细示例。 + + +```java +hoodie:trips->help +* ! - Allows execution of operating system (OS) commands +* // - Inline comment markers (start of line only) +* ; - Inline comment markers (start of line only) +* addpartitionmeta - Add partition metadata to a dataset, if not present +* clear - Clears the console +* cls - Clears the console +* commit rollback - Rollback a commit +* commits compare - Compare commits with another Hoodie dataset +* commit showfiles - Show file level details of a commit +* commit showpartitions - Show partition level details of a commit +* commits refresh - Refresh the commits +* commits show - Show the commits +* commits sync - Compare commits with another Hoodie dataset +* connect - Connect to a hoodie dataset +* date - Displays the local date and time +* exit - Exits the shell +* help - List all commands usage +* quit - Exits the shell +* records deduplicate - De-duplicate a partition path contains duplicates & produce repaired files to replace with +* script - Parses the specified resource file and executes its commands +* stats filesizes - File Sizes. Display summary stats on sizes of files +* stats wa - Write Amplification. Ratio of how many records were upserted to how many records were actually written +* sync validate - Validate the sync by counting the number of records +* system properties - Shows the shell's properties +* utils loadClass - Load a class +* version - Displays shell version + +hoodie:trips-> +``` + + +### 检查提交 + +在Hudi中,更新或插入一批记录的任务被称为**提交**。提交可提供基本的原子性保证,即只有提交的数据可用于查询。 +每个提交都有一个单调递增的字符串/数字,称为**提交编号**。通常,这是我们开始提交的时间。 + +查看有关最近10次提交的一些基本信息, + + +```java +hoodie:trips->commits show --sortBy "Total Bytes Written" --desc true --limit 10 + ________________________________________________________________________________________________________________________________________________________________________ + | CommitTime | Total Bytes Written| Total Files Added| Total Files Updated| Total Partitions Written| Total Records Written| Total Update Records Written| Total Errors| + |=======================================================================================================================================================================| + .... + .... + .... +hoodie:trips-> +``` + +在每次写入开始时,Hudi还将.inflight提交写入.hoodie文件夹。您可以使用那里的时间戳来估计正在进行的提交已经花费的时间 + +```java +$ hdfs dfs -ls /app/uber/trips/.hoodie/*.inflight +-rw-r--r-- 3 vinoth supergroup 321984 2016-10-05 23:18 /app/uber/trips/.hoodie/20161005225920.inflight +``` + + +### 深入到特定的提交 + +了解写入如何分散到特定分区, + + +```java +hoodie:trips->commit showpartitions --commit 20161005165855 --sortBy "Total Bytes Written" --desc true --limit 10 + __________________________________________________________________________________________________________________________________________ + | Partition Path| Total Files Added| Total Files Updated| Total Records Inserted| Total Records Updated| Total Bytes Written| Total Errors| + |=========================================================================================================================================| + .... + .... +``` + +如果您需要文件级粒度,我们可以执行以下操作 + +```java +hoodie:trips->commit showfiles --commit 20161005165855 --sortBy "Partition Path" + ________________________________________________________________________________________________________________________________________________________ + | Partition Path| File ID | Previous Commit| Total Records Updated| Total Records Written| Total Bytes Written| Total Errors| + |=======================================================================================================================================================| + .... + .... +``` + + +### 文件系统视图 + +Hudi将每个分区视为文件组的集合,每个文件组包含按提交顺序排列的文件切片列表(请参阅概念)。以下命令允许用户查看数据集的文件切片。 + +```java + hoodie:stock_ticks_mor->show fsview all + .... + _______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________ + | Partition | FileId | Base-Instant | Data-File | Data-File Size| Num Delta Files| Total Delta File Size| Delta Files | + |==============================================================================================================================================================================================================================================================================================================================================================================================================| + | 2018/08/31| 111415c3-f26d-4639-86c8-f9956f245ac3| 20181002180759| hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/111415c3-f26d-4639-86c8-f9956f245ac3_0_20181002180759.parquet| 432.5 KB | 1 | 20.8 KB | [HoodieLogFile {hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/.111415c3-f26d-4639-86c8-f9956f245ac3_20181002180759.log.1}]| + + + + hoodie:stock_ticks_mor->show fsview latest --partitionPath| Partition | FileId | Base-Instant | Data-File | Data-File Size| Num Delta Files| Total Delta Size| Delta Size - compaction scheduled| Delta Size - compaction unscheduled| Delta To Base Ratio - compaction scheduled| Delta To Base Ratio - compaction unscheduled| Delta Files - compaction scheduled | Delta Files - compaction unscheduled| + |=================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================| + | 2018/08/31| 111415c3-f26d-4639-86c8-f9956f245ac3| 20181002180759| hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/111415c3-f26d-4639-86c8-f9956f245ac3_0_20181002180759.parquet| 432.5 KB | 1 | 20.8 KB | 20.8 KB | 0.0 B | 0.0 B | 0.0 B | [HoodieLogFile {hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/.111415c3-f26d-4639-86c8-f9956f245ac3_20181002180759.log.1}]| [] | + + hoodie:stock_ticks_mor-> +``` + + +### 统计信息 + +由于Hudi直接管理DFS数据集的文件大小,这些信息会帮助你全面了解Hudi的运行状况 + + +```java +hoodie:trips->stats filesizes --partitionPath 2016/09/01 --sortBy "95th" --desc true --limit 10 + ________________________________________________________________________________________________ + | CommitTime | Min | 10th | 50th | avg | 95th | Max | NumFiles| StdDev | + |===============================================================================================| + | | 93.9 MB | 93.9 MB | 93.9 MB | 93.9 MB | 93.9 MB | 93.9 MB | 2 | 2.3 KB | + .... + .... +``` + +如果Hudi写入花费的时间更长,那么可以通过观察写放大指标来发现任何异常 + +```java +hoodie:trips->stats wa + __________________________________________________________________________ + | CommitTime | Total Upserted| Total Written| Write Amplifiation Factor| + |=========================================================================| + .... + .... +``` + + +### 归档的提交 + +为了限制DFS上.commit文件的增长量,Hudi将较旧的.commit文件(适当考虑清理策略)归档到commits.archived文件中。 +这是一个序列文件,其包含commitNumber => json的映射,及有关提交的原始信息(上面已很好地汇总了相同的信息)。 + +### 压缩 + +要了解压缩和写程序之间的时滞,请使用以下命令列出所有待处理的压缩。 + +```java +hoodie:trips->compactions show all + ___________________________________________________________________ + | Compaction Instant Time| State | Total FileIds to be Compacted| + |==================================================================| + | | REQUESTED| 35 | + | | INFLIGHT | 27 | +``` + +要检查特定的压缩计划,请使用 + +```java +hoodie:trips->compaction show --instant + _________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________ + | Partition Path| File Id | Base Instant | Data File Path | Total Delta Files| getMetrics | + |================================================================================================================================================================================================================================================ + | 2018/07/17 | | | viewfs://ns-default/.../../UUID_.parquet | 1 | {TOTAL_LOG_FILES=1.0, TOTAL_IO_READ_MB=1230.0, TOTAL_LOG_FILES_SIZE=2.51255751E8, TOTAL_IO_WRITE_MB=991.0, TOTAL_IO_MB=2221.0}| + +``` + +要手动调度或运行压缩,请使用以下命令。该命令使用spark启动器执行压缩操作。 +注意:确保没有其他应用程序正在同时调度此数据集的压缩 + +```java +hoodie:trips->help compaction schedule +Keyword: compaction schedule +Description: Schedule Compaction + Keyword: sparkMemory + Help: Spark executor memory + Mandatory: false + Default if specified: '__NULL__' + Default if unspecified: '1G' + +* compaction schedule - Schedule Compaction +``` + +```java +hoodie:trips->help compaction run +Keyword: compaction run +Description: Run Compaction for given instant time + Keyword: tableName + Help: Table name + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + + Keyword: parallelism + Help: Parallelism for hoodie compaction + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + + Keyword: schemaFilePath + Help: Path for Avro schema file + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + + Keyword: sparkMemory + Help: Spark executor memory + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + + Keyword: retry + Help: Number of retries + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + + Keyword: compactionInstant + Help: Base path for the target hoodie dataset + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + +* compaction run - Run Compaction for given instant time +``` + +### 验证压缩 + +验证压缩计划:检查压缩所需的所有文件是否都存在且有效 + +```java +hoodie:stock_ticks_mor->compaction validate --instant 20181005222611 +... + + COMPACTION PLAN VALID + + ___________________________________________________________________________________________________________________________________________________________________________________________________________________________ + | File Id | Base Instant Time| Base Data File | Num Delta Files| Valid| Error| + |==========================================================================================================================================================================================================================| + | 05320e98-9a57-4c38-b809-a6beaaeb36bd| 20181005222445 | hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/05320e98-9a57-4c38-b809-a6beaaeb36bd_0_20181005222445.parquet| 1 | true | | + + + +hoodie:stock_ticks_mor->compaction validate --instant 20181005222601 + + COMPACTION PLAN INVALID + + _______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________ + | File Id | Base Instant Time| Base Data File | Num Delta Files| Valid| Error | + |=====================================================================================================================================================================================================================================================================================================| + | 05320e98-9a57-4c38-b809-a6beaaeb36bd| 20181005222445 | hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/05320e98-9a57-4c38-b809-a6beaaeb36bd_0_20181005222445.parquet| 1 | false| All log files specified in compaction operation is not present. Missing .... | +``` + +### 注意 + +必须在其他写入/摄取程序没有运行的情况下执行以下命令。 + +有时,有必要从压缩计划中删除fileId以便加快或取消压缩操作。 +压缩计划之后在此文件上发生的所有新日志文件都将被安全地重命名以便进行保留。Hudi提供以下CLI来支持 + + +### 取消调度压缩 + +```java +hoodie:trips->compaction unscheduleFileId --fileId +.... +No File renames needed to unschedule file from pending compaction. Operation successful. +``` + +在其他情况下,需要撤销整个压缩计划。以下CLI支持此功能 + +```java +hoodie:trips->compaction unschedule --instant +..... +No File renames needed to unschedule pending compaction. Operation successful. +``` + +### 修复压缩 + +上面的压缩取消调度操作有时可能会部分失败(例如:DFS暂时不可用)。 +如果发生部分故障,则压缩操作可能与文件切片的状态不一致。 +当您运行`压缩验证`时,您会注意到无效的压缩操作(如果有的话)。 +在这种情况下,修复命令将立即执行,它将重新排列文件切片,以使文件不丢失,并且文件切片与压缩计划一致 + +```java +hoodie:stock_ticks_mor->compaction repair --instant 20181005222611 +...... +Compaction successfully repaired +..... +``` + + +## [指标](/cn/docs/configurations#指标配置) {#metrics} + +为Hudi Client配置正确的数据集名称和指标环境后,它将生成以下graphite指标,以帮助调试hudi数据集 + + - **提交持续时间** - 这是成功提交一批记录所花费的时间 + - **回滚持续时间** - 同样,撤消失败的提交所剩余的部分数据所花费的时间(每次写入失败后都会自动发生) + - **文件级别指标** - 显示每次提交中新增、版本、删除(清除)的文件数量 + - **记录级别指标** - 每次提交插入/更新的记录总数 + - **分区级别指标** - 更新的分区数量(对于了解提交持续时间的突然峰值非常有用) + +然后可以将这些指标绘制在grafana等标准工具上。以下是提交持续时间图表示例。 + +
+ hudi_commit_duration.png +
+ + +## 故障排除 {#troubleshooting} + +以下部分通常有助于调试Hudi故障。以下元数据已被添加到每条记录中,可以通过标准Hadoop SQL引擎(Hive/Presto/Spark)检索,来更容易地诊断问题的严重性。 + + - **_hoodie_record_key** - 作为每个DFS分区内的主键,是所有更新/插入的基础 + - **_hoodie_commit_time** - 该记录上次的提交 + - **_hoodie_file_name** - 包含记录的实际文件名(对检查重复非常有用) + - **_hoodie_partition_path** - basePath的路径,该路径标识包含此记录的分区 + +请注意,到目前为止,Hudi假定应用程序为给定的recordKey传递相同的确定性分区路径。即仅在每个分区内保证recordKey(主键)的唯一性。 + +### 缺失记录 + +请在可能写入记录的窗口中,使用上面的admin命令检查是否存在任何写入错误。 +如果确实发现错误,那么记录实际上不是由Hudi写入的,而是交还给应用程序来决定如何处理。 + +### 重复 + +首先,请确保访问Hudi数据集的查询是[没有问题的](querying_data),并之后确认的确有重复。 + + - 如果确认,请使用上面的元数据字段来标识包含记录的物理文件和分区文件。 + - 如果重复的记录存在于不同分区路径下的文件,则意味着您的应用程序正在为同一recordKey生成不同的分区路径,请修复您的应用程序. + - 如果重复的记录存在于同一分区路径下的多个文件,请使用邮件列表汇报这个问题。这不应该发生。您可以使用`records deduplicate`命令修复数据。 + +### Spark故障 {#spark-ui} + +典型的upsert() DAG如下所示。请注意,Hudi客户端会缓存中间的RDD,以智能地并调整文件大小和Spark并行度。 +另外,由于还显示了探针作业,Spark UI显示了两次sortByKey,但它只是一个排序。 +
+ hudi_upsert_dag.png +
+ + +概括地说,有两个步骤 + +**索引查找以标识要更改的文件** + + - Job 1 : 触发输入数据读取,转换为HoodieRecord对象,然后根据输入记录拿到目标分区路径。 + - Job 2 : 加载我们需要检查的文件名集。 + - Job 3 & 4 : 通过联合上面1和2中的RDD,智能调整spark join并行度,然后进行实际查找。 + - Job 5 : 生成带有位置的recordKeys作为标记的RDD。 + +**执行数据的实际写入** + + - Job 6 : 将记录与recordKey(位置)进行懒惰连接,以提供最终的HoodieRecord集,现在它包含每条记录的文件/分区路径信息(如果插入,则为null)。然后还要再次分析工作负载以确定文件的大小。 + - Job 7 : 实际写入数据(更新 + 插入 + 插入转为更新以保持文件大小) + +根据异常源(Hudi/Spark),上述关于DAG的信息可用于查明实际问题。最常遇到的故障是由YARN/DFS临时故障引起的。 +将来,将在项目中添加更复杂的调试/管理UI,以帮助自动进行某些调试。 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/docker_demo.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/docker_demo.md new file mode 100644 index 0000000000000..eea0e884d0863 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/docker_demo.md @@ -0,0 +1,1122 @@ +--- +title: Docker Demo +keywords: [ hudi, docker, demo] +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +## 一个使用 Docker 容器的 Demo + +我们来使用一个真实世界的案例,来看看 Hudi 是如何闭环运转的。 为了这个目的,在你的计算机中的本地 Docker 集群中组建了一个自包含的数据基础设施。 + +以下步骤已经在一台 Mac 笔记本电脑上测试过了。 + +### 前提条件 + + * Docker 安装 : 对于 Mac ,请依照 [https://docs.docker.com/v17.12/docker-for-mac/install/] 当中定义的步骤。 为了运行 Spark-SQL 查询,请确保至少分配给 Docker 6 GB 和 4 个 CPU 。(参见 Docker -> Preferences -> Advanced)。否则,Spark-SQL 查询可能被因为内存问题而被杀停。 + * kafkacat : 一个用于发布/消费 Kafka Topic 的命令行工具集。使用 `brew install kafkacat` 来安装 kafkacat 。 + * /etc/hosts : Demo 通过主机名引用了多个运行在容器中的服务。将下列设置添加到 /etc/hosts : + + +```java + 127.0.0.1 adhoc-1 + 127.0.0.1 adhoc-2 + 127.0.0.1 namenode + 127.0.0.1 datanode1 + 127.0.0.1 hiveserver + 127.0.0.1 hivemetastore + 127.0.0.1 kafkabroker + 127.0.0.1 sparkmaster + 127.0.0.1 zookeeper +``` + +此外,这未在其它一些环境中进行测试,例如 Windows 上的 Docker 。 + + +## 设置 Docker 集群 + + +### 构建 Hudi + +构建 Hudi 的第一步: +```java +cd +mvn package -DskipTests +``` + +### 组建 Demo 集群 + +下一步是运行 Docker 安装脚本并设置配置项以便组建集群。 +这需要从 Docker 镜像库拉取 Docker 镜像,并设置 Docker 集群。 + +```java +cd docker +./setup_demo.sh +.... +.... +.... +Stopping spark-worker-1 ... done +Stopping hiveserver ... done +Stopping hivemetastore ... done +Stopping historyserver ... done +....... +...... +Creating network "hudi_demo" with the default driver +Creating hive-metastore-postgresql ... done +Creating namenode ... done +Creating zookeeper ... done +Creating kafkabroker ... done +Creating hivemetastore ... done +Creating historyserver ... done +Creating hiveserver ... done +Creating datanode1 ... done +Creating presto-coordinator-1 ... done +Creating sparkmaster ... done +Creating presto-worker-1 ... done +Creating adhoc-1 ... done +Creating adhoc-2 ... done +Creating spark-worker-1 ... done +Copying spark default config and setting up configs +Copying spark default config and setting up configs +Copying spark default config and setting up configs +$ docker ps +``` + +至此, Docker 集群将会启动并运行。 Demo 集群提供了下列服务: + + * HDFS 服务( NameNode, DataNode ) + * Spark Master 和 Worker + * Hive 服务( Metastore, HiveServer2 以及 PostgresDB ) + * Kafka Broker 和一个 Zookeeper Node ( Kafka 将被用来当做 Demo 的上游数据源 ) + * 用来运行 Hudi/Hive CLI 命令的 Adhoc 容器 + +## Demo + +Stock Tracker 数据将用来展示不同的 Hudi 视图以及压缩带来的影响。 + +看一下 `docker/demo/data` 目录。那里有 2 批股票数据——都是 1 分钟粒度的。 +第 1 批数据包含一些股票代码在交易窗口(9:30 a.m 至 10:30 a.m)的第一个小时里的行情数据数据。第 2 批包含接下来 30 分钟(10:30 - 11 a.m)的交易数据。 Hudi 将被用来将两个批次的数据采集到一个数据集中,这个数据集将会包含最新的小时级股票行情数据。 +两个批次被有意地按窗口切分,这样在第 2 批数据中包含了一些针对第 1 批数据条目的更新数据。 + +### Step 1 : 将第 1 批数据发布到 Kafka + +将第 1 批数据上传到 Kafka 的 Topic “stock ticks” 中 `cat docker/demo/data/batch_1.json | kafkacat -b kafkabroker -t stock_ticks -P` + +为了检查新的 Topic 是否出现,使用 +```java +kafkacat -b kafkabroker -L -J | jq . +{ + "originating_broker": { + "id": 1001, + "name": "kafkabroker:9092/1001" + }, + "query": { + "topic": "*" + }, + "brokers": [ + { + "id": 1001, + "name": "kafkabroker:9092" + } + ], + "topics": [ + { + "topic": "stock_ticks", + "partitions": [ + { + "partition": 0, + "leader": 1001, + "replicas": [ + { + "id": 1001 + } + ], + "isrs": [ + { + "id": 1001 + } + ] + } + ] + } + ] +} + +``` + +### Step 2: 从 Kafka Topic 中增量采集数据 + +Hudi 自带一个名为 DeltaStreamer 的工具。 这个工具能连接多种数据源(包括 Kafka),以便拉取变更,并通过 upsert/insert 操作应用到 Hudi 数据集。此处,我们将使用这个工具从 Kafka Topic 下载 JSON 数据,并采集到前面步骤中初始化的 COW 和 MOR 表中。如果数据集不存在,这个工具将自动初始化数据集到文件系统中。 + +```java +docker exec -it adhoc-2 /bin/bash + +# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow dataset in HDFS +spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type COPY_ON_WRITE --source-class org.apache.hudi.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider + + +# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor dataset in HDFS +spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type MERGE_ON_READ --source-class org.apache.hudi.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props /var/demo/config/kafka-source.properties --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider --disable-compaction + + +# As part of the setup (Look at setup_demo.sh), the configs needed for DeltaStreamer is uploaded to HDFS. The configs +# contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields. + +exit +``` + +你可以使用 HDFS 的 Web 浏览器来查看数据集 +`http://namenode:50070/explorer#/user/hive/warehouse/stock_ticks_cow`. + +你可以浏览在数据集中新创建的分区文件夹,同时还有一个在 .hoodie 目录下的 deltacommit 文件。 + +在 MOR 数据集中也有类似的设置 +`http://namenode:50070/explorer#/user/hive/warehouse/stock_ticks_mor` + + +### Step 3: 与 Hive 同步 + +到了这一步,数据集在 HDFS 中可用。我们需要与 Hive 同步来创建新 Hive 表并添加分区,以便在那些数据集上执行 Hive 查询。 + +```java +docker exec -it adhoc-2 /bin/bash + +# THis command takes in HIveServer URL and COW Hudi Dataset location in HDFS and sync the HDFS state to Hive +/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh --jdbc-url jdbc:hive2://hiveserver:10000 --user hive --pass hive --partitioned-by dt --base-path /user/hive/warehouse/stock_ticks_cow --database default --table stock_ticks_cow +..... +2018-09-24 22:22:45,568 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_cow +..... + +# Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR storage) +/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh --jdbc-url jdbc:hive2://hiveserver:10000 --user hive --pass hive --partitioned-by dt --base-path /user/hive/warehouse/stock_ticks_mor --database default --table stock_ticks_mor +... +2018-09-24 22:23:09,171 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_mor +... +2018-09-24 22:23:09,559 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_mor_rt +.... +exit +``` +执行了以上命令后,你会发现: + +1. 一个名为 `stock_ticks_cow` 的 Hive 表被创建,它为写时复制数据集提供了读优化视图。 +2. 两个新表 `stock_ticks_mor` 和 `stock_ticks_mor_rt` 被创建用于读时合并数据集。 前者为 Hudi 数据集提供了读优化视图,而后者为数据集提供了实时视图。 + + +### Step 4 (a): 运行 Hive 查询 + +执行一个 Hive 查询来为股票 GOOG 找到采集到的最新时间戳。你会注意到读优化视图( COW 和 MOR 数据集都是如此)和实时视图(仅对 MOR 数据集)给出了相同的值 “10:29 a.m”,这是因为 Hudi 为每个批次的数据创建了一个 Parquet 文件。 + +```java +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false +# List Tables +0: jdbc:hive2://hiveserver:10000> show tables; ++---------------------+--+ +| tab_name | ++---------------------+--+ +| stock_ticks_cow | +| stock_ticks_mor | +| stock_ticks_mor_rt | ++---------------------+--+ +2 rows selected (0.801 seconds) +0: jdbc:hive2://hiveserver:10000> + + +# Look at partitions that were added +0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt; ++----------------+--+ +| partition | ++----------------+--+ +| dt=2018-08-31 | ++----------------+--+ +1 row selected (0.24 seconds) + + +# COPY-ON-WRITE Queries: +========================= + + +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:29:00 | ++---------+----------------------+--+ + +Now, run a projection query: + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924221953 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + + +# Merge-On-Read Queries: +========================== + +Lets run similar queries against M-O-R dataset. Lets look at both +ReadOptimized and Realtime views supported by M-O-R dataset + +# Run against ReadOptimized View. Notice that the latest timestamp is 10:29 +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:29:00 | ++---------+----------------------+--+ +1 row selected (6.326 seconds) + + +# Run against Realtime View. Notice that the latest timestamp is again 10:29 + +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:29:00 | ++---------+----------------------+--+ +1 row selected (1.606 seconds) + + +# Run projection query against Read Optimized and Realtime tables + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +exit +exit +``` + +### Step 4 (b): 执行 Spark-SQL 查询 +Hudi 支持以 Spark 作为类似 Hive 的查询引擎。这是在 Spartk-SQL 中执行与 Hive 相同的查询 + +```java +docker exec -it adhoc-1 /bin/bash +$SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --master local[2] --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0 +... + +Welcome to + ____ __ + / __/__ ___ _____/ /__ + _\ \/ _ \/ _ `/ __/ '_/ + /___/ .__/\_,_/_/ /_/\_\ version 2.3.1 + /_/ + +Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_181) +Type in expressions to have them evaluated. +Type :help for more information. + +scala> +scala> spark.sql("show tables").show(100, false) ++--------+------------------+-----------+ +|database|tableName |isTemporary| ++--------+------------------+-----------+ +|default |stock_ticks_cow |false | +|default |stock_ticks_mor |false | +|default |stock_ticks_mor_rt|false | ++--------+------------------+-----------+ + +# Copy-On-Write Table + +## Run max timestamp query against COW table + +scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false) +[Stage 0:> (0 + 1) / 1]SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". +SLF4J: Defaulting to no-operation (NOP) logger implementation +SLF4J: See http://www.slf4j.org/codes#StaticLoggerBinder for further details. ++------+-------------------+ +|symbol|max(ts) | ++------+-------------------+ +|GOOG |2018-08-31 10:29:00| ++------+-------------------+ + +## Projection Query + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false) ++-------------------+------+-------------------+------+---------+--------+ +|_hoodie_commit_time|symbol|ts |volume|open |close | ++-------------------+------+-------------------+------+---------+--------+ +|20180924221953 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 | +|20180924221953 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085| ++-------------------+------+-------------------+------+---------+--------+ + +# Merge-On-Read Queries: +========================== + +Lets run similar queries against M-O-R dataset. Lets look at both +ReadOptimized and Realtime views supported by M-O-R dataset + +# Run against ReadOptimized View. Notice that the latest timestamp is 10:29 +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false) ++------+-------------------+ +|symbol|max(ts) | ++------+-------------------+ +|GOOG |2018-08-31 10:29:00| ++------+-------------------+ + + +# Run against Realtime View. Notice that the latest timestamp is again 10:29 + +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) ++------+-------------------+ +|symbol|max(ts) | ++------+-------------------+ +|GOOG |2018-08-31 10:29:00| ++------+-------------------+ + +# Run projection query against Read Optimized and Realtime tables + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false) ++-------------------+------+-------------------+------+---------+--------+ +|_hoodie_commit_time|symbol|ts |volume|open |close | ++-------------------+------+-------------------+------+---------+--------+ +|20180924222155 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 | +|20180924222155 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085| ++-------------------+------+-------------------+------+---------+--------+ + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false) ++-------------------+------+-------------------+------+---------+--------+ +|_hoodie_commit_time|symbol|ts |volume|open |close | ++-------------------+------+-------------------+------+---------+--------+ +|20180924222155 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 | +|20180924222155 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085| ++-------------------+------+-------------------+------+---------+--------+ + +``` + +### Step 4 (c): 执行 Presto 查询 + +这是 Presto 查询,它们与 Hive 和 Spark 的查询类似。目前 Hudi 的实时视图不支持 Presto 。 + +```java +docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090 +presto> show catalogs; + Catalog +----------- + hive + jmx + localfile + system +(4 rows) + +Query 20190817_134851_00000_j8rcz, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:04 [0 rows, 0B] [0 rows/s, 0B/s] + +presto> use hive.default; +USE +presto:default> show tables; + Table +-------------------- + stock_ticks_cow + stock_ticks_mor + stock_ticks_mor_rt +(3 rows) + +Query 20190822_181000_00001_segyw, FINISHED, 2 nodes +Splits: 19 total, 19 done (100.00%) +0:05 [3 rows, 99B] [0 rows/s, 18B/s] + + +# COPY-ON-WRITE Queries: +========================= + + +presto:default> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:29:00 +(1 row) + +Query 20190822_181011_00002_segyw, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0:12 [197 rows, 613B] [16 rows/s, 50B/s] + +presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20190822180221 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20190822180221 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 +(2 rows) + +Query 20190822_181141_00003_segyw, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0:02 [197 rows, 613B] [109 rows/s, 341B/s] + + +# Merge-On-Read Queries: +========================== + +Lets run similar queries against M-O-R dataset. + +# Run against ReadOptimized View. Notice that the latest timestamp is 10:29 +presto:default> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:29:00 +(1 row) + +Query 20190822_181158_00004_segyw, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0:02 [197 rows, 613B] [110 rows/s, 343B/s] + + +presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20190822180250 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20190822180250 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 +(2 rows) + +Query 20190822_181256_00006_segyw, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0:02 [197 rows, 613B] [92 rows/s, 286B/s] + +presto:default> exit +``` + +### Step 5: 将第 2 批次上传到 Kafka 并运行 DeltaStreamer 进行采集 + +上传第 2 批次数据,并使用 DeltaStreamer 采集。由于这个批次不会引入任何新分区,因此不需要执行 Hive 同步。 + +```java +cat docker/demo/data/batch_2.json | kafkacat -b kafkabroker -t stock_ticks -P + +# Within Docker container, run the ingestion command +docker exec -it adhoc-2 /bin/bash + +# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow dataset in HDFS +spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type COPY_ON_WRITE --source-class org.apache.hudi.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider + + +# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor dataset in HDFS +spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type MERGE_ON_READ --source-class org.apache.hudi.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props /var/demo/config/kafka-source.properties --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider --disable-compaction + +exit +``` + +使用写时复制表, DeltaStreamer 的第 2 批数据采集将导致 Parquet 文件创建一个新版本。 +参考: `http://namenode:50070/explorer#/user/hive/warehouse/stock_ticks_cow/2018/08/31` + +使用读时合并表, 第 2 批数据采集仅仅将数据追加到没有合并的 delta (日志) 文件中。看一下 HDFS 文件系统来了解这一点: `http://namenode:50070/explorer#/user/hive/warehouse/stock_ticks_mor/2018/08/31` + +### Step 6 (a): 执行 Hive 查询 + +使用写时复制表,在每一个批次被提交采集并创建新版本的 Parquet 文件时,读优化视图会立即发现变更,这些变更被当第 2 批次的一部分。 + +使用读时合并表,第 2 批数据采集仅仅将数据追加到没有合并的 delta (日志) 文件中。 +此时,读优化视图和实时视图将提供不同的结果。读优化视图仍会返回“10:29 am”,因为它会只会从 Parquet 文件中读取。实时视图会做即时合并并返回最新提交的数据,即“10:59 a.m”。 + +```java +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false + +# Copy On Write Table: + +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ +1 row selected (1.932 seconds) + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924224524 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +As you can notice, the above queries now reflect the changes that came as part of ingesting second batch. + + +# Merge On Read Table: + +# Read Optimized View +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:29:00 | ++---------+----------------------+--+ +1 row selected (1.6 seconds) + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +# Realtime View +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924224537 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +exit +exit +``` + +### Step 6 (b): 执行 Spark SQL 查询 + +以 Spark SQL 执行类似的查询: + +```java +docker exec -it adhoc-1 /bin/bash +bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0 + +# Copy On Write Table: + +scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false) ++------+-------------------+ +|symbol|max(ts) | ++------+-------------------+ +|GOOG |2018-08-31 10:59:00| ++------+-------------------+ + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false) + ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924224524 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +As you can notice, the above queries now reflect the changes that came as part of ingesting second batch. + + +# Merge On Read Table: + +# Read Optimized View +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false) ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:29:00 | ++---------+----------------------+--+ +1 row selected (1.6 seconds) + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false) ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +# Realtime View +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false) ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924224537 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +exit +exit +``` + +### Step 6 (c): 执行 Presto 查询 + +在 Presto 中为读优化视图执行类似的查询: + + +```java +docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090 +presto> use hive.default; +USE + +# Copy On Write Table: + +presto:default>select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:59:00 +(1 row) + +Query 20190822_181530_00007_segyw, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0:02 [197 rows, 613B] [125 rows/s, 389B/s] + +presto:default>select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20190822180221 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20190822181433 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 +(2 rows) + +Query 20190822_181545_00008_segyw, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0:02 [197 rows, 613B] [106 rows/s, 332B/s] + +As you can notice, the above queries now reflect the changes that came as part of ingesting second batch. + + +# Merge On Read Table: + +# Read Optimized View +presto:default> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:29:00 +(1 row) + +Query 20190822_181602_00009_segyw, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0:01 [197 rows, 613B] [139 rows/s, 435B/s] + +presto:default>select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20190822180250 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20190822180250 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 +(2 rows) + +Query 20190822_181615_00010_segyw, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0:01 [197 rows, 613B] [154 rows/s, 480B/s] + +presto:default> exit +``` + + +### Step 7 : 写时复制表的增量查询 + +使用采集的两个批次的数据,我们展示 Hudi 写时复制数据集中支持的增量查询。 + +我们使用类似的工程查询样例: + +```java +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924064621 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +``` + +正如你在上面的查询中看到的,有两个提交——按时间线排列是 20180924064621 和 20180924065039 。 +当你按照这些步骤执行后,你的提交会得到不同的时间戳。将它们替换到上面时间戳的位置。 + +为了展示增量查询的影响,我们假设有一位读者已经在第 1 批数据中一部分看到了变化。那么,为了让读者看到第 2 批数据的影响,他/她需要保留第 1 批次提交时间中的开始时间( 20180924064621 )并执行增量查询: + +Hudi 的增量模式为增量查询提供了高效的扫描,通过 Hudi 管理的元数据,过滤掉了那些不包含候选记录的文件。 + +```java +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.mode=INCREMENTAL; +No rows affected (0.009 seconds) +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.max.commits=3; +No rows affected (0.009 seconds) +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.start.timestamp=20180924064621; +``` + +使用上面的设置,那些在提交 20180924065039 之后没有任何更新的文件ID将被过滤掉,不进行扫描。 +以下是增量查询: + +```java +0: jdbc:hive2://hiveserver:10000> +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064621'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +1 row selected (0.83 seconds) +0: jdbc:hive2://hiveserver:10000> +``` + +### 使用 Spark SQL 做增量查询 +```java +docker exec -it adhoc-1 /bin/bash +bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0 +Welcome to + ____ __ + / __/__ ___ _____/ /__ + _\ \/ _ \/ _ `/ __/ '_/ + /___/ .__/\_,_/_/ /_/\_\ version 2.3.1 + /_/ + +Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_181) +Type in expressions to have them evaluated. +Type :help for more information. + +scala> import org.apache.hudi.DataSourceReadOptions +import org.apache.hudi.DataSourceReadOptions + +# In the below query, 20180925045257 is the first commit's timestamp +scala> val hoodieIncViewDF = spark.read.format("org.apache.hudi").option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY, DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL).option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "20180924064621").load("/user/hive/warehouse/stock_ticks_cow") +SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". +SLF4J: Defaulting to no-operation (NOP) logger implementation +SLF4J: See http://www.slf4j.org/codes#StaticLoggerBinder for further details. +hoodieIncViewDF: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 15 more fields] + +scala> hoodieIncViewDF.registerTempTable("stock_ticks_cow_incr_tmp1") +warning: there was one deprecation warning; re-run with -deprecation for details + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_incr_tmp1 where symbol = 'GOOG'").show(100, false); ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +``` + + +### Step 8: 为读时合并数据集的调度并执行压缩 + +我们来调度并运行一个压缩来创建一个新版本的列式文件,以便读优化读取器能看到新数据。 +再次强调,你可以使用 Hudi CLI 来人工调度并执行压缩。 + +```java +docker exec -it adhoc-1 /bin/bash +root@adhoc-1:/opt# /var/hoodie/ws/hudi-cli/hudi-cli.sh +============================================ +* * +* _ _ _ _ * +* | | | | | | (_) * +* | |__| | __| | - * +* | __ || | / _` | || * +* | | | || || (_| | || * +* |_| |_|\___/ \____/ || * +* * +============================================ + +Welcome to Hoodie CLI. Please type help if you are looking for help. +hudi->connect --path /user/hive/warehouse/stock_ticks_mor +18/09/24 06:59:34 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +18/09/24 06:59:35 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor +18/09/24 06:59:35 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]] +18/09/24 06:59:35 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties +18/09/24 06:59:36 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor +Metadata for table stock_ticks_mor loaded + +# Ensure no compactions are present + +hoodie:stock_ticks_mor->compactions show all +18/09/24 06:59:54 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED]] + ___________________________________________________________________ + | Compaction Instant Time| State | Total FileIds to be Compacted| + |==================================================================| + + + + +# Schedule a compaction. This will use Spark Launcher to schedule compaction +hoodie:stock_ticks_mor->compaction schedule +.... +Compaction successfully completed for 20180924070031 + +# Now refresh and check again. You will see that there is a new compaction requested + +hoodie:stock_ticks->connect --path /user/hive/warehouse/stock_ticks_mor +18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor +18/09/24 07:01:16 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]] +18/09/24 07:01:16 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties +18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor +Metadata for table stock_ticks_mor loaded + + + +hoodie:stock_ticks_mor->compactions show all +18/09/24 06:34:12 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924041125__clean__COMPLETED], [20180924041125__deltacommit__COMPLETED], [20180924042735__clean__COMPLETED], [20180924042735__deltacommit__COMPLETED], [==>20180924063245__compaction__REQUESTED]] + ___________________________________________________________________ + | Compaction Instant Time| State | Total FileIds to be Compacted| + |==================================================================| + | 20180924070031 | REQUESTED| 1 | + + + + +# Execute the compaction. The compaction instant value passed below must be the one displayed in the above "compactions show all" query +hoodie:stock_ticks_mor->compaction run --compactionInstant 20180924070031 --parallelism 2 --sparkMemory 1G --schemaFilePath /var/demo/config/schema.avsc --retry 1 +.... +Compaction successfully completed for 20180924070031 + + +## Now check if compaction is completed + +hoodie:stock_ticks_mor->connect --path /user/hive/warehouse/stock_ticks_mor +18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor +18/09/24 07:03:00 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]] +18/09/24 07:03:00 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties +18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor +Metadata for table stock_ticks_mor loaded + + + +hoodie:stock_ticks->compactions show all +18/09/24 07:03:15 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED], [20180924070031__commit__COMPLETED]] + ___________________________________________________________________ + | Compaction Instant Time| State | Total FileIds to be Compacted| + |==================================================================| + | 20180924070031 | COMPLETED| 1 | + +``` + +### Step 9: 执行包含增量查询的 Hive 查询 + +你将看到读优化视图和实时视图都会展示最新提交的数据。 +让我们也对 MOR 表执行增量查询。 +通过查看下方的查询输出,能够明确 MOR 表的第一次提交时间是 20180924064636 而第二次提交时间是 20180924070031 。 + +```java +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false + +# Read Optimized View +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ +1 row selected (1.6 seconds) + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +# Realtime View +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +# Incremental View: + +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.mode=INCREMENTAL; +No rows affected (0.008 seconds) +# Max-Commits covers both second batch and compaction commit +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.max.commits=3; +No rows affected (0.007 seconds) +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.start.timestamp=20180924064636; +No rows affected (0.013 seconds) +# Query: +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064636'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +exit +exit +``` + +### Step 10: 压缩后在 MOR 的读优化视图与实时视图上使用 Spark-SQL + +```java +docker exec -it adhoc-1 /bin/bash +bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0 + +# Read Optimized View +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false) ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ +1 row selected (1.6 seconds) + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false) ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +# Realtime View +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false) ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +``` + +### Step 11: 压缩后在 MOR 数据集的读优化视图上进行 Presto 查询 + +```java +docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090 +presto> use hive.default; +USE + +# Read Optimized View +resto:default> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:59:00 +(1 row) + +Query 20190822_182319_00011_segyw, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0:01 [197 rows, 613B] [133 rows/s, 414B/s] + +presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20190822180250 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20190822181944 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 +(2 rows) + +Query 20190822_182333_00012_segyw, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0:02 [197 rows, 613B] [98 rows/s, 307B/s] + +presto:default> + +``` + + +Demo 到此结束。 + +## 在本地 Docker 环境中测试 Hudi + +你可以组建一个包含 Hadoop 、 Hive 和 Spark 服务的 Hadoop Docker 环境,并支持 Hudi 。 +```java +$ mvn pre-integration-test -DskipTests +``` +上面的命令为所有的服务构建了 Docker 镜像,它带有当前安装在 /var/hoodie/ws 的 Hudi 源,并使用一个部署文件引入了这些服务。我们当前在 Docker 镜像中使用 Hadoop (v2.8.4)、 Hive (v2.3.3)和 Spark (v2.3.1)。 + +要销毁容器: +```java +$ cd hudi-integ-test +$ mvn docker-compose:down +``` + +如果你想要组建 Docker 容器,使用: +```java +$ cd hudi-integ-test +$ mvn docker-compose:up -DdetachedMode=true +``` + +Hudi 是一个在包含 Hadoop 、 Hive 和 Spark 的海量数据分析/采集环境中使用的库。与这些系统的互用性是我们的一个关键目标。 我们在积极地向 __hudi-integ-test/src/test/java__ 添加集成测试,这些测试利用了这个 Docker 环境(参考: __hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java__ )。 + + +### 构建本地 Docker 容器: + +Demo 和执行集成测试所需要的 Docker 镜像已经在 Docker 源中。 Docker 镜像和部署脚本经过了谨慎的实现以便服务与多种目的: + +1. Docker 镜像有内建的 Hudi jar 包,它包含一些指向其他 jar 包的环境变量( HUDI_HADOOP_BUNDLE 等) +2. 为了执行集成测试,我们需要使用本地生成的 jar 包在 Docker 中运行服务。 Docker 部署脚本(参考 `docker/compose/docker-compose_hadoop284_hive233_spark231.yml`)能确保本地 jar 包通过挂载 Docker 地址上挂载本地 Hudi 工作空间,从而覆盖了内建的 jar 包。 +3. 当这些 Docker 容器挂载到本地 Hudi 工作空间之后,任何发生在工作空间中的变更将会自动反映到容器中。这对于开发者来说是一种开发和验证 Hudi 的简便方法,这些开发者没有分布式的环境。要注意的是,这是集成测试的执行方式。 + +这避免了维护分离的 Docker 镜像,也避免了本地构建 Docker 镜像的各个步骤的消耗。 +但是如果用户想要在有更低网络带宽的地方测试 Hudi ,他们仍可以构建本地镜像。 +在执行 `docker/setup_demo.sh` 之前执行脚本 `docker/build_local_docker_images.sh` 来构建本地 Docker 镜像。 + +以下是执行的命令: + +```java +cd docker +./build_local_docker_images.sh +..... + +[INFO] Reactor Summary: +[INFO] +[INFO] hoodie ............................................. SUCCESS [ 1.709 s] +[INFO] hudi-common ...................................... SUCCESS [ 9.015 s] +[INFO] hudi-hadoop-mr ................................... SUCCESS [ 1.108 s] +[INFO] hudi-client ...................................... SUCCESS [ 4.409 s] +[INFO] hudi-hive ........................................ SUCCESS [ 0.976 s] +[INFO] hudi-spark ....................................... SUCCESS [ 26.522 s] +[INFO] hudi-utilities ................................... SUCCESS [ 16.256 s] +[INFO] hudi-cli ......................................... SUCCESS [ 11.341 s] +[INFO] hudi-hadoop-mr-bundle ............................ SUCCESS [ 1.893 s] +[INFO] hudi-hive-bundle ................................. SUCCESS [ 14.099 s] +[INFO] hudi-spark-bundle ................................ SUCCESS [ 58.252 s] +[INFO] hudi-hadoop-docker ............................... SUCCESS [ 0.612 s] +[INFO] hudi-hadoop-base-docker .......................... SUCCESS [04:04 min] +[INFO] hudi-hadoop-namenode-docker ...................... SUCCESS [ 6.142 s] +[INFO] hudi-hadoop-datanode-docker ...................... SUCCESS [ 7.763 s] +[INFO] hudi-hadoop-history-docker ....................... SUCCESS [ 5.922 s] +[INFO] hudi-hadoop-hive-docker .......................... SUCCESS [ 56.152 s] +[INFO] hudi-hadoop-sparkbase-docker ..................... SUCCESS [01:18 min] +[INFO] hudi-hadoop-sparkmaster-docker ................... SUCCESS [ 2.964 s] +[INFO] hudi-hadoop-sparkworker-docker ................... SUCCESS [ 3.032 s] +[INFO] hudi-hadoop-sparkadhoc-docker .................... SUCCESS [ 2.764 s] +[INFO] hudi-integ-test .................................. SUCCESS [ 1.785 s] +[INFO] ------------------------------------------------------------------------ +[INFO] BUILD SUCCESS +[INFO] ------------------------------------------------------------------------ +[INFO] Total time: 09:15 min +[INFO] Finished at: 2018-09-10T17:47:37-07:00 +[INFO] Final Memory: 236M/1848M +[INFO] ------------------------------------------------------------------------ +``` diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/docs-versions.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/docs-versions.md new file mode 100644 index 0000000000000..942403ea2fbe4 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/docs-versions.md @@ -0,0 +1,12 @@ +--- +title: 文档版本 +keywords: [ hudi, privacy] +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + + + + +
+ diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/flink-quick-start-guide.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/flink-quick-start-guide.md new file mode 100644 index 0000000000000..0a811ed810519 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/flink-quick-start-guide.md @@ -0,0 +1,530 @@ +--- +title: "Flink 指南" +toc: true +last_modified_at: 2021-08-16T12:31:57-04:00 +language: cn +--- + +本指南提供了使用 Flink SQL 操作 Hudi 的文档。阅读本指南,您可以学习如何快速开始使用 Flink 读写 Hudi,同时对配置和任务优化有更深入的了解: + +- **快速开始** :通过阅读 [快速开始](#快速开始),你可以快速开始使用 Flink sql client 去读写 Hudi +- **配置** :对于 [Flink 配置](#flink-配置),使用 `$FLINK_HOME/conf/flink-conf.yaml` 来配置。 对于任意一个作业的配置,通过[表参数](#表参数)来设置 +- **写功能** :Flink 支持多种写功能用例,例如 [离线批量导入](#离线批量导入),[全量接增量](#全量接增量),[Changelog 模式](#changelog-模式),[Insert 模式](#insert-模式) 和 [离线 Compaction](#离线-compaction) +- **查询功能** :Flink 支持多种查询功能用例,例如 [Hive 查询](#hive-查询), [Presto 查询](#presto-查询) +- **优化** :针对 Flink 读写 Hudi 的操作,本指南提供了一些优化建议,例如 [内存优化](#内存优化) 和 [写入限流](#写入限流) + +## 快速开始 + +### 安装 + +我们推荐使用 [Flink Sql Client](https://ci.apache.org/projects/flink/flink-docs-release-1.13/docs/dev/table/sqlclient/) 来读写 Hudi,因为 Flink +sql client 对于 SQL 用户来说更容易上手。 + +#### 步骤1 下载 Flink jar +我们推荐使用 Flink-1.12.x 来读写 Hudi。 你可以按照 [Flink 安装文档](https://flink.apache.org/downloads) 的指导来安装 Flink。 `hudi-flink-bundle.jar` +使用的是 scala 2.11,所以我们推荐 Flink-1.12.x 配合 scala 2.11 来使用。 + +#### 步骤2 启动 Flink 集群 +在 Hadoop 环境下启动 standalone 的 Flink 集群。 +在你启动 Flink 集群前,我们推荐先配置如下参数: + +- 在 `$FLINK_HOME/conf/flink-conf.yaml` 中添加配置:`taskmanager.numberOfTaskSlots: 4` +- 在 `$FLINK_HOME/conf/flink-conf.yaml` 中,根据数据量大小和集群大小来添加其他的 [Flink 配置](#flink-配置) +- 在 `$FLINK_HOME/conf/workers` 中添加4核 `localhost` 来保证我们本地集群中有4个 workers + +启动集群: + +```bash +# HADOOP_HOME 是 Hadoop 的根目录。 +export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath` + +# 启动 Flink standalone 集群 +./bin/start-cluster.sh +``` +#### 步骤3 启动 Flink SQL client + +Hudi 将 Flink 板块单独打包为 `hudi-flink-bundle.jar`,该 Jar 包需要在启动的时候加载。 +你可以在 `hudi-source-dir/packaging/hudi-flink-bundle` 下手动的打包这个 Jar 包,或者从 [Apache Official Repository](https://repo.maven.apache.org/maven2/org/apache/hudi/hudi-flink-bundle_2.11/) +中下载。 + +启动 Flink SQL Client: + +```bash +# HADOOP_HOME 是 Hadoop 的根目录。 +export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath` + +./bin/sql-client.sh embedded -j .../hudi-flink-bundle_2.1?-*.*.*.jar shell +``` + +
+

小提示:

+
    +
  • 为了兼容大部分的对象存储,我们推荐使用 Hadoop 2.9 x+的 Hadoop 版本
  • +
  • flink-parquet 和 flink-avro 格式已经打包在 hudi-flink-bundle.jar 中了
  • +
+
+ +根据下面的不同功能来设置表名,存储路径和操作类型。 +Flink SQL Client 是逐行执行 SQL 的。 + +### 插入数据 + +先创建一个 Flink Hudi 表,然后在通过下面的 `VALUES` 语句往该表中插入数据。 + +```sql +-- 为了更直观的显示结果,推荐把 CLI 的输出结果模式设置为 tableau。 +set execution.result-mode=tableau; + +CREATE TABLE t1( + uuid VARCHAR(20), + name VARCHAR(10), + age INT, + ts TIMESTAMP(3), + `partition` VARCHAR(20) +) +PARTITIONED BY (`partition`) +WITH ( + 'connector' = 'hudi', + 'path' = 'schema://base-path', + 'table.type' = 'MERGE_ON_READ' -- 创建一个 MERGE_ON_READ类型的表,默认是 COPY_ON_ERITE +); + +-- 使用 values 语句插入数据 +INSERT INTO t1 VALUES + ('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'), + ('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'), + ('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'), + ('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'), + ('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'), + ('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'), + ('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'), + ('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4'); +``` + +### 查询数据 + +```sql +-- 从 Hudi 表中查询 +select * from t1; +``` +该查询语句提供的是 快照读(Snapshot Querying)。 +如果想了解更多关于表类型和查询类型的介绍,可以参考文档 [表类型和查询类型](/docs/concepts#table-types--queries) + +### 更新数据 + +数据的更新和插入数据类似: + +```sql +-- 这条语句会更新 key 为 'id1' 的记录 +insert into t1 values + ('id1','Danny',27,TIMESTAMP '1970-01-01 00:00:01','par1'); +``` + +需要注意的是:现在使用的存储类型为 `Append`。通常我们都是使用 apennd 模式,除非你是第一次创建这个表。 再次 [查询数据](#查询数据) 就会显示更新后的结果。 +每一次的插入操作都会在时间轴上生成一个带时间戳的新的 [commit](/docs/concepts),在元数据字段 `_hoodie_commit_time` 和同一 `_hoodie_record_key` 的 `age`字段中查看更新。 + +### 流式查询 + +Hudi Flink 也有能力来查询从指定时间戳开始的流式记录集合。 该功能可以使用Hudi的流式查询,只需要提供一个查询开始的时间戳就可以完成查询。如果我们需要的 +是指定时间戳后的所有数据,我们就不需要指定结束时间。 + +```sql +CREATE TABLE t1( + uuid VARCHAR(20), + name VARCHAR(10), + age INT, + ts TIMESTAMP(3), + `partition` VARCHAR(20) +) +PARTITIONED BY (`partition`) +WITH ( + 'connector' = 'hudi', + 'path' = 'oss://vvr-daily/hudi/t1', + 'table.type' = 'MERGE_ON_READ', + 'read.streaming.enabled' = 'true', -- 该参数开启流式查询 + 'read.streaming.start-commit' = '20210316134557' -- 指定开始的时间戳 + 'read.streaming.check-interval' = '4' -- 指定检查新的commit的周期,默认是60秒 +); + +-- 开启流式的查询 +select * from t1; +``` +上述的查询会查询出 `read.streaming.start-commit` 时间戳后的所有数据。该功能的特殊在于可以同时在流和批的 pipeline 上执行。 + +### 删除数据 {#deletes} + +在 [流式查询](#流式查询) 中使用数据时,Hudi Flink 源还可以接受来自底层数据源的更改日志,然后可以按行级别应用更新和删除。所以,你可以在 Hudi 上同步各种 RDBMS 的近实时快照。 + +## Flink 配置 + +在使用 Flink 前,你需要在 `$FLINK_HOME/conf/flink-conf.yaml` 中设置一些全局的 Flink 配置。 + +### 并行度 + +| 名称 | 默认值 | 类型 | 描述 | +| ----------- | ------- | ------- | ------- | +| `taskmanager.numberOfTaskSlots` | `1` | `Integer` | 单个 TaskManager 可以运行的并行 task 数。我们建议将该值设置为 > 4,实际值需根据数据量进行设置 | +| `parallelism.default` | `1` | `Integer` | 当用户为指定算子并行度时,会使用这个并行度(默认值是1)。例如 [`write.bucket_assign.tasks`](#并行度-1) 没有设置,就会使用这个默认值 | + +### 内存 + +| 名称 | 默认值 | 类型 | 描述 | +| ----------- | ------- | ------- | ------- | +| `jobmanager.memory.process.size` | `(none)` | `MemorySize` | JobManager 的总进程内存大小。代表 JobManager JVM 进程消耗的所有内存,包括总的 Flink 内存、JVM 元空间和 JVM 开销 | +| `taskmanager.memory.task.heap.size` | `(none)` | `MemorySize` | TaskExecutor 的堆内存大小。这是为写缓存保留的 JVM 堆内存大小 | +| `taskmanager.memory.managed.size` | `(none)` | `MemorySize` | 这是内存管理器管理的堆外内存的大小,用于排序和 RocksDB 状态后端。如果选择 RocksDB 作为状态后端,则需要设置此内存 | + +### Checkpoint + +| 名称 | 默认值 | 类型 | 描述 | +| ----------- | ------- | ------- | ------- | +| `execution.checkpointing.interval` | `(none)` | `Duration` | 设置该值的方式为 `execution.checkpointing.interval = 150000ms`,其中150000ms = 2.5min。 设置这个参数等同于开启了 Checkpoint | +| `state.backend` | `(none)` | `String` | 保存状态信息的状态后端。 我们推荐设置状态后端为 `rocksdb` :`state.backend: rocksdb` | +| `state.backend.rocksdb.localdir` | `(none)` | `String` | RocksDB 存储状态信息的路径 | +| `state.checkpoints.dir` | `(none)` | `String` | Checkpoint 的默认路径,用于在支持 Flink 的文件系统中存储检查点的数据文件和元数据。存储路径必须可从所有参与进程/节点(即所有 TaskManager 和 JobManager)访问,如 HDFS 和 OSS 路径 | +| `state.backend.incremental` | `false` | `Boolean` | 选项状态后端是否创建增量检查点。对于增量检查点,只存储与前一个检查点的差异,而不是完整的检查点状态。如果存储状态设置为 `rocksdb`,建议打开这个选项 | + +## 表参数 + +对于单个作业的配置,我们可以在 Flink SQL 语句的 [`WITH`](#表参数)中设置。 +所以,作业级别的配置如下: + +### 内存 + +:::note +我们在内存调优的时候需要先关注 TaskManager 的数量和 [内存](#内存) 配置,以及 write task 的并发(`write.tasks: 4` 的值),确认每个 write task 能够分配到足够的内存, +再考虑以下相关的内存参数设置。 +::: + +| 名称 | 说明 | 默认值 | 备注 | +| ----------- | ------- | ------- | ------- | +| `write.task.max.size` | 一个 write task 的最大可用内存。 默认为 `1024MB` | `1024D` | 当前预留给 write buffer 的内存为 `write.task.max.size` - `compaction.max_memory`。 当 write task 的内存 buffer 达到该阈值后会将内存里最大的 buffer flush | +| `write.batch.size` | Flink 的 write task 为了提高写数据效率,会按照写 bucket 提前缓存数据,每个 bucket 的数据在内存达到阈值之前会一直缓存在内存中,当阈值达到会把数据 buffer 传递给 Hudi 的 writer 执行写操作。默认为 `64MB` | `64D` | 推荐使用默认值 | +| `write.log_block.size` | Hudi 的 log writer 在收到 write task 的数据后不会马上 flush 数据,writer 是以 LogBlock 为单位往磁盘刷数据的,在 LogBlock 攒够之前 records 会以序列化字节的形式缓存在 writer 内部。默认为 `128MB` | `128` | 推荐使用默认值 | +| `write.merge.max_memory` | 当表的类型为 `COPY_ON_WRITE`,Hudi 会合并增量数据和 base file 中的数据。增量的数据会缓存在内存的 map 结构里,这个 map 是可溢写的,这个参数控制了 map 可以使用的堆内存大小。 默认为 `100MB` | `100` | 推荐使用默认值 | +| `compaction.max_memory` | 同 `write.merge.max_memory` 类似,只是发生在压缩时。默认为 `100MB` | `100` | 如果是在线 compaction,资源充足时可以开大些,比如 `1024MB` | + +### 并行度 + +| 名称 | 说明 | 默认值 | 备注 | +| ----------- | ------- | ------- | ------- | +| `write.tasks` | writer 的并发,每个 writer 顺序写 `1`~`N` 个 buckets。默认为 `4` | `4` | 增加这个并发,对小文件个数没影响 | +| `write.bucket_assign.tasks` | bucket assigner 的并发,默认使用 Flink 默认并发 `parallelism.default` | [`parallelism.default`](#并行度) | 增加并发同时增加了并发写的 bucekt 数,也就变相增加了小文件(小 bucket)数 | +| `write.index_boostrap.tasks` | Index bootstrap 算子的并发,增加并发可以加快 bootstrap 阶段的效率,bootstrap 阶段会阻塞 Checkpoint,因此需要设置多一些的 Checkpoint 失败容忍次数。 默认使用 Flink 默认并发 `parallelism.default` | [`parallelism.default`](#并行度) | 只在 `index.bootsrap.enabled` 为 `true` 时生效 | +| `read.tasks` | 读算子的并发(batch 和 stream。默认为 `4` | `4` | | +| `compaction.tasks` | 在线 compaction 的并发。默认为 `10` | `10` | 在线 compaction 比较耗费资源,建议使用 [`离线 compaction`](#离线-compaction) | + +### Compaction + +:::note +这些参数都只服务于在线 compaction。 +::: + +:::note +通过设置`compaction.async.enabled` = `false` 来关闭在线 compaction,但是 `compaction.schedule.enable` 仍然建议开启。之后通过[`离线 compaction`](#离线-compaction) 直接执行在线 compaction 产生的 compaction plan。 +::: + +| 名称 | 说明 | 默认值 | 备注 | +| ----------- | ------- | ------- | ------- | +| `compaction.schedule.enabled` | 是否阶段性生成 compaction plan | `true` | 建议开启,即使 `compaction.async.enabled` = `false` | +| `compaction.async.enabled` | 是否开启异步压缩,MOR 表时默认开启 | `true` | 通过关闭此参数来关闭 `在线 compaction` | +| `compaction.trigger.strategy` | 压缩策略 | `num_commits` | 可选择的策略有 `num_commits`:达到 `N` 个 delta commits 时触发 compaction; `time_elapsed`:距离上次 compaction 超过 `N` 秒触发 compaction ; `num_and_time`:`NUM_COMMITS` 和 `TIME_ELAPSED` 同时满足; `num_or_time`:`NUM_COMMITS` 或者 `TIME_ELAPSED` 中一个满足 | +| `compaction.delta_commits` | 默认策略,`5` 个 delta commits 触发一次压缩 | `5` | -- | +| `compaction.delta_seconds` | 默认 `1` 小时触发一次压缩 | `3600` | -- | +| `compaction.max_memory` | compaction 的 hashMap 可用内存。 默认值为 `100MB` | `100` | 资源够用的话,建议调整到 `1024MB` | +| `compaction.target_io` | 每个 compaction plan 的 IO 内存上限 (读和写)。 默认值为 `5GB`| `5120` | 离线 compaction 的默认值是 `500GB` | + +## 内存优化 + +### MOR + +1. [把 Flink 的状态后端设置为 `rocksdb`](#checkpoint) (默认的 `in memory` 状态后端非常的消耗内存) +2. 如果内存足够,`compaction.max_memory` 可以设置得更大些(默认为 `100MB`,可以调大到 `1024MB`) +3. 关注 taskManager 分配给每个 write task 的内存,保证每个 write task 能够分配到 `write.task.max.size` 所配置的内存大小。 比如 taskManager 的内存是 `4GB`, +运行了 `2` 个 `StreamWriteFunction`,那每个 write function 能分到 `2GB`,尽量预留一些缓存。因为网络缓存,taskManager 上其他类型的 task (比如 `BucketAssignFunction`)也会消耗一些内存 +4. 需要关注 compaction 的内存变化。 `compaction.max_memory` 控制了每个 compaction task 读 log 时可以利用的内存大小。`compaction.tasks` 控制了 compaction task 的并发 + +### COW + +1. [把 Flink 的状态后端设置为 `rocksdb`](#checkpoint) (默认的 `in memory` 状态后端非常的消耗内存) +2. 同时调大 `write.task.max.size` 和 `write.merge.max_memory` (默认值分别是 `1024MB` 和 `100MB`,可以调整为 `2014MB` 和 `1024MB`) +3. 关注 taskManager 分配给每个 write task 的内存,保证每个 write task 能够分配到 `write.task.max.size` 所配置的内存大小。 比如 taskManager 的内存是 `4GB`, + 运行了 `2` 个 `StreamWriteFunction`,那每个 write function 能分到 `2GB`,尽量预留一些缓存。因为网络缓存,taskManager 上其他类型的 task (比如 `BucketAssignFunction`)也会消耗一些内存 + +## 离线批量导入 + +针对存量数据导入的需求,如果存量数据来源于其他数据源,可以使用离线批量导入功能(`bulk_insert`),快速将存量数据导入 Hudi。 + + +:::note +`bulk_insert` 省去了 avro 的序列化以及数据的 merge 过程,后续也不会再有去重操作。所以,数据的唯一性需要自己来保证。 +::: + +:::note +`bulk_insert` 在 `batch execution mode` 模式下执行更加高效。 `batch execution mode` 模式默认会按照 partition path 排序输入消息再写入 Hudi, +避免 file handle 频繁切换导致性能下降。 +::: + +:::note +`bulk_insert` 的 write tasks 的并发是通过参数 `write.tasks` 来指定,并发的数量会影响到小文件的数量,理论上,`bulk_insert` 的 write tasks 的并发数就是划分的 bucket 数, +当然每个 bucket 在写到文件大小上限(parquet 120 MB)的时候会回滚到新的文件句柄,所以最后:写文件数量 >= [`write.bucket_assign.tasks`](#并行度)。 +::: + +### 参数 + +| 名称 | Required | 默认值 | 备注 | +| ----------- | ------- | ------- | ------- | +| `write.operation` | `true` | `upsert` | 开启 `bulk_insert` 功能 | +| `write.tasks` | `false` | `4` | `bulk_insert` write tasks 的并发,最后的文件数 >= [`write.bucket_assign.tasks`](#并行度) | +| `write.bulk_insert.shuffle_by_partition` | `false` | `true` | 是否将数据按照 partition 字段 shuffle 后,再通过 write task 写入,开启该参数将减少小文件的数量,但是有数据倾斜的风险 | +| `write.bulk_insert.sort_by_partition` | `false` | `true` | 是否将数据线按照 partition 字段排序后,再通过 write task 写入,当一个 write task 写多个 partition时,开启可以减少小文件数量 | +| `write.sort.memory` | `false` | `128` | sort 算子的可用 managed memory(单位 MB)。默认为 `128` MB | + +## 全量接增量 + +针对全量数据导入后,接增量的需求。如果已经有全量的离线 Hudi 表,需要接上实时写入,并且保证数据不重复,可以开启 全量接增量(`index bootstrap`)功能。 + +:::note +如果觉得流程冗长,可以在写入全量数据的时候资源调大直接走流模式写,全量走完接新数据再将资源调小(或者开启 [写入限流](#写入限流) )。 +::: + +### 参数 + +| 名称 | Required | 默认值 | 备注 | +| ----------- | ------- | ------- | ------- | +| `index.bootstrap.enabled` | `true` | `false` | 开启 index bootstrap 索引加载后,会将已存在的 Hudi 表的数据一次性加载到 state 中 | +| `index.partition.regex` | `false` | `*` | 设置正则表达式进行分区筛选,默认为加载全部分区 | + +### 使用流程 + +1. `CREATE TABLE` 创建和 Hudi 表对应的语句,注意 `table.type` 必须正确 +2. 设置 `index.bootstrap.enabled` = `true` 开启索引加载功能 +3. 在 `flink-conf.yaml` 中设置 Checkpoint 失败容忍 :`execution.checkpointing.tolerable-failed-checkpoints = n`(取决于checkpoint 调度次数) +4. 等待第一次 Checkpoint 完成,表示索引加载完成 +5. 索引加载完成后可以退出并保存 savepoint(也可以直接用 externalized checkpoint) +6. 重启任务,将 `index.bootstrap.enable` 设置为 `false`,参数配置到合适的大小 + +:::note +1. 索引加载是阻塞式,所以在索引加载过程中 Checkpoint 无法完成 +2. 索引加载由数据流触发,需要确保每个 partition 都至少有1条数据,即上游 source 有数据进来 +3. 索引加载为并发加载,根据数据量大小加载时间不同,可以在log中搜索 `finish loading the index under partition` 和 `Load record form file` 日志内容来观察索引加载的进度 +4. 第一次 Checkpoint 成功就表示索引已经加载完成,后续从 Checkpoint 恢复时无需再次加载索引 +::: + +## Changelog 模式 + +针对使用 Hudi 保留消息的所有变更(I / -U / U / D),之后接上 Flink 引擎的有状态计算实现全链路近实时数仓生产(增量计算)的需求,Hudi 的 MOR 表 +通过行存原生支持保留消息的所有变更(format 层面的集成),通过流读 MOR 表可以消费到所有的变更记录。 + +### 参数 + +| 名称 | Required | 默认值 | 备注 | +| ----------- | ------- | ------- | ------- | +| `changelog.enabled` | `false` | `false` | 默认是关闭状态,即 `UPSERT` 语义,所有的消息仅保证最后一条合并消息,中间的变更可能会被 merge 掉;改成 `true` 支持消费所有变更 | + +:::note +批(快照)读仍然会合并所有的中间结果,不管 format 是否已存储中间状态。 +::: + +:::note +设置 `changelog.enable` 为 `true` 后,中间的变更也只是 `best effort`:异步的压缩任务会将中间变更合并成 `1` 条,所以如果流读消费不够及时,被压缩后 +只能读到最后一条记录。当然,通过调整压缩的缓存时间可以预留一定的时间缓冲给 reader,比如调整压缩的两个参数:[`compaction.delta_commits`](#compaction) and [`compaction.delta_seconds`](#compaction)。 +::: + + +## Insert 模式 + +当前 Hudi 对于 `Insert 模式` 默认会采用小文件策略:MOR 会追加写 avro log 文件,COW 会不断合并之前的 parquet 文件(并且增量的数据会去重),这样会导致性能下降。 + + +如果想关闭文件合并,可以设置 `write.insert.deduplicate` 为 `false`。 关闭后,不会有任何的去重行为,每次 flush 都是直接写独立的 parquet(MOR 表也会直接写 parquet)。 + +### 参数 +| 名称 | Required | 默认值 | 备注 | +| ----------- | ------- | ------- | ------- | +| `write.insert.deduplicate` | `false` | `true` | 默认 `Insert 模式` 会去重,关闭后每次 flush 都会直接写独立的 parquet | + +## Hive 查询 + +### 打包 + +第一步是打包 `hudi-flink-bundle_2.11-0.9.0.jar`。 `hudi-flink-bundle` module pom.xml 默认将 Hive 相关的依赖 scope 设置为 provided, +如果想打入 Hive 的依赖,需要显示指定 Profile 为 `flink-bundle-shade-hive`。执行以下命令打入 Hive 依赖: + +```bash +# Maven 打包命令 +mvn install -DskipTests -Drat.skip=true -Pflink-bundle-shade-hive2 + +# 如果是 hive3 需要使用 profile -Pflink-bundle-shade-hive3 +# 如果是 hive1 需要使用 profile -Pflink-bundle-shade-hive1 +``` + +:::note +Hive1.x 现在只能实现同步 metadata 到 Hive,而无法使用 Hive 查询,如需查询可使用 Spark 查询 Hive 外表的方法查询。 +::: + +:::note +使用 -Pflink-bundle-shade-hive x,需要修改 Profile 中 Hive 的版本为集群对应版本(只需修改 Profile 里的 Hive 版本)。修改位置为 `packaging/hudi-flink-bundle/pom.xml` +最下面的对应 Profile 段,找到后修改 Profile 中的 Hive 版本为对应版本即可。 +::: + +### Hive 环境准备 + +1. 第一步是将 `hudi-hadoop-mr-bundle.jar` 放到 Hive中。 在 Hive 的根目录下创建 `auxlib/` 文件夹,把 `hudi-hadoop-mr-bundle-0.x.x-SNAPSHOT.jar` 移入到 `auxlib`。 +`hudi-hadoop-mr-bundle-0.x.x-SNAPSHOT.jar` 可以在 `packaging/hudi-hadoop-mr-bundle/target` 目录下拷贝。 + +2. 第二步是开启 Hive 相关的服务。Flink SQL Client 远程连接 Hive 的时候,要求 Hive 的 `hive metastore` and `hiveserver2` 两个服务都开启且需要记住端口号。 +服务开启的方法如下: + +```bash +# 启动 Hive metastore 和 hiveserver2 +nohup ./bin/hive --service metastore & +nohup ./bin/hive --service hiveserver2 & + +# 每次更新了 /auxlib 下的 jar 包都需要重启上述两个服务 +``` + +### Hive 配置模版 + +Flink hive sync 现在支持两种 hive sync mode,分别是 `hms` 和 `jdbc`。其中 `hms` 模式只需要配置 Metastore uris;而 `jdbc` 模式需要同时 +配置 JDBC 属性 和 Metastore uris,具体配置模版如下: + +```sql +-- hms mode 配置模版 +CREATE TABLE t1( + uuid VARCHAR(20), + name VARCHAR(10), + age INT, + ts TIMESTAMP(3), + `partition` VARCHAR(20) +) +PARTITIONED BY (`partition`) +WITH ( + 'connector' = 'hudi', + 'path' = 'oss://vvr-daily/hudi/t1', + 'table.type' = 'COPY_ON_WRITE', --MERGE_ON_READ 方式在没生成 parquet 文件前,Hive 不会有输出 + 'hive_sync.enable' = 'true', -- Required。开启 Hive 同步功能 + 'hive_sync.mode' = 'hms' -- Required。将 hive sync mode 设置为 hms, 默认 jdbc + 'hive_sync.metastore.uris' = 'thrift://ip:9083' -- Required。metastore 的端口 +); + + +-- jdbc mode 配置模版 +CREATE TABLE t1( + uuid VARCHAR(20), + name VARCHAR(10), + age INT, + ts TIMESTAMP(3), + `partition` VARCHAR(20) +) +PARTITIONED BY (`partition`) +WITH ( + 'connector' = 'hudi', + 'path' = 'oss://vvr-daily/hudi/t1', + 'table.type' = 'COPY_ON_WRITE', --MERGE_ON_READ 方式在没生成 parquet 文件前,Hive 不会有输出 + 'hive_sync.enable' = 'true', -- Required。开启 Hive 同步功能 + 'hive_sync.mode' = 'hms' -- Required。将 hive sync mode 设置为 hms, 默认 jdbc + 'hive_sync.metastore.uris' = 'thrift://ip:9083' -- Required。metastore 的端口 + 'hive_sync.jdbc_url'='jdbc:hive2://ip:10000', -- required。hiveServer 的端口 + 'hive_sync.table'='t1', -- required。hive 新建的表名 + 'hive_sync.db'='testDB', -- required。hive 新建的数据库名 + 'hive_sync.username'='root', -- required。HMS 用户名 + 'hive_sync.password'='your password' -- required。HMS 密码 +); +``` + +### Hive 查询 + +使用 beeline 查询时,需要手动设置: +```bash +set hive.input.format = org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat; +``` + +### 与其他包冲突 + +当 Flink lib/ 下有 `flink-sql-connector-hive-xxx.jar` 时,会出现hive包冲突,解决方法是在 Install 时,另外再指定一个 Profile:`-Pinclude-flink-sql-connector-hive`, +同时删除 Flink lib/ 下的 `flink-sql-connector-hive-xxx.jar`,新的 Install 的命令如下: + +```bash +# Maven 打包命令 +mvn install -DskipTests -Drat.skip=true -Pflink-bundle-shade-hive2 -Pinclude-flink-sql-connector-hive +``` + +## Presto 查询 + +### Hive 同步 +首先,参考 [Hive 查询](#hive-查询) 过程,完成 Hive 元数据的同步。 + +### Presto 环境配置 +1. 根据 [Presto 配置文档](https://prestodb.io/docs/current/installation/deployment.html) 来配置 Presto。 +2. 根据下面的配置,在 ` /presto-server-0.2xxx/etc/catalog/hive.properties` 中配置 Hive catalog: + +```properties +connector.name=hive-hadoop2 +hive.metastore.uri=thrift://xxx.xxx.xxx.xxx:9083 +hive.config.resources=.../hadoop-2.x/etc/hadoop/core-site.xml,.../hadoop-2.x/etc/hadoop/hdfs-site.xml +``` + +### Presto 查询 + +通过 presto-cli 连接 Hive metastore 开启查询。 presto-cli 的设置参考 presto-cli 配置: + +```bash +# 连接 presto server 的命令 +./presto --server xxx.xxx.xxx.xxx:9999 --catalog hive --schema default +``` + +:::note +1. `Presto-server-0.2445` 版本较低,在查 MOR 表的 `rt` 表时,会出现包冲突,正在解决中 +2. 当 `Presto-server-xxx` 的版本 < 0.233 时,`hudi-presto-bundle.jar` 需要手动导入到 `{presto_install_dir}/plugin/hive-hadoop2/` 中。 +::: + + +## 离线 Compaction + +MERGE_ON_READ 表的 compaction 默认是打开的,策略是 `5` 个 delta commits 执行一次压缩。因为压缩操作比较耗费内存,和写流程放在同一个 pipeline,在数据量比较大 +的时候(10w+/s),容易干扰写流程,此时采用离线定时任务的方式执行 compaction 任务更稳定。 + +:::note +一个 compaction 任务的执行包括两部分:1. 生成 compaction plan; 2. 执行对应的 compaction plan。其中,第一步生成 compaction plan 的过程推荐由写任务定时触发, + 写参数 `compaction.schedule.enable` 默认开启。 +::: + +离线 compaction 需要手动执行 Flink 任务,程序入口为: `hudi-flink-bundle_2.11-0.9.0-SNAPSHOT.jar` : `org.apache.hudi.sink.compact.HoodieFlinkCompactor` + +```bash +# 命令行执行方式 +./bin/flink run -c org.apache.hudi.sink.compact.HoodieFlinkCompactor lib/hudi-flink-bundle_2.11-0.9.0.jar --path hdfs://xxx:9000/table +``` + +### 参数 + +| 名称 | Required | 默认值 | 备注 | +| ----------- | ------- | ------- | ------- | +| `--path` | `frue` | `--` | 目标 Hudi 表的路径 | +| `--compaction-max-memory` | `false` | `100` | 压缩时 log 数据的索引 HashMap 的内存大小,默认 `100MB`,内存足够时,可以调大 | +| `--schedule` | `false` | `false` | 是否要执行 schedule compaction 的操作,当写流程还在持续写入表数据的时候,开启这个参数有丢失查询数据的风险,所以开启该参数一定要保证当前没有任务往表里写数据,写任务的 compaction plan 默认是一直 schedule 的,除非手动关闭(默认 `5` 个 delta commits 一次 compaction)| +| `--seq` | `false` | `LIFO` | 执行压缩任务的顺序,默认是从最新的 compaction plan 开始执行,可选值:`LIFO` :从最新的 plan 开始执行; `FIFO`:从最老的 plan 开始执行 | + +## 写入限流 + +针对将全量数据(百亿数量级)和增量先同步到 Kafka,再通过 Flink 流式消费的方式将库表数据直接导成 Hudi 表的需求,因为直接消费全量数据:量大 +(吞吐高)、乱序严重(写入的 partition 随机),会导致写入性能退化,出现吞吐毛刺等情况,这时候可以开启限速参数,保证流量平稳写入。 + +### 参数 + +| 名称 | Required | 默认值 | 备注 | +| ----------- | ------- | ------- | ------- | +| `write.rate.limit` | `false` | `0` | 默认关闭限速 | + +## 从这开始下一步? + +您也可以通过[自己构建hudi](https://github.com/apache/hudi#building-apache-hudi-from-source)来快速开始, +并在spark-shell命令中使用`--jars /packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.1?-*.*.*-SNAPSHOT.jar`, +而不是`--packages org.apache.hudi:hudi-spark3-bundle_2.12:0.8.0` + + +这里我们使用Spark演示了Hudi的功能。但是,Hudi可以支持多种存储类型/视图,并且可以从Hive,Spark,Presto等查询引擎中查询Hudi数据集。 +我们制作了一个基于Docker设置、所有依赖系统都在本地运行的[演示视频](https://www.youtube.com/watch?v=VhNgUsxdrD0), +我们建议您复制相同的设置然后按照[这里](/cn/docs/docker_demo)的步骤自己运行这个演示。 +另外,如果您正在寻找将现有数据迁移到Hudi的方法,请参考[迁移指南](/cn/docs/migration_guide)。 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/gcs_hoodie.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/gcs_hoodie.md new file mode 100644 index 0000000000000..7906b74c150b3 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/gcs_hoodie.md @@ -0,0 +1,61 @@ +--- +title: GCS 文件系统 +keywords: [ hudi, hive, google cloud, storage, spark, presto, 存储 ] +summary: 在本页中,我们探讨如何在 Google Cloud Storage 中配置 Hudi。 +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- +对于存储在 GCS 上的 Hudi , **区域** Bucket 提供了带有强一致性的 DFS API 。 + +## GCS 配置 + +Hudi 的 GCS 适配需要两项配置: + +- 为 Hudi 添加 GCS 凭证 +- 将需要的 jar 包添加到类路径 + +### GCS 凭证 + +在你的 core-site.xml 文件中添加必要的配置,Hudi 将从那里获取这些配置。 用你的 GCS 分区名称替换掉 `fs.defaultFS` ,以便 Hudi 能够在 Bucket 中读取/写入。 + +```xml + + fs.defaultFS + gs://hudi-bucket + + + + fs.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem + The FileSystem for gs: (GCS) uris. + + + + fs.AbstractFileSystem.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS + The AbstractFileSystem for gs: (GCS) uris. + + + + fs.gs.project.id + GCS_PROJECT_ID + + + google.cloud.auth.service.account.enable + true + + + google.cloud.auth.service.account.email + GCS_SERVICE_ACCOUNT_EMAIL + + + google.cloud.auth.service.account.keyfile + GCS_SERVICE_ACCOUNT_KEYFILE + +``` + +### GCS 库 + +将 GCS Hadoop 库添加到我们的类路径 + +- com.google.cloud.bigdataoss:gcs-connector:1.6.0-hadoop2 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/ibm_cos_hoodie.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/ibm_cos_hoodie.md new file mode 100644 index 0000000000000..7cd03ab914208 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/ibm_cos_hoodie.md @@ -0,0 +1,78 @@ +--- +title: IBM Cloud Object Storage 文件系统 +keywords: [ hudi, hive, ibm, cos, spark, presto] +summary: 在本页中,我们讨论在 IBM Cloud Object Storage 文件系统中配置 Hudi 。 +last_modified_at: 2020-10-01T11:38:24-10:00 +language: cn +--- +在本页中,我们解释如何将你的 Hudi Spark 作业存储到 IBM Cloud Object Storage 当中。 + +## IBM COS 配置 + +Hudi 适配 IBM Cloud Object Storage 需要两项配置: + +- 为 Hudi 添加 IBM COS 凭证 +- 添加需要的 jar 包到类路径 + +### IBM Cloud Object Storage 凭证 + +在 IBM Cloud Object Storage 上使用 Hudi 的最简单的办法,就是使用 [Stocator](https://github.com/CODAIT/stocator) 的 Spark 存储连接器为 `SparkSession` 或 `SparkContext` 配置 IBM Cloud Object Storage 凭证。 Hudi 将自动拾取配置并告知 IBM Cloud Object Storage 。 + +或者,向你的 core-site.xml 文件中添加必要的配置,Hudi 可以从那里获取这些配置。用你的 IBM Cloud Object Storage 的 Bucket 名称替换 `fs.defaultFS` 以便 Hudi 能够在 Bucket 中读取/写入。 + +例如,使用 HMAC 密钥以及服务名 `myCOS` : +```xml + + fs.defaultFS + cos://myBucket.myCOS + + + + fs.cos.flat.list + true + + + + fs.stocator.scheme.list + cos + + + + fs.cos.impl + com.ibm.stocator.fs.ObjectStoreFileSystem + + + + fs.stocator.cos.impl + com.ibm.stocator.fs.cos.COSAPIClient + + + + fs.stocator.cos.scheme + cos + + + + fs.cos.myCos.access.key + ACCESS KEY + + + + fs.cos.myCos.endpoint + http://s3-api.us-geo.objectstorage.softlayer.net + + + + fs.cos.myCos.secret.key + SECRET KEY + + +``` + +更多信息请参考 Stocator [文档](https://github.com/CODAIT/stocator/blob/master/README.md) 。 + +### IBM Cloud Object Storage 库 + +将 IBM Cloud Object Storage Hadoop 库添加到我们的类路径中: + + - com.ibm.stocator:stocator:1.1.3 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/migration_guide.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/migration_guide.md new file mode 100644 index 0000000000000..c7b61cab4ff5b --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/migration_guide.md @@ -0,0 +1,60 @@ +--- +title: 迁移指南 +keywords: [ hudi, migration, use case, 迁移, 用例] +summary: 在本页中,我们将讨论有效的工具,他们能将你的现有数据集迁移到 Hudi 数据集。 +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +Hudi 维护了元数据,包括提交的时间线和索引,来管理一个数据集。提交的时间线帮助理解一个数据集上发生的操作,以及数据集的当前状态。索引则被 Hudi 用来维护记录键到文件 ID 的映射,它能高效地定位一条记录。目前, Hudi 仅支持写 Parquet 列式格式 。 + +为了在你的现有数据集上开始使用 Hudi ,你需要将你的现有数据集迁移到 Hudi 管理的数据集中。以下有多种方法实现这个目的。 + + +## 方法 + + +### 将 Hudi 仅用于新分区 + +Hudi 可以被用来在不影响/改变数据集历史数据的情况下管理一个现有的数据集。 Hudi 已经实现兼容这样的数据集,需要注意的是,单个 Hive 分区要么完全由 Hudi 管理,要么不由 Hudi 管理。因此, Hudi 管理一个数据集的最低粒度是一个 Hive 分区。使用数据源 API 或 WriteClient 来写入数据集,并确保你开始写入的是一个新分区,或者将过去的 N 个分区而非整张表转换为 Hudi 。需要注意的是,由于历史分区不是由 Hudi 管理的, Hudi 提供的任何操作在那些分区上都不生效。更具体地说,无法在这些非 Hudi 管理的旧分区上进行插入更新或增量拉取。 + +如果你的数据集是追加型的数据集,并且你不指望在已经存在的(或者非 Hudi 管理的)分区上进行更新操作,就使用这个方法。 + +### 将现有的数据集转换为 Hudi + +将你的现有数据集导入到一个 Hudi 管理的数据集。由于全部数据都是 Hudi 管理的,方法 1 的任何限制在这里都不适用。跨分区的更新可以被应用到这个数据集,而 Hudi 会高效地让这些更新对查询可用。值得注意的是,你不仅可以在这个数据集上使用所有 Hudi 提供的操作,这样做还有额外的好处。 Hudi 会自动管理受管数据集的文件大小。你可以在转换数据集的时候设置期望的文件大小, Hudi 将确保它写出的文件符合这个配置。Hudi 还会确保小文件在后续被修正,这个过程是通过将新的插入引导到这些小文件而不是写入新的小文件来实现的,这样能维持你的集群的健康度。 + +选择这个方法后,有几种选择。 + +**选择 1** +使用 HDFSParquetImporter 工具。正如名字表明的那样,这仅仅适用于你的现有数据集是 Parquet 文件格式的。 +这个工具本质上是启动一个 Spark 作业来读取现有的 Parquet 数据集,并通过重写全部记录的方式将它转换为 HUDI 管理的数据集。 + +**选择 2** +对于大数据集,这可以简单地: +```java +for partition in [list of partitions in source dataset] { + val inputDF = spark.read.format("any_input_format").load("partition_path") + inputDF.write.format("org.apache.hudi").option()....save("basePath") +} +``` + +**选择 3** +写下你自定义的逻辑来定义如何将现有数据集加载到一个 Hudi 管理的数据集中。请在 [这里](/cn/docs/quick-start-guide) 阅读 RDD API 的相关资料。使用 HDFSParquetImporter 工具。一旦 Hudi 通过 `mvn clean install -DskipTests` 被构建了, Shell 将被 `cd hudi-cli && ./hudi-cli.sh` 调启。 + +```java +hudi->hdfsparquetimport + --upsert false + --srcPath /user/parquet/dataset/basepath + --targetPath + /user/hoodie/dataset/basepath + --tableName hoodie_table + --tableType COPY_ON_WRITE + --rowKeyField _row_key + --partitionPathField partitionStr + --parallelism 1500 + --schemaFilePath /user/table/schema + --format parquet + --sparkMemory 6g + --retry 2 +``` diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/oss_hoodie.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/oss_hoodie.md new file mode 100644 index 0000000000000..2863d3351c170 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/oss_hoodie.md @@ -0,0 +1,71 @@ +--- +title: OSS Filesystem +keywords: [ hudi, hive, aliyun, oss, spark, presto] +summary: In this page, we go over how to configure Hudi with OSS filesystem. +last_modified_at: 2020-04-21T12:50:50-10:00 +language: cn +--- +这个页面描述了如何让你的Hudi spark任务使用Aliyun OSS存储。 + +## Aliyun OSS 部署 + +为了让Hudi使用OSS,需要增加两部分的配置: + +- 为Hudi增加Aliyun OSS的相关配置 +- 增加Jar包的MVN依赖 + +### Aliyun OSS 相关的配置 + +新增下面的配置到你的Hudi能访问的core-site.xml文件。使用你的OSS bucket name替换掉`fs.defaultFS`,使用OSS endpoint地址替换`fs.oss.endpoint`,使用OSS的key和secret分别替换`fs.oss.accessKeyId`和`fs.oss.accessKeySecret`。这样Hudi就能读写相应的bucket。 + +```xml + + fs.defaultFS + oss://bucketname/ + + + + fs.oss.endpoint + oss-endpoint-address + Aliyun OSS endpoint to connect to. + + + + fs.oss.accessKeyId + oss_key + Aliyun access key ID + + + + fs.oss.accessKeySecret + oss-secret + Aliyun access key secret + + + + fs.oss.impl + org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem + +``` + +### Aliyun OSS Libs + +新增Aliyun hadoop的jar包的MVN依赖到pom.xml文件。由于hadoop-aliyun依赖hadoop 2.9.1+,因此你需要使用hadoop 2.9.1或更新的版本。 + +```xml + + org.apache.hadoop + hadoop-aliyun + 3.2.1 + + + com.aliyun.oss + aliyun-sdk-oss + 3.8.1 + + + org.jdom + jdom + 1.1 + +``` diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/overview.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/overview.md new file mode 100644 index 0000000000000..2ae90fe37bd6f --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/overview.md @@ -0,0 +1,155 @@ +--- +version: 0.8.0 +title: 概念 +keywords: [ hudi, design, storage, views, timeline] +summary: "Here we introduce some basic concepts & give a broad technical overview of Hudi" +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +Apache Hudi(发音为“Hudi”)在DFS的数据集上提供以下流原语 + + * 插入更新 (如何改变数据集?) + * 增量拉取 (如何获取变更的数据?) + +在本节中,我们将讨论重要的概念和术语,这些概念和术语有助于理解并有效使用这些原语。 + +## 时间轴 +在它的核心,Hudi维护一条包含在不同的`即时`时间所有对数据集操作的`时间轴`,从而提供,从不同时间点出发得到不同的视图下的数据集。Hudi即时包含以下组件 + + * `操作类型` : 对数据集执行的操作类型 + * `即时时间` : 即时时间通常是一个时间戳(例如:20190117010349),该时间戳按操作开始时间的顺序单调增加。 + * `状态` : 即时的状态 + +Hudi保证在时间轴上执行的操作的原子性和基于即时时间的时间轴一致性。 + +执行的关键操作包括 + + * `COMMITS` - 一次提交表示将一组记录**原子写入**到数据集中。 + * `CLEANS` - 删除数据集中不再需要的旧文件版本的后台活动。 + * `DELTA_COMMIT` - 增量提交是指将一批记录**原子写入**到MergeOnRead存储类型的数据集中,其中一些/所有数据都可以只写到增量日志中。 + * `COMPACTION` - 协调Hudi中差异数据结构的后台活动,例如:将更新从基于行的日志文件变成列格式。在内部,压缩表现为时间轴上的特殊提交。 + * `ROLLBACK` - 表示提交/增量提交不成功且已回滚,删除在写入过程中产生的所有部分文件。 + * `SAVEPOINT` - 将某些文件组标记为"已保存",以便清理程序不会将其删除。在发生灾难/数据恢复的情况下,它有助于将数据集还原到时间轴上的某个点。 + +任何给定的即时都可以处于以下状态之一 + + * `REQUESTED` - 表示已调度但尚未启动的操作。 + * `INFLIGHT` - 表示当前正在执行该操作。 + * `COMPLETED` - 表示在时间轴上完成了该操作。 + +
+ hudi_timeline.png +
+ +上面的示例显示了在Hudi数据集上大约10:00到10:20之间发生的更新事件,大约每5分钟一次,将提交元数据以及其他后台清理/压缩保留在Hudi时间轴上。 +观察的关键点是:提交时间指示数据的`到达时间`(上午10:20),而实际数据组织则反映了实际时间或`事件时间`,即数据所反映的(从07:00开始的每小时时段)。在权衡数据延迟和完整性时,这是两个关键概念。 + +如果有延迟到达的数据(事件时间为9:00的数据在10:20达到,延迟 >1 小时),我们可以看到upsert将新数据生成到更旧的时间段/文件夹中。 +在时间轴的帮助下,增量查询可以只提取10:00以后成功提交的新数据,并非常高效地只消费更改过的文件,且无需扫描更大的文件范围,例如07:00后的所有时间段。 + +## 文件组织 +Hudi将DFS上的数据集组织到`基本路径`下的目录结构中。数据集分为多个分区,这些分区是包含该分区的数据文件的文件夹,这与Hive表非常相似。 +每个分区被相对于基本路径的特定`分区路径`区分开来。 + +在每个分区内,文件被组织为`文件组`,由`文件id`唯一标识。 +每个文件组包含多个`文件切片`,其中每个切片包含在某个提交/压缩即时时间生成的基本列文件(`*.parquet`)以及一组日志文件(`*.log*`),该文件包含自生成基本文件以来对基本文件的插入/更新。 +Hudi采用MVCC设计,其中压缩操作将日志和基本文件合并以产生新的文件片,而清理操作则将未使用的/较旧的文件片删除以回收DFS上的空间。 + +Hudi通过索引机制将给定的hoodie键(记录键+分区路径)映射到文件组,从而提供了高效的Upsert。 +一旦将记录的第一个版本写入文件,记录键和文件组/文件id之间的映射就永远不会改变。 简而言之,映射的文件组包含一组记录的所有版本。 + +## 存储类型和视图 +Hudi存储类型定义了如何在DFS上对数据进行索引和布局以及如何在这种组织之上实现上述原语和时间轴活动(即如何写入数据)。 +反过来,`视图`定义了基础数据如何暴露给查询(即如何读取数据)。 + +| 存储类型 | 支持的视图 | +|-------------- |------------------| +| 写时复制 | 近实时 + 增量 | +| 读时合并 | 近实时 + 增量 + 读优化 | + +### 存储类型 +Hudi支持以下存储类型。 + + - [写时复制](#copy-on-write-storage) : 仅使用列文件格式(例如parquet)存储数据。通过在写入过程中执行同步合并以更新版本并重写文件。 + + - [读时合并](#merge-on-read-storage) : 使用列式(例如parquet)+ 基于行(例如avro)的文件格式组合来存储数据。 更新记录到增量文件中,然后进行同步或异步压缩以生成列文件的新版本。 + +下表总结了这两种存储类型之间的权衡 + +| 权衡 | 写时复制 | 读时合并 | +|-------------- |------------------| ------------------| +| 数据延迟 | 更高 | 更低 | +| 更新代价(I/O) | 更高(重写整个parquet文件) | 更低(追加到增量日志) | +| Parquet文件大小 | 更小(高更新代价(I/o)) | 更大(低更新代价) | +| 写放大 | 更高 | 更低(取决于压缩策略) | + + +### 视图 +Hudi支持以下存储数据的视图 + + - **读优化视图** : 在此视图上的查询将查看给定提交或压缩操作中数据集的最新快照。 + 该视图仅将最新文件切片中的基本/列文件暴露给查询,并保证与非Hudi列式数据集相比,具有相同的列式查询性能。 + - **增量视图** : 对该视图的查询只能看到从某个提交/压缩后写入数据集的新数据。该视图有效地提供了更改流,来支持增量数据管道。 + - **实时视图** : 在此视图上的查询将查看某个增量提交操作中数据集的最新快照。该视图通过动态合并最新的基本文件(例如parquet)和增量文件(例如avro)来提供近实时数据集(几分钟的延迟)。 + + +下表总结了不同视图之间的权衡。 + +| 权衡 | 读优化 | 实时 | +|-------------- |------------------| ------------------| +| 数据延迟 | 更高 | 更低 | +| 查询延迟 | 更低(原始列式性能)| 更高(合并列式 + 基于行的增量) | + + +## 写时复制存储 {#copy-on-write-storage} + +写时复制存储中的文件片仅包含基本/列文件,并且每次提交都会生成新版本的基本文件。 +换句话说,我们压缩每个提交,从而所有的数据都是以列数据的形式储存。在这种情况下,写入数据非常昂贵(我们需要重写整个列数据文件,即使只有一个字节的新数据被提交),而读取数据的成本则没有增加。 +这种视图有利于读取繁重的分析工作。 + +以下内容说明了将数据写入写时复制存储并在其上运行两个查询时,它是如何工作的。 + +
+ hudi_cow.png +
+ + +随着数据的写入,对现有文件组的更新将为该文件组生成一个带有提交即时时间标记的新切片,而插入分配一个新文件组并写入该文件组的第一个切片。 +这些文件切片及其提交即时时间在上面用颜色编码。 +针对这样的数据集运行SQL查询(例如:`select count(*)`统计该分区中的记录数目),首先检查时间轴上的最新提交并过滤每个文件组中除最新文件片以外的所有文件片。 +如您所见,旧查询不会看到以粉红色标记的当前进行中的提交的文件,但是在该提交后的新查询会获取新数据。因此,查询不受任何写入失败/部分写入的影响,仅运行在已提交数据上。 + +写时复制存储的目的是从根本上改善当前管理数据集的方式,通过以下方法来实现 + + - 优先支持在文件级原子更新数据,而无需重写整个表/分区 + - 能够只读取更新的部分,而不是进行低效的扫描或搜索 + - 严格控制文件大小来保持出色的查询性能(小的文件会严重损害查询性能)。 + +## 读时合并存储 {#merge-on-read-storage} + +读时合并存储是写时复制的升级版,从某种意义上说,它仍然可以通过读优化表提供数据集的读取优化视图(写时复制的功能)。 +此外,它将每个文件组的更新插入存储到基于行的增量日志中,通过文件id,将增量日志和最新版本的基本文件进行合并,从而提供近实时的数据查询。因此,此存储类型智能地平衡了读和写的成本,以提供近乎实时的查询。 +这里最重要的一点是压缩器,它现在可以仔细挑选需要压缩到其列式基础文件中的增量日志(根据增量日志的文件大小),以保持查询性能(较大的增量日志将会提升近实时的查询时间,并同时需要更长的合并时间)。 + +以下内容说明了存储的工作方式,并显示了对近实时表和读优化表的查询。 + +
+ hudi_mor.png +
+ +此示例中发生了很多有趣的事情,这些带出了该方法的微妙之处。 + + - 现在,我们每1分钟左右就有一次提交,这是其他存储类型无法做到的。 + - 现在,在每个文件id组中,都有一个增量日志,其中包含对基础列文件中记录的更新。 + 在示例中,增量日志包含10:05至10:10的所有数据。与以前一样,基本列式文件仍使用提交进行版本控制。 + 因此,如果只看一眼基本文件,那么存储布局看起来就像是写时复制表的副本。 + - 定期压缩过程会从增量日志中合并这些更改,并生成基础文件的新版本,就像示例中10:05发生的情况一样。 + - 有两种查询同一存储的方式:读优化(RO)表和近实时(RT)表,具体取决于我们选择查询性能还是数据新鲜度。 + - 对于RO表来说,提交数据在何时可用于查询将有些许不同。 请注意,以10:10运行的(在RO表上的)此类查询将不会看到10:05之后的数据,而在RT表上的查询总会看到最新的数据。 + - 何时触发压缩以及压缩什么是解决这些难题的关键。 + 通过实施压缩策略,在该策略中,与较旧的分区相比,我们会积极地压缩最新的分区,从而确保RO表能够以一致的方式看到几分钟内发布的数据。 + +读时合并存储上的目的是直接在DFS上启用近实时处理,而不是将数据复制到专用系统,后者可能无法处理大数据量。 +该存储还有一些其他方面的好处,例如通过避免数据的同步合并来减少写放大,即批量数据中每1字节数据需要的写入数据量。 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/performance.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/performance.md new file mode 100644 index 0000000000000..25a1c7589ef3d --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/performance.md @@ -0,0 +1,62 @@ +--- +title: 性能 +keywords: [ hudi, index, storage, compaction, cleaning, implementation] +toc: false +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +在本节中,我们将介绍一些有关Hudi插入更新、增量提取的实际性能数据,并将其与实现这些任务的其它传统工具进行比较。 + +## 插入更新 + +下面显示了从NoSQL数据库摄取获得的速度提升,这些速度提升数据是通过在写入时复制存储上的Hudi数据集上插入更新而获得的, +数据集包括5个从小到大的表(相对于批量加载表)。 + +
+ hudi_upsert_perf1.png +
+ +由于Hudi可以通过增量构建数据集,它也为更频繁地调度摄取提供了可能性,从而减少了延迟,并显著节省了总体计算成本。 + +
+ hudi_upsert_perf2.png +
+ +Hudi插入更新在t1表的一次提交中就进行了高达4TB的压力测试。 +有关一些调优技巧,请参见[这里](https://cwiki.apache.org/confluence/display/HUDI/Tuning+Guide)。 + +## 索引 + +为了有效地插入更新数据,Hudi需要将要写入的批量数据中的记录分类为插入和更新(并标记它所属的文件组)。 +为了加快此操作的速度,Hudi采用了可插拔索引机制,该机制存储了recordKey和它所属的文件组ID之间的映射。 +默认情况下,Hudi使用内置索引,该索引使用文件范围和布隆过滤器来完成此任务,相比于Spark Join,其速度最高可提高10倍。 + +当您将recordKey建模为单调递增时(例如时间戳前缀),Hudi提供了最佳的索引性能,从而进行范围过滤来避免与许多文件进行比较。 +即使对于基于UUID的键,也有[已知技术](https://www.percona.com/blog/2014/12/19/store-uuid-optimized-way/)来达到同样目的。 +例如,在具有80B键、3个分区、11416个文件、10TB数据的事件表上使用100M个时间戳前缀的键(5%的更新,95%的插入)时, +相比于原始Spark Join,Hudi索引速度的提升**约为7倍(440秒相比于2880秒)**。 +即使对于具有挑战性的工作负载,如使用300个核对3.25B UUID键、30个分区、6180个文件的“100%更新”的数据库摄取工作负载,Hudi索引也可以提供**80-100%的加速**。 + +## 读优化查询 + +读优化视图的主要设计目标是在不影响查询的情况下实现上一节中提到的延迟减少和效率提高。 +下图比较了对Hudi和非Hudi数据集的Hive、Presto、Spark查询,并对此进行说明。 + +**Hive** + +
+ hudi_query_perf_hive.png +
+ +**Spark** + +
+ hudi_query_perf_spark.png +
+ +**Presto** + +
+ hudi_query_perf_presto.png +
diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/powered_by.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/powered_by.md new file mode 100644 index 0000000000000..3d0c3c3a653d4 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/powered_by.md @@ -0,0 +1,88 @@ +--- +title: 演讲 & Hudi 用户 +keywords: [ hudi, talks, presentation] +last_modified_at: 2019-12-31T15:59:57-04:00 +language: cn +--- + +## 已使用 + +### Uber + +Hudi最初由[Uber](https://uber.com)开发,用于实现[低延迟、高效率的数据库摄取](http://www.slideshare.net/vinothchandar/hadoop-strata-talk-uber-your-hadoop-has-arrived/32)。 +Hudi自2016年8月开始在生产环境上线,在Hadoop上驱动约100个非常关键的业务表,支撑约几百TB的数据规模(前10名包括行程、乘客、司机)。 +Hudi还支持几个增量的Hive ETL管道,并且目前已集成到Uber的数据分发系统中。 + +### EMIS Health + +[EMIS Health](https://www.emishealth.com/)是英国最大的初级保健IT软件提供商,其数据集包括超过5000亿的医疗保健记录。HUDI用于管理生产中的分析数据集,并使其与上游源保持同步。Presto用于查询以HUDI格式写入的数据。 + +### Yields.io + +[Yields.io](https://www.yields.io/Blog/Apache-Hudi-at-Yields)是第一个使用AI在企业范围内进行自动模型验证和实时监控的金融科技平台。他们的数据湖由Hudi管理,他们还积极使用Hudi为增量式、跨语言/平台机器学习构建基础架构。 + +### Yotpo + +Hudi在Yotpo有不少用途。首先,在他们的[开源ETL框架](https://github.com/YotpoLtd/metorikku)中集成了Hudi作为CDC管道的输出写入程序,即从数据库binlog生成的事件流到Kafka然后再写入S3。 + +## 演讲 & 报告 + +1. ["Hoodie: Incremental processing on Hadoop at Uber"](https://conferences.oreilly.com/strata/strata-ca/public/schedule/detail/56511) - By Vinoth Chandar & Prasanna Rajaperumal + Mar 2017, Strata + Hadoop World, San Jose, CA + +2. ["Hoodie: An Open Source Incremental Processing Framework From Uber"](http://www.dataengconf.com/hoodie-an-open-source-incremental-processing-framework-from-uber) - By Vinoth Chandar. + Apr 2017, DataEngConf, San Francisco, CA [Slides](https://www.slideshare.net/vinothchandar/hoodie-dataengconf-2017) [Video](https://www.youtube.com/watch?v=7Wudjc-v7CA) + +3. ["Incremental Processing on Large Analytical Datasets"](https://spark-summit.org/2017/events/incremental-processing-on-large-analytical-datasets/) - By Prasanna Rajaperumal + June 2017, Spark Summit 2017, San Francisco, CA. [Slides](https://www.slideshare.net/databricks/incremental-processing-on-large-analytical-datasets-with-prasanna-rajaperumal-and-vinoth-chandar) [Video](https://www.youtube.com/watch?v=3HS0lQX-cgo&feature=youtu.be) + +4. ["Hudi: Unifying storage and serving for batch and near-real-time analytics"](https://conferences.oreilly.com/strata/strata-ny/public/schedule/detail/70937) - By Nishith Agarwal & Balaji Vardarajan + September 2018, Strata Data Conference, New York, NY + +5. ["Hudi: Large-Scale, Near Real-Time Pipelines at Uber"](https://databricks.com/session/hudi-near-real-time-spark-pipelines-at-petabyte-scale) - By Vinoth Chandar & Nishith Agarwal + October 2018, Spark+AI Summit Europe, London, UK + +6. ["Powering Uber's global network analytics pipelines in real-time with Apache Hudi"](https://www.youtube.com/watch?v=1w3IpavhSWA) - By Ethan Guo & Nishith Agarwal, April 2019, Data Council SF19, San Francisco, CA. + +7. ["Building highly efficient data lakes using Apache Hudi (Incubating)"](https://www.slideshare.net/ChesterChen/sf-big-analytics-20190612-building-highly-efficient-data-lakes-using-apache-hudi) - By Vinoth Chandar + June 2019, SF Big Analytics Meetup, San Mateo, CA + +8. ["Apache Hudi (Incubating) - The Past, Present and Future Of Efficient Data Lake Architectures"](https://docs.google.com/presentation/d/1FHhsvh70ZP6xXlHdVsAI0g__B_6Mpto5KQFlZ0b8-mM) - By Vinoth Chandar & Balaji Varadarajan + September 2019, ApacheCon NA 19, Las Vegas, NV, USA + +9. ["Insert, upsert, and delete data in Amazon S3 using Amazon EMR"](https://www.portal.reinvent.awsevents.com/connect/sessionDetail.ww?SESSION_ID=98662&csrftkn=YS67-AG7B-QIAV-ZZBK-E6TT-MD4Q-1HEP-747P) - By Paul Codding & Vinoth Chandar + December 2019, AWS re:Invent 2019, Las Vegas, NV, USA + +10. ["Building Robust CDC Pipeline With Apache Hudi And Debezium"](https://www.slideshare.net/SyedKather/building-robust-cdc-pipeline-with-apache-hudi-and-debezium) - By Pratyaksh, Purushotham, Syed and Shaik December 2019, Hadoop Summit Bangalore, India + +11. ["Using Apache Hudi to build the next-generation data lake and its application in medical big data"](https://drive.google.com/open?id=1dmH2kWJF69PNdifPp37QBgjivOHaSLDn) - By JingHuang & Leesf March 2020, Apache Hudi & Apache Kylin Online Meetup, China + +12. ["Building a near real-time, high-performance data warehouse based on Apache Hudi and Apache Kylin"](https://drive.google.com/open?id=1Pk_WdFxfEZxMMfAOn0R8-m3ALkcN6G9e) - By ShaoFeng Shi March 2020, Apache Hudi & Apache Kylin Online Meetup, China + +13. ["Building large scale, transactional data lakes using Apache Hudi"](https://berlinbuzzwords.de/session/building-large-scale-transactional-data-lakes-using-apache-hudi) - By Nishith Agarwal, June 2020, Berlin Buzzwords 2020. + +14. ["Apache Hudi - Design/Code Walkthrough Session for Contributors"](https://www.youtube.com/watch?v=N2eDfU_rQ_U) - By Vinoth Chandar, July 2020, Hudi community. + +15. ["PrestoDB and Apache Hudi"](https://youtu.be/nA3rwOdmm3A) - By Bhavani Sudha Saktheeswaran and Brandon Scheller, Aug 2020, PrestoDB Community Meetup. + +16. ["Panel Discussion on Presto Ecosystem"](https://www.youtube.com/watch?v=lsFSM2Z4kPs) - By Vinoth Chandar, Sep 2020, PrestoCon ["panel"](https://prestocon2020.sched.com/event/dgyw). + +17. ["Next Generation Data lakes using Apache Hudi"](https://docs.google.com/presentation/d/1y-ryRwCdTbqQHGr_bn3lxM_B8L1L5nsZOIXlJsDl_wU/edit?usp=sharing) - By Balaji Varadarajan and Sivabalan Narayanan, Sep 2020, ["ApacheCon"](https://www.apachecon.com/) + +18. ["Landing practice of Apache Hudi in T3go"](https://drive.google.com/file/d/1ULVPkjynaw-07wsutLcZm-4rVXf8E8N8/view?usp=sharing) - By VinoYang and XianghuWang, November 2020, Qcon. +## 文章 + +You can check out [our blog pages](https://hudi.apache.org/blog) for content written by our committers/contributors. + +1. ["The Case for incremental processing on Hadoop"](https://www.oreilly.com/ideas/ubers-case-for-incremental-processing-on-hadoop) - O'reilly Ideas article by Vinoth Chandar +2. ["Hoodie: Uber Engineering's Incremental Processing Framework on Hadoop"](https://eng.uber.com/hoodie/) - Engineering Blog By Prasanna Rajaperumal +3. ["New – Insert, Update, Delete Data on S3 with Amazon EMR and Apache Hudi"](https://aws.amazon.com/blogs/aws/new-insert-update-delete-data-on-s3-with-amazon-emr-and-apache-hudi/) - AWS Blog by Danilo Poccia +4. ["The Apache Software Foundation Announces Apache® Hudi™ as a Top-Level Project"](https://blogs.apache.org/foundation/entry/the-apache-software-foundation-announces64) - ASF Graduation announcement +5. ["Apache Hudi grows cloud data lake maturity"](https://searchdatamanagement.techtarget.com/news/252484740/Apache-Hudi-grows-cloud-data-lake-maturity) +6. ["Building a Large-scale Transactional Data Lake at Uber Using Apache Hudi"](https://eng.uber.com/apache-hudi-graduation/) - Uber eng blog by Nishith Agarwal +7. ["Hudi On Hops"](https://www.diva-portal.org/smash/get/diva2:1413103/FULLTEXT01.pdf) - By NETSANET GEBRETSADKAN KIDANE +8. ["开源数据湖存储框架 Apache Hudi 如何玩转增量处理"](https://www.infoq.cn/article/CAgIDpfJBVcJHKJLSbhe) - InfoQ CN article by Yanghua +9. ["Origins of Data Lake at Grofers"](https://lambda.grofers.com/origins-of-data-lake-at-grofers-6c011f94b86c) - by Akshay Agarwal +10. ["Data Lake Change Capture using Apache Hudi & Amazon AMS/EMR"](https://towardsdatascience.com/data-lake-change-data-capture-cdc-using-apache-hudi-on-amazon-emr-part-2-process-65e4662d7b4b) - Towards DataScience article, Oct 20 +11. ["How nClouds Helps Accelerate Data Delivery with Apache Hudi on Amazon EMR"](https://aws.amazon.com/blogs/apn/how-nclouds-helps-accelerate-data-delivery-with-apache-hudi-on-amazon-emr/) - published by nClouds in partnership with AWS +12. ["Apply record level changes from relational databases to Amazon S3 data lake using Apache Hudi on Amazon EMR and AWS Database Migration Service"](https://aws.amazon.com/blogs/big-data/apply-record-level-changes-from-relational-databases-to-amazon-s3-data-lake-using-apache-hudi-on-amazon-emr-and-aws-database-migration-service/) - AWS blog diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/privacy.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/privacy.md new file mode 100644 index 0000000000000..afa167d37ab74 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/privacy.md @@ -0,0 +1,23 @@ +--- +title: 隐私协议 +keywords: [ hudi, privacy, 隐私] +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +关于你使用本网站的信息,将通过服务器访问日志和 Cookie 跟踪被收集。 +收集的信息由以下内容构成: + +* 你访问网站使用的 IP 地址; +* 你访问我们的网站时使用的浏览器和操作系统; +* 你访问我们的网站的日期和时间; +* 你浏览的页面; +* 引导你链接到我们的网站的页面地址; + +这些信息中的一部分将使用由 [Google Analytics](http://www.google.com/analytics) 服务设置的 Cookie 跟踪进行收集,并由 Google 按照在他们的 [隐私协议](http://www.google.com/privacy) 中描述的方式进行处理。如果你不希望与 Google 分享这些数据,请参考你的浏览器文档中关于如何禁用 Cookie 的说明。 + +我们使用收集的数据来帮助让我们的网站对访问者更有用,并更好地了解我们的网站是如何、在何时被使用的。我们不跟踪也不收集个人隐私信息,同时也不与任何收集包含个人隐私数据的数据源合作。 + +使用本网站,即代表你许可以上述的方式和目的收集这些数据。 + +Hudi 开发者社区欢迎你提出关于本隐私协议的问题或评论。请将他们发送至 dev@hudi.apache.org 。 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/querying_data.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/querying_data.md new file mode 100644 index 0000000000000..cd78c33155823 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/querying_data.md @@ -0,0 +1,224 @@ +--- +title: 查询 Hudi 数据集 +keywords: [ hudi, hive, spark, sql, presto] +summary: In this page, we go over how to enable SQL queries on Hudi built tables. +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +从概念上讲,Hudi物理存储一次数据到DFS上,同时在其上提供三个逻辑视图,如[之前](/cn/docs/concepts#views)所述。 +数据集同步到Hive Metastore后,它将提供由Hudi的自定义输入格式支持的Hive外部表。一旦提供了适当的Hudi捆绑包, +就可以通过Hive、Spark和Presto之类的常用查询引擎来查询数据集。 + +具体来说,在写入过程中传递了两个由[table name](/cn/docs/configurations#TABLE_NAME_OPT_KEY)命名的Hive表。 +例如,如果`table name = hudi_tbl`,我们得到 + + - `hudi_tbl` 实现了由 `HoodieParquetInputFormat` 支持的数据集的读优化视图,从而提供了纯列式数据。 + - `hudi_tbl_rt` 实现了由 `HoodieParquetRealtimeInputFormat` 支持的数据集的实时视图,从而提供了基础数据和日志数据的合并视图。 + +如概念部分所述,[增量处理](https://www.oreilly.com/ideas/ubers-case-for-incremental-processing-on-hadoop)所需要的 +一个关键原语是`增量拉取`(以从数据集中获取更改流/日志)。您可以增量提取Hudi数据集,这意味着自指定的即时时间起, +您可以只获得全部更新和新行。 这与插入更新一起使用,对于构建某些数据管道尤其有用,包括将1个或多个源Hudi表(数据流/事实)以增量方式拉出(流/事实) +并与其他表(数据集/维度)结合以[写出增量](/cn/docs/writing_data)到目标Hudi数据集。增量视图是通过查询上表之一实现的,并具有特殊配置, +该特殊配置指示查询计划仅需要从数据集中获取增量数据。 + + +## 查询引擎支持列表 + +下面的表格展示了各查询引擎是否支持Hudi格式 + +### 读优化表 + +|查询引擎|实时视图|增量拉取| +|------------|--------|-----------| +|**Hive**|Y|Y| +|**Spark SQL**|Y|Y| +|**Spark Datasource**|Y|Y| +|**Flink SQL**|Y|N| +|**PrestoDB**|Y|N| +|**Impala**|Y|N| + + +### 实时表 + +|查询引擎|实时视图|增量拉取|读优化表| +|------------|--------|-----------|--------------| +|**Hive**|Y|Y|Y| +|**Spark SQL**|Y|Y|Y| +|**Spark Datasource**|Y|Y|Y| +|**Flink SQL**|Y|Y|Y| +|**PrestoDB**|Y|N|Y| +|**Impala**|N|N|Y| + + +接下来,我们将详细讨论在每个查询引擎上如何访问所有三个视图。 + +## Hive + +为了使Hive能够识别Hudi数据集并正确查询, +HiveServer2需要在其[辅助jars路径](https://www.cloudera.com/documentation/enterprise/5-6-x/topics/cm_mc_hive_udf#concept_nc3_mms_lr)中提供`hudi-hadoop-mr-bundle-x.y.z-SNAPSHOT.jar`。 +这将确保输入格式类及其依赖项可用于查询计划和执行。 + +### 读优化表 {#hive-ro-view} +除了上述设置之外,对于beeline cli访问,还需要将`hive.input.format`变量设置为`org.apache.hudi.hadoop.HoodieParquetInputFormat`输入格式的完全限定路径名。 +对于Tez,还需要将`hive.tez.input.format`设置为`org.apache.hadoop.hive.ql.io.HiveInputFormat`。 + +### 实时表 {#hive-rt-view} +除了在HiveServer2上安装Hive捆绑jars之外,还需要将其放在整个集群的hadoop/hive安装中,这样查询也可以使用自定义RecordReader。 + +### 增量拉取 {#hive-incr-pull} + +`HiveIncrementalPuller`允许通过HiveQL从大型事实/维表中增量提取更改, +结合了Hive(可靠地处理复杂的SQL查询)和增量原语的好处(通过增量拉取而不是完全扫描来加快查询速度)。 +该工具使用Hive JDBC运行hive查询并将其结果保存在临时表中,这个表可以被插入更新。 +Upsert实用程序(`HoodieDeltaStreamer`)具有目录结构所需的所有状态,以了解目标表上的提交时间应为多少。 +例如:`/app/incremental-hql/intermediate/{source_table_name}_temp/{last_commit_included}`。 +已注册的Delta Hive表的格式为`{tmpdb}.{source_table}_{last_commit_included}`。 + +以下是HiveIncrementalPuller的配置选项 + +| **配置** | **描述** | **默认值** | +|-------|--------|--------| +|hiveUrl| 要连接的Hive Server 2的URL | | +|hiveUser| Hive Server 2 用户名 | | +|hivePass| Hive Server 2 密码 | | +|queue| YARN 队列名称 | | +|tmp| DFS中存储临时增量数据的目录。目录结构将遵循约定。请参阅以下部分。 | | +|extractSQLFile| 在源表上要执行的提取数据的SQL。提取的数据将是自特定时间点以来已更改的所有行。 | | +|sourceTable| 源表名称。在Hive环境属性中需要设置。 | | +|targetTable| 目标表名称。中间存储目录结构需要。 | | +|sourceDataPath| 源DFS基本路径。这是读取Hudi元数据的地方。 | | +|targetDataPath| 目标DFS基本路径。 这是计算fromCommitTime所必需的。 如果显式指定了fromCommitTime,则不需要设置这个参数。 | | +|tmpdb| 用来创建中间临时增量表的数据库 | hoodie_temp | +|fromCommitTime| 这是最重要的参数。 这是从中提取更改的记录的时间点。 | | +|maxCommits| 要包含在拉取中的提交数。将此设置为-1将包括从fromCommitTime开始的所有提交。将此设置为大于0的值,将包括在fromCommitTime之后仅更改指定提交次数的记录。如果您需要一次赶上两次提交,则可能需要这样做。| 3 | +|help| 实用程序帮助 | | + + +设置fromCommitTime=0和maxCommits=-1将提取整个源数据集,可用于启动Backfill。 +如果目标数据集是Hudi数据集,则该实用程序可以确定目标数据集是否没有提交或延迟超过24小时(这是可配置的), +它将自动使用Backfill配置,因为增量应用最近24小时的更改会比Backfill花费更多的时间。 +该工具当前的局限性在于缺乏在混合模式(正常模式和增量模式)下自联接同一表的支持。 + +**关于使用Fetch任务执行的Hive查询的说明:** +由于Fetch任务为每个分区调用InputFormat.listStatus(),每个listStatus()调用都会列出Hoodie元数据。 +为了避免这种情况,如下操作可能是有用的,即使用Hive session属性对增量查询禁用Fetch任务: +`set hive.fetch.task.conversion = none;`。这将确保Hive查询使用Map Reduce执行, +合并分区(用逗号分隔),并且对所有这些分区仅调用一次InputFormat.listStatus()。 + +## Spark + +Spark可将Hudi jars和捆绑包轻松部署和管理到作业/笔记本中。简而言之,通过Spark有两种方法可以访问Hudi数据集。 + + - **Hudi DataSource**:支持实时视图,读取优化和增量拉取,类似于标准数据源(例如:`spark.read.parquet`)的工作方式。 + - **以Hive表读取**:支持所有三个视图,包括实时视图,依赖于自定义的Hudi输入格式(再次类似Hive)。 + +通常,您的spark作业需要依赖`hudi-spark`或`hudi-spark-bundle-x.y.z.jar`, +它们必须位于驱动程序和执行程序的类路径上(提示:使用`--jars`参数)。 + +### 读优化表 {#spark-ro-view} + +要使用SparkSQL将RO表读取为Hive表,只需按如下所示将路径过滤器推入sparkContext。 +对于Hudi表,该方法保留了Spark内置的读取Parquet文件的优化功能,例如进行矢量化读取。 + +```scala +spark.sparkContext.hadoopConfiguration.setClass("mapreduce.input.pathFilter.class", classOf[org.apache.hudi.hadoop.HoodieROTablePathFilter], classOf[org.apache.hadoop.fs.PathFilter]); +``` + +如果您希望通过数据源在DFS上使用全局路径,则只需执行以下类似操作即可得到Spark DataFrame。 + +```scala +Dataset hoodieROViewDF = spark.read().format("org.apache.hudi") +// pass any path glob, can include hudi & non-hudi datasets +.load("/glob/path/pattern"); +``` + +### 实时表 {#spark-rt-view} +将实时表在Spark中作为Hive表进行查询,设置`spark.sql.hive.convertMetastoreParquet = false`, +迫使Spark回退到使用Hive Serde读取数据(计划/执行仍然是Spark)。 + +```scala +$ spark-shell --jars hudi-spark-bundle-x.y.z-SNAPSHOT.jar --driver-class-path /etc/hive/conf --packages com.databricks:spark-avro_2.11:4.0.0 --conf spark.sql.hive.convertMetastoreParquet=false --num-executors 10 --driver-memory 7g --executor-memory 2g --master yarn-client + +scala> sqlContext.sql("select count(*) from hudi_rt where datestr = '2016-10-02'").show() +``` + +如果您希望通过数据源在DFS上使用全局路径,则只需执行以下类似操作即可得到Spark DataFrame。 + +```scala +Dataset hoodieRealtimeViewDF = spark.read().format("org.apache.hudi") +// pass any path glob, can include hudi & non-hudi datasets +.load("/glob/path/pattern"); +``` + +如果您希望只查询实时表的读优化视图 + +```scala +Dataset hoodieRealtimeViewDF = spark.read().format("org.apache.hudi") +.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL) +// pass any path glob, can include hudi & non-hudi datasets +.load("/glob/path/pattern"); +``` + +### 增量拉取 {#spark-incr-pull} +`hudi-spark`模块提供了DataSource API,这是一种从Hudi数据集中提取数据并通过Spark处理数据的更优雅的方法。 +如下所示是一个示例增量拉取,它将获取自`beginInstantTime`以来写入的所有记录。 + +```java + Dataset hoodieIncViewDF = spark.read() + .format("org.apache.hudi") + .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), + DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), + ) + .option(DataSourceReadOptions.INCR_PATH_GLOB_OPT_KEY(), + "/year=2020/month=*/day=*") // 可选,从指定的分区增量拉取 + .load(tablePath); // 用数据集的最底层路径 +``` + +请参阅[设置](/cn/docs/configurations#spark-datasource)部分,以查看所有数据源选项。 + +另外,`HoodieReadClient`通过Hudi的隐式索引提供了以下功能。 + +| **API** | **描述** | +|-------|--------| +| read(keys) | 使用Hudi自己的索通过快速查找将与键对应的数据作为DataFrame读出 | +| filterExists() | 从提供的RDD[HoodieRecord]中过滤出已经存在的记录。对删除重复数据有用 | +| checkExists(keys) | 检查提供的键是否存在于Hudi数据集中 | + + +## PrestoDB + +PrestoDB是一种常用的查询引擎,可提供交互式查询性能。 Hudi RO表可以在Presto中无缝查询。 +这需要在整个安装过程中将`hudi-presto-bundle` jar放入`/plugin/hive-hadoop2/`中。 + +## Impala (3.4 or later) + +### 读优化表 + +Impala可以在HDFS上查询Hudi读优化表,作为一种 [EXTERNAL TABLE](https://docs.cloudera.com/documentation/enterprise/6/6.3/topics/impala_tables#external_tables) 的形式。 +可以通过以下方式在Impala上建立Hudi读优化表: +``` +CREATE EXTERNAL TABLE database.table_name +LIKE PARQUET '/path/to/load/xxx.parquet' +STORED AS HUDIPARQUET +LOCATION '/path/to/load'; +``` +Impala可以利用合理的文件分区来提高查询的效率。 +如果想要建立分区的表,文件夹命名需要根据此种方式`year=2020/month=1`. +Impala使用`=`来区分分区名和分区值. +可以通过以下方式在Impala上建立分区Hudi读优化表: +``` +CREATE EXTERNAL TABLE database.table_name +LIKE PARQUET '/path/to/load/xxx.parquet' +PARTITION BY (year int, month int, day int) +STORED AS HUDIPARQUET +LOCATION '/path/to/load'; +ALTER TABLE database.table_name RECOVER PARTITIONS; +``` +在Hudi成功写入一个新的提交后, 刷新Impala表来得到最新的结果. +``` +REFRESH database.table_name +``` + diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/s3_hoodie.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/s3_hoodie.md new file mode 100644 index 0000000000000..1e0b3293c2ea0 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/s3_hoodie.md @@ -0,0 +1,81 @@ +--- +title: S3 文件系统 +keywords: [ hudi, hive, aws, s3, spark, presto] +summary: 在本页中,我们将讨论如何在 S3 文件系统中配置 Hudi 。 +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- +在本页中,我们将解释如何让你的 Hudi Spark 作业存储到 AWS S3 。 + +## AWS 配置 + + +Hudi 与 S3 的适配需要两项配置: + +- 为 Hudi 加 AWS 凭证 +- 将需要的 jar 包添加到类路径 + +### AWS 凭证 + +在 S3 上使用 Hudi 的最简单的办法,是为你的 `SparkSession` 或 `SparkContext` 设置 S3 凭证。 Hudi 将自动拾取并通知 S3 。 + +或者,将需要的配置添加到你的 core-site.xml 文件中, Hudi 可以从那里获取它们。用你的 S3 Bucket 名称替换 `fs.defaultFS` ,之后 Hudi 应该能够从 Bucket 中读取/写入. + +```xml + + fs.defaultFS + s3://ysharma + + + + fs.s3.impl + org.apache.hadoop.fs.s3native.NativeS3FileSystem + + + + fs.s3.awsAccessKeyId + AWS_KEY + + + + fs.s3.awsSecretAccessKey + AWS_SECRET + + + + fs.s3n.awsAccessKeyId + AWS_KEY + + + + fs.s3n.awsSecretAccessKey + AWS_SECRET + +``` + + +`hudi-cli` 或 DeltaStreamer 这些工具集能通过 `HOODIE_ENV_` 前缀的环境变量拾取。以下是一个作为示例的基础代码片段,它设置了这些变量并让 CLI 能够在保存在 S3 上的数据集上工作。 + +```java +export HOODIE_ENV_fs_DOT_s3a_DOT_access_DOT_key=$accessKey +export HOODIE_ENV_fs_DOT_s3a_DOT_secret_DOT_key=$secretKey +export HOODIE_ENV_fs_DOT_s3_DOT_awsAccessKeyId=$accessKey +export HOODIE_ENV_fs_DOT_s3_DOT_awsSecretAccessKey=$secretKey +export HOODIE_ENV_fs_DOT_s3n_DOT_awsAccessKeyId=$accessKey +export HOODIE_ENV_fs_DOT_s3n_DOT_awsSecretAccessKey=$secretKey +export HOODIE_ENV_fs_DOT_s3n_DOT_impl=org.apache.hadoop.fs.s3a.S3AFileSystem +``` + + + +### AWS 库 + +将 AWS Hadoop 库添加到我们的类路径。 + + - com.amazonaws:aws-java-sdk:1.10.34 + - org.apache.hadoop:hadoop-aws:2.7.3 + +如果使用了 AWS Glue 的数据,则需要 AWS Glue 库。 + + - com.amazonaws.glue:aws-glue-datacatalog-hive2-client:1.11.0 + - com.amazonaws:aws-java-sdk-glue:1.11.475 \ No newline at end of file diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/spark_quick-start-guide.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/spark_quick-start-guide.md new file mode 100644 index 0000000000000..7ced36bf08cbc --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/spark_quick-start-guide.md @@ -0,0 +1,449 @@ +--- +title: "Spark Guide" +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +本指南通过使用spark-shell简要介绍了Hudi功能。使用Spark数据源,我们将通过代码段展示如何插入和更新Hudi的默认存储类型数据集: +[写时复制](/cn/docs/concepts#copy-on-write-storage)。每次写操作之后,我们还将展示如何读取快照和增量数据。 + +## 设置spark-shell +Hudi适用于Spark-2.4.3+ & Spark 3.x 版本。您可以按照[此处](https://spark.apache.org/downloads)的说明设置spark。 +在提取的目录中,使用spark-shell运行Hudi: + +```scala +// spark-shell for spark 3 +spark-shell \ + --packages org.apache.hudi:hudi-spark3-bundle_2.12:0.8.0,org.apache.spark:spark-avro_2.12:3.0.1 \ + --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' +// spark-shell for spark 2 with scala 2.12 +spark-shell \ + --packages org.apache.hudi:hudi-spark-bundle_2.12:0.8.0,org.apache.spark:spark-avro_2.12:2.4.4 \ + --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' +// spark-shell for spark 2 with scala 2.11 +spark-shell \ + --packages org.apache.hudi:hudi-spark-bundle_2.11:0.8.0,org.apache.spark:spark-avro_2.11:2.4.4 \ + --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' +``` + + +设置表名、基本路径和数据生成器来为本指南生成记录。 + +```scala +import org.apache.hudi.QuickstartUtils._ +import scala.collection.JavaConversions._ +import org.apache.spark.sql.SaveMode._ +import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.config.HoodieWriteConfig._ + +val tableName = "hudi_cow_table" +val basePath = "file:///tmp/hudi_cow_table" +val dataGen = new DataGenerator +``` + +[数据生成器](https://github.com/apache/hudi/blob/master/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java#L50) +可以基于[行程样本模式](https://github.com/apache/hudi/blob/master/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java#L57) +生成插入和更新的样本。 + +## 插入数据 {#inserts} +生成一些新的行程样本,将其加载到DataFrame中,然后将DataFrame写入Hudi数据集中,如下所示。 + +```scala +val inserts = convertToStringList(dataGen.generateInserts(10)) +val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2)) +df.write.format("org.apache.hudi"). + options(getQuickstartWriteConfigs). + option(PRECOMBINE_FIELD_OPT_KEY, "ts"). + option(RECORDKEY_FIELD_OPT_KEY, "uuid"). + option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). + option(TABLE_NAME, tableName). + mode(Overwrite). + save(basePath); +``` + +`mode(Overwrite)`覆盖并重新创建数据集(如果已经存在)。 +您可以检查在`/tmp/hudi_cow_table////`下生成的数据。我们提供了一个记录键 +([schema](#sample-schema)中的`uuid`),分区字段(`region/country/city`)和组合逻辑([schema](#sample-schema)中的`ts`) +以确保行程记录在每个分区中都是唯一的。更多信息请参阅 +[对Hudi中的数据进行建模](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=113709185#FAQ-HowdoImodelthedatastoredinHudi), +有关将数据提取到Hudi中的方法的信息,请参阅[写入Hudi数据集](/cn/docs/writing_data)。 +这里我们使用默认的写操作:`插入更新`。 如果您的工作负载没有`更新`,也可以使用更快的`插入`或`批量插入`操作。 +想了解更多信息,请参阅[写操作](/cn/docs/writing_data#write-operations) + +## 查询数据 {#query} + +将数据文件加载到DataFrame中。 + +```scala +val roViewDF = spark. + read. + format("org.apache.hudi"). + load(basePath + "/*/*/*/*") + //load(basePath) 如果使用 "/partitionKey=partitionValue" 文件夹命名格式,Spark将自动识别分区信息 + +roViewDF.registerTempTable("hudi_ro_table") +spark.sql("select fare, begin_lon, begin_lat, ts from hudi_ro_table where fare > 20.0").show() +spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_ro_table").show() +``` + +该查询提供已提取数据的读取优化视图。由于我们的分区路径(`region/country/city`)是嵌套的3个级别 +从基本路径开始,我们使用了`load(basePath + "/*/*/*/*")`。 +有关支持的所有存储类型和视图的更多信息,请参考[存储类型和视图](/cn/docs/concepts#storage-types--views)。 + +## 更新数据 {#updates} + +这类似于插入新数据。使用数据生成器生成对现有行程的更新,加载到DataFrame中并将DataFrame写入hudi数据集。 + +```scala +val updates = convertToStringList(dataGen.generateUpdates(10)) +val df = spark.read.json(spark.sparkContext.parallelize(updates, 2)); +df.write.format("org.apache.hudi"). + options(getQuickstartWriteConfigs). + option(PRECOMBINE_FIELD_OPT_KEY, "ts"). + option(RECORDKEY_FIELD_OPT_KEY, "uuid"). + option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). + option(TABLE_NAME, tableName). + mode(Append). + save(basePath); +``` + +注意,保存模式现在为`追加`。通常,除非您是第一次尝试创建数据集,否则请始终使用追加模式。 +[查询](#query)现在再次查询数据将显示更新的行程。每个写操作都会生成一个新的由时间戳表示的[commit](/cn/docs/concepts) +。在之前提交的相同的`_hoodie_record_key`中寻找`_hoodie_commit_time`, `rider`, `driver`字段变更。 + +## 增量查询 + +Hudi还提供了获取给定提交时间戳以来已更改的记录流的功能。 +这可以通过使用Hudi的增量视图并提供所需更改的开始时间来实现。 +如果我们需要给定提交之后的所有更改(这是常见的情况),则无需指定结束时间。 + +```scala +// reload data +spark. + read. + format("org.apache.hudi"). + load(basePath + "/*/*/*/*"). + createOrReplaceTempView("hudi_ro_table") + +val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime").map(k => k.getString(0)).take(50) +val beginTime = commits(commits.length - 2) // commit time we are interested in + +// 增量查询数据 +val incViewDF = spark. + read. + format("org.apache.hudi"). + option(VIEW_TYPE_OPT_KEY, VIEW_TYPE_INCREMENTAL_OPT_VAL). + option(BEGIN_INSTANTTIME_OPT_KEY, beginTime). + load(basePath); +incViewDF.registerTempTable("hudi_incr_table") +spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0").show() +``` + +这将提供在开始时间提交之后发生的所有更改,其中包含票价大于20.0的过滤器。关于此功能的独特之处在于,它现在使您可以在批量数据上创作流式管道。 + +## 特定时间点查询 + +让我们看一下如何查询特定时间的数据。可以通过将结束时间指向特定的提交时间,将开始时间指向"000"(表示最早的提交时间)来表示特定时间。 + +```scala +val beginTime = "000" // Represents all commits > this time. +val endTime = commits(commits.length - 2) // commit time we are interested in + +// 增量查询数据 +val incViewDF = spark.read.format("org.apache.hudi"). + option(VIEW_TYPE_OPT_KEY, VIEW_TYPE_INCREMENTAL_OPT_VAL). + option(BEGIN_INSTANTTIME_OPT_KEY, beginTime). + option(END_INSTANTTIME_OPT_KEY, endTime). + load(basePath); +incViewDF.registerTempTable("hudi_incr_table") +spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0").show() +``` + +## 删除数据 {#deletes} +删除传入的 HoodieKeys 的记录。 + +```scala +// spark-shell +// 获取记录总数 +spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count() +// 拿到两条将要删除的数据 +val ds = spark.sql("select uuid, partitionpath from hudi_trips_snapshot").limit(2) + +// 执行删除 +val deletes = dataGen.generateDeletes(ds.collectAsList()) +val df = spark.read.json(spark.sparkContext.parallelize(deletes, 2)) + +df.write.format("hudi"). + options(getQuickstartWriteConfigs). + option(OPERATION_OPT_KEY,"delete"). + option(PRECOMBINE_FIELD_OPT_KEY, "ts"). + option(RECORDKEY_FIELD_OPT_KEY, "uuid"). + option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). + option(TABLE_NAME, tableName). + mode(Append). + save(basePath) + +// 向之前一样运行查询 +val roAfterDeleteViewDF = spark. + read. + format("hudi"). + load(basePath + "/*/*/*/*") + +roAfterDeleteViewDF.registerTempTable("hudi_trips_snapshot") +// 应返回 (total - 2) 条记录 +spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count() +``` +注意: 删除操作只支持 `Append` 模式。 + + + +请查阅写数据页的[删除部分](/cn/docs/writing_data#deletes) 查看更多信息. + + +# Pyspark 示例 +## 设置spark-shell + +Hudi适用于Spark-2.4.3+ & Spark 3.x 版本。您可以按照[此处](https://spark.apache.org/downloads)的说明设置spark。 +在提取的目录中,使用spark-shell运行Hudi: + +```python +# pyspark +export PYSPARK_PYTHON=$(which python3) +# for spark3 +pyspark \ + --packages org.apache.hudi:hudi-spark3-bundle_2.12:0.8.0,org.apache.spark:spark-avro_2.12:3.0.1 \ + --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' +# for spark2 with scala 2.12 +pyspark \ + --packages org.apache.hudi:hudi-spark-bundle_2.12:0.8.0,org.apache.spark:spark-avro_2.12:2.4.4 \ + --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' +# for spark2 with scala 2.11 +pyspark \ + --packages org.apache.hudi:hudi-spark-bundle_2.11:0.8.0,org.apache.spark:spark-avro_2.11:2.4.4 \ + --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' +``` + +
+

请注意以下事项:

+
    +
  • 需要通过 --packages 指定 spark-avro, 因为默认情况下 spark-shell 不包含该模块
  • +
  • spark-avro 和 spark 的版本必须匹配 (上面两个我们都使用了2.4.4)
  • +
  • 我们使用了基于 scala 2.11 构建的 hudi-spark-bundle, 因为使用的 spark-avro 也是基于 scala 2.11的. + 如果使用了 spark-avro_2.12, 相应的, 需要使用 hudi-spark-bundle_2.12.
  • +
+
+ +设置表名、基本路径和数据生成器来为本指南生成记录。 + +```python +# pyspark +tableName = "hudi_trips_cow" +basePath = "file:///tmp/hudi_trips_cow" +dataGen = sc._jvm.org.apache.hudi.QuickstartUtils.DataGenerator() +``` + +[数据生成器](https://github.com/apache/hudi/blob/master/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java#L50) +可以基于[行程样本模式](https://github.com/apache/hudi/blob/master/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java#L57) +生成插入和更新的样本。 +\{: .notice--info} + + +## 插入数据 {#inserts} + +生成一些新的行程样本,将其加载到DataFrame中,然后将DataFrame写入Hudi数据集中,如下所示。 + +```python +# pyspark +inserts = sc._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(dataGen.generateInserts(10)) +df = spark.read.json(spark.sparkContext.parallelize(inserts, 2)) + +hudi_options = { + 'hoodie.table.name': tableName, + 'hoodie.datasource.write.recordkey.field': 'uuid', + 'hoodie.datasource.write.partitionpath.field': 'partitionpath', + 'hoodie.datasource.write.table.name': tableName, + 'hoodie.datasource.write.operation': 'insert', + 'hoodie.datasource.write.precombine.field': 'ts', + 'hoodie.upsert.shuffle.parallelism': 2, + 'hoodie.insert.shuffle.parallelism': 2 +} + +df.write.format("hudi"). \ + options(**hudi_options). \ + mode("overwrite"). \ + save(basePath) +``` + +`mode(Overwrite)` 覆盖并重新创建数据集(如果已经存在)。 +您可以检查在`/tmp/hudi_cow_table////`下生成的数据。我们提供了一个记录键 +([schema](#sample-schema)中的`uuid`),分区字段(`region/country/city`)和组合逻辑([schema](#sample-schema)中的`ts`) +以确保行程记录在每个分区中都是唯一的。更多信息请参阅 +[对Hudi中的数据进行建模](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=113709185#FAQ-HowdoImodelthedatastoredinHudi), +有关将数据提取到Hudi中的方法的信息,请参阅[写入Hudi数据集](/cn/docs/writing_data)。 +这里我们使用默认的写操作:`插入更新`。 如果您的工作负载没有`更新`,也可以使用更快的`插入`或`批量插入`操作。 +想了解更多信息,请参阅[写操作](/cn/docs/writing_data#write-operations) +\{: .notice--info} + +## 查询数据 {#query} + +将数据文件加载到DataFrame中。 + +```python +# pyspark +tripsSnapshotDF = spark. \ + read. \ + format("hudi"). \ + load(basePath + "/*/*/*/*") +# load(basePath) use "/partitionKey=partitionValue" folder structure for Spark auto partition discovery + +tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot") + +spark.sql("select fare, begin_lon, begin_lat, ts from hudi_trips_snapshot where fare > 20.0").show() +spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_trips_snapshot").show() +``` + +该查询提供已提取数据的读取优化视图。由于我们的分区路径(`region/country/city`)是嵌套的3个级别 +从基本路径开始,我们使用了`load(basePath + "/*/*/*/*")`。 +有关支持的所有存储类型和视图的更多信息,请参考[存储类型和视图](/cn/docs/concepts#storage-types--views)。 +\{: .notice--info} + +## 更新数据 {#updates} + +这类似于插入新数据。使用数据生成器生成对现有行程的更新,加载到DataFrame中并将DataFrame写入hudi数据集。 + +```python +# pyspark +updates = sc._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(dataGen.generateUpdates(10)) +df = spark.read.json(spark.sparkContext.parallelize(updates, 2)) +df.write.format("hudi"). \ + options(**hudi_options). \ + mode("append"). \ + save(basePath) +``` + +Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time. +[Querying](#query-data) the data again will now show updated trips. Each write operation generates a new [commit](/docs/concepts) +denoted by the timestamp. Look for changes in `_hoodie_commit_time`, `rider`, `driver` fields for the same `_hoodie_record_key`s in previous commit. + +注意,保存模式现在为`追加`。通常,除非您是第一次尝试创建数据集,否则请始终使用追加模式。 +[查询](#query)现在再次查询数据将显示更新的行程。每个写操作都会生成一个新的由时间戳表示的[commit](/cn/docs/concepts) +。在之前提交的相同的`_hoodie_record_key`中寻找`_hoodie_commit_time`, `rider`, `driver`字段变更。 +\{: .notice--info} + +## 增量查询 + +Hudi还提供了获取给定提交时间戳以来已更改的记录流的功能。 +这可以通过使用Hudi的增量视图并提供所需更改的开始时间来实现。 +如果我们需要给定提交之后的所有更改(这是常见的情况),则无需指定结束时间。 + +```python +# pyspark +# 加载数据 +spark. \ + read. \ + format("hudi"). \ + load(basePath + "/*/*/*/*"). \ + createOrReplaceTempView("hudi_trips_snapshot") + +commits = list(map(lambda row: row[0], spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_trips_snapshot order by commitTime").limit(50).collect())) +beginTime = commits[len(commits) - 2] # commit time we are interested in + +# 增量的查询数据 +incremental_read_options = { + 'hoodie.datasource.query.type': 'incremental', + 'hoodie.datasource.read.begin.instanttime': beginTime, +} + +tripsIncrementalDF = spark.read.format("hudi"). \ + options(**incremental_read_options). \ + load(basePath) +tripsIncrementalDF.createOrReplaceTempView("hudi_trips_incremental") + +spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_trips_incremental where fare > 20.0").show() +``` + +这将提供在开始时间提交之后发生的所有更改,其中包含票价大于20.0的过滤器。关于此功能的独特之处在于,它现在使您可以在批量数据上创作流式管道。 +\{: .notice--info} + +## 特定时间点查询 + +让我们看一下如何查询特定时间的数据。可以通过将结束时间指向特定的提交时间,将开始时间指向"000"(表示最早的提交时间)来表示特定时间。 + +```python +# pyspark +beginTime = "000" # 代表所有大于该时间的 commits. +endTime = commits[len(commits) - 2] + +# 特定时间查询 +point_in_time_read_options = { + 'hoodie.datasource.query.type': 'incremental', + 'hoodie.datasource.read.end.instanttime': endTime, + 'hoodie.datasource.read.begin.instanttime': beginTime +} + +tripsPointInTimeDF = spark.read.format("hudi"). \ + options(**point_in_time_read_options). \ + load(basePath) + +tripsPointInTimeDF.createOrReplaceTempView("hudi_trips_point_in_time") +spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_trips_point_in_time where fare > 20.0").show() +``` + +## 删除数据 {#deletes} +删除传入的 HoodieKeys 的记录。 + +注意: 删除操作只支持 `Append` 模式。 + +```python +# pyspark +# 获取记录总数 +spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count() +# 拿到两条将被删除的记录 +ds = spark.sql("select uuid, partitionpath from hudi_trips_snapshot").limit(2) + +# 执行删除 +hudi_delete_options = { + 'hoodie.table.name': tableName, + 'hoodie.datasource.write.recordkey.field': 'uuid', + 'hoodie.datasource.write.partitionpath.field': 'partitionpath', + 'hoodie.datasource.write.table.name': tableName, + 'hoodie.datasource.write.operation': 'delete', + 'hoodie.datasource.write.precombine.field': 'ts', + 'hoodie.upsert.shuffle.parallelism': 2, + 'hoodie.insert.shuffle.parallelism': 2 +} + +from pyspark.sql.functions import lit +deletes = list(map(lambda row: (row[0], row[1]), ds.collect())) +df = spark.sparkContext.parallelize(deletes).toDF(['uuid', 'partitionpath']).withColumn('ts', lit(0.0)) +df.write.format("hudi"). \ + options(**hudi_delete_options). \ + mode("append"). \ + save(basePath) + +# 向之前一样运行查询 +roAfterDeleteViewDF = spark. \ + read. \ + format("hudi"). \ + load(basePath + "/*/*/*/*") +roAfterDeleteViewDF.registerTempTable("hudi_trips_snapshot") +# 应返回 (total - 2) 条记录 +spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count() +``` + +请查阅写数据页的[删除部分](/cn/docs/writing_data#deletes) 查看更多信息. + + +## 从这开始下一步? + +您也可以通过[自己构建hudi](https://github.com/apache/hudi#building-apache-hudi-from-source)来快速开始, +并在spark-shell命令中使用`--jars /packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.1?-*.*.*-SNAPSHOT.jar`, +而不是`--packages org.apache.hudi:hudi-spark3-bundle_2.12:0.8.0` + + +这里我们使用Spark演示了Hudi的功能。但是,Hudi可以支持多种存储类型/视图,并且可以从Hive,Spark,Presto等查询引擎中查询Hudi数据集。 +我们制作了一个基于Docker设置、所有依赖系统都在本地运行的[演示视频](https://www.youtube.com/watch?v=VhNgUsxdrD0), +我们建议您复制相同的设置然后按照[这里](/cn/docs/docker_demo)的步骤自己运行这个演示。 +另外,如果您正在寻找将现有数据迁移到Hudi的方法,请参考[迁移指南](/cn/docs/migration_guide)。 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/use_cases.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/use_cases.md new file mode 100644 index 0000000000000..8ddfc5cbf9d65 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/use_cases.md @@ -0,0 +1,67 @@ +--- +title: 使用案例 +keywords: [ hudi, data ingestion, etl, real time, use cases] +summary: "Following are some sample use-cases for Hudi, which illustrate the benefits in terms of faster processing & increased efficiency" +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +以下是一些使用Hudi的示例,说明了加快处理速度和提高效率的好处 + +## 近实时摄取 + +将外部源(如事件日志、数据库、外部源)的数据摄取到[Hadoop数据湖](http://martinfowler.com/bliki/DataLake)是一个众所周知的问题。 +尽管这些数据对整个组织来说是最有价值的,但不幸的是,在大多数(如果不是全部)Hadoop部署中都使用零散的方式解决,即使用多个不同的摄取工具。 + + +对于RDBMS摄取,Hudi提供 __通过更新插入达到更快加载__,而不是昂贵且低效的批量加载。例如,您可以读取MySQL BIN日志或[Sqoop增量导入](https://sqoop.apache.org/docs/1.4.2/SqoopUserGuide#_incremental_imports)并将其应用于 +DFS上的等效Hudi表。这比[批量合并任务](https://sqoop.apache.org/docs/1.4.0-incubating/SqoopUserGuide#id1770457)及[复杂的手工合并工作流](http://hortonworks.com/blog/four-step-strategy-incremental-updates-hive/)更快/更有效率。 + + +对于NoSQL数据存储,如[Cassandra](http://cassandra.apache.org/) / [Voldemort](http://www.project-voldemort.com/voldemort/) / [HBase](https://hbase.apache.org/),即使是中等规模大小也会存储数十亿行。 +毫无疑问, __全量加载不可行__,如果摄取需要跟上较高的更新量,那么则需要更有效的方法。 + + +即使对于像[Kafka](http://kafka.apache.org)这样的不可变数据源,Hudi也可以 __强制在HDFS上使用最小文件大小__, 这采取了综合方式解决[HDFS小文件问题](https://blog.cloudera.com/blog/2009/02/the-small-files-problem/)来改善NameNode的健康状况。这对事件流来说更为重要,因为它通常具有较高容量(例如:点击流),如果管理不当,可能会对Hadoop集群造成严重损害。 + +在所有源中,通过`commits`这一概念,Hudi增加了以原子方式向消费者发布新数据的功能,这种功能十分必要。 + +## 近实时分析 + +通常,实时[数据集市](https://en.wikipedia.org/wiki/Data_mart)由专业(实时)数据分析存储提供支持,例如[Druid](http://druid.io/)或[Memsql](http://www.memsql.com/)或[OpenTSDB](http://opentsdb.net/)。 +这对于较小规模的数据量来说绝对是完美的([相比于这样安装Hadoop](https://blog.twitter.com/2015/hadoop-filesystem-at-twitter)),这种情况需要在亚秒级响应查询,例如系统监控或交互式实时分析。 +但是,由于Hadoop上的数据太陈旧了,通常这些系统会被滥用于非交互式查询,这导致利用率不足和硬件/许可证成本的浪费。 + +另一方面,Hadoop上的交互式SQL解决方案(如Presto和SparkSQL)表现出色,在 __几秒钟内完成查询__。 +通过将 __数据新鲜度提高到几分钟__,Hudi可以提供一个更有效的替代方案,并支持存储在DFS中的 __数量级更大的数据集__ 的实时分析。 +此外,Hudi没有外部依赖(如专用于实时分析的HBase集群),因此可以在更新的分析上实现更快的分析,而不会增加操作开销。 + + +## 增量处理管道 + +Hadoop提供的一个基本能力是构建一系列数据集,这些数据集通过表示为工作流的DAG相互派生。 +工作流通常取决于多个上游工作流输出的新数据,新数据的可用性传统上由新的DFS文件夹/Hive分区指示。 +让我们举一个具体的例子来说明这点。上游工作流`U`可以每小时创建一个Hive分区,在每小时结束时(processing_time)使用该小时的数据(event_time),提供1小时的有效新鲜度。 +然后,下游工作流`D`在`U`结束后立即启动,并在下一个小时内自行处理,将有效延迟时间增加到2小时。 + +上面的示例忽略了迟到的数据,即`processing_time`和`event_time`分开时。 +不幸的是,在今天的后移动和前物联网世界中,__来自间歇性连接的移动设备和传感器的延迟数据是常态,而不是异常__。 +在这种情况下,保证正确性的唯一补救措施是[重新处理最后几个小时](https://falcon.apache.org/FalconDocumentation#Handling_late_input_data)的数据, +每小时一遍又一遍,这可能会严重影响整个生态系统的效率。例如; 试想一下,在数百个工作流中每小时重新处理TB数据。 + +Hudi通过以单个记录为粒度的方式(而不是文件夹/分区)从上游 Hudi数据集`HU`消费新数据(包括迟到数据),来解决上面的问题。 +应用处理逻辑,并使用下游Hudi数据集`HD`高效更新/协调迟到数据。在这里,`HU`和`HD`可以以更频繁的时间被连续调度 +比如15分钟,并且`HD`提供端到端30分钟的延迟。 + +为实现这一目标,Hudi采用了类似于[Spark Streaming](https://spark.apache.org/docs/latest/streaming-programming-guide#join-operations)、发布/订阅系统等流处理框架,以及像[Kafka](http://kafka.apache.org/documentation/#theconsumer) +或[Oracle XStream](https://docs.oracle.com/cd/E11882_01/server.112/e16545/xstrm_cncpt.htm#XSTRM187)等数据库复制技术的类似概念。 +如果感兴趣,可以在[这里](https://www.oreilly.com/ideas/ubers-case-for-incremental-processing-on-hadoop)找到有关增量处理(相比于流处理和批处理)好处的更详细解释。 + +## DFS的数据分发 + +一个常用场景是先在Hadoop上处理数据,然后将其分发回在线服务存储层,以供应用程序使用。 +例如,一个Spark管道可以[确定Hadoop上的紧急制动事件](https://eng.uber.com/telematics/)并将它们加载到服务存储层(如ElasticSearch)中,供Uber应用程序使用以增加安全驾驶。这种用例中,通常架构会在Hadoop和服务存储之间引入`队列`,以防止目标服务存储被压垮。 +对于队列的选择,一种流行的选择是Kafka,这个模型经常导致 __在DFS上存储相同数据的冗余(用于计算结果的离线分析)和Kafka(用于分发)__ + +通过将每次运行的Spark管道更新插入的输出转换为Hudi数据集,Hudi可以再次有效地解决这个问题,然后可以以增量方式获取尾部数据(就像Kafka topic一样)然后写入服务存储层。 diff --git a/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/writing_data.md b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/writing_data.md new file mode 100644 index 0000000000000..f9c84bc1066e4 --- /dev/null +++ b/website/i18n/cn/docusaurus-plugin-content-docs/version-1.0.0/writing_data.md @@ -0,0 +1,222 @@ +--- +title: 写入 Hudi 数据集 +keywords: [ hudi, incremental, batch, stream, processing, Hive, ETL, Spark SQL] +summary: In this page, we will discuss some available tools for incrementally ingesting & storing data. +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +language: cn +--- + +这一节我们将介绍使用[DeltaStreamer](#deltastreamer)工具从外部源甚至其他Hudi数据集摄取新更改的方法, +以及通过使用[Hudi数据源](#datasource-writer)的upserts加快大型Spark作业的方法。 +对于此类数据集,我们可以使用各种查询引擎[查询](/cn/docs/querying_data)它们。 + +## 写操作 + +在此之前,了解Hudi数据源及delta streamer工具提供的三种不同的写操作以及如何最佳利用它们可能会有所帮助。 +这些操作可以在针对数据集发出的每个提交/增量提交中进行选择/更改。 + + - **UPSERT(插入更新)** :这是默认操作,在该操作中,通过查找索引,首先将输入记录标记为插入或更新。 + 在运行启发式方法以确定如何最好地将这些记录放到存储上,如优化文件大小之后,这些记录最终会被写入。 + 对于诸如数据库更改捕获之类的用例,建议该操作,因为输入几乎肯定包含更新。 + - **INSERT(插入)** :就使用启发式方法确定文件大小而言,此操作与插入更新(UPSERT)非常相似,但此操作完全跳过了索引查找步骤。 + 因此,对于日志重复数据删除等用例(结合下面提到的过滤重复项的选项),它可以比插入更新快得多。 + 插入也适用于这种用例,这种情况数据集可以允许重复项,但只需要Hudi的事务写/增量提取/存储管理功能。 + - **BULK_INSERT(批插入)** :插入更新和插入操作都将输入记录保存在内存中,以加快存储优化启发式计算的速度(以及其它未提及的方面)。 + 所以对Hudi数据集进行初始加载/引导时这两种操作会很低效。批量插入提供与插入相同的语义,但同时实现了基于排序的数据写入算法, + 该算法可以很好地扩展数百TB的初始负载。但是,相比于插入和插入更新能保证文件大小,批插入在调整文件大小上只能尽力而为。 + +## DeltaStreamer + +`HoodieDeltaStreamer`实用工具 (hudi-utilities-bundle中的一部分) 提供了从DFS或Kafka等不同来源进行摄取的方式,并具有以下功能。 + + - 从Kafka单次摄取新事件,从Sqoop、HiveIncrementalPuller输出或DFS文件夹中的多个文件 + [增量导入](https://sqoop.apache.org/docs/1.4.2/SqoopUserGuide#_incremental_imports) + - 支持json、avro或自定义记录类型的传入数据 + - 管理检查点,回滚和恢复 + - 利用DFS或Confluent [schema注册表](https://github.com/confluentinc/schema-registry)的Avro模式。 + - 支持自定义转换操作 + +命令行选项更详细地描述了这些功能: + +```java +[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` --help +Usage:
[options] + Options: + --commit-on-errors + Commit even when some records failed to be written + Default: false + --enable-hive-sync + Enable syncing to hive + Default: false + --filter-dupes + Should duplicate records from source be dropped/filtered outbefore + insert/bulk-insert + Default: false + --help, -h + --hudi-conf + Any configuration that can be set in the properties file (using the CLI + parameter "--propsFilePath") can also be passed command line using this + parameter + Default: [] + --op + Takes one of these values : UPSERT (default), INSERT (use when input is + purely new data/inserts to gain speed) + Default: UPSERT + Possible Values: [UPSERT, INSERT, BULK_INSERT] + --payload-class + subclass of HoodieRecordPayload, that works off a GenericRecord. + Implement your own, if you want to do something other than overwriting + existing value + Default: org.apache.hudi.OverwriteWithLatestAvroPayload + --props + path to properties file on localfs or dfs, with configurations for + Hudi client, schema provider, key generator and data source. For + Hudi client props, sane defaults are used, but recommend use to + provide basic things like metrics endpoints, hive configs etc. For + sources, referto individual classes, for supported properties. + Default: file:///Users/vinoth/bin/hoodie/src/test/resources/delta-streamer-config/dfs-source.properties + --schemaprovider-class + subclass of org.apache.hudi.utilities.schema.SchemaProvider to attach + schemas to input & target table data, built in options: + FilebasedSchemaProvider + Default: org.apache.hudi.utilities.schema.FilebasedSchemaProvider + --source-class + Subclass of org.apache.hudi.utilities.sources to read data. Built-in + options: org.apache.hudi.utilities.sources.{JsonDFSSource (default), + AvroDFSSource, JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource} + Default: org.apache.hudi.utilities.sources.JsonDFSSource + --source-limit + Maximum amount of data to read from source. Default: No limit For e.g: + DFSSource => max bytes to read, KafkaSource => max events to read + Default: 9223372036854775807 + --source-ordering-field + Field within source record to decide how to break ties between records + with same key in input data. Default: 'ts' holding unix timestamp of + record + Default: ts + --spark-master + spark master to use. + Default: local[2] + * --target-base-path + base path for the target Hudi dataset. (Will be created if did not + exist first time around. If exists, expected to be a Hudi dataset) + * --target-table + name of the target table in Hive + --transformer-class + subclass of org.apache.hudi.utilities.transform.Transformer. UDF to + transform raw source dataset to a target dataset (conforming to target + schema) before writing. Default : Not set. E:g - + org.apache.hudi.utilities.transform.SqlQueryBasedTransformer (which + allows a SQL query template to be passed as a transformation function) +``` + +该工具采用层次结构组成的属性文件,并具有可插拔的接口,用于提取数据、生成密钥和提供模式。 +从Kafka和DFS摄取数据的示例配置在这里:`hudi-utilities/src/test/resources/delta-streamer-config`。 + +例如:当您让Confluent Kafka、Schema注册表启动并运行后,可以用这个命令产生一些测试数据 +([impressions.avro](https://docs.confluent.io/current/ksql/docs/tutorials/generate-custom-test-data), +由schema-registry代码库提供) + +```java +[confluent-5.0.0]$ bin/ksql-datagen schema=../impressions.avro format=avro topic=impressions key=impressionid +``` + +然后用如下命令摄取这些数据。 + +```java +[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` \ + --props file://${PWD}/hudi-utilities/src/test/resources/delta-streamer-config/kafka-source.properties \ + --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \ + --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \ + --source-ordering-field impresssiontime \ + --target-base-path file:///tmp/hudi-deltastreamer-op --target-table uber.impressions \ + --op BULK_INSERT +``` + +在某些情况下,您可能需要预先将现有数据集迁移到Hudi。 请参考[迁移指南](/cn/docs/migration_guide)。 + +## Datasource Writer + +`hudi-spark`模块提供了DataSource API,可以将任何DataFrame写入(也可以读取)到Hudi数据集中。 +以下是在指定需要使用的字段名称的之后,如何插入更新DataFrame的方法,这些字段包括 +`recordKey => _row_key`、`partitionPath => partition`和`precombineKey => timestamp` + +```java +inputDF.write() + .format("org.apache.hudi") + .options(clientOpts) // 可以传入任何Hudi客户端参数 + .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") + .option(HoodieWriteConfig.TABLE_NAME, tableName) + .mode(SaveMode.Append) + .save(basePath); +``` + +## 与Hive同步 + +上面的两个工具都支持将数据集的最新模式同步到Hive Metastore,以便查询新的列和分区。 +如果需要从命令行或在独立的JVM中运行它,Hudi提供了一个`HiveSyncTool`, +在构建了hudi-hive模块之后,可以按以下方式调用它。 + +```java +cd hudi-hive +./run_sync_tool.sh + [hudi-hive]$ ./run_sync_tool.sh --help +Usage:
[options] + Options: + * --base-path + Basepath of Hudi dataset to sync + * --database + name of the target database in Hive + --help, -h + Default: false + * --jdbc-url + Hive jdbc connect url + * --pass + Hive password + * --table + name of the target table in Hive + * --user + Hive username +``` + +## 删除数据 + +通过允许用户指定不同的数据记录负载实现,Hudi支持对存储在Hudi数据集中的数据执行两种类型的删除。 + + - **Soft Deletes(软删除)** :使用软删除时,用户希望保留键,但仅使所有其他字段的值都为空。 + 通过确保适当的字段在数据集模式中可以为空,并在将这些字段设置为null之后直接向数据集插入更新这些记录,即可轻松实现这一点。 + - **Hard Deletes(硬删除)** :这种更强形式的删除是从数据集中彻底删除记录在存储上的任何痕迹。 + 这可以通过触发一个带有自定义负载实现的插入更新来实现,这种实现可以使用总是返回Optional.Empty作为组合值的DataSource或DeltaStreamer。 + Hudi附带了一个内置的`org.apache.hudi.EmptyHoodieRecordPayload`类,它就是实现了这一功能。 + +```java + deleteDF // 仅包含要删除的记录的DataFrame + .write().format("org.apache.hudi") + .option(...) // 根据设置需要添加HUDI参数,例如记录键、分区路径和其他参数 + // 指定record_key,partition_key,precombine_fieldkey和常规参数 + .option(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY, "org.apache.hudi.EmptyHoodieRecordPayload") + +``` + +## 存储管理 + +Hudi还对存储在Hudi数据集中的数据执行几个关键的存储管理功能。在DFS上存储数据的关键方面是管理文件大小和数量以及回收存储空间。 +例如,HDFS在处理小文件上性能很差,这会对Name Node的内存及RPC施加很大的压力,并可能破坏整个集群的稳定性。 +通常,查询引擎可在较大的列文件上提供更好的性能,因为它们可以有效地摊销获得列统计信息等的成本。 +即使在某些云数据存储上,列出具有大量小文件的目录也常常比较慢。 + +以下是一些有效管理Hudi数据集存储的方法。 + + - Hudi中的[小文件处理功能](/cn/docs/configurations#compactionSmallFileSize),可以分析传入的工作负载并将插入内容分配到现有文件组中, + 而不是创建新文件组。新文件组会生成小文件。 + - 可以[配置](/cn/docs/configurations#retainCommits)Cleaner来清理较旧的文件片,清理的程度可以调整, + 具体取决于查询所需的最长时间和增量拉取所需的回溯。 + - 用户还可以调整[基础/parquet文件](/cn/docs/configurations#limitFileSize)、[日志文件](/cn/docs/configurations#logFileMaxSize)的大小 + 和预期的[压缩率](/cn/docs/configurations#parquetCompressionRatio),使足够数量的插入被分到同一个文件组中,最终产生大小合适的基础文件。 + - 智能调整[批插入并行度](/cn/docs/configurations#withBulkInsertParallelism),可以产生大小合适的初始文件组。 + 实际上,正确执行此操作非常关键,因为文件组一旦创建后就不能删除,只能如前所述对其进行扩展。 + - 对于具有大量更新的工作负载,[读取时合并存储](/cn/docs/concepts#merge-on-read-storage)提供了一种很好的机制, + 可以快速将其摄取到较小的文件中,之后通过压缩将它们合并为较大的基础文件。 diff --git a/website/releases/download.md b/website/releases/download.md index 6259ffaa587bf..f93d0e93e001f 100644 --- a/website/releases/download.md +++ b/website/releases/download.md @@ -6,6 +6,10 @@ toc: true last_modified_at: 2022-12-27T15:59:57-04:00 --- +### Release 1.0.0 +* Source Release : [Apache Hudi 1.0.0 Source Release](https://downloads.apache.org/hudi/1.0.0/hudi-1.0.0.src.tgz) ([asc](https://downloads.apache.org/hudi/1.0.0/hudi-1.0.0.src.tgz.asc), [sha512](https://downloads.apache.org/hudi/1.0.0/hudi-1.0.0.src.tgz.sha512)) +* Release Note : ([Release Note for Apache Hudi 1.0.0](/releases/release-1.0.0)) + ### Release 1.0.0-beta2 * Source Release : [Apache Hudi 1.0.0-beta2 Source Release](https://downloads.apache.org/hudi/1.0.0-beta2/hudi-1.0.0-beta2.src.tgz) ([asc](https://downloads.apache.org/hudi/1.0.0-beta2/hudi-1.0.0-beta2.src.tgz.asc), [sha512](https://downloads.apache.org/hudi/1.0.0-beta2/hudi-1.0.0-beta2.src.tgz.sha512)) * Release Note : ([Release Note for Apache Hudi 1.0.0-beta2](/releases/release-1.0.0-beta2)) diff --git a/website/src/components/HomepageHeader/index.js b/website/src/components/HomepageHeader/index.js index fba104fed57c3..3100dc17cf5bb 100644 --- a/website/src/components/HomepageHeader/index.js +++ b/website/src/components/HomepageHeader/index.js @@ -19,7 +19,7 @@ function HomepageHeader() {
- + Latest releases diff --git a/website/versioned_docs/version-1.0.0/azure_hoodie.md b/website/versioned_docs/version-1.0.0/azure_hoodie.md new file mode 100644 index 0000000000000..f28ec609c70d8 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/azure_hoodie.md @@ -0,0 +1,50 @@ +--- +title: Microsoft Azure +keywords: [ hudi, hive, azure, spark, presto] +summary: In this page, we go over how to configure Hudi with Azure filesystem. +last_modified_at: 2020-05-25T19:00:57-04:00 +--- +In this page, we explain how to use Hudi on Microsoft Azure. + +## Disclaimer + +This page is maintained by the Hudi community. +If the information is inaccurate or you have additional information to add. +Please feel free to create a JIRA ticket. Contribution is highly appreciated. + +## Supported Storage System + +There are two storage systems support Hudi . + +- Azure Blob Storage +- Azure Data Lake Gen 2 + +## Verified Combination of Spark and storage system + +#### HDInsight Spark2.4 on Azure Data Lake Storage Gen 2 +This combination works out of the box. No extra config needed. + +#### Databricks Spark2.4 on Azure Data Lake Storage Gen 2 +- Import Hudi jar to databricks workspace + +- Mount the file system to dbutils. + ```scala + dbutils.fs.mount( + source = "abfss://xxx@xxx.dfs.core.windows.net", + mountPoint = "/mountpoint", + extraConfigs = configs) + ``` +- When writing Hudi dataset, use abfss URL + ```scala + inputDF.write + .format("org.apache.hudi") + .options(opts) + .mode(SaveMode.Append) + .save("abfss://<>.dfs.core.windows.net/hudi-tables/customer") + ``` +- When reading Hudi dataset, use the mounting point + ```scala + spark.read + .format("org.apache.hudi") + .load("/mountpoint/hudi-tables/customer") + ``` diff --git a/website/versioned_docs/version-1.0.0/basic_configurations.md b/website/versioned_docs/version-1.0.0/basic_configurations.md new file mode 100644 index 0000000000000..84f589d84db0e --- /dev/null +++ b/website/versioned_docs/version-1.0.0/basic_configurations.md @@ -0,0 +1,797 @@ +--- +title: Basic Configurations +summary: This page covers the basic configurations you may use to write/read Hudi tables. This page only features a subset of the most frequently used configurations. For a full list of all configs, please visit the [All Configurations](/docs/configurations) page. +last_modified_at: 2024-12-06T17:38:05.854 +--- + + +This page covers the basic configurations you may use to write/read Hudi tables. This page only features a subset of the most frequently used configurations. For a full list of all configs, please visit the [All Configurations](/docs/configurations) page. + +- [**Hudi Table Config**](#TABLE_CONFIG): Basic Hudi Table configuration parameters. +- [**Spark Datasource Configs**](#SPARK_DATASOURCE): These configs control the Hudi Spark Datasource, providing ability to define keys/partitioning, pick out the write operation, specify how to merge records or choosing query type to read. +- [**Flink Sql Configs**](#FLINK_SQL): These configs control the Hudi Flink SQL source/sink connectors, providing ability to define record keys, pick out the write operation, specify how to merge records, enable/disable asynchronous compaction or choosing query type to read. +- [**Write Client Configs**](#WRITE_CLIENT): Internally, the Hudi datasource uses a RDD based HoodieWriteClient API to actually perform writes to storage. These configs provide deep control over lower level aspects like file sizing, compression, parallelism, compaction, write schema, cleaning etc. Although Hudi provides sane defaults, from time-time these configs may need to be tweaked to optimize for specific workloads. +- [**Metastore and Catalog Sync Configs**](#META_SYNC): Configurations used by the Hudi to sync metadata to external metastores and catalogs. +- [**Metrics Configs**](#METRICS): These set of configs are used to enable monitoring and reporting of key Hudi stats and metrics. +- [**Kafka Connect Configs**](#KAFKA_CONNECT): These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables +- [**Hudi Streamer Configs**](#HUDI_STREAMER): These set of configs are used for Hudi Streamer utility which provides the way to ingest from different sources such as DFS or Kafka. + +:::note +In the tables below **(N/A)** means there is no default value set +::: + +## Hudi Table Config {#TABLE_CONFIG} +Basic Hudi Table configuration parameters. + + +### Hudi Table Basic Configs {#Hudi-Table-Basic-Configs} +Configurations of the Hudi Table like type of ingestion, storage formats, hive table name etc. Configurations are loaded from hoodie.properties, these properties are usually set during initializing a path as hoodie base path and never changes during the lifetime of a hoodie table. + + + + +[**Basic Configs**](#Hudi-Table-Basic-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- || +| [hoodie.bootstrap.base.path](#hoodiebootstrapbasepath) | (N/A) | Base path of the dataset that needs to be bootstrapped as a Hudi table
`Config Param: BOOTSTRAP_BASE_PATH` | +| [hoodie.compaction.payload.class](#hoodiecompactionpayloadclass) | (N/A) | Payload class to use for performing merges, compactions, i.e merge delta logs with current base file and then produce a new base file.
`Config Param: PAYLOAD_CLASS_NAME` | +| [hoodie.database.name](#hoodiedatabasename) | (N/A) | Database name. If different databases have the same table name during incremental query, we can set it to limit the table name under a specific database
`Config Param: DATABASE_NAME` | +| [hoodie.record.merge.strategy.id](#hoodierecordmergestrategyid) | (N/A) | Id of merger strategy. Hudi will pick HoodieRecordMerger implementations in `hoodie.write.record.merge.custom.implementation.classes` which has the same merger strategy id
`Config Param: RECORD_MERGE_STRATEGY_ID`
`Since Version: 0.13.0` | +| [hoodie.table.checksum](#hoodietablechecksum) | (N/A) | Table checksum is used to guard against partial writes in HDFS. It is added as the last entry in hoodie.properties and then used to validate while reading table config.
`Config Param: TABLE_CHECKSUM`
`Since Version: 0.11.0` | +| [hoodie.table.create.schema](#hoodietablecreateschema) | (N/A) | Schema used when creating the table
`Config Param: CREATE_SCHEMA` | +| [hoodie.table.index.defs.path](#hoodietableindexdefspath) | (N/A) | Relative path to table base path where the index definitions are stored
`Config Param: RELATIVE_INDEX_DEFINITION_PATH`
`Since Version: 1.0.0` | +| [hoodie.table.keygenerator.class](#hoodietablekeygeneratorclass) | (N/A) | Key Generator class property for the hoodie table
`Config Param: KEY_GENERATOR_CLASS_NAME` | +| [hoodie.table.keygenerator.type](#hoodietablekeygeneratortype) | (N/A) | Key Generator type to determine key generator class
`Config Param: KEY_GENERATOR_TYPE`
`Since Version: 1.0.0` | +| [hoodie.table.metadata.partitions](#hoodietablemetadatapartitions) | (N/A) | Comma-separated list of metadata partitions that have been completely built and in-sync with data table. These partitions are ready for use by the readers
`Config Param: TABLE_METADATA_PARTITIONS`
`Since Version: 0.11.0` | +| [hoodie.table.metadata.partitions.inflight](#hoodietablemetadatapartitionsinflight) | (N/A) | Comma-separated list of metadata partitions whose building is in progress. These partitions are not yet ready for use by the readers.
`Config Param: TABLE_METADATA_PARTITIONS_INFLIGHT`
`Since Version: 0.11.0` | +| [hoodie.table.name](#hoodietablename) | (N/A) | Table name that will be used for registering with Hive. Needs to be same across runs.
`Config Param: NAME` | +| [hoodie.table.partition.fields](#hoodietablepartitionfields) | (N/A) | Comma separated field names used to partition the table. These field names also include the partition type which is used by custom key generators
`Config Param: PARTITION_FIELDS` | +| [hoodie.table.precombine.field](#hoodietableprecombinefield) | (N/A) | Field used in preCombining before actual write. By default, when two records have the same key value, the largest value for the precombine field determined by Object.compareTo(..), is picked.
`Config Param: PRECOMBINE_FIELD` | +| [hoodie.table.recordkey.fields](#hoodietablerecordkeyfields) | (N/A) | Columns used to uniquely identify the table. Concatenated values of these fields are used as the record key component of HoodieKey.
`Config Param: RECORDKEY_FIELDS` | +| [hoodie.table.secondary.indexes.metadata](#hoodietablesecondaryindexesmetadata) | (N/A) | The metadata of secondary indexes
`Config Param: SECONDARY_INDEXES_METADATA`
`Since Version: 0.13.0` | +| [hoodie.timeline.layout.version](#hoodietimelinelayoutversion) | (N/A) | Version of timeline used, by the table.
`Config Param: TIMELINE_LAYOUT_VERSION` | +| [hoodie.archivelog.folder](#hoodiearchivelogfolder) | archived | path under the meta folder, to store archived timeline instants at.
`Config Param: ARCHIVELOG_FOLDER` | +| [hoodie.bootstrap.index.class](#hoodiebootstrapindexclass) | org.apache.hudi.common.bootstrap.index.hfile.HFileBootstrapIndex | Implementation to use, for mapping base files to bootstrap base file, that contain actual data.
`Config Param: BOOTSTRAP_INDEX_CLASS_NAME` | +| [hoodie.bootstrap.index.enable](#hoodiebootstrapindexenable) | true | Whether or not, this is a bootstrapped table, with bootstrap base data and an mapping index defined, default true.
`Config Param: BOOTSTRAP_INDEX_ENABLE` | +| [hoodie.bootstrap.index.type](#hoodiebootstrapindextype) | HFILE | Bootstrap index type determines which implementation to use, for mapping base files to bootstrap base file, that contain actual data.
`Config Param: BOOTSTRAP_INDEX_TYPE`
`Since Version: 1.0.0` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)
`Config Param: HIVE_STYLE_PARTITIONING_ENABLE` | +| [hoodie.partition.metafile.use.base.format](#hoodiepartitionmetafileusebaseformat) | false | If true, partition metafiles are saved in the same format as base-files for this dataset (e.g. Parquet / ORC). If false (default) partition metafiles are saved as properties files.
`Config Param: PARTITION_METAFILE_USE_BASE_FORMAT` | +| [hoodie.populate.meta.fields](#hoodiepopulatemetafields) | true | When enabled, populates all meta fields. When disabled, no meta fields are populated and incremental queries will not be functional. This is only meant to be used for append only/immutable data for batch processing
`Config Param: POPULATE_META_FIELDS` | +| [hoodie.record.merge.mode](#hoodierecordmergemode) | EVENT_TIME_ORDERING | org.apache.hudi.common.config.RecordMergeMode: Determines the logic of merging updates COMMIT_TIME_ORDERING: Using transaction time to merge records, i.e., the record from later transaction overwrites the earlier record with the same key. EVENT_TIME_ORDERING(default): Using event time as the ordering to merge records, i.e., the record with the larger event time overwrites the record with the smaller event time on the same key, regardless of transaction time. The event time or preCombine field needs to be specified by the user. CUSTOM: Using custom merging logic specified by the user.
`Config Param: RECORD_MERGE_MODE`
`Since Version: 1.0.0` | +| [hoodie.table.base.file.format](#hoodietablebasefileformat) | PARQUET | Base file format to store all the base file data.
`Config Param: BASE_FILE_FORMAT` | +| [hoodie.table.cdc.enabled](#hoodietablecdcenabled) | false | When enable, persist the change data if necessary, and can be queried as a CDC query mode.
`Config Param: CDC_ENABLED`
`Since Version: 0.13.0` | +| [hoodie.table.cdc.supplemental.logging.mode](#hoodietablecdcsupplementalloggingmode) | DATA_BEFORE_AFTER | org.apache.hudi.common.table.cdc.HoodieCDCSupplementalLoggingMode: Change log capture supplemental logging mode. The supplemental log is used for accelerating the generation of change log details. OP_KEY_ONLY: Only keeping record keys in the supplemental logs, so the reader needs to figure out the update before image and after image. DATA_BEFORE: Keeping the before images in the supplemental logs, so the reader needs to figure out the update after images. DATA_BEFORE_AFTER(default): Keeping the before and after images in the supplemental logs, so the reader can generate the details directly from the logs.
`Config Param: CDC_SUPPLEMENTAL_LOGGING_MODE`
`Since Version: 0.13.0` | +| [hoodie.table.initial.version](#hoodietableinitialversion) | EIGHT | Initial Version of table when the table was created. Used for upgrade/downgrade to identify what upgrade/downgrade paths happened on the table. This is only configured when the table is initially setup.
`Config Param: INITIAL_VERSION`
`Since Version: 1.0.0` | +| [hoodie.table.log.file.format](#hoodietablelogfileformat) | HOODIE_LOG | Log format used for the delta logs.
`Config Param: LOG_FILE_FORMAT` | +| [hoodie.table.multiple.base.file.formats.enable](#hoodietablemultiplebasefileformatsenable) | false | When set to true, the table can support reading and writing multiple base file formats.
`Config Param: MULTIPLE_BASE_FILE_FORMATS_ENABLE`
`Since Version: 1.0.0` | +| [hoodie.table.timeline.timezone](#hoodietabletimelinetimezone) | LOCAL | User can set hoodie commit timeline timezone, such as utc, local and so on. local is default
`Config Param: TIMELINE_TIMEZONE` | +| [hoodie.table.type](#hoodietabletype) | COPY_ON_WRITE | The table type for the underlying data.
`Config Param: TYPE` | +| [hoodie.table.version](#hoodietableversion) | EIGHT | Version of table, used for running upgrade/downgrade steps between releases with potentially breaking/backwards compatible changes.
`Config Param: VERSION` | +| [hoodie.timeline.history.path](#hoodietimelinehistorypath) | history | path under the meta folder, to store timeline history at.
`Config Param: TIMELINE_HISTORY_PATH` | +| [hoodie.timeline.path](#hoodietimelinepath) | timeline | path under the meta folder, to store timeline instants at.
`Config Param: TIMELINE_PATH` | +--- + +## Spark Datasource Configs {#SPARK_DATASOURCE} +These configs control the Hudi Spark Datasource, providing ability to define keys/partitioning, pick out the write operation, specify how to merge records or choosing query type to read. + + +### Read Options {#Read-Options} +Options useful for reading tables via `read.format.option(...)` + + + + + +[**Basic Configs**](#Read-Options-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.datasource.read.begin.instanttime](#hoodiedatasourcereadbegininstanttime) | (N/A) | Required when `hoodie.datasource.query.type` is set to `incremental`. Represents the completion time to start incrementally pulling data from. The completion time here need not necessarily correspond to an instant on the timeline. New data written with completion_time >= START_COMMIT are fetched out. For e.g: ‘20170901080000’ will get all new data written on or after Sep 1, 2017 08:00AM.
`Config Param: START_COMMIT` | +| [hoodie.datasource.read.end.instanttime](#hoodiedatasourcereadendinstanttime) | (N/A) | Used when `hoodie.datasource.query.type` is set to `incremental`. Represents the completion time to limit incrementally fetched data to. When not specified latest commit completion time from timeline is assumed by default. When specified, new data written with completion_time <= END_COMMIT are fetched out. Point in time type queries make more sense with begin and end completion times specified.
`Config Param: END_COMMIT` | +| [hoodie.datasource.read.incr.table.version](#hoodiedatasourcereadincrtableversion) | (N/A) | The table version assumed for incremental read
`Config Param: INCREMENTAL_READ_TABLE_VERSION` | +| [hoodie.datasource.read.streaming.table.version](#hoodiedatasourcereadstreamingtableversion) | (N/A) | The table version assumed for streaming read
`Config Param: STREAMING_READ_TABLE_VERSION` | +| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot | Whether data needs to be read, in `incremental` mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest view, based on base files) (or) `snapshot` mode (obtain latest view, by merging base and (if any) log files)
`Config Param: QUERY_TYPE` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)
`Config Param: READ_PRE_COMBINE_FIELD` | +--- + + +### Write Options {#Write-Options} +You can pass down any of the WriteClient level configs directly using `options()` or `option(k,v)` methods. + +```java +inputDF.write() +.format("org.apache.hudi") +.options(clientOpts) // any of the Hudi client opts can be passed in as well +.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") +.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") +.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") +.option(HoodieWriteConfig.TABLE_NAME, tableName) +.mode(SaveMode.Append) +.save(basePath); +``` + +Options useful for writing tables via `write.format.option(...)` + + + + + +[**Basic Configs**](#Write-Options-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | ----------------------------- || +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | (N/A) | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.
`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | (N/A) | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString()
`Config Param: PARTITIONPATH_FIELD` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | (N/A) | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: RECORDKEY_FIELD` | +| [hoodie.datasource.write.secondarykey.column](#hoodiedatasourcewritesecondarykeycolumn) | (N/A) | Columns that constitute the secondary key component. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: SECONDARYKEY_COLUMN_NAME` | +| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false | Enable running of clustering service, asynchronously as inserts happen on the table.
`Config Param: ASYNC_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.clustering.inline](#hoodieclusteringinline) | false | Turn on inline clustering - clustering will be run after each write operation is complete
`Config Param: INLINE_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false | When set to true, register/sync the table to Apache Hive metastore.
`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 | Hive metastore url
`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 | Hive metastore url
`Config Param: METASTORE_URIS` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)
`Config Param: HIVE_STYLE_PARTITIONING` | +| [hoodie.datasource.write.operation](#hoodiedatasourcewriteoperation) | upsert | Whether to do upsert, insert or bulk_insert for the write operation. Use bulk_insert to load new data into a table, and there on use upsert/insert. bulk insert uses a disk based write path to scale to load large inputs without need to cache it.
`Config Param: OPERATION` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)
`Config Param: PRECOMBINE_FIELD` | +| [hoodie.datasource.write.table.type](#hoodiedatasourcewritetabletype) | COPY_ON_WRITE | The table type for the underlying data, for this write. This can’t change between writes.
`Config Param: TABLE_TYPE` | +| [hoodie.write.record.merge.mode](#hoodiewriterecordmergemode) | EVENT_TIME_ORDERING | org.apache.hudi.common.config.RecordMergeMode: Determines the logic of merging updates COMMIT_TIME_ORDERING: Using transaction time to merge records, i.e., the record from later transaction overwrites the earlier record with the same key. EVENT_TIME_ORDERING(default): Using event time as the ordering to merge records, i.e., the record with the larger event time overwrites the record with the smaller event time on the same key, regardless of transaction time. The event time or preCombine field needs to be specified by the user. CUSTOM: Using custom merging logic specified by the user.
`Config Param: RECORD_MERGE_MODE`
`Since Version: 1.0.0` | +--- + +## Flink Sql Configs {#FLINK_SQL} +These configs control the Hudi Flink SQL source/sink connectors, providing ability to define record keys, pick out the write operation, specify how to merge records, enable/disable asynchronous compaction or choosing query type to read. + + +### Flink Options {#Flink-Options} +Flink jobs using the SQL can be configured through the options in WITH clause. The actual datasource level configs are listed below. + + + + +[**Basic Configs**](#Flink-Options-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.database.name](#hoodiedatabasename) | (N/A) | Database name to register to Hive metastore
`Config Param: DATABASE_NAME` | +| [hoodie.table.name](#hoodietablename) | (N/A) | Table name to register to Hive metastore
`Config Param: TABLE_NAME` | +| [path](#path) | (N/A) | Base path for the target hoodie table. The path would be created if it does not exist, otherwise a Hoodie table expects to be initialized successfully
`Config Param: PATH` | +| [read.commits.limit](#readcommitslimit) | (N/A) | The maximum number of commits allowed to read in each instant check, if it is streaming read, the avg read instants number per-second would be 'read.commits.limit'/'read.streaming.check-interval', by default no limit
`Config Param: READ_COMMITS_LIMIT` | +| [read.end-commit](#readend-commit) | (N/A) | End commit instant for reading, the commit time format should be 'yyyyMMddHHmmss'
`Config Param: READ_END_COMMIT` | +| [read.start-commit](#readstart-commit) | (N/A) | Start commit instant for reading, the commit time format should be 'yyyyMMddHHmmss', by default reading from the latest instant for streaming read
`Config Param: READ_START_COMMIT` | +| [archive.max_commits](#archivemax_commits) | 50 | Max number of commits to keep before archiving older commits into a sequential log, default 50
`Config Param: ARCHIVE_MAX_COMMITS` | +| [archive.min_commits](#archivemin_commits) | 40 | Min number of commits to keep before archiving older commits into a sequential log, default 40
`Config Param: ARCHIVE_MIN_COMMITS` | +| [cdc.enabled](#cdcenabled) | false | When enable, persist the change data if necessary, and can be queried as a CDC query mode
`Config Param: CDC_ENABLED` | +| [cdc.supplemental.logging.mode](#cdcsupplementalloggingmode) | DATA_BEFORE_AFTER | Setting 'op_key_only' persists the 'op' and the record key only, setting 'data_before' persists the additional 'before' image, and setting 'data_before_after' persists the additional 'before' and 'after' images.
`Config Param: SUPPLEMENTAL_LOGGING_MODE` | +| [changelog.enabled](#changelogenabled) | false | Whether to keep all the intermediate changes, we try to keep all the changes of a record when enabled: 1). The sink accept the UPDATE_BEFORE message; 2). The source try to emit every changes of a record. The semantics is best effort because the compaction job would finally merge all changes of a record into one. default false to have UPSERT semantics
`Config Param: CHANGELOG_ENABLED` | +| [clean.async.enabled](#cleanasyncenabled) | true | Whether to cleanup the old commits immediately on new commits, enabled by default
`Config Param: CLEAN_ASYNC_ENABLED` | +| [clean.retain_commits](#cleanretain_commits) | 30 | Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much you can incrementally pull on this table, default 30
`Config Param: CLEAN_RETAIN_COMMITS` | +| [clustering.async.enabled](#clusteringasyncenabled) | false | Async Clustering, default false
`Config Param: CLUSTERING_ASYNC_ENABLED` | +| [clustering.plan.strategy.small.file.limit](#clusteringplanstrategysmallfilelimit) | 600 | Files smaller than the size specified here are candidates for clustering, default 600 MB
`Config Param: CLUSTERING_PLAN_STRATEGY_SMALL_FILE_LIMIT` | +| [clustering.plan.strategy.target.file.max.bytes](#clusteringplanstrategytargetfilemaxbytes) | 1073741824 | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups, default 1 GB
`Config Param: CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES` | +| [compaction.async.enabled](#compactionasyncenabled) | true | Async Compaction, enabled by default for MOR
`Config Param: COMPACTION_ASYNC_ENABLED` | +| [compaction.delta_commits](#compactiondelta_commits) | 5 | Max delta commits needed to trigger compaction, default 5 commits
`Config Param: COMPACTION_DELTA_COMMITS` | +| [hive_sync.enabled](#hive_syncenabled) | false | Asynchronously sync Hive meta to HMS, default false
`Config Param: HIVE_SYNC_ENABLED` | +| [hive_sync.jdbc_url](#hive_syncjdbc_url) | jdbc:hive2://localhost:10000 | Jdbc URL for hive sync, default 'jdbc:hive2://localhost:10000'
`Config Param: HIVE_SYNC_JDBC_URL` | +| [hive_sync.metastore.uris](#hive_syncmetastoreuris) | | Metastore uris for hive sync, default ''
`Config Param: HIVE_SYNC_METASTORE_URIS` | +| [hive_sync.mode](#hive_syncmode) | HMS | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql, default 'hms'
`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot | Decides how data files need to be read, in 1) Snapshot mode (obtain latest view, based on row & columnar data); 2) incremental mode (new data since an instantTime); 3) Read Optimized mode (obtain latest view, based on columnar data) .Default: snapshot
`Config Param: QUERY_TYPE` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false | Whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)
`Config Param: HIVE_STYLE_PARTITIONING` | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | | Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual value obtained by invoking .toString(), default ''
`Config Param: PARTITION_PATH_FIELD` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | uuid | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: RECORD_KEY_FIELD` | +| [index.type](#indextype) | FLINK_STATE | Index type of Flink write job, default is using state backed index.
`Config Param: INDEX_TYPE` | +| [lookup.join.cache.ttl](#lookupjoincachettl) | PT1H | The cache TTL (e.g. 10min) for the build table in lookup join.
`Config Param: LOOKUP_JOIN_CACHE_TTL` | +| [metadata.compaction.delta_commits](#metadatacompactiondelta_commits) | 10 | Max delta commits for metadata table to trigger compaction, default 10
`Config Param: METADATA_COMPACTION_DELTA_COMMITS` | +| [metadata.enabled](#metadataenabled) | true | Enable the internal metadata table which serves table metadata like level file listings, default enabled
`Config Param: METADATA_ENABLED` | +| [precombine.field](#precombinefield) | ts | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)
`Config Param: PRECOMBINE_FIELD` | +| [read.streaming.enabled](#readstreamingenabled) | false | Whether to read as streaming source, default false
`Config Param: READ_AS_STREAMING` | +| [read.streaming.skip_insertoverwrite](#readstreamingskip_insertoverwrite) | false | Whether to skip insert overwrite instants to avoid reading base files of insert overwrite operations for streaming read. In streaming scenarios, insert overwrite is usually used to repair data, here you can control the visibility of downstream streaming read.
`Config Param: READ_STREAMING_SKIP_INSERT_OVERWRITE` | +| [table.type](#tabletype) | COPY_ON_WRITE | Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ
`Config Param: TABLE_TYPE` | +| [write.operation](#writeoperation) | upsert | The write operation, that this write should do
`Config Param: OPERATION` | +| [write.parquet.max.file.size](#writeparquetmaxfilesize) | 120 | Target size for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.
`Config Param: WRITE_PARQUET_MAX_FILE_SIZE` | +--- + +## Write Client Configs {#WRITE_CLIENT} +Internally, the Hudi datasource uses a RDD based HoodieWriteClient API to actually perform writes to storage. These configs provide deep control over lower level aspects like file sizing, compression, parallelism, compaction, write schema, cleaning etc. Although Hudi provides sane defaults, from time-time these configs may need to be tweaked to optimize for specific workloads. + + +### Common Configurations {#Common-Configurations} +The following set of configurations are common across Hudi. + + + + +[**Basic Configs**](#Common-Configurations-basic-configs) + + +| Config Name | Default | Description | +| ----------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.base.path](#hoodiebasepath) | (N/A) | Base path on lake storage, under which all the table data is stored. Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs etc in .hoodie directory under this base path directory.
`Config Param: BASE_PATH` | +--- + + +### Metadata Configs {#Metadata-Configs} +Configurations used by the Hudi Metadata Table. This table maintains the metadata about a given Hudi table (e.g file listings) to avoid overhead of accessing cloud storage, during queries. + + + + +[**Basic Configs**](#Metadata-Configs-basic-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metadata.enable](#hoodiemetadataenable) | true | Enable the internal metadata table which serves table metadata like level file listings
`Config Param: ENABLE`
`Since Version: 0.7.0` | +| [hoodie.metadata.index.bloom.filter.enable](#hoodiemetadataindexbloomfilterenable) | false | Enable indexing bloom filters of user data files under metadata table. When enabled, metadata table will have a partition to store the bloom filter index and will be used during the index lookups.
`Config Param: ENABLE_METADATA_INDEX_BLOOM_FILTER`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.enable](#hoodiemetadataindexcolumnstatsenable) | false | Enable indexing column ranges of user data files under metadata table key lookups. When enabled, metadata table will have a partition to store the column ranges and will be used for pruning files during the index lookups.
`Config Param: ENABLE_METADATA_INDEX_COLUMN_STATS`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.expression.enable](#hoodiemetadataindexexpressionenable) | false | Enable expression index within the metadata table. When this configuration property is enabled (`true`), the Hudi writer automatically keeps all expression indexes consistent with the data table. When disabled (`false`), all expression indexes are deleted. Note that individual expression index can only be created through a `CREATE INDEX` and deleted through a `DROP INDEX` statement in Spark SQL.
`Config Param: EXPRESSION_INDEX_ENABLE_PROP`
`Since Version: 1.0.0` | +| [hoodie.metadata.index.partition.stats.enable](#hoodiemetadataindexpartitionstatsenable) | false | Enable aggregating stats for each column at the storage partition level.
`Config Param: ENABLE_METADATA_INDEX_PARTITION_STATS`
`Since Version: 1.0.0` | +| [hoodie.metadata.index.secondary.enable](#hoodiemetadataindexsecondaryenable) | true | Enable secondary index within the metadata table. When this configuration property is enabled (`true`), the Hudi writer automatically keeps all secondary indexes consistent with the data table. When disabled (`false`), all secondary indexes are deleted. Note that individual secondary index can only be created through a `CREATE INDEX` and deleted through a `DROP INDEX` statement in Spark SQL.
`Config Param: SECONDARY_INDEX_ENABLE_PROP`
`Since Version: 1.0.0` | +--- + + +### Storage Configs {#Storage-Configs} +Configurations that control aspects around writing, sizing, reading base and log files. + + + + +[**Basic Configs**](#Storage-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.parquet.compression.codec](#hoodieparquetcompressioncodec) | gzip | Compression Codec for parquet files
`Config Param: PARQUET_COMPRESSION_CODEC_NAME` | +| [hoodie.parquet.max.file.size](#hoodieparquetmaxfilesize) | 125829120 | Target size in bytes for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.
`Config Param: PARQUET_MAX_FILE_SIZE` | +--- + + +### Archival Configs {#Archival-Configs} +Configurations that control archival. + + + + +[**Basic Configs**](#Archival-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------ | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.keep.max.commits](#hoodiekeepmaxcommits) | 30 | Archiving service moves older entries from timeline into an archived log after each write, to keep the metadata overhead constant, even as the table size grows. This config controls the maximum number of instants to retain in the active timeline.
`Config Param: MAX_COMMITS_TO_KEEP` | +| [hoodie.keep.min.commits](#hoodiekeepmincommits) | 20 | Similar to hoodie.keep.max.commits, but controls the minimum number of instants to retain in the active timeline.
`Config Param: MIN_COMMITS_TO_KEEP` | +--- + + +### Bootstrap Configs {#Bootstrap-Configs} +Configurations that control how you want to bootstrap your existing tables for the first time into hudi. The bootstrap operation can flexibly avoid copying data over before you can use Hudi and support running the existing writers and new hudi writers in parallel, to validate the migration. + + + + +[**Basic Configs**](#Bootstrap-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.bootstrap.base.path](#hoodiebootstrapbasepath) | (N/A) | Base path of the dataset that needs to be bootstrapped as a Hudi table
`Config Param: BASE_PATH`
`Since Version: 0.6.0` | +--- + + +### Clean Configs {#Clean-Configs} +Cleaning (reclamation of older/unused file groups/slices). + + + + +[**Basic Configs**](#Clean-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------ | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.clean.async.enabled](#hoodiecleanasyncenabled) | false | Only applies when hoodie.clean.automatic is turned on. When turned on runs cleaner async with writing, which can speed up overall write performance.
`Config Param: ASYNC_CLEAN` | +| [hoodie.clean.commits.retained](#hoodiecleancommitsretained) | 10 | When KEEP_LATEST_COMMITS cleaning policy is used, the number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much data retention the table supports for incremental queries.
`Config Param: CLEANER_COMMITS_RETAINED` | +--- + + +### Clustering Configs {#Clustering-Configs} +Configurations that control the clustering table service in hudi, which optimizes the storage layout for better query performance by sorting and sizing data files. + + + + +[**Basic Configs**](#Clustering-Configs-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false | Enable running of clustering service, asynchronously as inserts happen on the table.
`Config Param: ASYNC_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.clustering.inline](#hoodieclusteringinline) | false | Turn on inline clustering - clustering will be run after each write operation is complete
`Config Param: INLINE_CLUSTERING`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.small.file.limit](#hoodieclusteringplanstrategysmallfilelimit) | 314572800 | Files smaller than the size in bytes specified here are candidates for clustering
`Config Param: PLAN_STRATEGY_SMALL_FILE_LIMIT`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.target.file.max.bytes](#hoodieclusteringplanstrategytargetfilemaxbytes) | 1073741824 | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups
`Config Param: PLAN_STRATEGY_TARGET_FILE_MAX_BYTES`
`Since Version: 0.7.0` | +--- + + +### Compaction Configs {#Compaction-Configs} +Configurations that control compaction (merging of log files onto a new base files). + + + + +[**Basic Configs**](#Compaction-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.compact.inline](#hoodiecompactinline) | false | When set to true, compaction service is triggered after each write. While being simpler operationally, this adds extra latency on the write path.
`Config Param: INLINE_COMPACT` | +| [hoodie.compact.inline.max.delta.commits](#hoodiecompactinlinemaxdeltacommits) | 5 | Number of delta commits after the last compaction, before scheduling of a new compaction is attempted. This config takes effect only for the compaction triggering strategy based on the number of commits, i.e., NUM_COMMITS, NUM_COMMITS_AFTER_LAST_REQUEST, NUM_AND_TIME, and NUM_OR_TIME.
`Config Param: INLINE_COMPACT_NUM_DELTA_COMMITS` | +--- + + +### Error table Configs {#Error-table-Configs} +Configurations that are required for Error table configs + + + + +[**Basic Configs**](#Error-table-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.errortable.base.path](#hoodieerrortablebasepath) | (N/A) | Base path for error table under which all error records would be stored.
`Config Param: ERROR_TABLE_BASE_PATH` | +| [hoodie.errortable.target.table.name](#hoodieerrortabletargettablename) | (N/A) | Table name to be used for the error table
`Config Param: ERROR_TARGET_TABLE` | +| [hoodie.errortable.write.class](#hoodieerrortablewriteclass) | (N/A) | Class which handles the error table writes. This config is used to configure a custom implementation for Error Table Writer. Specify the full class name of the custom error table writer as a value for this config
`Config Param: ERROR_TABLE_WRITE_CLASS` | +| [hoodie.errortable.enable](#hoodieerrortableenable) | false | Config to enable error table. If the config is enabled, all the records with processing error in DeltaStreamer are transferred to error table.
`Config Param: ERROR_TABLE_ENABLED` | +| [hoodie.errortable.insert.shuffle.parallelism](#hoodieerrortableinsertshuffleparallelism) | 200 | Config to set insert shuffle parallelism. The config is similar to hoodie.insert.shuffle.parallelism config but applies to the error table.
`Config Param: ERROR_TABLE_INSERT_PARALLELISM_VALUE` | +| [hoodie.errortable.upsert.shuffle.parallelism](#hoodieerrortableupsertshuffleparallelism) | 200 | Config to set upsert shuffle parallelism. The config is similar to hoodie.upsert.shuffle.parallelism config but applies to the error table.
`Config Param: ERROR_TABLE_UPSERT_PARALLELISM_VALUE` | +| [hoodie.errortable.validate.recordcreation.enable](#hoodieerrortablevalidaterecordcreationenable) | true | Records that fail to be created due to keygeneration failure or other issues will be sent to the Error Table
`Config Param: ERROR_ENABLE_VALIDATE_RECORD_CREATION`
`Since Version: 0.15.0` | +| [hoodie.errortable.validate.targetschema.enable](#hoodieerrortablevalidatetargetschemaenable) | false | Records with schema mismatch with Target Schema are sent to Error Table.
`Config Param: ERROR_ENABLE_VALIDATE_TARGET_SCHEMA` | +| [hoodie.errortable.write.failure.strategy](#hoodieerrortablewritefailurestrategy) | ROLLBACK_COMMIT | The config specifies the failure strategy if error table write fails. Use one of - [ROLLBACK_COMMIT (Rollback the corresponding base table write commit for which the error events were triggered) , LOG_ERROR (Error is logged but the base table write succeeds) ]
`Config Param: ERROR_TABLE_WRITE_FAILURE_STRATEGY` | +--- + + +### Write Configurations {#Write-Configurations} +Configurations that control write behavior on Hudi tables. These can be directly passed down from even higher level frameworks (e.g Spark datasources, Flink sink) and utilities (e.g Hudi Streamer). + + + + +[**Basic Configs**](#Write-Configurations-basic-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- || +| [hoodie.base.path](#hoodiebasepath) | (N/A) | Base path on lake storage, under which all the table data is stored. Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs etc in .hoodie directory under this base path directory.
`Config Param: BASE_PATH` | +| [hoodie.table.name](#hoodietablename) | (N/A) | Table name that will be used for registering with metastores like HMS. Needs to be same across runs.
`Config Param: TBL_NAME` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)
`Config Param: PRECOMBINE_FIELD_NAME` | +| [hoodie.instant_state.timeline_server_based.enabled](#hoodieinstant_statetimeline_server_basedenabled) | false | If enabled, writers get instant state from timeline server rather than requesting DFS directly
`Config Param: INSTANT_STATE_TIMELINE_SERVER_BASED`
`Since Version: 1.0.0` | +| [hoodie.instant_state.timeline_server_based.force_refresh.request.number](#hoodieinstant_statetimeline_server_basedforce_refreshrequestnumber) | 100 | Number of requests to trigger instant state cache refreshing
`Config Param: INSTANT_STATE_TIMELINE_SERVER_BASED_FORCE_REFRESH_REQUEST_NUMBER`
`Since Version: 1.0.0` | +| [hoodie.write.auto.upgrade](#hoodiewriteautoupgrade) | true | If enabled, writers automatically migrate the table to the specified write table version if the current table version is lower.
`Config Param: AUTO_UPGRADE_VERSION`
`Since Version: 1.0.0` | +| [hoodie.write.concurrency.mode](#hoodiewriteconcurrencymode) | SINGLE_WRITER | org.apache.hudi.common.model.WriteConcurrencyMode: Concurrency modes for write operations. SINGLE_WRITER(default): Only one active writer to the table. Maximizes throughput. OPTIMISTIC_CONCURRENCY_CONTROL: Multiple writers can operate on the table with lazy conflict resolution using locks. This means that only one writer succeeds if multiple writers write to the same file group. NON_BLOCKING_CONCURRENCY_CONTROL: Multiple writers can operate on the table with non-blocking conflict resolution. The writers can write into the same file group with the conflicts resolved automatically by the query reader and the compactor.
`Config Param: WRITE_CONCURRENCY_MODE` | +| [hoodie.write.record.merge.mode](#hoodiewriterecordmergemode) | EVENT_TIME_ORDERING | org.apache.hudi.common.config.RecordMergeMode: Determines the logic of merging updates COMMIT_TIME_ORDERING: Using transaction time to merge records, i.e., the record from later transaction overwrites the earlier record with the same key. EVENT_TIME_ORDERING(default): Using event time as the ordering to merge records, i.e., the record with the larger event time overwrites the record with the smaller event time on the same key, regardless of transaction time. The event time or preCombine field needs to be specified by the user. CUSTOM: Using custom merging logic specified by the user.
`Config Param: RECORD_MERGE_MODE`
`Since Version: 1.0.0` | +| [hoodie.write.table.version](#hoodiewritetableversion) | 8 | The table version this writer is storing the table in. This should match the current table version.
`Config Param: WRITE_TABLE_VERSION`
`Since Version: 1.0.0` | +--- + + +### Lock Configs {#LOCK} +Configurations that control locking mechanisms required for concurrency control between writers to a Hudi table. Concurrency between Hudi's own table services are auto managed internally. + + +#### Common Lock Configurations {#Common-Lock-Configurations} + + + + + +[**Basic Configs**](#Common-Lock-Configurations-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.write.lock.heartbeat_interval_ms](#hoodiewritelockheartbeat_interval_ms) | 60000 | Heartbeat interval in ms, to send a heartbeat to indicate that hive client holding locks.
`Config Param: LOCK_HEARTBEAT_INTERVAL_MS`
`Since Version: 0.15.0` | +--- + + +### Key Generator Configs {#KEY_GENERATOR} +Hudi maintains keys (record key + partition path) for uniquely identifying a particular record. These configs allow developers to setup the Key generator class that extracts these out of incoming records. + + +#### Key Generator Options {#Key-Generator-Options} + + + + + +[**Basic Configs**](#Key-Generator-Options-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | (N/A) | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString()
`Config Param: PARTITIONPATH_FIELD_NAME` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | (N/A) | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: RECORDKEY_FIELD_NAME` | +| [hoodie.datasource.write.secondarykey.column](#hoodiedatasourcewritesecondarykeycolumn) | (N/A) | Columns that constitute the secondary key component. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: SECONDARYKEY_COLUMN_NAME` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)
`Config Param: HIVE_STYLE_PARTITIONING_ENABLE` | +--- + + +### Index Configs {#INDEX} +Configurations that control indexing behavior, which tags incoming records as either inserts or updates to older records. + + +#### Common Index Configs {#Common-Index-Configs} + + + + + +[**Basic Configs**](#Common-Index-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------ | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.expression.index.function](#hoodieexpressionindexfunction) | (N/A) | Function to be used for building the expression index.
`Config Param: INDEX_FUNCTION`
`Since Version: 1.0.0` | +| [hoodie.expression.index.name](#hoodieexpressionindexname) | (N/A) | Name of the expression index. This is also used for the partition name in the metadata table.
`Config Param: INDEX_NAME`
`Since Version: 1.0.0` | +| [hoodie.table.checksum](#hoodietablechecksum) | (N/A) | Index definition checksum is used to guard against partial writes in HDFS. It is added as the last entry in index.properties and then used to validate while reading table config.
`Config Param: INDEX_DEFINITION_CHECKSUM`
`Since Version: 1.0.0` | +| [hoodie.expression.index.type](#hoodieexpressionindextype) | COLUMN_STATS | Type of the expression index. Default is `column_stats` if there are no functions and expressions in the command. Valid options could be BITMAP, COLUMN_STATS, LUCENE, etc. If index_type is not provided, and there are functions or expressions in the command then a expression index using column stats will be created.
`Config Param: INDEX_TYPE`
`Since Version: 1.0.0` | +--- + + +#### Common Index Configs {#Common-Index-Configs} + + + + + +[**Basic Configs**](#Common-Index-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.index.type](#hoodieindextype) | (N/A) | org.apache.hudi.index.HoodieIndex$IndexType: Determines how input records are indexed, i.e., looked up based on the key for the location in the existing table. Default is SIMPLE on Spark engine, and INMEMORY on Flink and Java engines. HBASE: uses an external managed Apache HBase table to store record key to location mapping. HBase index is a global index, enforcing key uniqueness across all partitions in the table. INMEMORY: Uses in-memory hashmap in Spark and Java engine and Flink in-memory state in Flink for indexing. BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced inside partitions. GLOBAL_BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced across all partitions in the table. SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced inside partitions. GLOBAL_SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced across all partitions in the table. BUCKET: locates the file group containing the record fast by using bucket hashing, particularly beneficial in large scale. Use `hoodie.index.bucket.engine` to choose bucket engine type, i.e., how buckets are generated. FLINK_STATE: Internal Config for indexing based on Flink state. RECORD_INDEX: Index which saves the record key to location mappings in the HUDI Metadata Table. Record index is a global index, enforcing key uniqueness across all partitions in the table. Supports sharding to achieve very high scale.
`Config Param: INDEX_TYPE` | +| [hoodie.bucket.index.query.pruning](#hoodiebucketindexquerypruning) | true | Control if table with bucket index use bucket query or not
`Config Param: BUCKET_QUERY_INDEX` | +--- + +## Metastore and Catalog Sync Configs {#META_SYNC} +Configurations used by the Hudi to sync metadata to external metastores and catalogs. + + +### Common Metadata Sync Configs {#Common-Metadata-Sync-Configs} + + + + + +[**Basic Configs**](#Common-Metadata-Sync-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | +--- + + +### Glue catalog sync based client Configurations {#Glue-catalog-sync-based-client-Configurations} +Configs that control Glue catalog sync based client. + + + + +[**Basic Configs**](#Glue-catalog-sync-based-client-Configurations-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------------------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.glue.partition_index_fields](#hoodiedatasourcemetasyncgluepartition_index_fields) | | Specify the partitions fields to index on aws glue. Separate the fields by semicolon. By default, when the feature is enabled, all the partition will be indexed. You can create up to three indexes, separate them by comma. Eg: col1;col2;col3,col2,col3
`Config Param: META_SYNC_PARTITION_INDEX_FIELDS`
`Since Version: 0.15.0` | +| [hoodie.datasource.meta.sync.glue.partition_index_fields.enable](#hoodiedatasourcemetasyncgluepartition_index_fieldsenable) | false | Enable aws glue partition index feature, to speedup partition based query pattern
`Config Param: META_SYNC_PARTITION_INDEX_FIELDS_ENABLE`
`Since Version: 0.15.0` | +--- + + +### BigQuery Sync Configs {#BigQuery-Sync-Configs} +Configurations used by the Hudi to sync metadata to Google BigQuery. + + + + +[**Basic Configs**](#BigQuery-Sync-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | +--- + + +### Hive Sync Configs {#Hive-Sync-Configs} +Configurations used by the Hudi to sync metadata to Hive Metastore. + + + + +[**Basic Configs**](#Hive-Sync-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | (N/A) | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.
`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false | When set to true, register/sync the table to Apache Hive metastore.
`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 | Hive metastore url
`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 | Hive metastore url
`Config Param: METASTORE_URIS` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | +--- + + +### Global Hive Sync Configs {#Global-Hive-Sync-Configs} +Global replication configurations used by the Hudi to sync metadata to Hive Metastore. + + + + +[**Basic Configs**](#Global-Hive-Sync-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | (N/A) | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.
`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false | When set to true, register/sync the table to Apache Hive metastore.
`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 | Hive metastore url
`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 | Hive metastore url
`Config Param: METASTORE_URIS` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | +--- + + +### DataHub Sync Configs {#DataHub-Sync-Configs} +Configurations used by the Hudi to sync metadata to DataHub. + + + + +[**Basic Configs**](#DataHub-Sync-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | +--- + +## Metrics Configs {#METRICS} +These set of configs are used to enable monitoring and reporting of key Hudi stats and metrics. + + +### Metrics Configurations {#Metrics-Configurations} +Enables reporting on Hudi metrics. Hudi publishes metrics on every commit, clean, rollback etc. The following sections list the supported reporters. + + + + +[**Basic Configs**](#Metrics-Configurations-basic-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------- | --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.on](#hoodiemetricson) | false | Turn on/off metrics reporting. off by default.
`Config Param: TURN_METRICS_ON`
`Since Version: 0.5.0` | +| [hoodie.metrics.reporter.type](#hoodiemetricsreportertype) | GRAPHITE | Type of metrics reporter.
`Config Param: METRICS_REPORTER_TYPE_VALUE`
`Since Version: 0.5.0` | +| [hoodie.metricscompaction.log.blocks.on](#hoodiemetricscompactionlogblockson) | false | Turn on/off metrics reporting for log blocks with compaction commit. off by default.
`Config Param: TURN_METRICS_COMPACTION_LOG_BLOCKS_ON`
`Since Version: 0.14.0` | +--- + + +### Metrics Configurations for M3 {#Metrics-Configurations-for-M3} +Enables reporting on Hudi metrics using M3. Hudi publishes metrics on every commit, clean, rollback etc. + + + + +[**Basic Configs**](#Metrics-Configurations-for-M3-basic-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.m3.env](#hoodiemetricsm3env) | production | M3 tag to label the environment (defaults to 'production'), applied to all metrics.
`Config Param: M3_ENV`
`Since Version: 0.15.0` | +| [hoodie.metrics.m3.host](#hoodiemetricsm3host) | localhost | M3 host to connect to.
`Config Param: M3_SERVER_HOST_NAME`
`Since Version: 0.15.0` | +| [hoodie.metrics.m3.port](#hoodiemetricsm3port) | 9052 | M3 port to connect to.
`Config Param: M3_SERVER_PORT_NUM`
`Since Version: 0.15.0` | +| [hoodie.metrics.m3.service](#hoodiemetricsm3service) | hoodie | M3 tag to label the service name (defaults to 'hoodie'), applied to all metrics.
`Config Param: M3_SERVICE`
`Since Version: 0.15.0` | +| [hoodie.metrics.m3.tags](#hoodiemetricsm3tags) | | Optional M3 tags applied to all metrics.
`Config Param: M3_TAGS`
`Since Version: 0.15.0` | +--- + +## Kafka Connect Configs {#KAFKA_CONNECT} +These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables + + +### Kafka Sink Connect Configurations {#Kafka-Sink-Connect-Configurations} +Configurations for Kafka Connect Sink Connector for Hudi. + + + + +[**Basic Configs**](#Kafka-Sink-Connect-Configurations-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------- | --------------- | ----------------------------------------------------------------------------------------- | +| [bootstrap.servers](#bootstrapservers) | localhost:9092 | The bootstrap servers for the Kafka Cluster.
`Config Param: KAFKA_BOOTSTRAP_SERVERS` | +--- + +## Hudi Streamer Configs {#HUDI_STREAMER} +These set of configs are used for Hudi Streamer utility which provides the way to ingest from different sources such as DFS or Kafka. + + +### Hudi Streamer Configs {#Hudi-Streamer-Configs} + + + + + +[**Basic Configs**](#Hudi-Streamer-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.kafka.topic](#hoodiestreamersourcekafkatopic) | (N/A) | Kafka topic name. The config is specific to HoodieMultiTableStreamer
`Config Param: KAFKA_TOPIC` | +--- + + +### Hudi Streamer SQL Transformer Configs {#Hudi-Streamer-SQL-Transformer-Configs} +Configurations controlling the behavior of SQL transformer in Hudi Streamer. + + + + +[**Basic Configs**](#Hudi-Streamer-SQL-Transformer-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------- | ------- | -------------------------------------------------------------------------------------------- | +| [hoodie.streamer.transformer.sql](#hoodiestreamertransformersql) | (N/A) | SQL Query to be executed during write
`Config Param: TRANSFORMER_SQL` | +| [hoodie.streamer.transformer.sql.file](#hoodiestreamertransformersqlfile) | (N/A) | File with a SQL script to be executed during write
`Config Param: TRANSFORMER_SQL_FILE` | +--- + + +### Hudi Streamer Source Configs {#DELTA_STREAMER_SOURCE} +Configurations controlling the behavior of reading source data. + + +#### DFS Path Selector Configs {#DFS-Path-Selector-Configs} +Configurations controlling the behavior of path selector for DFS source in Hudi Streamer. + + + + +[**Basic Configs**](#DFS-Path-Selector-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------- | ------- | ------------------------------------------------------------------- | +| [hoodie.streamer.source.dfs.root](#hoodiestreamersourcedfsroot) | (N/A) | Root path of the source on DFS
`Config Param: ROOT_INPUT_PATH` | +--- + + +#### Hudi Incremental Source Configs {#Hudi-Incremental-Source-Configs} +Configurations controlling the behavior of incremental pulling from a Hudi table as a source in Hudi Streamer. + + + + +[**Basic Configs**](#Hudi-Incremental-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------- | +| [hoodie.streamer.source.hoodieincr.path](#hoodiestreamersourcehoodieincrpath) | (N/A) | Base-path for the source Hudi table
`Config Param: HOODIE_SRC_BASE_PATH` | +--- + + +#### Kafka Source Configs {#Kafka-Source-Configs} +Configurations controlling the behavior of Kafka source in Hudi Streamer. + + + + +[**Basic Configs**](#Kafka-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.kafka.topic](#hoodiestreamersourcekafkatopic) | (N/A) | Kafka topic name.
`Config Param: KAFKA_TOPIC_NAME` | +| [hoodie.streamer.source.kafka.proto.value.deserializer.class](#hoodiestreamersourcekafkaprotovaluedeserializerclass) | org.apache.kafka.common.serialization.ByteArrayDeserializer | Kafka Proto Payload Deserializer Class
`Config Param: KAFKA_PROTO_VALUE_DESERIALIZER_CLASS`
`Since Version: 0.15.0` | +--- + + +#### Pulsar Source Configs {#Pulsar-Source-Configs} +Configurations controlling the behavior of Pulsar source in Hudi Streamer. + + + + +[**Basic Configs**](#Pulsar-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------- | ------------------------ | --------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.pulsar.topic](#hoodiestreamersourcepulsartopic) | (N/A) | Name of the target Pulsar topic to source data from
`Config Param: PULSAR_SOURCE_TOPIC_NAME` | +| [hoodie.streamer.source.pulsar.endpoint.admin.url](#hoodiestreamersourcepulsarendpointadminurl) | http://localhost:8080 | URL of the target Pulsar endpoint (of the form 'pulsar://host:port'
`Config Param: PULSAR_SOURCE_ADMIN_ENDPOINT_URL` | +| [hoodie.streamer.source.pulsar.endpoint.service.url](#hoodiestreamersourcepulsarendpointserviceurl) | pulsar://localhost:6650 | URL of the target Pulsar endpoint (of the form 'pulsar://host:port'
`Config Param: PULSAR_SOURCE_SERVICE_ENDPOINT_URL` | +--- + + +#### S3 Source Configs {#S3-Source-Configs} +Configurations controlling the behavior of S3 source in Hudi Streamer. + + + + +[**Basic Configs**](#S3-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------- | ------- | -------------------------------------------------------------------------- | +| [hoodie.streamer.s3.source.queue.url](#hoodiestreamers3sourcequeueurl) | (N/A) | Queue url for cloud object events
`Config Param: S3_SOURCE_QUEUE_URL` | +--- + + +#### File-based SQL Source Configs {#File-based-SQL-Source-Configs} +Configurations controlling the behavior of File-based SQL Source in Hudi Streamer. + + + + +[**Basic Configs**](#File-based-SQL-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.sql.file](#hoodiestreamersourcesqlfile) | (N/A) | SQL file path containing the SQL query to read source data.
`Config Param: SOURCE_SQL_FILE`
`Since Version: 0.14.0` | +--- + + +#### SQL Source Configs {#SQL-Source-Configs} +Configurations controlling the behavior of SQL source in Hudi Streamer. + + + + +[**Basic Configs**](#SQL-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------ | ------- | ------------------------------------------------------------------- | +| [hoodie.streamer.source.sql.sql.query](#hoodiestreamersourcesqlsqlquery) | (N/A) | SQL query for fetching source data.
`Config Param: SOURCE_SQL` | +--- + + +### Hudi Streamer Schema Provider Configs {#SCHEMA_PROVIDER} +Configurations that control the schema provider for Hudi Streamer. + + +#### Hudi Streamer Schema Provider Configs {#Hudi-Streamer-Schema-Provider-Configs} + + + + + +[**Basic Configs**](#Hudi-Streamer-Schema-Provider-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.schemaprovider.registry.targetUrl](#hoodiestreamerschemaproviderregistrytargetUrl) | (N/A) | The schema of the target you are writing to e.g. https://foo:bar@schemaregistry.org
`Config Param: TARGET_SCHEMA_REGISTRY_URL` | +| [hoodie.streamer.schemaprovider.registry.url](#hoodiestreamerschemaproviderregistryurl) | (N/A) | The schema of the source you are reading from e.g. https://foo:bar@schemaregistry.org
`Config Param: SRC_SCHEMA_REGISTRY_URL` | +--- + + +#### File-based Schema Provider Configs {#File-based-Schema-Provider-Configs} +Configurations for file-based schema provider. + + + + +[**Basic Configs**](#File-based-Schema-Provider-Configs-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------- | +| [hoodie.streamer.schemaprovider.source.schema.file](#hoodiestreamerschemaprovidersourceschemafile) | (N/A) | The schema of the source you are reading from
`Config Param: SOURCE_SCHEMA_FILE` | +| [hoodie.streamer.schemaprovider.target.schema.file](#hoodiestreamerschemaprovidertargetschemafile) | (N/A) | The schema of the target you are writing to
`Config Param: TARGET_SCHEMA_FILE` | +--- + diff --git a/website/versioned_docs/version-1.0.0/bos_hoodie.md b/website/versioned_docs/version-1.0.0/bos_hoodie.md new file mode 100644 index 0000000000000..2a6cde81c8684 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/bos_hoodie.md @@ -0,0 +1,57 @@ +--- +title: Baidu Cloud +keywords: [ hudi, hive, baidu, bos, spark, presto] +summary: In this page, we go over how to configure Hudi with bos filesystem. +last_modified_at: 2021-06-09T11:38:24-10:00 +--- +In this page, we explain how to get your Hudi job to store into Baidu BOS. + +## Baidu BOS configs + +There are two configurations required for Hudi-BOS compatibility: + +- Adding Baidu BOS Credentials for Hudi +- Adding required Jars to classpath + +### Baidu BOS Credentials + +Add the required configs in your core-site.xml from where Hudi can fetch them. Replace the `fs.defaultFS` with your BOS bucket name, replace `fs.bos.endpoint` with your bos endpoint, replace `fs.bos.access.key` with your bos key, replace `fs.bos.secret.access.key` with your bos secret key. Hudi should be able to read/write from the bucket. + +```xml + + fs.defaultFS + bos://bucketname/ + + + + fs.bos.endpoint + bos-endpoint-address + Baidu bos endpoint to connect to,for example : http://bj.bcebos.com + + + + fs.bos.access.key + bos-key + Baidu access key + + + + fs.bos.secret.access.key + bos-secret-key + Baidu secret key. + + + + fs.bos.impl + org.apache.hadoop.fs.bos.BaiduBosFileSystem + +``` + +### Baidu bos Libs + +Baidu hadoop libraries jars to add to our classpath + +- com.baidubce:bce-java-sdk:0.10.165 +- bos-hdfs-sdk-1.0.2-community.jar + +You can download the bos-hdfs-sdk jar from [here](https://sdk.bce.baidu.com/console-sdk/bos-hdfs-sdk-1.0.2-community.jar.zip) , and then unzip it. \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/cleaning.md b/website/versioned_docs/version-1.0.0/cleaning.md new file mode 100644 index 0000000000000..5f6ea4b369728 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/cleaning.md @@ -0,0 +1,155 @@ +--- +title: Cleaning +toc: true +toc_min_heading_level: 2 +toc_max_heading_level: 4 +--- +## Background +Cleaning is a table service employed by Hudi to reclaim space occupied by older versions of data and keep storage costs +in check. Apache Hudi provides snapshot isolation between writers and readers by managing multiple versioned files with **MVCC** +concurrency. These file versions provide history and enable time travel and rollbacks, but it is important to manage +how much history you keep to balance your costs. Cleaning service plays a crucial role in manging the tradeoff between +retaining long history of data and the associated storage costs. + +Hudi enables [Automatic Hudi cleaning](/docs/configurations/#hoodiecleanautomatic) by default. Cleaning is invoked +immediately after each commit, to delete older file slices. It's recommended to leave this enabled to ensure metadata +and data storage growth is bounded. Cleaner can also be scheduled after every few commits instead of after every commit by +configuring [hoodie.clean.max.commits](https://hudi.apache.org/docs/configurations#hoodiecleanmaxcommits). + +### Cleaning Retention Policies +When cleaning old files, you should be careful not to remove files that are being actively used by long running queries. + +For spark based: + +| Config Name | Default | Description | +|----------------------------------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------| +| hoodie.cleaner.policy | KEEP_LATEST_COMMITS (Optional) | org.apache.hudi.common.model.HoodieCleaningPolicy: Cleaning policy to be used.

`Config Param: CLEANER_POLICY` | + +The corresponding config for Flink based engine is [`clean.policy`](https://hudi.apache.org/docs/configurations/#cleanpolicy). + +Hudi cleaner currently supports the below cleaning policies to keep a certain number of commits or file versions: + +- **KEEP_LATEST_COMMITS**: This is the default policy. This is a temporal cleaning policy that ensures the effect of + having lookback into all the changes that happened in the last X commits. Suppose a writer is ingesting data + into a Hudi dataset every 30 minutes and the longest running query can take 5 hours to finish, then the user should + retain atleast the last 10 commits. With such a configuration, we ensure that the oldest version of a file is kept on + disk for at least 5 hours, thereby preventing the longest running query from failing at any point in time. Incremental + cleaning is also possible using this policy. + Number of commits to retain can be configured by [`hoodie.cleaner.commits.retained`](https://analytics.google.com/analytics/web/#/p300324801/reports/intelligenthome). + The corresponding Flink related config is [`clean.retain_commits`](https://hudi.apache.org/docs/configurations/#cleanretain_commits). + +- **KEEP_LATEST_FILE_VERSIONS**: This policy has the effect of keeping N number of file versions irrespective of time. + This policy is useful when it is known how many MAX versions of the file does one want to keep at any given time. + To achieve the same behaviour as before of preventing long running queries from failing, one should do their calculations + based on data patterns. Alternatively, this policy is also useful if a user just wants to maintain 1 latest version of the file. + Number of file versions to retain can be configured by [`hoodie.cleaner.fileversions.retained`](https://hudi.apache.org/docs/configurations/#hoodiecleanerfileversionsretained). + The corresponding Flink related config is [`clean.retain_file_versions`](https://hudi.apache.org/docs/configurations/#cleanretain_file_versions). + +- **KEEP_LATEST_BY_HOURS**: This policy clean up based on hours.It is simple and useful when knowing that you want to + keep files at any given time. Corresponding to commits with commit times older than the configured number of hours to + be retained are cleaned. Currently you can configure by parameter [`hoodie.cleaner.hours.retained`](https://hudi.apache.org/docs/configurations/#hoodiecleanerhoursretained). + The corresponding Flink related config is [`clean.retain_hours`](https://hudi.apache.org/docs/configurations/#cleanretain_hours). + +### Configs +For details about all possible configurations and their default values see the [configuration docs](https://hudi.apache.org/docs/next/configurations/#Clean-Configs). +For Flink related configs refer [here](https://hudi.apache.org/docs/next/configurations/#FLINK_SQL). + +### Ways to trigger Cleaning + +#### Inline + +By default, in Spark based writing, cleaning is run inline after every commit using the default policy of `KEEP_LATEST_COMMITS`. It's recommended +to keep this enabled, to ensure metadata and data storage growth is bounded. To enable this, users do not have to set any configs. Following are the relevant basic configs. + +| Config Name | Default | Description | +|----------------------------------| -----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.clean.automatic | true (Optional) | When enabled, the cleaner table service is invoked immediately after each commit, to delete older file slices. It's recommended to enable this, to ensure metadata and data storage growth is bounded.

`Config Param: AUTO_CLEAN` | +| hoodie.cleaner.commits.retained | 10 (Optional) | Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much data retention the table supports for incremental queries.

`Config Param: CLEANER_COMMITS_RETAINED` | + + +#### Async +In case you wish to run the cleaner service asynchronously along with writing, please enable the [`hoodie.clean.async`](https://hudi.apache.org/docs/configurations#hoodiecleanasync) as shown below: +```java +hoodie.clean.automatic=true +hoodie.clean.async=true +``` + +For Flink based writing, this is the default mode of cleaning. Please refer to [`clean.async.enabled`](https://hudi.apache.org/docs/configurations/#cleanasyncenabled) for details. + +#### Run independently +Hoodie Cleaner can also be run as a separate process. Following is the command for running the cleaner independently: +``` +spark-submit --master local \ + --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ + --class org.apache.hudi.utilities.HoodieCleaner `ls packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle-*.jar` --help + Usage:
[options] + Options: + --help, -h + + --hoodie-conf + Any configuration that can be set in the properties file (using the CLI + parameter "--props") can also be passed command line using this + parameter. This can be repeated + Default: [] + --props + path to properties file on localfs or dfs, with configurations for + hoodie client for cleaning + --spark-master + spark master to use. + Default: local[2] + * --target-base-path + base path for the hoodie table to be cleaner. +``` +Some examples to run the cleaner. +Keep the latest 10 commits +``` +spark-submit --master local \ + --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ + --class org.apache.hudi.utilities.HoodieCleaner `ls packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle-*.jar` \ + --target-base-path /path/to/hoodie_table \ + --hoodie-conf hoodie.cleaner.policy=KEEP_LATEST_COMMITS \ + --hoodie-conf hoodie.cleaner.commits.retained=10 \ + --hoodie-conf hoodie.cleaner.parallelism=200 +``` +Keep the latest 3 file versions +``` +spark-submit --master local \ + --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ + --class org.apache.hudi.utilities.HoodieCleaner `ls packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle-*.jar` \ + --hoodie-conf hoodie.cleaner.policy=KEEP_LATEST_FILE_VERSIONS \ + --hoodie-conf hoodie.cleaner.fileversions.retained=3 \ + --hoodie-conf hoodie.cleaner.parallelism=200 +``` +Clean commits older than 24 hours +``` +spark-submit --master local \ + --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ + --class org.apache.hudi.utilities.HoodieCleaner `ls packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle-*.jar` \ + --target-base-path /path/to/hoodie_table \ + --hoodie-conf hoodie.cleaner.policy=KEEP_LATEST_BY_HOURS \ + --hoodie-conf hoodie.cleaner.hours.retained=24 \ + --hoodie-conf hoodie.cleaner.parallelism=200 +``` +Note: The parallelism takes the min value of number of partitions to clean and `hoodie.cleaner.parallelism`. + +#### CLI +You can also use [Hudi CLI](/docs/cli) to run Hoodie Cleaner. + +CLI provides the below commands for cleaner service: +- `cleans show` +- `clean showpartitions` +- `cleans run` + +Example of cleaner keeping the latest 10 commits +``` +cleans run --sparkMaster local --hoodieConfigs hoodie.cleaner.policy=KEEP_LATEST_COMMITS hoodie.cleaner.commits.retained=3 hoodie.cleaner.parallelism=200 +``` + +You can find more details and the relevant code for these commands in [`org.apache.hudi.cli.commands.CleansCommand`](https://github.com/apache/hudi/blob/master/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CleansCommand.java) class. + +## Related Resources +

Videos

+ +* [Cleaner Service: Save up to 40% on data lake storage costs | Hudi Labs](https://youtu.be/mUvRhJDoO3w) +* [Efficient Data Lake Management with Apache Hudi Cleaner: Benefits of Scheduling Data Cleaning #1](https://www.youtube.com/watch?v=CEzgFtmVjx4) +* [Efficient Data Lake Management with Apache Hudi Cleaner: Benefits of Scheduling Data Cleaning #2](https://www.youtube.com/watch?v=RbBF9Ys2GqM) \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/cli.md b/website/versioned_docs/version-1.0.0/cli.md new file mode 100644 index 0000000000000..def32b11a8e3a --- /dev/null +++ b/website/versioned_docs/version-1.0.0/cli.md @@ -0,0 +1,755 @@ +--- +title: CLI +keywords: [hudi, cli] +last_modified_at: 2021-08-18T15:59:57-04:00 +--- + +### Local set up +Once hudi has been built, the shell can be fired by via `cd hudi-cli && ./hudi-cli.sh`. + +### Hudi CLI Bundle setup +In release `0.13.0` we have now added another way of launching the `hudi cli`, which is using the `hudi-cli-bundle`. + +There are a couple of requirements when using this approach such as having `spark` installed locally on your machine. +It is required to use a spark distribution with hadoop dependencies packaged such as `spark-3.3.1-bin-hadoop2.tgz` from https://archive.apache.org/dist/spark/. +We also recommend you set an env variable `$SPARK_HOME` to the path of where spark is installed on your machine. +One important thing to note is that the `hudi-spark-bundle` should also be present when using the `hudi-cli-bundle`. +To provide the locations of these bundle jars you can set them in your shell like so: +`export CLI_BUNDLE_JAR=` , `export SPARK_BUNDLE_JAR=`. + +For steps see below if you are not compiling the project and downloading the jars: + +1. Create an empty folder as a new directory +2. Copy the hudi-cli-bundle jars and hudi-spark*-bundle jars to this directory +3. Copy the following script and folder to this directory +``` +packaging/hudi-cli-bundle/hudi-cli-with-bundle.sh +packaging/hudi-cli-bundle/conf . the `conf` folder should be in this directory. +``` + +4. Start Hudi CLI shell with environment variables set +``` +export SPARK_HOME= +export CLI_BUNDLE_JAR= +export SPARK_BUNDLE_JAR= + +./hudi-cli-with-bundle.sh + +``` + +### Base path +A hudi table resides on DFS, in a location referred to as the `basePath` and +we would need this location in order to connect to a Hudi table. Hudi library effectively manages this table internally, using `.hoodie` subfolder to track all metadata. + + + + +### Using Hudi-cli in S3 +If you are using hudi that comes packaged with AWS EMR, you can find instructions to use hudi-cli [here](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hudi-cli.html). +If you are not using EMR, or would like to use latest hudi-cli from master, you can follow the below steps to access S3 dataset in your local environment (laptop). + +Build Hudi with corresponding Spark version, for eg, -Dspark3.1.x + +Set the following environment variables. +``` +export AWS_REGION=us-east-2 +export AWS_ACCESS_KEY_ID= +export AWS_SECRET_ACCESS_KEY= +export SPARK_HOME= +``` +Ensure you set the SPARK_HOME to your local spark home compatible to compiled hudi spark version above. + +Apart from these, we might need to add aws jars to class path so that accessing S3 is feasible from local. +We need two jars, namely, aws-java-sdk-bundle jar and hadoop-aws jar which you can find online. +For eg: +``` +wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar -o /lib/spark-3.2.0-bin-hadoop3.2/jars/hadoop-aws-3.2.0.jar +wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.375/aws-java-sdk-bundle-1.11.375.jar -o /lib/spark-3.2.0-bin-hadoop3.2/jars/aws-java-sdk-bundle-1.11.375.jar +``` + +#### Note: These AWS jar versions below are specific to Spark 3.2.0 +``` +export CLIENT_JAR=/lib/spark-3.2.0-bin-hadoop3.2/jars/aws-java-sdk-bundle-1.12.48.jar:/lib/spark-3.2.0-bin-hadoop3.2/jars/hadoop-aws-3.3.1.jar +``` +Once these are set, you are good to launch hudi-cli and access S3 dataset. +``` +./hudi-cli/hudi-cli.sh +``` +### Using hudi-cli on Google Dataproc +[Dataproc](https://cloud.google.com/dataproc) is Google's managed service for running Apache Hadoop, Apache Spark, +Apache Flink, Presto and many other frameworks, including Hudi. If you want to run the Hudi CLI on a Dataproc node +which has not been launched with Hudi support enabled, you can use the steps below: + +These steps use Hudi version 0.13.0. If you want to use a different version you will have to edit the below commands +appropriately: +1. Once you've started the Dataproc cluster, you can ssh into it as follows: +``` +$ gcloud compute ssh --zone "YOUR_ZONE" "HOSTNAME_OF_MASTER_NODE" --project "YOUR_PROJECT" +``` + +2. Download the Hudi CLI bundle +``` +wget https://repo1.maven.org/maven2/org/apache/hudi/hudi-cli-bundle_2.12/0.13.0/hudi-cli-bundle_2.12-0.13.0.jar +``` + +3. Download the Hudi Spark bundle +``` +wget https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.13.0/hudi-spark-bundle_2.12-0.13.0.jar +``` + +4. Download the shell script that launches Hudi CLI bundle +``` +wget https://raw.githubusercontent.com/apache/hudi/release-0.13.0/packaging/hudi-cli-bundle/hudi-cli-with-bundle.sh +``` + +5. Launch Hudi CLI bundle with appropriate environment variables as follows: +``` +CLIENT_JAR=$DATAPROC_DIR/lib/gcs-connector.jar CLI_BUNDLE_JAR=hudi-cli-bundle_2.12-0.13.0.jar SPARK_BUNDLE_JAR=hudi-spark-bundle_2.12-0.13.0.jar ./hudi-cli-with-bundle.sh +``` + +6. hudi->connect --path gs://path_to_some_table +Metadata for table some_table loaded + +7. hudi:some_table->commits show --limit 5 +This command should show the recent commits, if the above steps work correctly. + +## Connect to a Kerberized cluster + +Before connecting to a Kerberized cluster, you can use **kerberos kinit** command. Following is the usage of this command. + +```shell +hudi->help kerberos kinit +NAME + kerberos kinit - Perform Kerberos authentication + +SYNOPSIS + kerberos kinit --krb5conf String [--principal String] [--keytab String] + +OPTIONS + --krb5conf String + Path to krb5.conf + [Optional, default = /etc/krb5.conf] + + --principal String + Kerberos principal + [Mandatory] + + --keytab String + Path to keytab + [Mandatory] +``` + +For example: + +```shell +hudi->kerberos kinit --principal user/host@DOMAIN --keytab /etc/security/keytabs/user.keytab +Perform Kerberos authentication +Parameters: +--krb5conf: /etc/krb5.conf +--principal: user/host@DOMAIN +--keytab: /etc/security/keytabs/user.keytab +Kerberos current user: user/host@DOMAIN (auth:KERBEROS) +Kerberos login user: user/host@DOMAIN (auth:KERBEROS) +Kerberos authentication success +``` + +If you see "Kerberos authentication success" in the command output, it means Kerberos authentication has been successful. + + +## Using hudi-cli + +To initialize a hudi table, use the following command. + +```java +=================================================================== +* ___ ___ * +* /\__\ ___ /\ \ ___ * +* / / / /\__\ / \ \ /\ \ * +* / /__/ / / / / /\ \ \ \ \ \ * +* / \ \ ___ / / / / / \ \__\ / \__\ * +* / /\ \ /\__\ / /__/ ___ / /__/ \ |__| / /\/__/ * +* \/ \ \/ / / \ \ \ /\__\ \ \ \ / / / /\/ / / * +* \ / / \ \ / / / \ \ / / / \ /__/ * +* / / / \ \/ / / \ \/ / / \ \__\ * +* / / / \ / / \ / / \/__/ * +* \/__/ \/__/ \/__/ Apache Hudi CLI * +* * +=================================================================== + +hudi->create --path /user/hive/warehouse/table1 --tableName hoodie_table_1 --tableType COPY_ON_WRITE +..... +``` + +To see the description of hudi table, use the command: + +```java +hudi:hoodie_table_1->desc +18/09/06 15:57:19 INFO timeline.HoodieActiveTimeline: Loaded instants [] + _________________________________________________________ + | Property | Value | + |========================================================| + | basePath | ... | + | metaPath | ... | + | fileSystem | hdfs | + | hoodie.table.name | hoodie_table_1 | + | hoodie.table.type | COPY_ON_WRITE | + | hoodie.archivelog.folder| | +``` + +Following is a sample command to connect to a Hudi table contains uber trips. + +```java +hudi:trips->connect --path /app/uber/trips + +16/10/05 23:20:37 INFO model.HoodieTableMetadata: All commits :HoodieCommits{commitList=[20161002045850, 20161002052915, 20161002055918, 20161002065317, 20161002075932, 20161002082904, 20161002085949, 20161002092936, 20161002105903, 20161002112938, 20161002123005, 20161002133002, 20161002155940, 20161002165924, 20161002172907, 20161002175905, 20161002190016, 20161002192954, 20161002195925, 20161002205935, 20161002215928, 20161002222938, 20161002225915, 20161002232906, 20161003003028, 20161003005958, 20161003012936, 20161003022924, 20161003025859, 20161003032854, 20161003042930, 20161003052911, 20161003055907, 20161003062946, 20161003065927, 20161003075924, 20161003082926, 20161003085925, 20161003092909, 20161003100010, 20161003102913, 20161003105850, 20161003112910, 20161003115851, 20161003122929, 20161003132931, 20161003142952, 20161003145856, 20161003152953, 20161003155912, 20161003162922, 20161003165852, 20161003172923, 20161003175923, 20161003195931, 20161003210118, 20161003212919, 20161003215928, 20161003223000, 20161003225858, 20161004003042, 20161004011345, 20161004015235, 20161004022234, 20161004063001, 20161004072402, 20161004074436, 20161004080224, 20161004082928, 20161004085857, 20161004105922, 20161004122927, 20161004142929, 20161004163026, 20161004175925, 20161004194411, 20161004203202, 20161004211210, 20161004214115, 20161004220437, 20161004223020, 20161004225321, 20161004231431, 20161004233643, 20161005010227, 20161005015927, 20161005022911, 20161005032958, 20161005035939, 20161005052904, 20161005070028, 20161005074429, 20161005081318, 20161005083455, 20161005085921, 20161005092901, 20161005095936, 20161005120158, 20161005123418, 20161005125911, 20161005133107, 20161005155908, 20161005163517, 20161005165855, 20161005180127, 20161005184226, 20161005191051, 20161005193234, 20161005203112, 20161005205920, 20161005212949, 20161005223034, 20161005225920]} +Metadata for table trips loaded +``` + +Once connected to the table, a lot of other commands become available. The shell has contextual autocomplete help (press TAB) and below is a list of all commands, few of which are reviewed in this section + +```shell +hudi:trips->help +* ! - Allows execution of operating system (OS) commands +* // - Inline comment markers (start of line only) +* ; - Inline comment markers (start of line only) +* bootstrap index showmapping - Show bootstrap index mapping +* bootstrap index showpartitions - Show bootstrap indexed partitions +* bootstrap run - Run a bootstrap action for current Hudi table +* clean showpartitions - Show partition level details of a clean +* cleans refresh - Refresh table metadata +* cleans run - run clean +* cleans show - Show the cleans +* clear - Clears the console +* cls - Clears the console +* clustering run - Run Clustering +* clustering schedule - Schedule Clustering +* clustering scheduleAndExecute - Run Clustering. Make a cluster plan first and execute that plan immediately +* commit rollback - Rollback a commit +* commits compare - Compare commits with another Hoodie table +* commit show_write_stats - Show write stats of a commit +* commit showfiles - Show file level details of a commit +* commit showpartitions - Show partition level details of a commit +* commits refresh - Refresh table metadata +* commits show - Show the commits +* commits showarchived - Show the archived commits +* commits sync - Sync commits with another Hoodie table +* compaction repair - Renames the files to make them consistent with the timeline as dictated by Hoodie metadata. Use when compaction unschedule fails partially. +* compaction run - Run Compaction for given instant time +* compaction schedule - Schedule Compaction +* compaction scheduleAndExecute - Schedule compaction plan and execute this plan +* compaction show - Shows compaction details for a specific compaction instant +* compaction showarchived - Shows compaction details for a specific compaction instant +* compactions show all - Shows all compactions that are in active timeline +* compactions showarchived - Shows compaction details for specified time window +* compaction unschedule - Unschedule Compaction +* compaction unscheduleFileId - UnSchedule Compaction for a fileId +* compaction validate - Validate Compaction +* connect - Connect to a hoodie table +* create - Create a hoodie table if not present +* date - Displays the local date and time +* desc - Describe Hoodie Table properties +* downgrade table - Downgrades a table +* exit - Exits the shell +* export instants - Export Instants and their metadata from the Timeline +* fetch table schema - Fetches latest table schema +* hdfsparquetimport - Imports Parquet table to a hoodie table +* help - List all commands usage +* marker delete - Delete the marker +* metadata create - Create the Metadata Table if it does not exist +* metadata delete - Remove the Metadata Table +* metadata init - Update the metadata table from commits since the creation +* metadata list-files - Print a list of all files in a partition from the metadata +* metadata list-partitions - List all partitions from metadata +* metadata refresh - Refresh table metadata +* metadata set - Set options for Metadata Table +* metadata stats - Print stats about the metadata +* metadata validate-files - Validate all files in all partitions from the metadata +* quit - Exits the shell +* refresh - Refresh table metadata +* repair addpartitionmeta - Add partition metadata to a table, if not present +* repair corrupted clean files - repair corrupted clean files +* repair deduplicate - De-duplicate a partition path contains duplicates & produce repaired files to replace with +* repair migrate-partition-meta - Migrate all partition meta file currently stored in text format to be stored in base file format. See HoodieTableConfig#PARTITION_METAFILE_USE_DATA_FORMAT. +* repair overwrite-hoodie-props - Overwrite hoodie.properties with provided file. Risky operation. Proceed with caution! +* savepoint create - Savepoint a commit +* savepoint delete - Delete the savepoint +* savepoint rollback - Savepoint a commit +* savepoints refresh - Refresh table metadata +* savepoints show - Show the savepoints +* script - Parses the specified resource file and executes its commands +* set - Set spark launcher env to cli +* show archived commits - Read commits from archived files and show details +* show archived commit stats - Read commits from archived files and show details +* show env - Show spark launcher env by key +* show envs all - Show spark launcher envs +* show fsview all - Show entire file-system view +* show fsview latest - Show latest file-system view +* show logfile metadata - Read commit metadata from log files +* show logfile records - Read records from log files +* show rollback - Show details of a rollback instant +* show rollbacks - List all rollback instants +* stats filesizes - File Sizes. Display summary stats on sizes of files +* stats wa - Write Amplification. Ratio of how many records were upserted to how many records were actually written +* sync validate - Validate the sync by counting the number of records +* system properties - Shows the shell's properties +* table delete-configs - Delete the supplied table configs from the table. +* table recover-configs - Recover table configs, from update/delete that failed midway. +* table update-configs - Update the table configs with configs with provided file. +* temp_delete - Delete view name +* temp_query - query against created temp view +* temp delete - Delete view name +* temp query - query against created temp view +* temps_show - Show all views name +* temps show - Show all views name +* upgrade table - Upgrades a table +* utils loadClass - Load a class +* version - Displays shell version + +hudi:trips-> +``` + + +### Inspecting Commits + +The task of upserting or inserting a batch of incoming records is known as a **commit** in Hudi. A commit provides basic atomicity guarantees such that only committed data is available for querying. +Each commit has a monotonically increasing string/number called the **commit number**. Typically, this is the time at which we started the commit. + +To view some basic information about the last 10 commits, + + +```java +hudi:trips->commits show --sortBy "Total Bytes Written" --desc true --limit 10 + ________________________________________________________________________________________________________________________________________________________________________ + | CommitTime | Total Bytes Written| Total Files Added| Total Files Updated| Total Partitions Written| Total Records Written| Total Update Records Written| Total Errors| + |=======================================================================================================================================================================| + .... + .... + .... +``` + +At the start of each write, Hudi also writes a .inflight commit to the .hoodie folder. You can use the timestamp there to estimate how long the commit has been inflight + + +```java +$ hdfs dfs -ls /app/uber/trips/.hoodie/*.inflight +-rw-r--r-- 3 vinoth supergroup 321984 2016-10-05 23:18 /app/uber/trips/.hoodie/20161005225920.inflight +``` + + +### Drilling Down to a specific Commit + +To understand how the writes spread across specific partiions, + + +```java +hudi:trips->commit showpartitions --commit 20161005165855 --sortBy "Total Bytes Written" --desc true --limit 10 + __________________________________________________________________________________________________________________________________________ + | Partition Path| Total Files Added| Total Files Updated| Total Records Inserted| Total Records Updated| Total Bytes Written| Total Errors| + |=========================================================================================================================================| + .... + .... +``` + +If you need file level granularity , we can do the following + + +```java +hudi:trips->commit showfiles --commit 20161005165855 --sortBy "Partition Path" + ________________________________________________________________________________________________________________________________________________________ + | Partition Path| File ID | Previous Commit| Total Records Updated| Total Records Written| Total Bytes Written| Total Errors| + |=======================================================================================================================================================| + .... + .... +``` + + +### FileSystem View + +Hudi views each partition as a collection of file-groups with each file-group containing a list of file-slices in commit order (See concepts). +The below commands allow users to view the file-slices for a data-set. + +```java +hudi:stock_ticks_mor->show fsview all + .... + _______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________ + | Partition | FileId | Base-Instant | Data-File | Data-File Size| Num Delta Files| Total Delta File Size| Delta Files | + |==============================================================================================================================================================================================================================================================================================================================================================================================================| + | 2018/08/31| 111415c3-f26d-4639-86c8-f9956f245ac3| 20181002180759| hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/111415c3-f26d-4639-86c8-f9956f245ac3_0_20181002180759.parquet| 432.5 KB | 1 | 20.8 KB | [HoodieLogFile {hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/.111415c3-f26d-4639-86c8-f9956f245ac3_20181002180759.log.1}]| + + + +hudi:stock_ticks_mor->show fsview latest --partitionPath| Partition | FileId | Base-Instant | Data-File | Data-File Size| Num Delta Files| Total Delta Size| Delta Size - compaction scheduled| Delta Size - compaction unscheduled| Delta To Base Ratio - compaction scheduled| Delta To Base Ratio - compaction unscheduled| Delta Files - compaction scheduled | Delta Files - compaction unscheduled| + || + | 2018/08/31| 111415c3-f26d-4639-86c8-f9956f245ac3| 20181002180759| hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/111415c3-f26d-4639-86c8-f9956f245ac3_0_20181002180759.parquet| 432.5 KB | 1 | 20.8 KB | 20.8 KB | 0.0 B | 0.0 B | 0.0 B | [HoodieLogFile {hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/.111415c3-f26d-4639-86c8-f9956f245ac3_20181002180759.log.1}]| [] | + +``` + + +### Statistics + +Since Hudi directly manages file sizes for DFS table, it might be good to get an overall picture + + +```java +hudi:trips->stats filesizes --partitionPath 2016/09/01 --sortBy "95th" --desc true --limit 10 + ________________________________________________________________________________________________ + | CommitTime | Min | 10th | 50th | avg | 95th | Max | NumFiles| StdDev | + |===============================================================================================| + | | 93.9 MB | 93.9 MB | 93.9 MB | 93.9 MB | 93.9 MB | 93.9 MB | 2 | 2.3 KB | + .... + .... +``` + +In case of Hudi write taking much longer, it might be good to see the write amplification for any sudden increases + + +```java +hudi:trips->stats wa + __________________________________________________________________________ + | CommitTime | Total Upserted| Total Written| Write Amplifiation Factor| + |=========================================================================| + .... + .... +``` + + +### Archived Commits + +In order to limit the amount of growth of .commit files on DFS, Hudi archives older .commit files (with due respect to the cleaner policy) into a commits.archived file. +This is a sequence file that contains a mapping from commitNumber => json with raw information about the commit (same that is nicely rolled up above). + + +### Compactions + +To get an idea of the lag between compaction and writer applications, use the below command to list down all +pending compactions. + +```java +hudi:trips->compactions show all + ___________________________________________________________________ + | Compaction Instant Time| State | Total FileIds to be Compacted| + |==================================================================| + | | REQUESTED| 35 | + | | INFLIGHT | 27 | +``` + +To inspect a specific compaction plan, use + +```java +hudi:trips->compaction show --instant + _________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________ + | Partition Path| File Id | Base Instant | Data File Path | Total Delta Files| getMetrics | + |================================================================================================================================================================================================================================================ + | 2018/07/17 | | | viewfs://ns-default/.../../UUID_.parquet | 1 | {TOTAL_LOG_FILES=1.0, TOTAL_IO_READ_MB=1230.0, TOTAL_LOG_FILES_SIZE=2.51255751E8, TOTAL_IO_WRITE_MB=991.0, TOTAL_IO_MB=2221.0}| + +``` + +To manually schedule or run a compaction, use the below command. This command uses spark launcher to perform compaction +operations. + +**NOTE:** Make sure no other application is scheduling compaction for this table concurrently +\{: .notice--info} + +```java +hudi:trips->help compaction schedule +Keyword: compaction schedule +Description: Schedule Compaction + Keyword: sparkMemory + Help: Spark executor memory + Mandatory: false + Default if specified: '__NULL__' + Default if unspecified: '1G' + +* compaction schedule - Schedule Compaction +``` + +```java +hudi:trips->help compaction run +Keyword: compaction run +Description: Run Compaction for given instant time + Keyword: tableName + Help: Table name + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + + Keyword: parallelism + Help: Parallelism for hoodie compaction + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + + Keyword: schemaFilePath + Help: Path for Avro schema file + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + + Keyword: sparkMemory + Help: Spark executor memory + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + + Keyword: retry + Help: Number of retries + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + + Keyword: compactionInstant + Help: Base path for the target hoodie table + Mandatory: true + Default if specified: '__NULL__' + Default if unspecified: '__NULL__' + +* compaction run - Run Compaction for given instant time +``` + +### Validate Compaction + +Validating a compaction plan : Check if all the files necessary for compactions are present and are valid + +```java +hudi:stock_ticks_mor->compaction validate --instant 20181005222611 +... + + COMPACTION PLAN VALID + + ___________________________________________________________________________________________________________________________________________________________________________________________________________________________ + | File Id | Base Instant Time| Base Data File | Num Delta Files| Valid| Error| + |==========================================================================================================================================================================================================================| + | 05320e98-9a57-4c38-b809-a6beaaeb36bd| 20181005222445 | hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/05320e98-9a57-4c38-b809-a6beaaeb36bd_0_20181005222445.parquet| 1 | true | | + + + +hudi:stock_ticks_mor->compaction validate --instant 20181005222601 + + COMPACTION PLAN INVALID + + _______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________ + | File Id | Base Instant Time| Base Data File | Num Delta Files| Valid| Error | + |=====================================================================================================================================================================================================================================================================================================| + | 05320e98-9a57-4c38-b809-a6beaaeb36bd| 20181005222445 | hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/05320e98-9a57-4c38-b809-a6beaaeb36bd_0_20181005222445.parquet| 1 | false| All log files specified in compaction operation is not present. Missing .... | +``` + +**NOTE:** The following commands must be executed without any other writer/ingestion application running. +\{: .notice--warning} + +Sometimes, it becomes necessary to remove a fileId from a compaction-plan inorder to speed-up or unblock compaction +operation. Any new log-files that happened on this file after the compaction got scheduled will be safely renamed +so that are preserved. Hudi provides the following CLI to support it + + +### Unscheduling Compaction + +```java +hudi:trips->compaction unscheduleFileId --fileId +.... +No File renames needed to unschedule file from pending compaction. Operation successful. +``` + +In other cases, an entire compaction plan needs to be reverted. This is supported by the following CLI + +```java +hudi:trips->compaction unschedule --instant +..... +No File renames needed to unschedule pending compaction. Operation successful. +``` + +### Repair Compaction + +The above compaction unscheduling operations could sometimes fail partially (e:g -> DFS temporarily unavailable). With +partial failures, the compaction operation could become inconsistent with the state of file-slices. When you run +`compaction validate`, you can notice invalid compaction operations if there is one. In these cases, the repair +command comes to the rescue, it will rearrange the file-slices so that there is no loss and the file-slices are +consistent with the compaction plan + +```java +hudi:stock_ticks_mor->compaction repair --instant 20181005222611 +...... +Compaction successfully repaired +..... +``` + +### Savepoint and Restore +As the name suggest, "savepoint" saves the table as of the commit time, so that it lets you restore the table to this +savepoint at a later point in time if need be. You can read more about savepoints and restore [here](disaster_recovery) + +To trigger savepoint for a hudi table +```java +connect --path /tmp/hudi_trips_cow/ +commits show +set --conf SPARK_HOME= +savepoint create --commit 20220128160245447 --sparkMaster local[2] +``` + +To restore the table to one of the savepointed commit: + +```java +connect --path /tmp/hudi_trips_cow/ +commits show +set --conf SPARK_HOME= +savepoints show +╔═══════════════════╗ +║ SavepointTime ║ +╠═══════════════════╣ +║ 20220128160245447 ║ +╚═══════════════════╝ +savepoint rollback --savepoint 20220128160245447 --sparkMaster local[2] +``` + +### Upgrade and Downgrade Table +In case the user needs to downgrade the version of Hudi library used, the Hudi table needs to be manually downgraded +on the newer version of Hudi CLI before library downgrade. To downgrade a Hudi table through CLI, user needs to specify +the target Hudi table version as follows: + +```shell +connect --path +downgrade table --toVersion +``` + +The following table shows the Hudi table versions corresponding to the Hudi release versions: + +| Hudi Table Version | Hudi Release Version(s) | +|:-------------------|:------------------------| +| `FIVE` or `5` | 0.12.x | +| `FOUR` or `4` | 0.11.x | +| `THREE` or `3` | 0.10.x | +| `TWO` or `2` | 0.9.x | +| `ONE` or `1` | 0.6.x - 0.8.x | +| `ZERO` or `0` | 0.5.x and below | + +For example, to downgrade a table from version `FIVE`(`5`) (current version) to `TWO`(`2`), you should run (use proper Spark master based +on your environment) + +```shell +downgrade table --toVersion TWO --sparkMaster local[2] +``` + +or + +```shell +downgrade table --toVersion 2 --sparkMaster local[2] +``` + +You can verify the table version by looking at the `hoodie.table.version` property in `.hoodie/hoodie.properties` under +the table path: + +```properties +hoodie.table.version=2 +``` + +Hudi CLI also provides the ability to manually upgrade a Hudi table. To upgrade a Hudi table through CLI: + +```shell +upgrade table --toVersion +``` + +:::note +Table upgrade is automatically handled by the Hudi write client in different deployment modes such as DeltaStreamer +after upgrading the Hudi library so that the user does not have to do manual upgrade. Such automatic table upgrade +is the **recommended** way in general, instead of using `upgrade` CLI command. + +Table upgrade from table version ONE to TWO requires key generator related configs such as +"hoodie.datasource.write.recordkey.field", which is only available when user configures the write job. So the table +upgrade from version ONE to TWO through CLI is not supported, and user should rely on the automatic upgrade in the write +client instead. +::: + +You may also run the upgrade command without specifying the target version. In such a case, the latest table version +corresponding to the library release version is used: + +```shell +upgrade table +``` + +### Change Hudi Table Type +There are cases we want to change the hudi table type. For example, change COW table to MOR for more efficient and +lower latency ingestion; change MOR to COW for better read performance and compatibility with downstream engines. +So we offer the table command to perform this modification conveniently. + +Changing **COW to MOR**, we can simply modify the `hoodie.table.type` in `hoodie.properties` to MERGE_ON_READ. + +While changing **MOR to COW**, we must make sure all the log files are compacted before modifying the table type, +or it will cause data loss. + +```shell +connect --path +table change-table-type +``` + +The parameter `target_table_type` candidates are below: + +| target table type | comment | +|:------------------|| +| MOR | Change COW table to MERGE_ON_READ. | +| COW | Change MOR table to COPY_ON_WRITE.
By default, changing to COW will **execute all pending compactions** and **perform a full compaction** if any log file left. Set `--enable-compaction=false` will disable the default compaction.
There are params can be set for the compaction operation:
`--parallelism`: Default `3`. Parallelism for hoodie compaction
`--sparkMaster`: Default `local`. Spark Master
`--sparkMemory`: Default `4G`. Spark executor memory
`--retry`: Default `1`. Number of retries
`--propsFilePath`: Default ` `. path to properties file on localfs or dfs with configurations for hoodie client for compacting
`--hoodieConfigs`: Default ` `. Any configuration that can be set in the properties file can be passed here in the form of an array | + + +Example below is changing MOR table to COW: +```shell +connect --path /var/dataset/test_table_mor2cow +desc +╔════════════════════════════════════════════════╤═════════════════════════════════════════╗ +║ Property │ Value ║ +╠════════════════════════════════════════════════╪═════════════════════════════════════════╣ +║ basePath │ /var/dataset/test_table_mor2cow ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ metaPath │ /var/dataset/test_table_mor2cow/.hoodie ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ fileSystem │ file ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.table.name │ test_table ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.compaction.record.merger.strategy │ eeb8d96f-b1e4-49fd-bbf8-28ac514178e5 ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.table.metadata.partitions │ files ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.table.type │ MERGE_ON_READ ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.table.metadata.partitions.inflight │ ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.archivelog.folder │ archived ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.timeline.layout.version │ 1 ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.table.checksum │ 2702201862 ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.compaction.payload.type │ HOODIE_AVRO ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.table.version │ 6 ║ +╟────────────────────────────────────────────────┼─────────────────────────────────────────╢ +║ hoodie.datasource.write.drop.partition.columns │ false ║ +╚════════════════════════════════════════════════╧═════════════════════════════════════════╝ + +table change-table-type COW +╔════════════════════════════════════════════════╤══════════════════════════════════════╤══════════════════════════════════════╗ +║ Property │ Old Value │ New Value ║ +╠════════════════════════════════════════════════╪══════════════════════════════════════╪══════════════════════════════════════╣ +║ hoodie.archivelog.folder │ archived │ archived ║ +╟────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────╢ +║ hoodie.compaction.payload.type │ HOODIE_AVRO │ HOODIE_AVRO ║ +╟────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────╢ +║ hoodie.compaction.record.merger.strategy │ eeb8d96f-b1e4-49fd-bbf8-28ac514178e5 │ eeb8d96f-b1e4-49fd-bbf8-28ac514178e5 ║ +╟────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────╢ +║ hoodie.datasource.write.drop.partition.columns │ false │ false ║ +╟────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────╢ +║ hoodie.table.checksum │ 2702201862 │ 2702201862 ║ +╟────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────╢ +║ hoodie.table.metadata.partitions │ files │ files ║ +╟────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────╢ +║ hoodie.table.metadata.partitions.inflight │ │ ║ +╟────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────╢ +║ hoodie.table.name │ test_table │ test_table ║ +╟────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────╢ +║ hoodie.table.type │ MERGE_ON_READ │ COPY_ON_WRITE ║ +╟────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────╢ +║ hoodie.table.version │ 6 │ 6 ║ +╟────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────────────────────╢ +║ hoodie.timeline.layout.version │ 1 │ 1 ║ +╚════════════════════════════════════════════════╧══════════════════════════════════════╧══════════════════════════════════════╝ +``` diff --git a/website/versioned_docs/version-1.0.0/cloud.md b/website/versioned_docs/version-1.0.0/cloud.md new file mode 100644 index 0000000000000..123abd5e6bea4 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/cloud.md @@ -0,0 +1,38 @@ +--- +title: Cloud Storage +keywords: [hudi, aws, gcp, oss, azure, cloud, juicefs] +summary: "In this page, we introduce how Hudi work with different Cloud providers." +toc: true +last_modified_at: 2021-10-12T10:50:00+08:00 +--- + +## Talking to Cloud Storage + +Immaterial of whether RDD/WriteClient APIs or Datasource is used, the following information helps configure access +to cloud stores. + +* [AWS S3](s3_hoodie)
+ Configurations required for S3 and Hudi co-operability. +* [Google Cloud Storage](gcs_hoodie)
+ Configurations required for GCS and Hudi co-operability. +* [Alibaba Cloud OSS](oss_hoodie)
+ Configurations required for OSS and Hudi co-operability. +* [Microsoft Azure](azure_hoodie)
+ Configurations required for Azure and Hudi co-operability. +* [Tencent Cloud Object Storage](cos_hoodie)
+ Configurations required for COS and Hudi co-operability. +* [IBM Cloud Object Storage](ibm_cos_hoodie)
+ Configurations required for IBM Cloud Object Storage and Hudi co-operability. +* [Baidu Cloud Object Storage](bos_hoodie)
+ Configurations required for BOS and Hudi co-operability. +* [JuiceFS](jfs_hoodie)
+ Configurations required for JuiceFS and Hudi co-operability. +* [Oracle Cloud Infrastructure](oci_hoodie)
+ Configurations required for OCI and Hudi co-operability. + +:::note +Many cloud object storage systems like [Amazon S3](https://docs.aws.amazon.com/s3/) allow you to set +lifecycle policies, such as [S3 Lifecycle](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-lifecycle-mgmt.html), +to manage objects. One of the policies is related to object expiration. If your organisation has configured such policies, +then please ensure to exclude (or have a longer expiry period) for Hudi tables. +::: \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/clustering.md b/website/versioned_docs/version-1.0.0/clustering.md new file mode 100644 index 0000000000000..0bbbad9781a91 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/clustering.md @@ -0,0 +1,346 @@ +--- +title: Clustering +summary: "In this page, we describe async compaction in Hudi." +toc: true +last_modified_at: +--- + +## Background + +Apache Hudi brings stream processing to big data, providing fresh data while being an order of magnitude efficient over traditional batch processing. In a data lake/warehouse, one of the key trade-offs is between ingestion speed and query performance. Data ingestion typically prefers small files to improve parallelism and make data available to queries as soon as possible. However, query performance degrades poorly with a lot of small files. Also, during ingestion, data is typically co-located based on arrival time. However, the query engines perform better when the data frequently queried is co-located together. In most architectures each of these systems tend to add optimizations independently to improve performance which hits limitations due to un-optimized data layouts. This doc introduces a new kind of table service called clustering [[RFC-19]](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance) to reorganize data for improved query performance without compromising on ingestion speed. + + +## How is compaction different from clustering? + +Hudi is modeled like a log-structured storage engine with multiple versions of the data. +Particularly, [Merge-On-Read](/docs/table_types#merge-on-read-table) +tables in Hudi store data using a combination of base file in columnar format and row-based delta logs that contain +updates. Compaction is a way to merge the delta logs with base files to produce the latest file slices with the most +recent snapshot of data. Compaction helps to keep the query performance in check (larger delta log files would incur +longer merge times on query side). On the other hand, clustering is a data layout optimization technique. One can stitch +together small files into larger files using clustering. Additionally, data can be clustered by sort key so that queries +can take advantage of data locality. + +## Clustering Architecture + +At a high level, Hudi provides different operations such as insert/upsert/bulk_insert through it’s write client API to be able to write data to a Hudi table. To be able to choose a trade-off between file size and ingestion speed, Hudi provides a knob `hoodie.parquet.small.file.limit` to be able to configure the smallest allowable file size. Users are able to configure the small file [soft limit](https://hudi.apache.org/docs/configurations/#hoodieparquetsmallfilelimit) to `0` to force new data to go into a new set of filegroups or set it to a higher value to ensure new data gets “padded” to existing files until it meets that limit that adds to ingestion latencies. + + + +To be able to support an architecture that allows for fast ingestion without compromising query performance, we have introduced a ‘clustering’ service to rewrite the data to optimize Hudi data lake file layout. + +Clustering table service can run asynchronously or synchronously adding a new action type called “REPLACE”, that will mark the clustering action in the Hudi metadata timeline. + + + +### Overall, there are 2 steps to clustering + +1. Scheduling clustering: Create a clustering plan using a pluggable clustering strategy. +2. Execute clustering: Process the plan using an execution strategy to create new files and replace old files. + + +### Schedule clustering + +Following steps are followed to schedule clustering. + +1. Identify files that are eligible for clustering: Depending on the clustering strategy chosen, the scheduling logic will identify the files eligible for clustering. +2. Group files that are eligible for clustering based on specific criteria. Each group is expected to have data size in multiples of ‘targetFileSize’. Grouping is done as part of ‘strategy’ defined in the plan. Additionally, there is an option to put a cap on group size to improve parallelism and avoid shuffling large amounts of data. +3. Finally, the clustering plan is saved to the timeline in an avro [metadata format](https://github.com/apache/hudi/blob/master/hudi-common/src/main/avro/HoodieClusteringPlan.avsc). + + +### Execute clustering + +1. Read the clustering plan and get the ‘clusteringGroups’ that mark the file groups that need to be clustered. +2. For each group, we instantiate appropriate strategy class with strategyParams (example: sortColumns) and apply that strategy to rewrite the data. +3. Create a “REPLACE” commit and update the metadata in [HoodieReplaceCommitMetadata](https://github.com/apache/hudi/blob/master/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieReplaceCommitMetadata.java). + + +Clustering Service builds on Hudi’s MVCC based design to allow for writers to continue to insert new data while clustering action runs in the background to reformat data layout, ensuring snapshot isolation between concurrent readers and writers. + +NOTE: Clustering can only be scheduled for tables / partitions not receiving any concurrent updates. In the future, concurrent updates use-case will be supported as well. + +![Clustering example](/assets/images/blog/clustering/clustering1_new.png) +_Figure: Illustrating query performance improvements by clustering_ + +## Clustering Usecases + +### Batching small files + +As mentioned in the intro, streaming ingestion generally results in smaller files in your data lake. But having a lot of +such small files could lead to higher query latency. From our experience supporting community users, there are quite a +few users who are using Hudi just for small file handling capabilities. So, you could employ clustering to batch a lot +of such small files into larger ones. + +![Batching small files](/assets/images/blog/clustering/clustering2_new.png) + +### Cluster by sort key + +Another classic problem in data lake is the arrival time vs event time problem. Generally you write data based on +arrival time, while query predicates do not sit well with it. With clustering, you can re-write your data by sorting +based on query predicates and so, your data skipping will be very efficient and your query can ignore scanning a lot of +unnecessary data. + +![Batching small files](/assets/images/blog/clustering/clustering_3.png) + +## Clustering Strategies + +On a high level, clustering creates a plan based on a configurable strategy, groups eligible files based on specific +criteria and then executes the plan. As mentioned before, clustering plan as well as execution depends on configurable +strategy. These strategies can be broadly classified into three types: clustering plan strategy, execution strategy and +update strategy. + +### Plan Strategy + +This strategy comes into play while creating clustering plan. It helps to decide what file groups should be clustered +and how many output file groups should the clustering produce. Note that these strategies are easily pluggable using the +config [hoodie.clustering.plan.strategy.class](/docs/configurations#hoodieclusteringplanstrategyclass). + +Different plan strategies are as follows: + +#### Size-based clustering strategies + +This strategy creates clustering groups based on max size allowed per group. Also, it excludes files that are greater +than the small file limit from the clustering plan. Available strategies depending on write client +are: `SparkSizeBasedClusteringPlanStrategy`, `FlinkSizeBasedClusteringPlanStrategy` +and `JavaSizeBasedClusteringPlanStrategy`. Furthermore, Hudi provides flexibility to include or exclude partitions for +clustering, tune the file size limits, maximum number of output groups. Please refer to [hoodie.clustering.plan.strategy.small.file.limit](https://hudi.apache.org/docs/next/configurations/#hoodieclusteringplanstrategysmallfilelimit) +, [hoodie.clustering.plan.strategy.max.num.groups](https://hudi.apache.org/docs/next/configurations/#hoodieclusteringplanstrategymaxnumgroups), [hoodie.clustering.plan.strategy.max.bytes.per.group](https://hudi.apache.org/docs/next/configurations/#hoodieclusteringplanstrategymaxbytespergroup) +, [hoodie.clustering.plan.strategy.target.file.max.bytes](https://hudi.apache.org/docs/next/configurations/#hoodieclusteringplanstrategytargetfilemaxbytes) for more details. + +| Config Name | Default | Description | +|---------------------------------------------------------| -------------------|| +| hoodie.clustering.plan.strategy.partition.selected | N/A **(Required)** | Comma separated list of partitions to run clustering

`Config Param: PARTITION_SELECTED`
`Since Version: 0.11.0` | +| hoodie.clustering.plan.strategy.partition.regex.pattern | N/A **(Required)** | Filter clustering partitions that matched regex pattern

`Config Param: PARTITION_REGEX_PATTERN`
`Since Version: 0.11.0` | +| hoodie.clustering.plan.partition.filter.mode | NONE (Optional) | Partition filter mode used in the creation of clustering plan. Possible values:
  • `NONE`: Do not filter partitions. The clustering plan will include all partitions that have clustering candidates.
  • `RECENT_DAYS`: This filter assumes that your data is partitioned by date. The clustering plan will only include partitions from K days ago to N days ago, where K >= N. K is determined by `hoodie.clustering.plan.strategy.daybased.lookback.partitions` and N is determined by `hoodie.clustering.plan.strategy.daybased.skipfromlatest.partitions`.
  • `SELECTED_PARTITIONS`: The clustering plan will include only partition paths with names that sort within the inclusive range [`hoodie.clustering.plan.strategy.cluster.begin.partition`, `hoodie.clustering.plan.strategy.cluster.end.partition`].
  • `DAY_ROLLING`: To determine the partitions in the clustering plan, the eligible partitions will be sorted in ascending order. Each partition will have an index i in that list. The clustering plan will only contain partitions such that i mod 24 = H, where H is the current hour of the day (from 0 to 23).

`Config Param: PLAN_PARTITION_FILTER_MODE_NAME`
`Since Version: 0.11.0` | + + +#### SparkSingleFileSortPlanStrategy + +In this strategy, clustering group for each partition is built in the same way as `SparkSizeBasedClusteringPlanStrategy` +. The difference is that the output group is 1 and file group id remains the same, +while `SparkSizeBasedClusteringPlanStrategy` can create multiple file groups with newer fileIds. + +#### SparkConsistentBucketClusteringPlanStrategy + +This strategy is specifically used for consistent bucket index. This will be leveraged to expand your bucket index (from +static partitioning to dynamic). Typically, users don’t need to use this strategy. Hudi internally uses this for +dynamically expanding the buckets for bucket index datasets. + +:::note The latter two strategies are applicable only for the Spark engine. +::: + +### Execution Strategy + +After building the clustering groups in the planning phase, Hudi applies execution strategy, for each group, primarily +based on sort columns and size. The strategy can be specified using the +config [hoodie.clustering.execution.strategy.class](/docs/configurations/#hoodieclusteringexecutionstrategyclass). By +default, Hudi sorts the file groups in the plan by the specified columns, while meeting the configured target file +sizes. + +| Config Name | Default | Description | +| --------------------------------------------| ----------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| hoodie.clustering.execution.strategy.class | org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy (Optional) | Config to provide a strategy class (subclass of RunClusteringStrategy) to define how the clustering plan is executed. By default, we sort the file groups in th plan by the specified columns, while meeting the configured target file sizes.

`Config Param: EXECUTION_STRATEGY_CLASS_NAME`
`Since Version: 0.7.0` | + +The available strategies are as follows: + +1. `SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY`: Uses bulk_insert to re-write data from input file groups. + 1. Set `hoodie.clustering.execution.strategy.class` + to `org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy`. + 2. `hoodie.clustering.plan.strategy.sort.columns`: Columns to sort the data while clustering. This goes in + conjunction with layout optimization strategies depending on your query predicates. One can set comma separated + list of columns that needs to be sorted in this config. +2. `JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY`: Similar to `SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY`, for the Java and Flink + engines. Set `hoodie.clustering.execution.strategy.class` + to `org.apache.hudi.client.clustering.run.strategy.JavaSortAndSizeExecutionStrategy`. +3. `SPARK_CONSISTENT_BUCKET_EXECUTION_STRATEGY`: As the name implies, this is applicable to dynamically expand + consistent bucket index and only applicable to the Spark engine. Set `hoodie.clustering.execution.strategy.class` + to `org.apache.hudi.client.clustering.run.strategy.SparkConsistentBucketClusteringExecutionStrategy`. + +### Update Strategy + +Currently, clustering can only be scheduled for tables/partitions not receiving any concurrent updates. By default, +the config for update strategy - [`hoodie.clustering.updates.strategy`](/docs/configurations/#hoodieclusteringupdatesstrategy) is set to ***SparkRejectUpdateStrategy***. If some file group has updates during clustering then it will reject updates and throw an +exception. However, in some use-cases updates are very sparse and do not touch most file groups. The default strategy to +simply reject updates does not seem fair. In such use-cases, users can set the config to ***SparkAllowUpdateStrategy***. + +We discussed the critical strategy configurations. All other configurations related to clustering are +listed [here](/docs/configurations/#Clustering-Configs). Out of this list, a few configurations that will be very useful +for inline or async clustering are shown below with code samples. + +## Inline clustering + +Inline clustering happens synchronously with the regular ingestion writer or as part of the data ingestion pipeline. This means the next round of ingestion cannot proceed until the clustering is complete With inline clustering, Hudi will schedule, plan clustering operations after each commit is completed and execute the clustering plans after it’s created. This is the simplest deployment model to run because it’s easier to manage than running different asynchronous Spark jobs. This mode is supported on Spark Datasource, Flink, Spark-SQL and DeltaStreamer in a sync-once mode. + +For this deployment mode, please enable and set: `hoodie.clustering.inline` + +To choose how often clustering is triggered, also set: `hoodie.clustering.inline.max.commits`. + +Inline clustering can be setup easily using spark dataframe options. +See sample below: + +```scala +import org.apache.hudi.QuickstartUtils._ +import scala.collection.JavaConversions._ +import org.apache.spark.sql.SaveMode._ +import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.config.HoodieWriteConfig._ + + +val df = //generate data frame +df.write.format("org.apache.hudi"). + options(getQuickstartWriteConfigs). + option("hoodie.datasource.write.precombine.field", "ts"). + option("hoodie.datasource.write.recordkey.field", "uuid"). + option("hoodie.datasource.write.partitionpath.field", "partitionpath"). + option("hoodie.table.name", "tableName"). + option("hoodie.parquet.small.file.limit", "0"). + option("hoodie.clustering.inline", "true"). + option("hoodie.clustering.inline.max.commits", "4"). + option("hoodie.clustering.plan.strategy.target.file.max.bytes", "1073741824"). + option("hoodie.clustering.plan.strategy.small.file.limit", "629145600"). + option("hoodie.clustering.plan.strategy.sort.columns", "column1,column2"). //optional, if sorting is needed as part of rewriting data + mode(Append). + save("dfs://location"); +``` + +## Async Clustering + +Async clustering runs the clustering table service in the background without blocking the regular ingestions writers. There are three different ways to deploy an asynchronous clustering process: + +- **Asynchronous execution within the same process**: In this deployment mode, Hudi will schedule and plan the clustering operations after each commit is completed as part of the ingestion pipeline. Separately, Hudi spins up another thread within the same job and executes the clustering table service. This is supported by Spark Streaming, Flink and DeltaStreamer in continuous mode. For this deployment mode, please enable `hoodie.clustering.async.enabled` and `hoodie.clustering.async.max.commits​`. +- **Asynchronous scheduling and execution by a separate process**: In this deployment mode, the application will write data to a Hudi table as part of the ingestion pipeline. A separate clustering job will schedule, plan and execute the clustering operation. By running a different job for the clustering operation, it rebalances how Hudi uses compute resources: fewer compute resources are needed for the ingestion, which makes ingestion latency stable, and an independent set of compute resources are reserved for the clustering process. Please configure the lock providers for the concurrency control among all jobs (both writer and table service jobs). In general, configure lock providers when there are two different jobs or two different processes occurring. All writers support this deployment model. For this deployment mode, no clustering configs should be set for the ingestion writer. +- **Scheduling inline and executing async**: In this deployment mode, the application ingests data and schedules the clustering in one job; in another, the application executes the clustering plan. The supported writers (see below) won’t be blocked from ingesting data. If the metadata table is enabled, a lock provider is not needed. However, if the metadata table is enabled, please ensure all jobs have the lock providers configured for concurrency control. All writers support this deployment option. For this deployment mode, please enable, `hoodie.clustering.schedule.inline` and `hoodie.clustering.async.enabled`. + +Hudi supports [multi-writers](https://hudi.apache.org/docs/concurrency_control#enabling-multi-writing) which provides +snapshot isolation between multiple table services, thus allowing writers to continue with ingestion while clustering +runs in the background. + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------- | --------------------------------------- || +| hoodie.clustering.async.enabled | false (Optional) | Enable running of clustering service, asynchronously as inserts happen on the table.

`Config Param: ASYNC_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| hoodie.clustering.async.max.commits | 4 (Optional) | Config to control frequency of async clustering

`Config Param: ASYNC_CLUSTERING_MAX_COMMITS`
`Since Version: 0.9.0` | + +## Setup Asynchronous Clustering +Users can leverage [HoodieClusteringJob](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance#RFC19Clusteringdataforfreshnessandqueryperformance-SetupforAsyncclusteringJob) +to setup 2-step asynchronous clustering. + +### HoodieClusteringJob +By specifying the `scheduleAndExecute` mode both schedule as well as clustering can be achieved in the same step. +The appropriate mode can be specified using `-mode` or `-m` option. There are three modes: + +1. `schedule`: Make a clustering plan. This gives an instant which can be passed in execute mode. +2. `execute`: Execute a clustering plan at a particular instant. If no instant-time is specified, HoodieClusteringJob will execute for the earliest instant on the Hudi timeline. +3. `scheduleAndExecute`: Make a clustering plan first and execute that plan immediately. + +Note that to run this job while the original writer is still running, please enable multi-writing: +``` +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider +``` + +A sample spark-submit command to setup HoodieClusteringJob is as below: + +```bash +spark-submit \ +--jars "packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar,packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ +--class org.apache.hudi.utilities.HoodieClusteringJob \ +/path/to/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ +--props /path/to/config/clusteringjob.properties \ +--mode scheduleAndExecute \ +--base-path /path/to/hudi_table/basePath \ +--table-name hudi_table_schedule_clustering \ +--spark-memory 1g +``` +A sample `clusteringjob.properties` file: +``` +hoodie.clustering.async.enabled=true +hoodie.clustering.async.max.commits=4 +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy +hoodie.clustering.plan.strategy.sort.columns=column1,column2 +``` + +### HoodieStreamer + +This brings us to our users' favorite utility in Hudi. Now, we can trigger asynchronous clustering with Hudi Streamer. +Just set the `hoodie.clustering.async.enabled` config to true and specify other clustering config in properties file +whose location can be pased as `—props` when starting the Hudi Streamer (just like in the case of HoodieClusteringJob). + +A sample spark-submit command to setup HoodieStreamer is as below: + + +```bash +spark-submit \ +--jars "packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar,packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ +--class org.apache.hudi.utilities.streamer.HoodieStreamer \ +/path/to/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ +--props /path/to/config/clustering_kafka.properties \ +--schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \ +--source-class org.apache.hudi.utilities.sources.AvroKafkaSource \ +--source-ordering-field impresssiontime \ +--table-type COPY_ON_WRITE \ +--target-base-path /path/to/hudi_table/basePath \ +--target-table impressions_cow_cluster \ +--op INSERT \ +--hoodie-conf hoodie.clustering.async.enabled=true \ +--continuous +``` + +### Spark Structured Streaming + +We can also enable asynchronous clustering with Spark structured streaming sink as shown below. +```scala +val commonOpts = Map( + "hoodie.insert.shuffle.parallelism" -> "4", + "hoodie.upsert.shuffle.parallelism" -> "4", + "hoodie.datasource.write.recordkey.field" -> "_row_key", + "hoodie.datasource.write.partitionpath.field" -> "partition", + "hoodie.datasource.write.precombine.field" -> "timestamp", + "hoodie.table.name" -> "hoodie_test" +) + +def getAsyncClusteringOpts(isAsyncClustering: String, + clusteringNumCommit: String, + executionStrategy: String):Map[String, String] = { + commonOpts + (DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE.key -> isAsyncClustering, + HoodieClusteringConfig.ASYNC_CLUSTERING_MAX_COMMITS.key -> clusteringNumCommit, + HoodieClusteringConfig.EXECUTION_STRATEGY_CLASS_NAME.key -> executionStrategy + ) +} + +def initStreamingWriteFuture(hudiOptions: Map[String, String]): Future[Unit] = { + val streamingInput = // define the source of streaming + Future { + println("streaming starting") + streamingInput + .writeStream + .format("org.apache.hudi") + .options(hudiOptions) + .option("checkpointLocation", basePath + "/checkpoint") + .mode(Append) + .start() + .awaitTermination(10000) + println("streaming ends") + } +} + +def structuredStreamingWithClustering(): Unit = { + val df = //generate data frame + val hudiOptions = getClusteringOpts("true", "1", "org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy") + val f1 = initStreamingWriteFuture(hudiOptions) + Await.result(f1, Duration.Inf) +} +``` + +## Java Client + +Clustering is also supported via Java client. Plan strategy `org.apache.hudi.client.clustering.plan.strategy.JavaSizeBasedClusteringPlanStrategy` +and execution strategy `org.apache.hudi.client.clustering.run.strategy.JavaSortAndSizeExecutionStrategy` are supported +out-of-the-box. Note that as of now only linear sort is supported in Java execution strategy. + +## Related Resources +

Videos

+ +* [Understanding Clustering in Apache Hudi and the Benefits of Asynchronous Clustering](https://www.youtube.com/watch?v=R_sm4wlGXuE) diff --git a/website/versioned_docs/version-1.0.0/compaction.md b/website/versioned_docs/version-1.0.0/compaction.md new file mode 100644 index 0000000000000..7859030052aa6 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/compaction.md @@ -0,0 +1,228 @@ +--- +title: Compaction +summary: "In this page, we describe async compaction in Hudi." +toc: true +toc_min_heading_level: 2 +toc_max_heading_level: 4 +last_modified_at: +--- +## Background +Compaction is a table service employed by Hudi specifically in Merge On Read(MOR) tables to merge updates from row-based log +files to the corresponding columnar-based base file periodically to produce a new version of the base file. Compaction is +not applicable to Copy On Write(COW) tables and only applies to MOR tables. + +### Why MOR tables need compaction? +To understand the significance of compaction in MOR tables, it is helpful to understand the MOR table layout first. In Hudi, +data is organized in terms of [file groups](https://hudi.apache.org/docs/file_layouts/). Each file group in a MOR table +consists of a base file and one or more log files. Typically, during writes, inserts are stored in the base file, and updates +are appended to log files. + +![mor_table_file_layout](/assets/images/hudi_mor_file_layout.jpg) +_Figure: MOR table file layout showing different file groups with base data file and log files_ + +During the compaction process, updates from the log files are merged with the base file to form a new version of the +base file as shown below. Since MOR is designed to be write-optimized, on new writes, after index tagging is complete, +Hudi appends the records pertaining to each file groups as log blocks in log files. There is no synchronous merge +happening during write, resulting in a lower write amplification and better write latency. In contrast, on new writes to a +COW table, Hudi combines the new writes with the older base file to produce a new version of the base file resulting in +a higher write amplification and higher write latencies. + +![mor_table_file_layout](/assets/images/hudi_mor_file_layout_post_compaction.jpg) +_Figure: Compaction on a given file group_ + +While serving the read query(snapshot read), for each file group, records in base file and all its corresponding log +files are merged together and served. And hence the read latency for MOR snapshot query might be higher compared to +COW table since there is no merge involved in case of COW at read time. Compaction takes care of merging the updates from +log files with the base file at regular intervals to bound the growth of log files and to ensure the read latencies do not +spike up. + +## Compaction Architecture +There are two steps to compaction. +- ***Compaction Scheduling***: In this step, Hudi scans the partitions and selects file slices to be compacted. A compaction + plan is finally written to Hudi timeline. +- ***Compaction Execution***: In this step the compaction plan is read and file slices are compacted. + +### Strategies in Compaction Scheduling +There are two strategies involved in scheduling the compaction: +- Trigger Strategy: Determines how often to trigger scheduling of the compaction. +- Compaction Strategy: Determines which file groups to compact. + +Hudi provides various options for both these strategies as discussed below. + +#### Trigger Strategies + +| Config Name | Default | Description | +|----------------------------------------------------|-------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.compact.inline.trigger.strategy | NUM_COMMITS (Optional) | org.apache.hudi.table.action.compact.CompactionTriggerStrategy: Controls when compaction is scheduled.
`Config Param: INLINE_COMPACT_TRIGGER_STRATEGY`
+
  • `NUM_COMMITS`: triggers compaction when there are at least N delta commits after last completed compaction.
  • `NUM_COMMITS_AFTER_LAST_REQUEST`: triggers compaction when there are at least N delta commits after last completed or requested compaction.
  • `TIME_ELAPSED`: triggers compaction after N seconds since last compaction.
  • `NUM_AND_TIME`: triggers compaction when both there are at least N delta commits and N seconds elapsed (both must be satisfied) after last completed compaction.
  • `NUM_OR_TIME`: triggers compaction when both there are at least N delta commits or N seconds elapsed (either condition is satisfied) after last completed compaction.
| + +#### Compaction Strategies +| Config Name | Default | Description | +|----------------------------------------------------|-------------------------|| +| hoodie.compaction.strategy | org.apache.hudi.table.action.compact.strategy.LogFileSizeBasedCompactionStrategy (Optional) | Compaction strategy decides which file groups are picked up for compaction during each compaction run. By default. Hudi picks the log file with most accumulated unmerged data.

`Config Param: COMPACTION_STRATEGY` | + +Available Strategies (Provide the full package name when using the strategy):
  • `LogFileNumBasedCompactionStrategy`: +orders the compactions based on the total log files count, filters the file group with log files count greater than the +threshold and limits the compactions within a configured IO bound.
  • `LogFileSizeBasedCompactionStrategy`: orders +the compactions based on the total log files size, filters the file group which log files size is greater than the +threshold and limits the compactions within a configured IO bound.
  • `BoundedIOCompactionStrategy`: CompactionStrategy +which looks at total IO to be done for the compaction (read + write) and limits the list of compactions to be under a +configured limit on the IO.
  • `BoundedPartitionAwareCompactionStrategy`:This strategy ensures that the last N partitions +are picked up even if there are later partitions created for the table. lastNPartitions is defined as the N partitions before +the currentDate. currentDay = 2018/01/01 The table has partitions for 2018/02/02 and 2018/03/03 beyond the currentDay This +strategy will pick up the following partitions for compaction : (2018/01/01, allPartitionsInRange[(2018/01/01 - lastNPartitions) +to 2018/01/01), 2018/02/02, 2018/03/03)
  • `DayBasedCompactionStrategy`:This strategy orders compactions in reverse +order of creation of Hive Partitions. It helps to compact data in latest partitions first and then older capped at the +Total_IO allowed.
  • `UnBoundedCompactionStrategy`: UnBoundedCompactionStrategy will not change ordering or filter +any compaction. It is a pass-through and will compact all the base files which has a log file. This usually means +no-intelligence on compaction.
  • `UnBoundedPartitionAwareCompactionStrategy`:UnBoundedPartitionAwareCompactionStrategy is a custom UnBounded Strategy. This will filter all the partitions that +are eligible to be compacted by a \{@link BoundedPartitionAwareCompactionStrategy} and return the result. This is done +so that a long running UnBoundedPartitionAwareCompactionStrategy does not step over partitions in a shorter running +BoundedPartitionAwareCompactionStrategy. Essentially, this is an inverse of the partitions chosen in +BoundedPartitionAwareCompactionStrategy
+ +:::note +Please refer to [advanced configs](https://hudi.apache.org/docs/next/configurations#Compaction-Configs) for more details. +::: + +## Ways to trigger Compaction + +### Inline +By default, compaction is run asynchronously. + +If latency of ingesting records is important for you, you are most likely using Merge-On-Read tables. +Merge-On-Read tables store data using a combination of columnar (e.g parquet) + row based (e.g avro) file formats. +Updates are logged to delta files & later compacted to produce new versions of columnar files. +To improve ingestion latency, Async Compaction is the default configuration. + +If immediate read performance of a new commit is important for you, or you want simplicity of not managing separate compaction jobs, +you may want synchronous inline compaction, which means that as a commit is written it is also compacted by the same job. + +For this deployment mode, please use `hoodie.compact.inline = true` for Spark Datasource and Spark SQL writers. For +HoodieStreamer sync once mode inline compaction can be achieved by passing the flag `--disable-compaction` (Meaning to +disable async compaction). Further in HoodieStreamer when both +ingestion and compaction is running in the same spark context, you can use resource allocation configuration +in Hudi Streamer CLI such as (`--delta-sync-scheduling-weight`, +`--compact-scheduling-weight`, `--delta-sync-scheduling-minshare`, and `--compact-scheduling-minshare`) +to control executor allocation between ingestion and compaction. + + +### Async & Offline Compaction models + +There are a couple of ways here to trigger compaction . + +#### Async execution within the same process +In streaming ingestion write models like HoodieStreamer +continuous mode, Flink and Spark Streaming, async compaction is enabled by default and runs alongside without blocking +regular ingestion. + +##### Spark Structured Streaming + +Compactions are scheduled and executed asynchronously inside the +streaming job.Here is an example snippet in java + +```properties +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.HoodieDataSourceHelpers; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.apache.spark.sql.streaming.OutputMode; +import org.apache.spark.sql.streaming.ProcessingTime; + + + DataStreamWriter writer = streamingInput.writeStream().format("org.apache.hudi") + .option("hoodie.datasource.write.operation", operationType) + .option("hoodie.datasource.write.table.type", tableType) + .option("hoodie.datasource.write.recordkey.field", "_row_key") + .option("hoodie.datasource.write.partitionpath.field", "partition") + .option("hoodie.datasource.write.precombine.field"(), "timestamp") + .option("hoodie.compact.inline.max.delta.commits", "10") + .option("hoodie.datasource.compaction.async.enable", "true") + .option("hoodie.table.name", tableName).option("checkpointLocation", checkpointLocation) + .outputMode(OutputMode.Append()); + writer.trigger(new ProcessingTime(30000)).start(tablePath); +``` + +##### Hudi Streamer Continuous Mode +Hudi Streamer provides continuous ingestion mode where a single long running spark application +ingests data to Hudi table continuously from upstream sources. In this mode, Hudi supports managing asynchronous +compactions. Here is an example snippet for running in continuous mode with async compactions + +```properties +spark-submit --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ +--class org.apache.hudi.utilities.streamer.HoodieStreamer \ +--table-type MERGE_ON_READ \ +--target-base-path \ +--target-table \ +--source-class org.apache.hudi.utilities.sources.JsonDFSSource \ +--source-ordering-field ts \ +--props /path/to/source.properties \ +--continous +``` + +#### Scheduling and Execution by a separate process +For some use cases with long running table services, instead of having the regular writes block, users have the option to run +both steps of the compaction ([scheduling and execution](#compaction-architecture)) offline in a separate process altogether. +This allows for regular writers to not bother about these compaction steps and allows users to provide more resources for +the compaction job as needed. + +:::note +This model needs a lock provider configured for all jobs - the regular writer as well as the offline compaction job. +::: + +#### Scheduling inline and executing async + +In this model, it is possible for a Spark Datasource writer or a Flink job to just schedule the compaction inline ( that +will serialize the compaction plan in the timeline but will not execute it). And then a separate utility like +HudiCompactor or HoodieFlinkCompactor can take care of periodically executing the compaction plan. + +:::note +This model may need a lock provider **if** metadata table is enabled. +::: + +#### Hudi Compactor Utility +Hudi provides a standalone tool to execute specific compactions asynchronously. Below is an example and you can read more in the [deployment guide](/docs/cli#compactions) +The compactor utility allows to do scheduling and execution of compaction. + +Example: +```properties +spark-submit --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ +--class org.apache.hudi.utilities.HoodieCompactor \ +--base-path \ +--table-name \ +--schema-file \ +--instant-time +``` + +Note, the `instant-time` parameter is now optional for the Hudi Compactor Utility. If using the utility without `--instant time`, +the spark-submit will execute the earliest scheduled compaction on the Hudi timeline. + +#### Hudi CLI +Hudi CLI is yet another way to execute specific compactions asynchronously. Here is an example and you can read more in the [deployment guide](/docs/cli#compactions) + +Example: +```properties +hudi:trips->compaction run --tableName --parallelism --compactionInstant +... +``` + +#### Flink Offline Compaction +Offline compaction needs to submit the Flink task on the command line. The program entry is as follows: `hudi-flink-bundle_2.11-0.9.0-SNAPSHOT.jar` : +`org.apache.hudi.sink.compact.HoodieFlinkCompactor` + +```bash +# Command line +./bin/flink run -c org.apache.hudi.sink.compact.HoodieFlinkCompactor lib/hudi-flink-bundle_2.11-0.9.0.jar --path hdfs://xxx:9000/table +``` + +#### Options + +| Option Name | Default | Description | +| ----------- |-------------------------------------------------------------------------------------------------------------------------------| ------- | +| `--path` | `n/a **(Required)**` | The path where the target table is stored on Hudi | +| `--compaction-max-memory` | `100` (Optional) | The index map size of log data during compaction, 100 MB by default. If you have enough memory, you can turn up this parameter | +| `--schedule` | `false` (Optional) | whether to execute the operation of scheduling compaction plan. When the write process is still writing, turning on this parameter have a risk of losing data. Therefore, it must be ensured that there are no write tasks currently writing data to this table when this parameter is turned on | +| `--seq` | `LIFO` (Optional) | The order in which compaction tasks are executed. Executing from the latest compaction plan by default. `LIFO`: executing from the latest plan. `FIFO`: executing from the oldest plan. | +| `--service` | `false` (Optional) | Whether to start a monitoring service that checks and schedules new compaction task in configured interval. | +| `--min-compaction-interval-seconds` | `600(s)` (optional) | The checking interval for service mode, by default 10 minutes. | diff --git a/website/versioned_docs/version-1.0.0/comparison.md b/website/versioned_docs/version-1.0.0/comparison.md new file mode 100644 index 0000000000000..681b359a4de8f --- /dev/null +++ b/website/versioned_docs/version-1.0.0/comparison.md @@ -0,0 +1,56 @@ +--- +title: "Comparison" +keywords: [ apache, hudi, kafka, kudu, hive, hbase, stream processing] +last_modified_at: 2019-12-30T15:59:57-04:00 +--- + +Apache Hudi fills a big void for processing data on top of DFS, and thus mostly co-exists nicely with these technologies. However, +it would be useful to understand how Hudi fits into the current big data ecosystem, contrasting it with a few related systems +and bring out the different tradeoffs these systems have accepted in their design. + +## Kudu + +[Apache Kudu](https://kudu.apache.org) is a storage system that has similar goals as Hudi, which is to bring real-time analytics on petabytes of data via first +class support for `upserts`. A key differentiator is that Kudu also attempts to serve as a datastore for OLTP workloads, something that Hudi does not aspire to be. +Consequently, Kudu does not support incremental pulling (as of early 2017), something Hudi does to enable incremental processing use cases. + + +Kudu diverges from a distributed file system abstraction and HDFS altogether, with its own set of storage servers talking to each other via RAFT. +Hudi, on the other hand, is designed to work with an underlying Hadoop compatible filesystem (HDFS,S3 or Ceph) and does not have its own fleet of storage servers, +instead relying on Apache Spark to do the heavy-lifting. Thus, Hudi can be scaled easily, just like other Spark jobs, while Kudu would require hardware +& operational support, typical to datastores like HBase or Vertica. We have not at this point, done any head to head benchmarks against Kudu (given RTTable is WIP). +But, if we were to go with results shared by [CERN](https://db-blog.web.cern.ch/blog/zbigniew-baranowski/2017-01-performance-comparison-different-file-formats-and-storage-engines) , +we expect Hudi to positioned at something that ingests parquet with superior performance. + + +## Hive Transactions + +[Hive Transactions/ACID](https://cwiki.apache.org/confluence/display/Hive/Hive+Transactions) is another similar effort, which tries to implement storage like +`merge-on-read`, on top of ORC file format. Understandably, this feature is heavily tied to Hive and other efforts like [LLAP](https://cwiki.apache.org/confluence/display/Hive/LLAP). +Hive transactions does not offer the read-optimized storage option or the incremental pulling, that Hudi does. In terms of implementation choices, Hudi leverages +the full power of a processing framework like Spark, while Hive transactions feature is implemented underneath by Hive tasks/queries kicked off by user or the Hive metastore. +Based on our production experience, embedding Hudi as a library into existing Spark pipelines was much easier and less operationally heavy, compared with the other approach. +Hudi is also designed to work with non-hive engines like PrestoDB/Spark and will incorporate file formats other than parquet over time. + +## HBase + +Even though [HBase](https://hbase.apache.org) is ultimately a key-value store for OLTP workloads, users often tend to associate HBase with analytics given the proximity to Hadoop. +Given HBase is heavily write-optimized, it supports sub-second upserts out-of-box and Hive-on-HBase lets users query that data. However, in terms of actual performance for analytical workloads, +hybrid columnar storage formats like Parquet/ORC handily beat HBase, since these workloads are predominantly read-heavy. Hudi bridges this gap between faster data and having +analytical storage formats. From an operational perspective, arming users with a library that provides faster data, is more scalable, than managing a big farm of HBase region servers, +just for analytics. Finally, HBase does not support incremental processing primitives like `commit times`, `incremental pull` as first class citizens like Hudi. + +## Stream Processing + +A popular question, we get is : "How does Hudi relate to stream processing systems?", which we will try to answer here. Simply put, Hudi can integrate with +batch (`copy-on-write table`) and streaming (`merge-on-read table`) jobs of today, to store the computed results in Hadoop. For Spark apps, this can happen via direct +integration of Hudi library with Spark/Spark streaming DAGs. In case of Non-Spark processing systems (eg: Flink, Hive), the processing can be done in the respective systems +and later sent into a Hudi table via a Kafka topic/DFS intermediate file. In more conceptual level, data processing +pipelines just consist of three components : `source`, `processing`, `sink`, with users ultimately running queries against the sink to use the results of the pipeline. +Hudi can act as either a source or sink, that stores data on DFS. Applicability of Hudi to a given stream processing pipeline ultimately boils down to suitability +of PrestoDB/SparkSQL/Hive for your queries. + +More advanced use cases revolve around the concepts of [incremental processing](https://www.oreilly.com/ideas/ubers-case-for-incremental-processing-on-hadoop), which effectively +uses Hudi even inside the `processing` engine to speed up typical batch pipelines. For e.g: Hudi can be used as a state store inside a processing DAG (similar +to how [rocksDB](https://ci.apache.org/projects/flink/flink-docs-release-1.2/ops/state_backends#the-rocksdbstatebackend) is used by Flink). This is an item on the roadmap +and will eventually happen as a [Beam Runner](https://issues.apache.org/jira/browse/HUDI-60) diff --git a/website/versioned_docs/version-1.0.0/concepts.md b/website/versioned_docs/version-1.0.0/concepts.md new file mode 100644 index 0000000000000..8d0adf8dd5a1b --- /dev/null +++ b/website/versioned_docs/version-1.0.0/concepts.md @@ -0,0 +1,172 @@ +--- +version: 0.6.0 +title: "Concepts" +keywords: [ hudi, design, table, queries, timeline] +summary: "Here we introduce some basic concepts & give a broad technical overview of Hudi" +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +--- + +Apache Hudi (pronounced “Hudi”) provides the following streaming primitives over hadoop compatible storages + + * Update/Delete Records (how do I change records in a table?) + * Change Streams (how do I fetch records that changed?) + +In this section, we will discuss key concepts & terminologies that are important to understand, to be able to effectively use these primitives. + +## Timeline +At its core, Hudi maintains a `timeline` of all actions performed on the table at different `instants` of time that helps provide instantaneous views of the table, +while also efficiently supporting retrieval of data in the order of arrival. A Hudi instant consists of the following components + + * `Instant action` : Type of action performed on the table + * `Instant time` : Instant time is typically a timestamp (e.g: 20190117010349), which monotonically increases in the order of action's begin time. + * `state` : current state of the instant + +Hudi guarantees that the actions performed on the timeline are atomic & timeline consistent based on the instant time. + +Key actions performed include + + * `COMMITS` - A commit denotes an **atomic write** of a batch of records into a table. + * `CLEANS` - Background activity that gets rid of older versions of files in the table, that are no longer needed. + * `DELTA_COMMIT` - A delta commit refers to an **atomic write** of a batch of records into a MergeOnRead type table, where some/all of the data could be just written to delta logs. + * `COMPACTION` - Background activity to reconcile differential data structures within Hudi e.g: moving updates from row based log files to columnar formats. Internally, compaction manifests as a special commit on the timeline + * `ROLLBACK` - Indicates that a commit/delta commit was unsuccessful & rolled back, removing any partial files produced during such a write + * `SAVEPOINT` - Marks certain file groups as "saved", such that cleaner will not delete them. It helps restore the table to a point on the timeline, in case of disaster/data recovery scenarios. + +Any given instant can be +in one of the following states + + * `REQUESTED` - Denotes an action has been scheduled, but has not initiated + * `INFLIGHT` - Denotes that the action is currently being performed + * `COMPLETED` - Denotes completion of an action on the timeline + +
+ hudi_timeline.png +
+ +Example above shows upserts happenings between 10:00 and 10:20 on a Hudi table, roughly every 5 mins, leaving commit metadata on the Hudi timeline, along +with other background cleaning/compactions. One key observation to make is that the commit time indicates the `arrival time` of the data (10:20AM), while the actual data +organization reflects the actual time or `event time`, the data was intended for (hourly buckets from 07:00). These are two key concepts when reasoning about tradeoffs between latency and completeness of data. + +When there is late arriving data (data intended for 9:00 arriving >1 hr late at 10:20), we can see the upsert producing new data into even older time buckets/folders. +With the help of the timeline, an incremental query attempting to get all new data that was committed successfully since 10:00 hours, is able to very efficiently consume +only the changed files without say scanning all the time buckets > 07:00. + +## File management +Hudi organizes a table into a directory structure under a `basepath` on DFS. Table is broken up into partitions, which are folders containing data files for that partition, +very similar to Hive tables. Each partition is uniquely identified by its `partitionpath`, which is relative to the basepath. + +Within each partition, files are organized into `file groups`, uniquely identified by a `file id`. Each file group contains several +`file slices`, where each slice contains a base file (`*.parquet`) produced at a certain commit/compaction instant time, + along with set of log files (`*.log.*`) that contain inserts/updates to the base file since the base file was produced. +Hudi adopts a MVCC design, where compaction action merges logs and base files to produce new file slices and cleaning action gets rid of +unused/older file slices to reclaim space on DFS. + +## Index +Hudi provides efficient upserts, by mapping a given hoodie key (record key + partition path) consistently to a file id, via an indexing mechanism. +This mapping between record key and file group/file id, never changes once the first version of a record has been written to a file. In short, the +mapped file group contains all versions of a group of records. + +## Table Types & Queries +Hudi table types define how data is indexed & laid out on the DFS and how the above primitives and timeline activities are implemented on top of such organization (i.e how data is written). +In turn, `query types` define how the underlying data is exposed to the queries (i.e how data is read). + +| Table Type | Supported Query types | +|-------------- |------------------| +| Copy On Write | Snapshot Queries + Incremental Queries | +| Merge On Read | Snapshot Queries + Incremental Queries + Read Optimized Queries | + +### Table Types +Hudi supports the following table types. + + - [Copy On Write](#copy-on-write-table) : Stores data using exclusively columnar file formats (e.g parquet). Updates simply version & rewrite the files by performing a synchronous merge during write. + - [Merge On Read](#merge-on-read-table) : Stores data using a combination of columnar (e.g parquet) + row based (e.g avro) file formats. Updates are logged to delta files & later compacted to produce new versions of columnar files synchronously or asynchronously. + +Following table summarizes the trade-offs between these two table types + +| Trade-off | CopyOnWrite | MergeOnRead | +|-------------- |------------------| ------------------| +| Data Latency | Higher | Lower | +| Update cost (I/O) | Higher (rewrite entire parquet) | Lower (append to delta log) | +| Parquet File Size | Smaller (high update(I/0) cost) | Larger (low update cost) | +| Write Amplification | Higher | Lower (depending on compaction strategy) | + + +### Query types +Hudi supports the following query types + + - **Snapshot Queries** : Queries see the latest snapshot of the table as of a given commit or compaction action. In case of merge on read table, it exposes near-real time data(few mins) by merging + the base and delta files of the latest file slice on-the-fly. For copy on write table, it provides a drop-in replacement for existing parquet tables, while providing upsert/delete and other write side features. + - **Incremental Queries** : Queries only see new data written to the table, since a given commit/compaction. This effectively provides change streams to enable incremental data pipelines. + - **Read Optimized Queries** : Queries see the latest snapshot of table as of a given commit/compaction action. Exposes only the base/columnar files in latest file slices and guarantees the + same columnar query performance compared to a non-hudi columnar table. + +Following table summarizes the trade-offs between the different query types. + +| Trade-off | Snapshot | Read Optimized | +|-------------- |-------------| ------------------| +| Data Latency | Lower | Higher +| Query Latency | Higher (merge base / columnar file + row based delta / log files) | Lower (raw base / columnar file performance) + + +## Copy On Write Table + +File slices in Copy-On-Write table only contain the base/columnar file and each commit produces new versions of base files. +In other words, we implicitly compact on every commit, such that only columnar data exists. As a result, the write amplification +(number of bytes written for 1 byte of incoming data) is much higher, where read amplification is zero. +This is a much desired property for analytical workloads, which is predominantly read-heavy. + +Following illustrates how this works conceptually, when data written into copy-on-write table and two queries running on top of it. + + +
+ hudi_cow.png +
+ + +As data gets written, updates to existing file groups produce a new slice for that file group stamped with the commit instant time, +while inserts allocate a new file group and write its first slice for that file group. These file slices and their commit instant times are color coded above. +SQL queries running against such a table (eg: `select count(*)` counting the total records in that partition), first checks the timeline for the latest commit +and filters all but latest file slices of each file group. As you can see, an old query does not see the current inflight commit's files color coded in pink, +but a new query starting after the commit picks up the new data. Thus queries are immune to any write failures/partial writes and only run on committed data. + +The intention of copy on write table, is to fundamentally improve how tables are managed today through + + - First class support for atomically updating data at file-level, instead of rewriting whole tables/partitions + - Ability to incremental consume changes, as opposed to wasteful scans or fumbling with heuristics + - Tight control of file sizes to keep query performance excellent (small files hurt query performance considerably). + + +## Merge On Read Table + +Merge on read table is a superset of copy on write, in the sense it still supports read optimized queries of the table by exposing only the base/columnar files in latest file slices. +Additionally, it stores incoming upserts for each file group, onto a row based delta log, to support snapshot queries by applying the delta log, +onto the latest version of each file id on-the-fly during query time. Thus, this table type attempts to balance read and write amplification intelligently, to provide near real-time data. +The most significant change here, would be to the compactor, which now carefully chooses which delta log files need to be compacted onto +their columnar base file, to keep the query performance in check (larger delta log files would incur longer merge times with merge data on query side) + +Following illustrates how the table works, and shows two types of queries - snapshot query and read optimized query. + +
+ hudi_mor.png +
+ +There are lot of interesting things happening in this example, which bring out the subtleties in the approach. + + - We now have commits every 1 minute or so, something we could not do in the other table type. + - Within each file id group, now there is an delta log file, which holds incoming updates to the records already present in the base columnar files. In the example, the delta log files hold + all the data from 10:05 to 10:10. The base columnar files are still versioned with the commit, as before. + Thus, if one were to simply look at base files alone, then the table layout looks exactly like a copy on write table. + - A periodic compaction process reconciles these changes from the delta log and produces a new version of base file, just like what happened at 10:05 in the example. + - There are two ways of querying the same underlying table: Read Optimized query and Snapshot query, depending on whether we chose query performance or freshness of data. + - The semantics around when data from a commit is available to a query changes in a subtle way for a read optimized query. Note, that such a query + running at 10:10, wont see data after 10:05 above, while a snapshot query always sees the freshest data. + - When we trigger compaction & what it decides to compact hold all the key to solving these hard problems. By implementing a compacting + strategy, where we aggressively compact the latest partitions compared to older partitions, we could ensure the read optimized queries see data + published within X minutes in a consistent fashion. + +The intention of merge on read table is to enable near real-time processing directly on top of DFS, as opposed to copying +data out to specialized systems, which may not be able to handle the data volume. There are also a few secondary side benefits to +this table such as reduced write amplification by avoiding synchronous merge of data, i.e, the amount of data written per 1 bytes of data in a batch + + diff --git a/website/versioned_docs/version-1.0.0/concurrency_control.md b/website/versioned_docs/version-1.0.0/concurrency_control.md new file mode 100644 index 0000000000000..549f1ddd17eb1 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/concurrency_control.md @@ -0,0 +1,339 @@ +--- +title: "Concurrency Control" +summary: In this page, we will discuss how to perform concurrent writes to Hudi Tables. +toc: true +toc_min_heading_level: 2 +toc_max_heading_level: 4 +last_modified_at: 2021-03-19T15:59:57-04:00 +--- +Concurrency control defines how different writers/readers/table services coordinate access to a Hudi table. Hudi ensures atomic writes, by way of publishing commits atomically to the timeline, +stamped with an instant time that denotes the time at which the action is deemed to have occurred. Unlike general purpose file version control, Hudi draws clear distinction between +writer processes that issue [write operations](write_operations) and table services that (re)write data/metadata to optimize/perform bookkeeping and +readers (that execute queries and read data). + +Hudi provides +* **Snapshot isolation** between all three types of processes, meaning they all operate on a consistent snapshot of the table. +* **Optimistic concurrency control (OCC)** between writers to provide standard relational database semantics. +* **Multiversion Concurrency Control (MVCC)** based concurrency control between writers and table-services and between different table services. +* **Non-blocking Concurrency Control (NBCC)** between writers, to provide streaming semantics and avoiding live-locks/starvation between writers. + +In this section, we will discuss the different concurrency controls supported by Hudi and how they are leveraged to provide flexible deployment models for single and multiple writer scenarios. +We’ll also describe ways to ingest data into a Hudi Table from multiple writers using different writers, like Hudi Streamer, Hudi datasource, Spark Structured Streaming and Spark SQL. + +:::note +If there is only one process performing writing AND async/inline table services on the table, you can +avoid the overhead of a distributed lock requirement by configuring the in process lock provider. + +```properties +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.InProcessLockProvider +``` +::: + +## Distributed Locking +A pre-requisite for distributed co-ordination in Hudi, like many other distributed database systems is a distributed lock provider, that different processes can use to plan, schedule and +execute actions on the Hudi timeline in a concurrent fashion. Locks are also used to [generate TrueTime](timeline#truetime-generation), as discussed before. + +External locking is typically used in conjunction with optimistic concurrency control +because it provides a way to prevent conflicts that might occur when two or more transactions (commits in our case) attempt to modify the same resource concurrently. +When a transaction attempts to modify a resource that is currently locked by another transaction, it must wait until the lock is released before proceeding. + +In case of multi-writing in Hudi, the locks are acquired on the Hudi table for a very short duration during specific phases (such as just before committing the writes or before scheduling table services) instead of locking for the entire span of time. This approach allows multiple writers to work on the same table simultaneously, increasing concurrency and avoids conflicts. + +There are 4 different lock providers that require different configurations to be set. Please refer to comprehensive locking configs [here](https://hudi.apache.org/docs/next/configurations#LOCK). + +#### Zookeeper based +``` +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider +``` +Following are the basic configs required to setup this lock provider: + +| Config Name| Default| Description | +| ---------------------------------------------------------------------------- | ------------------------ |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.write.lock.zookeeper.base_path | N/A **(Required)** | The base path on Zookeeper under which to create lock related ZNodes. This should be same for all concurrent writers to the same table

`Config Param: ZK_BASE_PATH`
`Since Version: 0.8.0` | +| hoodie.write.lock.zookeeper.port | N/A **(Required)** | Zookeeper port to connect to.

`Config Param: ZK_PORT`
`Since Version: 0.8.0` | +| hoodie.write.lock.zookeeper.url | N/A **(Required)** | Zookeeper URL to connect to.

`Config Param: ZK_CONNECT_URL`
`Since Version: 0.8.0` | + +#### HiveMetastore based + +``` +hoodie.write.lock.provider=org.apache.hudi.hive.transaction.lock.HiveMetastoreBasedLockProvider +``` +Following are the basic configs required to setup this lock provider: + +| Config Name| Default| Description | +| ----------------------------------------------------------------------- | ------------------------ |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.write.lock.hivemetastore.database | N/A **(Required)** | For Hive based lock provider, the Hive database to acquire lock against

`Config Param: HIVE_DATABASE_NAME`
`Since Version: 0.8.0` | +| hoodie.write.lock.hivemetastore.table | N/A **(Required)** | For Hive based lock provider, the Hive table to acquire lock against

`Config Param: HIVE_TABLE_NAME`
`Since Version: 0.8.0` | + +`The HiveMetastore URI's are picked up from the hadoop configuration file loaded during runtime.` + +#### Amazon DynamoDB based +``` +hoodie.write.lock.provider=org.apache.hudi.aws.transaction.lock.DynamoDBBasedLockProvider +``` +Amazon DynamoDB based lock provides a simple way to support multi writing across different clusters. You can refer to the +[DynamoDB based Locks Configurations](https://hudi.apache.org/docs/configurations#DynamoDB-based-Locks-Configurations) +section for the details of each related configuration knob. Following are the basic configs required to setup this lock provider: + +| Config Name| Default| Description | +| ----------------------------------------------------------------------- | ------------------------ |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.write.lock.dynamodb.endpoint_url| N/A **(Required)** | For DynamoDB based lock provider, the url endpoint used for Amazon DynamoDB service. Useful for development with a local dynamodb instance.

`Config Param: DYNAMODB_ENDPOINT_URL`
`Since Version: 0.10.1`| + +For advanced configs refer [here](https://hudi.apache.org/docs/next/configurations#DynamoDB-based-Locks-Configurations) + + +When using the DynamoDB-based lock provider, the name of the DynamoDB table acting as the lock table for Hudi is +specified by the config `hoodie.write.lock.dynamodb.table`. This DynamoDB table is automatically created by Hudi, so you +don't have to create the table yourself. If you want to use an existing DynamoDB table, make sure that an attribute with +the name `key` is present in the table. The `key` attribute should be the partition key of the DynamoDB table. The +config `hoodie.write.lock.dynamodb.partition_key` specifies the value to put for the `key` attribute (not the attribute +name), which is used for the lock on the same table. By default, `hoodie.write.lock.dynamodb.partition_key` is set to +the table name, so that multiple writers writing to the same table share the same lock. If you customize the name, make +sure it's the same across multiple writers. + +Also, to set up the credentials for accessing AWS resources, customers can pass the following props to Hudi jobs: +``` +hoodie.aws.access.key +hoodie.aws.secret.key +hoodie.aws.session.token +``` +If not configured, Hudi falls back to use [DefaultAWSCredentialsProviderChain](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html). + +IAM policy for your service instance will need to add the following permissions: + +```json +{ + "Sid":"DynamoDBLocksTable", + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:DeleteItem", + "dynamodb:DescribeTable", + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:Scan", + "dynamodb:UpdateItem" + ], + "Resource": "arn:${Partition}:dynamodb:${Region}:${Account}:table/${TableName}" +} +``` +- `TableName` : same as `hoodie.write.lock.dynamodb.partition_key` +- `Region`: same as `hoodie.write.lock.dynamodb.region` + +AWS SDK dependencies are not bundled with Hudi from v0.10.x and will need to be added to your classpath. +Add the following Maven packages (check the latest versions at time of install): +``` +com.amazonaws:dynamodb-lock-client +com.amazonaws:aws-java-sdk-dynamodb +com.amazonaws:aws-java-sdk-core +``` + +#### FileSystem based (not for production use) + +FileSystem based lock provider supports multiple writers cross different jobs/applications based on atomic create/delete operations of the underlying filesystem. + +``` +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.FileSystemBasedLockProvider +``` + +When using the FileSystem based lock provider, by default, the lock file will store into `hoodie.base.path`+`/.hoodie/lock`. You may use a custom folder to store the lock file by specifying `hoodie.write.lock.filesystem.path`. + +In case the lock cannot release during job crash, you can set `hoodie.write.lock.filesystem.expire` (lock will never expire by default) to a desired expire time in minutes. You may also delete lock file manually in such situation. +:::note +FileSystem based lock provider is not supported with cloud storage like S3 or GCS. +::: + + +## Simple Single writer + table services + +Data lakehouse pipelines tend to be predominantly single writer, with the most common need for distributed co-ordination on a table coming from table management. For e.g. a Apache Flink +job producing fast writes into a table, requiring regular file-size management or cleaning. Hudi's storage engine and platform tools provide a lot of support for such common scenarios. + +### Inline table services + +This is the simplest form of concurrency, meaning there is no concurrency at all in the write processes. In this model, Hudi eliminates the need for concurrency control and maximizes throughput by supporting these table services out-of-box and running inline after every write to the table. Execution plans are idempotent, persisted to the timeline and auto-recover from failures. For most simple use-cases, this means just writing is sufficient to get a well-managed table that needs no concurrency control. + +There is no actual concurrent writing in this model. **MVCC** is leveraged to provide snapshot isolation guarantees between ingestion writer and multiple readers and also between multiple table service writers and readers. Writes to the table either from ingestion or from table services produce versioned data that are available to readers only after the writes are committed. Until then, readers can access only the previous version of the data. + +A single writer with all table services such as cleaning, clustering, compaction, etc can be configured to be inline (such as Hudi Streamer sync-once mode and Spark Datasource with default configs) without any additional configs. + +### Async table services + +Hudi provides the option of running the table services in an async fashion, where most of the heavy lifting (e.g actually rewriting the columnar data by compaction service) is done asynchronously. In this model, the async deployment eliminates any repeated wasteful retries and optimizes the table using clustering techniques while a single writer consumes the writes to the table without having to be blocked by such table services. This model avoids the need for taking an [external lock](#external-locking-and-lock-providers) to control concurrency and avoids the need to separately orchestrate and monitor offline table services jobs.. + +A single writer along with async table services runs in the same process. For example, you can have a Hudi Streamer in continuous mode write to a MOR table using async compaction; you can use Spark Streaming (where [compaction](https://hudi.apache.org/docs/compaction) is async by default), and you can use Flink streaming or your own job setup and enable async table services inside the same writer. + +Hudi leverages **MVCC** in this model to support running any number of table service jobs concurrently, without any concurrency conflict. This is made possible by ensuring Hudi 's ingestion writer and async table services coordinate among themselves to ensure no conflicts and avoid race conditions. The same single write guarantees described in Model A above can be achieved in this model as well. +With this model users don't need to spin up different spark jobs and manage the orchestration among it. For larger deployments, this model can ease the operational burden significantly while getting the table services running without blocking the writers. + +**Single Writer Guarantees** + +In this model, the following are the guarantees on [write operations](https://hudi.apache.org/docs/write_operations/) to expect: + +- *UPSERT Guarantee*: The target table will NEVER show duplicates. +- *INSERT Guarantee*: The target table wilL NEVER have duplicates if dedup: [`hoodie.datasource.write.insert.drop.duplicates`](https://hudi.apache.org/docs/configurations#hoodiedatasourcewriteinsertdropduplicates) & [`hoodie.combine.before.insert`](https://hudi.apache.org/docs/configurations/#hoodiecombinebeforeinsert), is enabled. +- *BULK_INSERT Guarantee*: The target table will NEVER have duplicates if dedup: [`hoodie.datasource.write.insert.drop.duplicates`](https://hudi.apache.org/docs/configurations#hoodiedatasourcewriteinsertdropduplicates) & [`hoodie.combine.before.insert`](https://hudi.apache.org/docs/configurations/#hoodiecombinebeforeinsert), is enabled. +- *INCREMENTAL QUERY Guarantee*: Data consumption and checkpoints are NEVER out of order. + +## Full-on Multi-writer + Async table services + +Hudi has introduced a new concurrency mode `NON_BLOCKING_CONCURRENCY_CONTROL`, where unlike OCC, multiple writers can +operate on the table with non-blocking conflict resolution. The writers can write into the same file group with the +conflicts resolved automatically by the query reader and the compactor. The new concurrency mode is currently +available for preview in version 1.0.0-beta only. You can read more about it under section [Model C: Multi-writer](#model-c-multi-writer). + +It is not always possible to serialize all write operations to a table (such as UPSERT, INSERT or DELETE) into the same write process and therefore, multi-writing capability may be required. +In multi-writing, disparate distributed processes run in parallel or overlapping time windows to write to the same table. In such cases, an external locking mechanism is a must to safely +coordinate concurrent accesses. Here are few different scenarios that would all fall under multi-writing: + +- Multiple ingestion writers to the same table:For instance, two Spark Datasource writers working on different sets of partitions form a source kafka topic. +- Multiple ingestion writers to the same table, including one writer with async table services: For example, a Hudi Streamer with async compaction for regular ingestion & a Spark Datasource writer for backfilling. +- A single ingestion writer and a separate compaction (HoodieCompactor) or clustering (HoodieClusteringJob) job apart from the ingestion writer: This is considered as multi-writing as they are not running in the same process. + +Hudi's concurrency model intelligently differentiates actual writing to the table from table services that manage or optimize the table. Hudi offers similar **optimistic concurrency control across multiple writers**, but **table services can still execute completely lock-free and async** as long as they run in the same process as one of the writers. +For multi-writing, Hudi leverages file level optimistic concurrency control(OCC). For example, when two writers write to non overlapping files, both writes are allowed to succeed. However, when the writes from different writers overlap (touch the same set of files), only one of them will succeed. Please note that this feature is currently experimental and requires external lock providers to acquire locks briefly at critical sections during the write. More on lock providers below. + +#### Multi Writer Guarantees + +With multiple writers using OCC, these are the write guarantees to expect: + +- *UPSERT Guarantee*: The target table will NEVER show duplicates. +- *INSERT Guarantee*: The target table MIGHT have duplicates even if dedup is enabled. +- *BULK_INSERT Guarantee*: The target table MIGHT have duplicates even if dedup is enabled. +- *INCREMENTAL PULL Guarantee*: Data consumption and checkpoints are NEVER out of order. If there are inflight commits + (due to multi-writing), incremental queries will not expose the completed commits following the inflight commits. + +## Non-Blocking Concurrency Control + +`NON_BLOCKING_CONCURRENCY_CONTROL`, offers the same set of guarantees as mentioned in the case of OCC but without +explicit locks for serializing the writes. Lock is only needed for writing the commit metadata to the Hudi timeline. The +completion time for the commits reflects the serialization order and file slicing is done based on completion time. +Multiple writers can operate on the table with non-blocking conflict resolution. The writers can write into the same +file group with the conflicts resolved automatically by the query reader and the compactor. The new concurrency mode is +currently available for preview in version 1.0.0-beta only with the caveat that conflict resolution is not supported yet +between clustering and ingestion. It works for compaction and ingestion, and we can see an example of that with Flink +writers [here](sql_dml#non-blocking-concurrency-control-experimental). + +:::note +`NON_BLOCKING_CONCURRENCY_CONTROL` between ingestion writer and table service writer is not yet supported for clustering. +Please use `OPTIMISTIC_CONCURRENCY_CONTROL` for clustering. +::: + +## Early conflict Detection + +Multi writing using OCC allows multiple writers to concurrently write and atomically commit to the Hudi table if there is no overlapping data file to be written, to guarantee data consistency, integrity and correctness. Prior to 0.13.0 release, as the OCC (optimistic concurrency control) name suggests, each writer will optimistically proceed with ingestion and towards the end, just before committing will go about conflict resolution flow to deduce overlapping writes and abort one if need be. But this could result in lot of compute waste, since the aborted commit will have to retry from beginning. With 0.13.0, Hudi introduced early conflict deduction leveraging markers in hudi to deduce the conflicts eagerly and abort early in the write lifecyle instead of doing it in the end. For large scale deployments, this might avoid wasting lot o compute resources if there could be overlapping concurrent writers. + +To improve the concurrency control, the [0.13.0 release](https://hudi.apache.org/releases/release-0.13.0#early-conflict-detection-for-multi-writer) introduced a new feature, early conflict detection in OCC, to detect the conflict during the data writing phase and abort the writing early on once a conflict is detected, using Hudi's marker mechanism. Hudi can now stop a conflicting writer much earlier because of the early conflict detection and release computing resources necessary to cluster, improving resource utilization. + +By default, this feature is turned off. To try this out, a user needs to set `hoodie.write.concurrency.early.conflict.detection.enable` to true, when using OCC for concurrency control (Refer [configs](https://hudi.apache.org/docs/next/configurations#Write-Configurations-advanced-configs) page for all relevant configs). +:::note +Early conflict Detection in OCC is an **EXPERIMENTAL** feature +::: + + +## Enabling Multi Writing + +The following properties are needed to be set appropriately to turn on optimistic concurrency control to achieve multi writing. + +``` +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.write.lock.provider= +hoodie.cleaner.policy.failed.writes=LAZY +``` + +| Config Name | Default | Description | +|-------------------------------------|-------------------------------------------------------------------------------|| +| hoodie.write.concurrency.mode | SINGLE_WRITER (Optional) | [Concurrency modes](https://github.com/apache/hudi/blob/00ece7bce0a4a8d0019721a28049723821e01842/hudi-common/src/main/java/org/apache/hudi/common/model/WriteConcurrencyMode.java) for write operations.
Possible values:
  • `SINGLE_WRITER`: Only one active writer to the table. Maximizes throughput.
  • `OPTIMISTIC_CONCURRENCY_CONTROL`: Multiple writers can operate on the table with lazy conflict resolution using locks. This means that only one writer succeeds if multiple writers write to the same file group.
  • `NON_BLOCKING_CONCURRENCY_CONTROL`: Multiple writers can operate on the table with non-blocking conflict resolution. The writers can write into the same file group with the conflicts resolved automatically by the query reader and the compactor.

`Config Param: WRITE_CONCURRENCY_MODE` | +| hoodie.write.lock.provider | org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider (Optional) | Lock provider class name, user can provide their own implementation of LockProvider which should be subclass of org.apache.hudi.common.lock.LockProvider

`Config Param: LOCK_PROVIDER_CLASS_NAME`
`Since Version: 0.8.0` | +| hoodie.cleaner.policy.failed.writes | EAGER (Optional) | org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy: Policy that controls how to clean up failed writes. Hudi will delete any files written by failed writes to re-claim space. EAGER(default): Clean failed writes inline after every write operation. LAZY: Clean failed writes lazily after heartbeat timeout when the cleaning service runs. This policy is required when multi-writers are enabled. NEVER: Never clean failed writes.

`Config Param: FAILED_WRITES_CLEANER_POLICY` | + + +### Multi Writing via Hudi Streamer + +The `HoodieStreamer` utility (part of hudi-utilities-slim-bundle) provides ways to ingest from different sources such as DFS or Kafka, with the following capabilities. + +Using optimistic_concurrency_control via Hudi Streamer requires adding the above configs to the properties file that can be passed to the +job. For example below, adding the configs to kafka-source.properties file and passing them to Hudi Streamer will enable optimistic concurrency. +A Hudi Streamer job can then be triggered as follows: + +```java +[hoodie]$ spark-submit \ + --jars "packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar,packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ + --class org.apache.hudi.utilities.streamer.HoodieStreamer `ls packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle-*.jar` \ + --props file://${PWD}/hudi-utilities/src/test/resources/streamer-config/kafka-source.properties \ + --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \ + --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \ + --source-ordering-field impresssiontime \ + --target-base-path file:\/\/\/tmp/hudi-streamer-op \ + --target-table tableName \ + --op BULK_INSERT +``` + +### Multi Writing via Spark Datasource Writer + +The `hudi-spark` module offers the DataSource API to write (and read) a Spark DataFrame into a Hudi table. + +Following is an example of how to use optimistic_concurrency_control via spark datasource + +```java +inputDF.write.format("hudi") + .options(getQuickstartWriteConfigs) + .option("hoodie.datasource.write.precombine.field", "ts") + .option("hoodie.cleaner.policy.failed.writes", "LAZY") + .option("hoodie.write.concurrency.mode", "optimistic_concurrency_control") + .option("hoodie.write.lock.zookeeper.url", "zookeeper") + .option("hoodie.write.lock.zookeeper.port", "2181") + .option("hoodie.write.lock.zookeeper.base_path", "/test") + .option("hoodie.datasource.write.recordkey.field", "uuid") + .option("hoodie.datasource.write.partitionpath.field", "partitionpath") + .option("hoodie.table.name", tableName) + .mode(Overwrite) + .save(basePath) +``` + +## Disabling Multi Writing + +Remove the following settings that were used to enable multi-writer or override with default values. + +``` +hoodie.write.concurrency.mode=single_writer +hoodie.cleaner.policy.failed.writes=EAGER +``` + +## OCC Best Practices + +Concurrent Writing to Hudi tables requires acquiring a lock with one of the lock providers mentioned above. Due to several reasons you might want to configure retries to allow your application to acquire the lock. +1. Network connectivity or excessive load on servers increasing time for lock acquisition resulting in timeouts +2. Running a large number of concurrent jobs that are writing to the same hudi table can result in contention during lock acquisition can cause timeouts +3. In some scenarios of conflict resolution, Hudi commit operations might take upto 10's of seconds while the lock is being held. This can result in timeouts for other jobs waiting to acquire a lock. + +Set the correct native lock provider client retries. +:::note +Please note that sometimes these settings are set on the server once and all clients inherit the same configs. Please check your settings before enabling optimistic concurrency. +::: + +``` +hoodie.write.lock.wait_time_ms +hoodie.write.lock.num_retries +``` + +Set the correct hudi client retries for Zookeeper & HiveMetastore. This is useful in cases when native client retry settings cannot be changed. Please note that these retries will happen in addition to any native client retries that you may have set. + +``` +hoodie.write.lock.client.wait_time_ms +hoodie.write.lock.client.num_retries +``` + +*Setting the right values for these depends on a case by case basis; some defaults have been provided for general cases.* + + +## Caveats + +If you are using the `WriteClient` API, please note that multiple writes to the table need to be initiated from 2 different instances of the write client. +It is **NOT** recommended to use the same instance of the write client to perform multi writing. + +## Related Resources +

Videos

+ +* [Hands on Lab with using DynamoDB as lock table for Apache Hudi Data Lakes](https://youtu.be/JP0orl9_0yQ) +* [Non Blocking Concurrency Control Flink Demo](/blog/2024/12/06/non-blocking-concurrency-control) \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/configurations.md b/website/versioned_docs/version-1.0.0/configurations.md new file mode 100644 index 0000000000000..0758147e0df0b --- /dev/null +++ b/website/versioned_docs/version-1.0.0/configurations.md @@ -0,0 +1,2287 @@ +--- +title: All Configurations +keywords: [ configurations, default, flink options, spark, configs, parameters ] +permalink: /docs/configurations.html +summary: This page covers the different ways of configuring your job to write/read Hudi tables. At a high level, you can control behaviour at few levels. +toc_min_heading_level: 2 +toc_max_heading_level: 4 +last_modified_at: 2024-12-06T17:38:05.818 +--- + + +This page covers the different ways of configuring your job to write/read Hudi tables. At a high level, you can control behaviour at few levels. + +- [**Hudi Table Config**](#TABLE_CONFIG): Basic Hudi Table configuration parameters. +- [**Environment Config**](#ENVIRONMENT_CONFIG): Hudi supports passing configurations via a configuration file `hudi-defaults.conf` in which each line consists of a key and a value separated by whitespace or = sign. For example: +``` +hoodie.datasource.hive_sync.mode jdbc +hoodie.datasource.hive_sync.jdbcurl jdbc:hive2://localhost:10000 +hoodie.datasource.hive_sync.support_timestamp false +``` +It helps to have a central configuration file for your common cross job configurations/tunings, so all the jobs on your cluster can utilize it. It also works with Spark SQL DML/DDL, and helps avoid having to pass configs inside the SQL statements. + +Hudi always loads the configuration file under default directory `file:/etc/hudi/conf`, if exists, to set the default configs. Besides, you can specify another configuration directory location by setting the `HUDI_CONF_DIR` environment variable. The configs stored in `HUDI_CONF_DIR/hudi-defaults.conf` are loaded, overriding any configs already set by the config file in the default directory. +- [**Spark Datasource Configs**](#SPARK_DATASOURCE): These configs control the Hudi Spark Datasource, providing ability to define keys/partitioning, pick out the write operation, specify how to merge records or choosing query type to read. +- [**Flink Sql Configs**](#FLINK_SQL): These configs control the Hudi Flink SQL source/sink connectors, providing ability to define record keys, pick out the write operation, specify how to merge records, enable/disable asynchronous compaction or choosing query type to read. +- [**Write Client Configs**](#WRITE_CLIENT): Internally, the Hudi datasource uses a RDD based HoodieWriteClient API to actually perform writes to storage. These configs provide deep control over lower level aspects like file sizing, compression, parallelism, compaction, write schema, cleaning etc. Although Hudi provides sane defaults, from time-time these configs may need to be tweaked to optimize for specific workloads. +- [**Reader Configs**](#READER): Please fill in the description for Config Group Name: Reader Configs +- [**Metastore and Catalog Sync Configs**](#META_SYNC): Configurations used by the Hudi to sync metadata to external metastores and catalogs. +- [**Metrics Configs**](#METRICS): These set of configs are used to enable monitoring and reporting of key Hudi stats and metrics. +- [**Record Payload Config**](#RECORD_PAYLOAD): This is the lowest level of customization offered by Hudi. Record payloads define how to produce new values to upsert based on incoming new record and stored old record. Hudi provides default implementations such as OverwriteWithLatestAvroPayload which simply update table with the latest/last-written record. This can be overridden to a custom class extending HoodieRecordPayload class, on both datasource and WriteClient levels. +- [**Kafka Connect Configs**](#KAFKA_CONNECT): These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables +- [**Amazon Web Services Configs**](#AWS): Configurations specific to Amazon Web Services. +- [**Hudi Streamer Configs**](#HUDI_STREAMER): These set of configs are used for Hudi Streamer utility which provides the way to ingest from different sources such as DFS or Kafka. + +:::note +In the tables below **(N/A)** means there is no default value set +::: + +## Externalized Config File +Instead of directly passing configuration settings to every Hudi job, you can also centrally set them in a configuration +file `hudi-defaults.conf`. By default, Hudi would load the configuration file under `/etc/hudi/conf` directory. You can +specify a different configuration directory location by setting the `HUDI_CONF_DIR` environment variable. This can be +useful for uniformly enforcing repeated configs (like Hive sync or write/index tuning), across your entire data lake. + +## Hudi Table Config {#TABLE_CONFIG} +Basic Hudi Table configuration parameters. + + +### Hudi Table Basic Configs {#Hudi-Table-Basic-Configs} +Configurations of the Hudi Table like type of ingestion, storage formats, hive table name etc. Configurations are loaded from hoodie.properties, these properties are usually set during initializing a path as hoodie base path and never changes during the lifetime of a hoodie table. + + + +[**Basic Configs**](#Hudi-Table-Basic-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- || +| [hoodie.bootstrap.base.path](#hoodiebootstrapbasepath) | (N/A) | Base path of the dataset that needs to be bootstrapped as a Hudi table
`Config Param: BOOTSTRAP_BASE_PATH` | +| [hoodie.compaction.payload.class](#hoodiecompactionpayloadclass) | (N/A) | Payload class to use for performing merges, compactions, i.e merge delta logs with current base file and then produce a new base file.
`Config Param: PAYLOAD_CLASS_NAME` | +| [hoodie.database.name](#hoodiedatabasename) | (N/A) | Database name. If different databases have the same table name during incremental query, we can set it to limit the table name under a specific database
`Config Param: DATABASE_NAME` | +| [hoodie.record.merge.strategy.id](#hoodierecordmergestrategyid) | (N/A) | Id of merger strategy. Hudi will pick HoodieRecordMerger implementations in `hoodie.write.record.merge.custom.implementation.classes` which has the same merger strategy id
`Config Param: RECORD_MERGE_STRATEGY_ID`
`Since Version: 0.13.0` | +| [hoodie.table.checksum](#hoodietablechecksum) | (N/A) | Table checksum is used to guard against partial writes in HDFS. It is added as the last entry in hoodie.properties and then used to validate while reading table config.
`Config Param: TABLE_CHECKSUM`
`Since Version: 0.11.0` | +| [hoodie.table.create.schema](#hoodietablecreateschema) | (N/A) | Schema used when creating the table
`Config Param: CREATE_SCHEMA` | +| [hoodie.table.index.defs.path](#hoodietableindexdefspath) | (N/A) | Relative path to table base path where the index definitions are stored
`Config Param: RELATIVE_INDEX_DEFINITION_PATH`
`Since Version: 1.0.0` | +| [hoodie.table.keygenerator.class](#hoodietablekeygeneratorclass) | (N/A) | Key Generator class property for the hoodie table
`Config Param: KEY_GENERATOR_CLASS_NAME` | +| [hoodie.table.keygenerator.type](#hoodietablekeygeneratortype) | (N/A) | Key Generator type to determine key generator class
`Config Param: KEY_GENERATOR_TYPE`
`Since Version: 1.0.0` | +| [hoodie.table.metadata.partitions](#hoodietablemetadatapartitions) | (N/A) | Comma-separated list of metadata partitions that have been completely built and in-sync with data table. These partitions are ready for use by the readers
`Config Param: TABLE_METADATA_PARTITIONS`
`Since Version: 0.11.0` | +| [hoodie.table.metadata.partitions.inflight](#hoodietablemetadatapartitionsinflight) | (N/A) | Comma-separated list of metadata partitions whose building is in progress. These partitions are not yet ready for use by the readers.
`Config Param: TABLE_METADATA_PARTITIONS_INFLIGHT`
`Since Version: 0.11.0` | +| [hoodie.table.name](#hoodietablename) | (N/A) | Table name that will be used for registering with Hive. Needs to be same across runs.
`Config Param: NAME` | +| [hoodie.table.partition.fields](#hoodietablepartitionfields) | (N/A) | Comma separated field names used to partition the table. These field names also include the partition type which is used by custom key generators
`Config Param: PARTITION_FIELDS` | +| [hoodie.table.precombine.field](#hoodietableprecombinefield) | (N/A) | Field used in preCombining before actual write. By default, when two records have the same key value, the largest value for the precombine field determined by Object.compareTo(..), is picked.
`Config Param: PRECOMBINE_FIELD` | +| [hoodie.table.recordkey.fields](#hoodietablerecordkeyfields) | (N/A) | Columns used to uniquely identify the table. Concatenated values of these fields are used as the record key component of HoodieKey.
`Config Param: RECORDKEY_FIELDS` | +| [hoodie.table.secondary.indexes.metadata](#hoodietablesecondaryindexesmetadata) | (N/A) | The metadata of secondary indexes
`Config Param: SECONDARY_INDEXES_METADATA`
`Since Version: 0.13.0` | +| [hoodie.timeline.layout.version](#hoodietimelinelayoutversion) | (N/A) | Version of timeline used, by the table.
`Config Param: TIMELINE_LAYOUT_VERSION` | +| [hoodie.archivelog.folder](#hoodiearchivelogfolder) | archived | path under the meta folder, to store archived timeline instants at.
`Config Param: ARCHIVELOG_FOLDER` | +| [hoodie.bootstrap.index.class](#hoodiebootstrapindexclass) | org.apache.hudi.common.bootstrap.index.hfile.HFileBootstrapIndex | Implementation to use, for mapping base files to bootstrap base file, that contain actual data.
`Config Param: BOOTSTRAP_INDEX_CLASS_NAME` | +| [hoodie.bootstrap.index.enable](#hoodiebootstrapindexenable) | true | Whether or not, this is a bootstrapped table, with bootstrap base data and an mapping index defined, default true.
`Config Param: BOOTSTRAP_INDEX_ENABLE` | +| [hoodie.bootstrap.index.type](#hoodiebootstrapindextype) | HFILE | Bootstrap index type determines which implementation to use, for mapping base files to bootstrap base file, that contain actual data.
`Config Param: BOOTSTRAP_INDEX_TYPE`
`Since Version: 1.0.0` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)
`Config Param: HIVE_STYLE_PARTITIONING_ENABLE` | +| [hoodie.partition.metafile.use.base.format](#hoodiepartitionmetafileusebaseformat) | false | If true, partition metafiles are saved in the same format as base-files for this dataset (e.g. Parquet / ORC). If false (default) partition metafiles are saved as properties files.
`Config Param: PARTITION_METAFILE_USE_BASE_FORMAT` | +| [hoodie.populate.meta.fields](#hoodiepopulatemetafields) | true | When enabled, populates all meta fields. When disabled, no meta fields are populated and incremental queries will not be functional. This is only meant to be used for append only/immutable data for batch processing
`Config Param: POPULATE_META_FIELDS` | +| [hoodie.record.merge.mode](#hoodierecordmergemode) | EVENT_TIME_ORDERING | org.apache.hudi.common.config.RecordMergeMode: Determines the logic of merging updates COMMIT_TIME_ORDERING: Using transaction time to merge records, i.e., the record from later transaction overwrites the earlier record with the same key. EVENT_TIME_ORDERING(default): Using event time as the ordering to merge records, i.e., the record with the larger event time overwrites the record with the smaller event time on the same key, regardless of transaction time. The event time or preCombine field needs to be specified by the user. CUSTOM: Using custom merging logic specified by the user.
`Config Param: RECORD_MERGE_MODE`
`Since Version: 1.0.0` | +| [hoodie.table.base.file.format](#hoodietablebasefileformat) | PARQUET | Base file format to store all the base file data.
`Config Param: BASE_FILE_FORMAT` | +| [hoodie.table.cdc.enabled](#hoodietablecdcenabled) | false | When enable, persist the change data if necessary, and can be queried as a CDC query mode.
`Config Param: CDC_ENABLED`
`Since Version: 0.13.0` | +| [hoodie.table.cdc.supplemental.logging.mode](#hoodietablecdcsupplementalloggingmode) | DATA_BEFORE_AFTER | org.apache.hudi.common.table.cdc.HoodieCDCSupplementalLoggingMode: Change log capture supplemental logging mode. The supplemental log is used for accelerating the generation of change log details. OP_KEY_ONLY: Only keeping record keys in the supplemental logs, so the reader needs to figure out the update before image and after image. DATA_BEFORE: Keeping the before images in the supplemental logs, so the reader needs to figure out the update after images. DATA_BEFORE_AFTER(default): Keeping the before and after images in the supplemental logs, so the reader can generate the details directly from the logs.
`Config Param: CDC_SUPPLEMENTAL_LOGGING_MODE`
`Since Version: 0.13.0` | +| [hoodie.table.initial.version](#hoodietableinitialversion) | EIGHT | Initial Version of table when the table was created. Used for upgrade/downgrade to identify what upgrade/downgrade paths happened on the table. This is only configured when the table is initially setup.
`Config Param: INITIAL_VERSION`
`Since Version: 1.0.0` | +| [hoodie.table.log.file.format](#hoodietablelogfileformat) | HOODIE_LOG | Log format used for the delta logs.
`Config Param: LOG_FILE_FORMAT` | +| [hoodie.table.multiple.base.file.formats.enable](#hoodietablemultiplebasefileformatsenable) | false | When set to true, the table can support reading and writing multiple base file formats.
`Config Param: MULTIPLE_BASE_FILE_FORMATS_ENABLE`
`Since Version: 1.0.0` | +| [hoodie.table.timeline.timezone](#hoodietabletimelinetimezone) | LOCAL | User can set hoodie commit timeline timezone, such as utc, local and so on. local is default
`Config Param: TIMELINE_TIMEZONE` | +| [hoodie.table.type](#hoodietabletype) | COPY_ON_WRITE | The table type for the underlying data.
`Config Param: TYPE` | +| [hoodie.table.version](#hoodietableversion) | EIGHT | Version of table, used for running upgrade/downgrade steps between releases with potentially breaking/backwards compatible changes.
`Config Param: VERSION` | +| [hoodie.timeline.history.path](#hoodietimelinehistorypath) | history | path under the meta folder, to store timeline history at.
`Config Param: TIMELINE_HISTORY_PATH` | +| [hoodie.timeline.path](#hoodietimelinepath) | timeline | path under the meta folder, to store timeline instants at.
`Config Param: TIMELINE_PATH` | + +[**Advanced Configs**](#Hudi-Table-Basic-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.write.drop.partition.columns](#hoodiedatasourcewritedroppartitioncolumns) | false | When set to true, will not write the partition columns into hudi. By default, false.
`Config Param: DROP_PARTITION_COLUMNS` | +| [hoodie.datasource.write.partitionpath.urlencode](#hoodiedatasourcewritepartitionpathurlencode) | false | Should we url encode the partition path value, before creating the folder structure.
`Config Param: URL_ENCODE_PARTITIONING` | +--- + +## Spark Datasource Configs {#SPARK_DATASOURCE} +These configs control the Hudi Spark Datasource, providing ability to define keys/partitioning, pick out the write operation, specify how to merge records or choosing query type to read. + + +### Read Options {#Read-Options} +Options useful for reading tables via `read.format.option(...)` + + + + +[**Basic Configs**](#Read-Options-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.datasource.read.begin.instanttime](#hoodiedatasourcereadbegininstanttime) | (N/A) | Required when `hoodie.datasource.query.type` is set to `incremental`. Represents the completion time to start incrementally pulling data from. The completion time here need not necessarily correspond to an instant on the timeline. New data written with completion_time >= START_COMMIT are fetched out. For e.g: ‘20170901080000’ will get all new data written on or after Sep 1, 2017 08:00AM.
`Config Param: START_COMMIT` | +| [hoodie.datasource.read.end.instanttime](#hoodiedatasourcereadendinstanttime) | (N/A) | Used when `hoodie.datasource.query.type` is set to `incremental`. Represents the completion time to limit incrementally fetched data to. When not specified latest commit completion time from timeline is assumed by default. When specified, new data written with completion_time <= END_COMMIT are fetched out. Point in time type queries make more sense with begin and end completion times specified.
`Config Param: END_COMMIT` | +| [hoodie.datasource.read.incr.table.version](#hoodiedatasourcereadincrtableversion) | (N/A) | The table version assumed for incremental read
`Config Param: INCREMENTAL_READ_TABLE_VERSION` | +| [hoodie.datasource.read.streaming.table.version](#hoodiedatasourcereadstreamingtableversion) | (N/A) | The table version assumed for streaming read
`Config Param: STREAMING_READ_TABLE_VERSION` | +| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot | Whether data needs to be read, in `incremental` mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest view, based on base files) (or) `snapshot` mode (obtain latest view, by merging base and (if any) log files)
`Config Param: QUERY_TYPE` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)
`Config Param: READ_PRE_COMBINE_FIELD` | + +[**Advanced Configs**](#Read-Options-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------- || +| [as.of.instant](#asofinstant) | (N/A) | The query instant for time travel. Without specified this option, we query the latest snapshot.
`Config Param: TIME_TRAVEL_AS_OF_INSTANT` | +| [hoodie.datasource.read.paths](#hoodiedatasourcereadpaths) | (N/A) | Comma separated list of file paths to read within a Hudi table.
`Config Param: READ_PATHS` | +| [hoodie.datasource.merge.type](#hoodiedatasourcemergetype) | payload_combine | For Snapshot query on merge on read table. Use this key to define how the payloads are merged, in 1) skip_merge: read the base file records plus the log file records without merging; 2) payload_combine: read the base file records first, for each record in base file, checks whether the key is in the log file records (combines the two records with same key for base and log file records), then read the left log file records
`Config Param: REALTIME_MERGE` | +| [hoodie.datasource.query.incremental.format](#hoodiedatasourcequeryincrementalformat) | latest_state | This config is used alone with the 'incremental' query type.When set to 'latest_state', it returns the latest records' values.When set to 'cdc', it returns the cdc data.
`Config Param: INCREMENTAL_FORMAT`
`Since Version: 0.13.0` | +| [hoodie.datasource.read.create.filesystem.relation](#hoodiedatasourcereadcreatefilesystemrelation) | false | When this is set, the relation created by DefaultSource is for a view representing the result set of the table valued function hudi_filesystem_view(...)
`Config Param: CREATE_FILESYSTEM_RELATION`
`Since Version: 1.0.0` | +| [hoodie.datasource.read.extract.partition.values.from.path](#hoodiedatasourcereadextractpartitionvaluesfrompath) | false | When set to true, values for partition columns (partition values) will be extracted from physical partition path (default Spark behavior). When set to false partition values will be read from the data file (in Hudi partition columns are persisted by default). This config is a fallback allowing to preserve existing behavior, and should not be used otherwise.
`Config Param: EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH`
`Since Version: 0.11.0` | +| [hoodie.datasource.read.file.index.listing.mode](#hoodiedatasourcereadfileindexlistingmode) | lazy | Overrides Hudi's file-index implementation's file listing mode: when set to 'eager', file-index will list all partition paths and corresponding file slices w/in them eagerly, during initialization, prior to partition-pruning kicking in, meaning that all partitions will be listed including ones that might be subsequently pruned out; when set to 'lazy', partitions and file-slices w/in them will be listed lazily (ie when they actually accessed, instead of when file-index is initialized) allowing partition pruning to occur before that, only listing partitions that has already been pruned. Please note that, this config is provided purely to allow to fallback to behavior existing prior to 0.13.0 release, and will be deprecated soon after.
`Config Param: FILE_INDEX_LISTING_MODE_OVERRIDE`
`Since Version: 0.13.0` | +| [hoodie.datasource.read.file.index.listing.partition-path-prefix.analysis.enabled](#hoodiedatasourcereadfileindexlistingpartition-path-prefixanalysisenabled) | true | Controls whether partition-path prefix analysis is enabled w/in the file-index, allowing to avoid necessity to recursively list deep folder structures of partitioned tables w/ multiple partition columns, by carefully analyzing provided partition-column predicates and deducing corresponding partition-path prefix from them (if possible).
`Config Param: FILE_INDEX_LISTING_PARTITION_PATH_PREFIX_ANALYSIS_ENABLED`
`Since Version: 0.13.0` | +| [hoodie.datasource.read.incr.fallback.fulltablescan.enable](#hoodiedatasourcereadincrfallbackfulltablescanenable) | false | When doing an incremental query whether we should fall back to full table scans if file does not exist.
`Config Param: INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN` | +| [hoodie.datasource.read.incr.filters](#hoodiedatasourcereadincrfilters) | | For use-cases like DeltaStreamer which reads from Hoodie Incremental table and applies opaque map functions, filters appearing late in the sequence of transformations cannot be automatically pushed down. This option allows setting filters directly on Hoodie Source.
`Config Param: PUSH_DOWN_INCR_FILTERS` | +| [hoodie.datasource.read.incr.path.glob](#hoodiedatasourcereadincrpathglob) | | For the use-cases like users only want to incremental pull from certain partitions instead of the full table. This option allows using glob pattern to directly filter on path.
`Config Param: INCR_PATH_GLOB` | +| [hoodie.datasource.read.schema.use.end.instanttime](#hoodiedatasourcereadschemauseendinstanttime) | false | Uses end instant schema when incrementally fetched data to. Default: users latest instant schema.
`Config Param: INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME` | +| [hoodie.datasource.read.table.valued.function.filesystem.relation.subpath](#hoodiedatasourcereadtablevaluedfunctionfilesystemrelationsubpath) | | A regex under the table's base path to get file system view information
`Config Param: FILESYSTEM_RELATION_ARG_SUBPATH`
`Since Version: 1.0.0` | +| [hoodie.datasource.read.table.valued.function.timeline.relation](#hoodiedatasourcereadtablevaluedfunctiontimelinerelation) | false | When this is set, the relation created by DefaultSource is for a view representing the result set of the table valued function hudi_query_timeline(...)
`Config Param: CREATE_TIMELINE_RELATION`
`Since Version: 1.0.0` | +| [hoodie.datasource.read.table.valued.function.timeline.relation.archived](#hoodiedatasourcereadtablevaluedfunctiontimelinerelationarchived) | false | When this is set, the result set of the table valued function hudi_query_timeline(...) will include archived timeline
`Config Param: TIMELINE_RELATION_ARG_ARCHIVED_TIMELINE`
`Since Version: 1.0.0` | +| [hoodie.datasource.streaming.startOffset](#hoodiedatasourcestreamingstartOffset) | earliest | Start offset to pull data from hoodie streaming source. allow earliest, latest, and specified start instant time
`Config Param: START_OFFSET`
`Since Version: 0.13.0` | +| [hoodie.enable.data.skipping](#hoodieenabledataskipping) | true | Enables data-skipping allowing queries to leverage indexes to reduce the search space by skipping over files
`Config Param: ENABLE_DATA_SKIPPING`
`Since Version: 0.10.0` | +| [hoodie.file.index.enable](#hoodiefileindexenable) | true | Enables use of the spark file index implementation for Hudi, that speeds up listing of large tables.
`Config Param: ENABLE_HOODIE_FILE_INDEX` | +| [hoodie.read.timeline.holes.resolution.policy](#hoodiereadtimelineholesresolutionpolicy) | FAIL | When doing incremental queries, there could be hollow commits (requested or inflight commits that are not the latest) that are produced by concurrent writers and could lead to potential data loss. This config allows users to have different ways of handling this situation. The valid values are [FAIL, BLOCK, USE_TRANSITION_TIME]: Use `FAIL` to throw an exception when hollow commit is detected. This is helpful when hollow commits are not expected. Use `BLOCK` to block processing commits from going beyond the hollow ones. This fits the case where waiting for hollow commits to finish is acceptable. Use `USE_TRANSITION_TIME` (experimental) to query commits in range by state transition time (completion time), instead of commit time (start time). Using this mode will result in `begin.instanttime` and `end.instanttime` using `stateTransitionTime` instead of the instant's commit time.
`Config Param: INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT`
`Since Version: 0.14.0` | +| [hoodie.schema.on.read.enable](#hoodieschemaonreadenable) | false | Enables support for Schema Evolution feature
`Config Param: SCHEMA_EVOLUTION_ENABLED` | +--- + + +### Write Options {#Write-Options} +You can pass down any of the WriteClient level configs directly using `options()` or `option(k,v)` methods. + +```java +inputDF.write() +.format("org.apache.hudi") +.options(clientOpts) // any of the Hudi client opts can be passed in as well +.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") +.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") +.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") +.option(HoodieWriteConfig.TABLE_NAME, tableName) +.mode(SaveMode.Append) +.save(basePath); +``` + +Options useful for writing tables via `write.format.option(...)` + + + + +[**Basic Configs**](#Write-Options-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | ----------------------------- || +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | (N/A) | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.
`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | (N/A) | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString()
`Config Param: PARTITIONPATH_FIELD` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | (N/A) | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: RECORDKEY_FIELD` | +| [hoodie.datasource.write.secondarykey.column](#hoodiedatasourcewritesecondarykeycolumn) | (N/A) | Columns that constitute the secondary key component. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: SECONDARYKEY_COLUMN_NAME` | +| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false | Enable running of clustering service, asynchronously as inserts happen on the table.
`Config Param: ASYNC_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.clustering.inline](#hoodieclusteringinline) | false | Turn on inline clustering - clustering will be run after each write operation is complete
`Config Param: INLINE_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false | When set to true, register/sync the table to Apache Hive metastore.
`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 | Hive metastore url
`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 | Hive metastore url
`Config Param: METASTORE_URIS` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)
`Config Param: HIVE_STYLE_PARTITIONING` | +| [hoodie.datasource.write.operation](#hoodiedatasourcewriteoperation) | upsert | Whether to do upsert, insert or bulk_insert for the write operation. Use bulk_insert to load new data into a table, and there on use upsert/insert. bulk insert uses a disk based write path to scale to load large inputs without need to cache it.
`Config Param: OPERATION` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)
`Config Param: PRECOMBINE_FIELD` | +| [hoodie.datasource.write.table.type](#hoodiedatasourcewritetabletype) | COPY_ON_WRITE | The table type for the underlying data, for this write. This can’t change between writes.
`Config Param: TABLE_TYPE` | +| [hoodie.write.record.merge.mode](#hoodiewriterecordmergemode) | EVENT_TIME_ORDERING | org.apache.hudi.common.config.RecordMergeMode: Determines the logic of merging updates COMMIT_TIME_ORDERING: Using transaction time to merge records, i.e., the record from later transaction overwrites the earlier record with the same key. EVENT_TIME_ORDERING(default): Using event time as the ordering to merge records, i.e., the record with the larger event time overwrites the record with the smaller event time on the same key, regardless of transaction time. The event time or preCombine field needs to be specified by the user. CUSTOM: Using custom merging logic specified by the user.
`Config Param: RECORD_MERGE_MODE`
`Since Version: 1.0.0` | + +[**Advanced Configs**](#Write-Options-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------- || +| [hoodie.datasource.hive_sync.serde_properties](#hoodiedatasourcehive_syncserde_properties) | (N/A) | Serde properties to hive table.
`Config Param: HIVE_TABLE_SERDE_PROPERTIES` | +| [hoodie.datasource.hive_sync.table_properties](#hoodiedatasourcehive_synctable_properties) | (N/A) | Additional properties to store with table.
`Config Param: HIVE_TABLE_PROPERTIES` | +| [hoodie.datasource.overwrite.mode](#hoodiedatasourceoverwritemode) | (N/A) | Controls whether overwrite use dynamic or static mode, if not configured, respect spark.sql.sources.partitionOverwriteMode
`Config Param: OVERWRITE_MODE`
`Since Version: 0.14.0` | +| [hoodie.datasource.write.partitions.to.delete](#hoodiedatasourcewritepartitionstodelete) | (N/A) | Comma separated list of partitions to delete. Allows use of wildcard *
`Config Param: PARTITIONS_TO_DELETE` | +| [hoodie.datasource.write.payload.class](#hoodiedatasourcewritepayloadclass) | (N/A) | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for PRECOMBINE_FIELD_OPT_VAL in-effective
`Config Param: PAYLOAD_CLASS_NAME` | +| [hoodie.datasource.write.table.name](#hoodiedatasourcewritetablename) | (N/A) | Table name for the datasource write. Also used to register the table into meta stores.
`Config Param: TABLE_NAME` | +| [hoodie.write.record.merge.custom.implementation.classes](#hoodiewriterecordmergecustomimplementationclasses) | (N/A) | List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. These record merge impls will filter by hoodie.write.record.merge.strategy.idHudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc)
`Config Param: RECORD_MERGE_IMPL_CLASSES`
`Since Version: 0.13.0` | +| [hoodie.write.record.merge.strategy.id](#hoodiewriterecordmergestrategyid) | (N/A) | ID of record merge strategy. Hudi will pick HoodieRecordMerger implementations in `hoodie.write.record.merge.custom.implementation.classes` which has the same merge strategy id
`Config Param: RECORD_MERGE_STRATEGY_ID`
`Since Version: 0.13.0` | +| [hoodie.datasource.compaction.async.enable](#hoodiedatasourcecompactionasyncenable) | true | Controls whether async compaction should be turned on for MOR table writing.
`Config Param: ASYNC_COMPACT_ENABLE` | +| [hoodie.datasource.hive_sync.auto_create_database](#hoodiedatasourcehive_syncauto_create_database) | true | Auto create hive database if does not exists
`Config Param: HIVE_AUTO_CREATE_DATABASE` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET | Base file format for the sync.
`Config Param: HIVE_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.batch_num](#hoodiedatasourcehive_syncbatch_num) | 1000 | The number of partitions one batch when synchronous partitions to hive.
`Config Param: HIVE_BATCH_SYNC_PARTITION_NUM` | +| [hoodie.datasource.hive_sync.bucket_sync](#hoodiedatasourcehive_syncbucket_sync) | false | Whether sync hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'
`Config Param: HIVE_SYNC_BUCKET_SYNC` | +| [hoodie.datasource.hive_sync.create_managed_table](#hoodiedatasourcehive_synccreate_managed_table) | false | Whether to sync the table as managed table.
`Config Param: HIVE_CREATE_MANAGED_TABLE` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default | The name of the destination database that we should sync the hudi table to.
`Config Param: HIVE_DATABASE` | +| [hoodie.datasource.hive_sync.ignore_exceptions](#hoodiedatasourcehive_syncignore_exceptions) | false | Ignore exceptions when syncing with Hive.
`Config Param: HIVE_IGNORE_EXCEPTIONS` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.
`Config Param: HIVE_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | | Field in the table to use for determining hive partition columns.
`Config Param: HIVE_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.password](#hoodiedatasourcehive_syncpassword) | hive | hive password to use
`Config Param: HIVE_PASS` | +| [hoodie.datasource.hive_sync.skip_ro_suffix](#hoodiedatasourcehive_syncskip_ro_suffix) | false | Skip the _ro suffix for Read optimized table, when registering
`Config Param: HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE` | +| [hoodie.datasource.hive_sync.support_timestamp](#hoodiedatasourcehive_syncsupport_timestamp) | false | ‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. Disabled by default for backward compatibility. NOTE: On Spark entrypoints, this is defaulted to TRUE
`Config Param: HIVE_SUPPORT_TIMESTAMP_TYPE` | +| [hoodie.datasource.hive_sync.sync_as_datasource](#hoodiedatasourcehive_syncsync_as_datasource) | true |
`Config Param: HIVE_SYNC_AS_DATA_SOURCE_TABLE` | +| [hoodie.datasource.hive_sync.sync_comment](#hoodiedatasourcehive_syncsync_comment) | false | Whether to sync the table column comments while syncing the table.
`Config Param: HIVE_SYNC_COMMENT` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown | The name of the destination table that we should sync the hudi table to.
`Config Param: HIVE_TABLE` | +| [hoodie.datasource.hive_sync.use_jdbc](#hoodiedatasourcehive_syncuse_jdbc) | true | Use JDBC when hive synchronization is enabled
`Config Param: HIVE_USE_JDBC` | +| [hoodie.datasource.hive_sync.use_pre_apache_input_format](#hoodiedatasourcehive_syncuse_pre_apache_input_format) | false | Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. Use this when you are in the process of migrating from com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format
`Config Param: HIVE_USE_PRE_APACHE_INPUT_FORMAT` | +| [hoodie.datasource.hive_sync.username](#hoodiedatasourcehive_syncusername) | hive | hive user name to use
`Config Param: HIVE_USER` | +| [hoodie.datasource.insert.dup.policy](#hoodiedatasourceinsertduppolicy) | none | **Note** This is only applicable to Spark SQL writing.<br />When operation type is set to "insert", users can optionally enforce a dedup policy. This policy will be employed when records being ingested already exists in storage. Default policy is none and no action will be taken. Another option is to choose "drop", on which matching records from incoming will be dropped and the rest will be ingested. Third option is "fail" which will fail the write operation when same records are re-ingested. In other words, a given record as deduced by the key generation policy can be ingested only once to the target table of interest.
`Config Param: INSERT_DUP_POLICY`
`Since Version: 0.14.0` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false | If true, only sync on conditions like schema change or partition change.
`Config Param: HIVE_CONDITIONAL_SYNC` | +| [hoodie.datasource.write.commitmeta.key.prefix](#hoodiedatasourcewritecommitmetakeyprefix) | _ | Option keys beginning with this prefix, are automatically added to the commit/deltacommit metadata. This is useful to store checkpointing information, in a consistent way with the hudi timeline
`Config Param: COMMIT_METADATA_KEYPREFIX` | +| [hoodie.datasource.write.drop.partition.columns](#hoodiedatasourcewritedroppartitioncolumns) | false | When set to true, will not write the partition columns into hudi. By default, false.
`Config Param: DROP_PARTITION_COLUMNS` | +| [hoodie.datasource.write.insert.drop.duplicates](#hoodiedatasourcewriteinsertdropduplicates) | false | If set to true, records from the incoming dataframe will not overwrite existing records with the same key during the write operation. <br /> **Note** Just for Insert operation in Spark SQL writing since 0.14.0, users can switch to the config `hoodie.datasource.insert.dup.policy` instead for a simplified duplicate handling experience. The new config will be incorporated into all other writing flows and this config will be fully deprecated in future releases.
`Config Param: INSERT_DROP_DUPS` | +| [hoodie.datasource.write.keygenerator.class](#hoodiedatasourcewritekeygeneratorclass) | org.apache.hudi.keygen.SimpleKeyGenerator | Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator`
`Config Param: KEYGENERATOR_CLASS_NAME` | +| [hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled](#hoodiedatasourcewritekeygeneratorconsistentlogicaltimestampenabled) | false | When set to true, consistent value will be generated for a logical timestamp type column, like timestamp-millis and timestamp-micros, irrespective of whether row-writer is enabled. Disabled by default so as not to break the pipeline that deploy either fully row-writer path or non row-writer path. For example, if it is kept disabled then record key of timestamp type with value `2016-12-29 09:54:00` will be written as timestamp `2016-12-29 09:54:00.0` in row-writer path, while it will be written as long value `1483023240000000` in non row-writer path. If enabled, then the timestamp value will be written in both the cases.
`Config Param: KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED`
`Since Version: 0.10.1` | +| [hoodie.datasource.write.partitionpath.urlencode](#hoodiedatasourcewritepartitionpathurlencode) | false | Should we url encode the partition path value, before creating the folder structure.
`Config Param: URL_ENCODE_PARTITIONING` | +| [hoodie.datasource.write.reconcile.schema](#hoodiedatasourcewritereconcileschema) | false | This config controls how writer's schema will be selected based on the incoming batch's schema as well as existing table's one. When schema reconciliation is DISABLED, incoming batch's schema will be picked as a writer-schema (therefore updating table's schema). When schema reconciliation is ENABLED, writer-schema will be picked such that table's schema (after txn) is either kept the same or extended, meaning that we'll always prefer the schema that either adds new columns or stays the same. This enables us, to always extend the table's schema during evolution and never lose the data (when, for ex, existing column is being dropped in a new batch)
`Config Param: RECONCILE_SCHEMA` | +| [hoodie.datasource.write.row.writer.enable](#hoodiedatasourcewriterowwriterenable) | true | When set to true, will perform write operations directly using the spark native `Row` representation, avoiding any additional conversion costs.
`Config Param: ENABLE_ROW_WRITER` | +| [hoodie.datasource.write.streaming.checkpoint.identifier](#hoodiedatasourcewritestreamingcheckpointidentifier) | default_single_writer | A stream identifier used for HUDI to fetch the right checkpoint(`batch id` to be more specific) corresponding this writer. Please note that keep the identifier an unique value for different writer if under multi-writer scenario. If the value is not set, will only keep the checkpoint info in the memory. This could introduce the potential issue that the job is restart(`batch id` is lost) while spark checkpoint write fails, causing spark will retry and rewrite the data.
`Config Param: STREAMING_CHECKPOINT_IDENTIFIER`
`Since Version: 0.13.0` | +| [hoodie.datasource.write.streaming.disable.compaction](#hoodiedatasourcewritestreamingdisablecompaction) | false | By default for MOR table, async compaction is enabled with spark streaming sink. By setting this config to true, we can disable it and the expectation is that, users will schedule and execute compaction in a different process/job altogether. Some users may wish to run it separately to manage resources across table services and regular ingestion pipeline and so this could be preferred on such cases.
`Config Param: STREAMING_DISABLE_COMPACTION`
`Since Version: 0.14.0` | +| [hoodie.datasource.write.streaming.ignore.failed.batch](#hoodiedatasourcewritestreamingignorefailedbatch) | false | Config to indicate whether to ignore any non exception error (e.g. writestatus error) within a streaming microbatch. Turning this on, could hide the write status errors while the spark checkpoint moves ahead.So, would recommend users to use this with caution.
`Config Param: STREAMING_IGNORE_FAILED_BATCH` | +| [hoodie.datasource.write.streaming.retry.count](#hoodiedatasourcewritestreamingretrycount) | 3 | Config to indicate how many times streaming job should retry for a failed micro batch.
`Config Param: STREAMING_RETRY_CNT` | +| [hoodie.datasource.write.streaming.retry.interval.ms](#hoodiedatasourcewritestreamingretryintervalms) | 2000 | Config to indicate how long (by millisecond) before a retry should issued for failed microbatch
`Config Param: STREAMING_RETRY_INTERVAL_MS` | +| [hoodie.meta.sync.client.tool.class](#hoodiemetasyncclienttoolclass) | org.apache.hudi.hive.HiveSyncTool | Sync tool class name used to sync to metastore. Defaults to Hive.
`Config Param: META_SYNC_CLIENT_TOOL_CLASS_NAME` | +| [hoodie.spark.sql.insert.into.operation](#hoodiesparksqlinsertintooperation) | insert | Sql write operation to use with INSERT_INTO spark sql command. This comes with 3 possible values, bulk_insert, insert and upsert. bulk_insert is generally meant for initial loads and is known to be performant compared to insert. But bulk_insert may not do small file management. If you prefer hudi to automatically manage small files, then you can go with "insert". There is no precombine (if there are duplicates within the same batch being ingested, same dups will be ingested) with bulk_insert and insert and there is no index look up as well. If you may use INSERT_INTO for mutable dataset, then you may have to set this config value to "upsert". With upsert, you will get both precombine and updates to existing records on storage is also honored. If not, you may see duplicates.
`Config Param: SPARK_SQL_INSERT_INTO_OPERATION`
`Since Version: 0.14.0` | +| [hoodie.spark.sql.merge.into.partial.updates](#hoodiesparksqlmergeintopartialupdates) | true | Whether to write partial updates to the data blocks containing updates in MOR tables with Spark SQL MERGE INTO statement. The data blocks containing partial updates have a schema with a subset of fields compared to the full schema of the table.
`Config Param: ENABLE_MERGE_INTO_PARTIAL_UPDATES`
`Since Version: 1.0.0` | +| [hoodie.spark.sql.optimized.writes.enable](#hoodiesparksqloptimizedwritesenable) | true | Controls whether spark sql prepped update, delete, and merge are enabled.
`Config Param: SPARK_SQL_OPTIMIZED_WRITES`
`Since Version: 0.14.0` | +| [hoodie.sql.bulk.insert.enable](#hoodiesqlbulkinsertenable) | false | When set to true, the sql insert statement will use bulk insert. This config is deprecated as of 0.14.0. Please use hoodie.spark.sql.insert.into.operation instead.
`Config Param: SQL_ENABLE_BULK_INSERT` | +| [hoodie.sql.insert.mode](#hoodiesqlinsertmode) | upsert | Insert mode when insert data to pk-table. The optional modes are: upsert, strict and non-strict.For upsert mode, insert statement do the upsert operation for the pk-table which will update the duplicate record.For strict mode, insert statement will keep the primary key uniqueness constraint which do not allow duplicate record.While for non-strict mode, hudi just do the insert operation for the pk-table. This config is deprecated as of 0.14.0. Please use hoodie.spark.sql.insert.into.operation and hoodie.datasource.insert.dup.policy as you see fit.
`Config Param: SQL_INSERT_MODE` | +| [hoodie.streamer.source.kafka.value.deserializer.class](#hoodiestreamersourcekafkavaluedeserializerclass) | io.confluent.kafka.serializers.KafkaAvroDeserializer | This class is used by kafka client to deserialize the records
`Config Param: KAFKA_AVRO_VALUE_DESERIALIZER_CLASS`
`Since Version: 0.9.0` | +| [hoodie.write.set.null.for.missing.columns](#hoodiewritesetnullformissingcolumns) | false | When a nullable column is missing from incoming batch during a write operation, the write operation will fail schema compatibility check. Set this option to true will make the missing column be filled with null values to successfully complete the write operation.
`Config Param: SET_NULL_FOR_MISSING_COLUMNS`
`Since Version: 0.14.1` | +--- + + +### PreCommit Validator Configurations {#PreCommit-Validator-Configurations} +The following set of configurations help validate new data before commits. + + + +[**Advanced Configs**](#PreCommit-Validator-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.precommit.validators](#hoodieprecommitvalidators) | | Comma separated list of class names that can be invoked to validate commit
`Config Param: VALIDATOR_CLASS_NAMES` | +| [hoodie.precommit.validators.equality.sql.queries](#hoodieprecommitvalidatorsequalitysqlqueries) | | Spark SQL queries to run on table before committing new data to validate state before and after commit. Multiple queries separated by ';' delimiter are supported. Example: "select count(*) from \<TABLE_NAME\> Note \<TABLE_NAME\> is replaced by table state before and after commit.
`Config Param: EQUALITY_SQL_QUERIES` | +| [hoodie.precommit.validators.inequality.sql.queries](#hoodieprecommitvalidatorsinequalitysqlqueries) | | Spark SQL queries to run on table before committing new data to validate state before and after commit.Multiple queries separated by ';' delimiter are supported.Example query: 'select count(*) from \<TABLE_NAME\> where col=null'Note \<TABLE_NAME\> variable is expected to be present in query.
`Config Param: INEQUALITY_SQL_QUERIES` | +| [hoodie.precommit.validators.single.value.sql.queries](#hoodieprecommitvalidatorssinglevaluesqlqueries) | | Spark SQL queries to run on table before committing new data to validate state after commit.Multiple queries separated by ';' delimiter are supported.Expected result is included as part of query separated by '#'. Example query: 'query1#result1:query2#result2'Note \<TABLE_NAME\> variable is expected to be present in query.
`Config Param: SINGLE_VALUE_SQL_QUERIES` | +--- + +## Flink Sql Configs {#FLINK_SQL} +These configs control the Hudi Flink SQL source/sink connectors, providing ability to define record keys, pick out the write operation, specify how to merge records, enable/disable asynchronous compaction or choosing query type to read. + + +### Flink Options {#Flink-Options} +Flink jobs using the SQL can be configured through the options in WITH clause. The actual datasource level configs are listed below. + + + +[**Basic Configs**](#Flink-Options-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.database.name](#hoodiedatabasename) | (N/A) | Database name to register to Hive metastore
`Config Param: DATABASE_NAME` | +| [hoodie.table.name](#hoodietablename) | (N/A) | Table name to register to Hive metastore
`Config Param: TABLE_NAME` | +| [path](#path) | (N/A) | Base path for the target hoodie table. The path would be created if it does not exist, otherwise a Hoodie table expects to be initialized successfully
`Config Param: PATH` | +| [read.commits.limit](#readcommitslimit) | (N/A) | The maximum number of commits allowed to read in each instant check, if it is streaming read, the avg read instants number per-second would be 'read.commits.limit'/'read.streaming.check-interval', by default no limit
`Config Param: READ_COMMITS_LIMIT` | +| [read.end-commit](#readend-commit) | (N/A) | End commit instant for reading, the commit time format should be 'yyyyMMddHHmmss'
`Config Param: READ_END_COMMIT` | +| [read.start-commit](#readstart-commit) | (N/A) | Start commit instant for reading, the commit time format should be 'yyyyMMddHHmmss', by default reading from the latest instant for streaming read
`Config Param: READ_START_COMMIT` | +| [archive.max_commits](#archivemax_commits) | 50 | Max number of commits to keep before archiving older commits into a sequential log, default 50
`Config Param: ARCHIVE_MAX_COMMITS` | +| [archive.min_commits](#archivemin_commits) | 40 | Min number of commits to keep before archiving older commits into a sequential log, default 40
`Config Param: ARCHIVE_MIN_COMMITS` | +| [cdc.enabled](#cdcenabled) | false | When enable, persist the change data if necessary, and can be queried as a CDC query mode
`Config Param: CDC_ENABLED` | +| [cdc.supplemental.logging.mode](#cdcsupplementalloggingmode) | DATA_BEFORE_AFTER | Setting 'op_key_only' persists the 'op' and the record key only, setting 'data_before' persists the additional 'before' image, and setting 'data_before_after' persists the additional 'before' and 'after' images.
`Config Param: SUPPLEMENTAL_LOGGING_MODE` | +| [changelog.enabled](#changelogenabled) | false | Whether to keep all the intermediate changes, we try to keep all the changes of a record when enabled: 1). The sink accept the UPDATE_BEFORE message; 2). The source try to emit every changes of a record. The semantics is best effort because the compaction job would finally merge all changes of a record into one. default false to have UPSERT semantics
`Config Param: CHANGELOG_ENABLED` | +| [clean.async.enabled](#cleanasyncenabled) | true | Whether to cleanup the old commits immediately on new commits, enabled by default
`Config Param: CLEAN_ASYNC_ENABLED` | +| [clean.retain_commits](#cleanretain_commits) | 30 | Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much you can incrementally pull on this table, default 30
`Config Param: CLEAN_RETAIN_COMMITS` | +| [clustering.async.enabled](#clusteringasyncenabled) | false | Async Clustering, default false
`Config Param: CLUSTERING_ASYNC_ENABLED` | +| [clustering.plan.strategy.small.file.limit](#clusteringplanstrategysmallfilelimit) | 600 | Files smaller than the size specified here are candidates for clustering, default 600 MB
`Config Param: CLUSTERING_PLAN_STRATEGY_SMALL_FILE_LIMIT` | +| [clustering.plan.strategy.target.file.max.bytes](#clusteringplanstrategytargetfilemaxbytes) | 1073741824 | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups, default 1 GB
`Config Param: CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES` | +| [compaction.async.enabled](#compactionasyncenabled) | true | Async Compaction, enabled by default for MOR
`Config Param: COMPACTION_ASYNC_ENABLED` | +| [compaction.delta_commits](#compactiondelta_commits) | 5 | Max delta commits needed to trigger compaction, default 5 commits
`Config Param: COMPACTION_DELTA_COMMITS` | +| [hive_sync.enabled](#hive_syncenabled) | false | Asynchronously sync Hive meta to HMS, default false
`Config Param: HIVE_SYNC_ENABLED` | +| [hive_sync.jdbc_url](#hive_syncjdbc_url) | jdbc:hive2://localhost:10000 | Jdbc URL for hive sync, default 'jdbc:hive2://localhost:10000'
`Config Param: HIVE_SYNC_JDBC_URL` | +| [hive_sync.metastore.uris](#hive_syncmetastoreuris) | | Metastore uris for hive sync, default ''
`Config Param: HIVE_SYNC_METASTORE_URIS` | +| [hive_sync.mode](#hive_syncmode) | HMS | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql, default 'hms'
`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot | Decides how data files need to be read, in 1) Snapshot mode (obtain latest view, based on row & columnar data); 2) incremental mode (new data since an instantTime); 3) Read Optimized mode (obtain latest view, based on columnar data) .Default: snapshot
`Config Param: QUERY_TYPE` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false | Whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)
`Config Param: HIVE_STYLE_PARTITIONING` | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | | Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual value obtained by invoking .toString(), default ''
`Config Param: PARTITION_PATH_FIELD` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | uuid | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: RECORD_KEY_FIELD` | +| [index.type](#indextype) | FLINK_STATE | Index type of Flink write job, default is using state backed index.
`Config Param: INDEX_TYPE` | +| [lookup.join.cache.ttl](#lookupjoincachettl) | PT1H | The cache TTL (e.g. 10min) for the build table in lookup join.
`Config Param: LOOKUP_JOIN_CACHE_TTL` | +| [metadata.compaction.delta_commits](#metadatacompactiondelta_commits) | 10 | Max delta commits for metadata table to trigger compaction, default 10
`Config Param: METADATA_COMPACTION_DELTA_COMMITS` | +| [metadata.enabled](#metadataenabled) | true | Enable the internal metadata table which serves table metadata like level file listings, default enabled
`Config Param: METADATA_ENABLED` | +| [precombine.field](#precombinefield) | ts | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)
`Config Param: PRECOMBINE_FIELD` | +| [read.streaming.enabled](#readstreamingenabled) | false | Whether to read as streaming source, default false
`Config Param: READ_AS_STREAMING` | +| [read.streaming.skip_insertoverwrite](#readstreamingskip_insertoverwrite) | false | Whether to skip insert overwrite instants to avoid reading base files of insert overwrite operations for streaming read. In streaming scenarios, insert overwrite is usually used to repair data, here you can control the visibility of downstream streaming read.
`Config Param: READ_STREAMING_SKIP_INSERT_OVERWRITE` | +| [table.type](#tabletype) | COPY_ON_WRITE | Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ
`Config Param: TABLE_TYPE` | +| [write.operation](#writeoperation) | upsert | The write operation, that this write should do
`Config Param: OPERATION` | +| [write.parquet.max.file.size](#writeparquetmaxfilesize) | 120 | Target size for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.
`Config Param: WRITE_PARQUET_MAX_FILE_SIZE` | + +[**Advanced Configs**](#Flink-Options-advanced-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- || +| [clustering.tasks](#clusteringtasks) | (N/A) | Parallelism of tasks that do actual clustering, default same as the write task parallelism
`Config Param: CLUSTERING_TASKS` | +| [compaction.tasks](#compactiontasks) | (N/A) | Parallelism of tasks that do actual compaction, default same as the write task parallelism
`Config Param: COMPACTION_TASKS` | +| [hive_sync.conf.dir](#hive_syncconfdir) | (N/A) | The hive configuration directory, where the hive-site.xml lies in, the file should be put on the client machine
`Config Param: HIVE_SYNC_CONF_DIR` | +| [hive_sync.serde_properties](#hive_syncserde_properties) | (N/A) | Serde properties to hive table, the data format is k1=v1 k2=v2
`Config Param: HIVE_SYNC_TABLE_SERDE_PROPERTIES` | +| [hive_sync.table_properties](#hive_synctable_properties) | (N/A) | Additional properties to store with table, the data format is k1=v1 k2=v2
`Config Param: HIVE_SYNC_TABLE_PROPERTIES` | +| [hoodie.datasource.write.keygenerator.class](#hoodiedatasourcewritekeygeneratorclass) | (N/A) | Key generator class, that implements will extract the key out of incoming record
`Config Param: KEYGEN_CLASS_NAME` | +| [read.tasks](#readtasks) | (N/A) | Parallelism of tasks that do actual read, default is the parallelism of the execution environment
`Config Param: READ_TASKS` | +| [source.avro-schema](#sourceavro-schema) | (N/A) | Source avro schema string, the parsed schema is used for deserialization
`Config Param: SOURCE_AVRO_SCHEMA` | +| [source.avro-schema.path](#sourceavro-schemapath) | (N/A) | Source avro schema file path, the parsed schema is used for deserialization
`Config Param: SOURCE_AVRO_SCHEMA_PATH` | +| [write.bucket_assign.tasks](#writebucket_assigntasks) | (N/A) | Parallelism of tasks that do bucket assign, default same as the write task parallelism
`Config Param: BUCKET_ASSIGN_TASKS` | +| [write.index_bootstrap.tasks](#writeindex_bootstraptasks) | (N/A) | Parallelism of tasks that do index bootstrap, default same as the write task parallelism
`Config Param: INDEX_BOOTSTRAP_TASKS` | +| [write.partition.format](#writepartitionformat) | (N/A) | Partition path format, only valid when 'write.datetime.partitioning' is true, default is: 1) 'yyyyMMddHH' for timestamp(3) WITHOUT TIME ZONE, LONG, FLOAT, DOUBLE, DECIMAL; 2) 'yyyyMMdd' for DATE and INT.
`Config Param: PARTITION_FORMAT` | +| [write.tasks](#writetasks) | (N/A) | Parallelism of tasks that do actual write, default is the parallelism of the execution environment
`Config Param: WRITE_TASKS` | +| [clean.policy](#cleanpolicy) | KEEP_LATEST_COMMITS | Clean policy to manage the Hudi table. Available option: KEEP_LATEST_COMMITS, KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_BY_HOURS.Default is KEEP_LATEST_COMMITS.
`Config Param: CLEAN_POLICY` | +| [clean.retain_file_versions](#cleanretain_file_versions) | 5 | Number of file versions to retain. default 5
`Config Param: CLEAN_RETAIN_FILE_VERSIONS` | +| [clean.retain_hours](#cleanretain_hours) | 24 | Number of hours for which commits need to be retained. This config provides a more flexible option ascompared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group, corresponding to commits with commit times older than the configured number of hours to be retained are cleaned.
`Config Param: CLEAN_RETAIN_HOURS` | +| [clustering.delta_commits](#clusteringdelta_commits) | 4 | Max delta commits needed to trigger clustering, default 4 commits
`Config Param: CLUSTERING_DELTA_COMMITS` | +| [clustering.plan.partition.filter.mode](#clusteringplanpartitionfiltermode) | NONE | Partition filter mode used in the creation of clustering plan. Available values are - NONE: do not filter table partition and thus the clustering plan will include all partitions that have clustering candidate.RECENT_DAYS: keep a continuous range of partitions, worked together with configs 'clustering.plan.strategy.daybased.lookback.partitions' and 'clustering.plan.strategy.daybased.skipfromlatest.partitions.SELECTED_PARTITIONS: keep partitions that are in the specified range ['clustering.plan.strategy.cluster.begin.partition', 'clustering.plan.strategy.cluster.end.partition'].DAY_ROLLING: clustering partitions on a rolling basis by the hour to avoid clustering all partitions each time, which strategy sorts the partitions asc and chooses the partition of which index is divided by 24 and the remainder is equal to the current hour.
`Config Param: CLUSTERING_PLAN_PARTITION_FILTER_MODE_NAME` | +| [clustering.plan.strategy.class](#clusteringplanstrategyclass) | org.apache.hudi.client.clustering.plan.strategy.FlinkSizeBasedClusteringPlanStrategy | Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan i.e select what file groups are being clustered. Default strategy, looks at the last N (determined by clustering.plan.strategy.daybased.lookback.partitions) day based partitions picks the small file slices within those partitions.
`Config Param: CLUSTERING_PLAN_STRATEGY_CLASS` | +| [clustering.plan.strategy.cluster.begin.partition](#clusteringplanstrategyclusterbeginpartition) | | Begin partition used to filter partition (inclusive)
`Config Param: CLUSTERING_PLAN_STRATEGY_CLUSTER_BEGIN_PARTITION` | +| [clustering.plan.strategy.cluster.end.partition](#clusteringplanstrategyclusterendpartition) | | End partition used to filter partition (inclusive)
`Config Param: CLUSTERING_PLAN_STRATEGY_CLUSTER_END_PARTITION` | +| [clustering.plan.strategy.daybased.lookback.partitions](#clusteringplanstrategydaybasedlookbackpartitions) | 2 | Number of partitions to list to create ClusteringPlan, default is 2
`Config Param: CLUSTERING_TARGET_PARTITIONS` | +| [clustering.plan.strategy.daybased.skipfromlatest.partitions](#clusteringplanstrategydaybasedskipfromlatestpartitions) | 0 | Number of partitions to skip from latest when choosing partitions to create ClusteringPlan
`Config Param: CLUSTERING_PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST` | +| [clustering.plan.strategy.max.num.groups](#clusteringplanstrategymaxnumgroups) | 30 | Maximum number of groups to create as part of ClusteringPlan. Increasing groups will increase parallelism, default is 30
`Config Param: CLUSTERING_MAX_NUM_GROUPS` | +| [clustering.plan.strategy.partition.regex.pattern](#clusteringplanstrategypartitionregexpattern) | | Filter clustering partitions that matched regex pattern
`Config Param: CLUSTERING_PLAN_STRATEGY_PARTITION_REGEX_PATTERN` | +| [clustering.plan.strategy.partition.selected](#clusteringplanstrategypartitionselected) | | Partitions to run clustering
`Config Param: CLUSTERING_PLAN_STRATEGY_PARTITION_SELECTED` | +| [clustering.plan.strategy.sort.columns](#clusteringplanstrategysortcolumns) | | Columns to sort the data by when clustering
`Config Param: CLUSTERING_SORT_COLUMNS` | +| [clustering.schedule.enabled](#clusteringscheduleenabled) | false | Schedule the cluster plan, default false
`Config Param: CLUSTERING_SCHEDULE_ENABLED` | +| [compaction.delta_seconds](#compactiondelta_seconds) | 3600 | Max delta seconds time needed to trigger compaction, default 1 hour
`Config Param: COMPACTION_DELTA_SECONDS` | +| [compaction.max_memory](#compactionmax_memory) | 100 | Max memory in MB for compaction spillable map, default 100MB
`Config Param: COMPACTION_MAX_MEMORY` | +| [compaction.schedule.enabled](#compactionscheduleenabled) | true | Schedule the compaction plan, enabled by default for MOR
`Config Param: COMPACTION_SCHEDULE_ENABLED` | +| [compaction.target_io](#compactiontarget_io) | 512000 | Target IO in MB for per compaction (both read and write), default 500 GB
`Config Param: COMPACTION_TARGET_IO` | +| [compaction.timeout.seconds](#compactiontimeoutseconds) | 1200 | Max timeout time in seconds for online compaction to rollback, default 20 minutes
`Config Param: COMPACTION_TIMEOUT_SECONDS` | +| [compaction.trigger.strategy](#compactiontriggerstrategy) | num_commits | Strategy to trigger compaction, options are 'num_commits': trigger compaction when there are at least N delta commits after last completed compaction; 'num_commits_after_last_request': trigger compaction when there are at least N delta commits after last completed/requested compaction; 'time_elapsed': trigger compaction when time elapsed > N seconds since last compaction; 'num_and_time': trigger compaction when both NUM_COMMITS and TIME_ELAPSED are satisfied; 'num_or_time': trigger compaction when NUM_COMMITS or TIME_ELAPSED is satisfied. Default is 'num_commits'
`Config Param: COMPACTION_TRIGGER_STRATEGY` | +| [hive_sync.assume_date_partitioning](#hive_syncassume_date_partitioning) | false | Assume partitioning is yyyy/mm/dd, default false
`Config Param: HIVE_SYNC_ASSUME_DATE_PARTITION` | +| [hive_sync.auto_create_db](#hive_syncauto_create_db) | true | Auto create hive database if it does not exists, default true
`Config Param: HIVE_SYNC_AUTO_CREATE_DB` | +| [hive_sync.db](#hive_syncdb) | default | Database name for hive sync, default 'default'
`Config Param: HIVE_SYNC_DB` | +| [hive_sync.file_format](#hive_syncfile_format) | PARQUET | File format for hive sync, default 'PARQUET'
`Config Param: HIVE_SYNC_FILE_FORMAT` | +| [hive_sync.ignore_exceptions](#hive_syncignore_exceptions) | false | Ignore exceptions during hive synchronization, default false
`Config Param: HIVE_SYNC_IGNORE_EXCEPTIONS` | +| [hive_sync.partition_extractor_class](#hive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor | Tool to extract the partition value from HDFS path, default 'MultiPartKeysValueExtractor'
`Config Param: HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME` | +| [hive_sync.partition_fields](#hive_syncpartition_fields) | | Partition fields for hive sync, default ''
`Config Param: HIVE_SYNC_PARTITION_FIELDS` | +| [hive_sync.password](#hive_syncpassword) | hive | Password for hive sync, default 'hive'
`Config Param: HIVE_SYNC_PASSWORD` | +| [hive_sync.skip_ro_suffix](#hive_syncskip_ro_suffix) | false | Skip the _ro suffix for Read optimized table when registering, default false
`Config Param: HIVE_SYNC_SKIP_RO_SUFFIX` | +| [hive_sync.support_timestamp](#hive_syncsupport_timestamp) | true | INT64 with original type TIMESTAMP_MICROS is converted to hive timestamp type. Disabled by default for backward compatibility.
`Config Param: HIVE_SYNC_SUPPORT_TIMESTAMP` | +| [hive_sync.table](#hive_synctable) | unknown | Table name for hive sync, default 'unknown'
`Config Param: HIVE_SYNC_TABLE` | +| [hive_sync.table.strategy](#hive_synctablestrategy) | ALL | Hive table synchronization strategy. Available option: RO, RT, ALL.
`Config Param: HIVE_SYNC_TABLE_STRATEGY` | +| [hive_sync.use_jdbc](#hive_syncuse_jdbc) | true | Use JDBC when hive synchronization is enabled, default true
`Config Param: HIVE_SYNC_USE_JDBC` | +| [hive_sync.username](#hive_syncusername) | hive | Username for hive sync, default 'hive'
`Config Param: HIVE_SYNC_USERNAME` | +| [hoodie.bucket.index.hash.field](#hoodiebucketindexhashfield) | | Index key field. Value to be used as hashing to find the bucket ID. Should be a subset of or equal to the recordKey fields. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: INDEX_KEY_FIELD` | +| [hoodie.bucket.index.num.buckets](#hoodiebucketindexnumbuckets) | 4 | Hudi bucket number per partition. Only affected if using Hudi bucket index.
`Config Param: BUCKET_INDEX_NUM_BUCKETS` | +| [hoodie.datasource.merge.type](#hoodiedatasourcemergetype) | payload_combine | For Snapshot query on merge on read table. Use this key to define how the payloads are merged, in 1) skip_merge: read the base file records plus the log file records without merging; 2) payload_combine: read the base file records first, for each record in base file, checks whether the key is in the log file records (combines the two records with same key for base and log file records), then read the left log file records
`Config Param: MERGE_TYPE` | +| [hoodie.datasource.write.keygenerator.type](#hoodiedatasourcewritekeygeneratortype) | SIMPLE | Key generator type, that implements will extract the key out of incoming record. **Note** This is being actively worked on. Please use `hoodie.datasource.write.keygenerator.class` instead.
`Config Param: KEYGEN_TYPE` | +| [hoodie.datasource.write.partitionpath.urlencode](#hoodiedatasourcewritepartitionpathurlencode) | false | Whether to encode the partition path url, default false
`Config Param: URL_ENCODE_PARTITIONING` | +| [hoodie.index.bucket.engine](#hoodieindexbucketengine) | SIMPLE | Type of bucket index engine. Available options: [SIMPLE | CONSISTENT_HASHING]
`Config Param: BUCKET_INDEX_ENGINE_TYPE` | +| [hoodie.write.table.version](#hoodiewritetableversion) | 8 | Table version produced by this writer.
`Config Param: WRITE_TABLE_VERSION` | +| [index.bootstrap.enabled](#indexbootstrapenabled) | false | Whether to bootstrap the index state from existing hoodie table, default false
`Config Param: INDEX_BOOTSTRAP_ENABLED` | +| [index.global.enabled](#indexglobalenabled) | true | Whether to update index for the old partition path if same key record with different partition path came in, default true
`Config Param: INDEX_GLOBAL_ENABLED` | +| [index.partition.regex](#indexpartitionregex) | .* | Whether to load partitions in state if partition path matching, default `*`
`Config Param: INDEX_PARTITION_REGEX` | +| [index.state.ttl](#indexstatettl) | 0.0 | Index state ttl in days, default stores the index permanently
`Config Param: INDEX_STATE_TTL` | +| [partition.default_name](#partitiondefault_name) | __HIVE_DEFAULT_PARTITION__ | The default partition name in case the dynamic partition column value is null/empty string
`Config Param: PARTITION_DEFAULT_NAME` | +| [payload.class](#payloadclass) | org.apache.hudi.common.model.EventTimeAvroPayload | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for the option in-effective
`Config Param: PAYLOAD_CLASS_NAME` | +| [read.cdc.from.changelog](#readcdcfromchangelog) | true | Whether to consume the delta changes only from the cdc changelog files. When CDC is enabled, i). for COW table, the changelog is generated on each file update; ii). for MOR table, the changelog is generated on compaction. By default, always read from the changelog file, once it is disabled, the reader would infer the changes based on the file slice dependencies.
`Config Param: READ_CDC_FROM_CHANGELOG` | +| [read.data.skipping.enabled](#readdataskippingenabled) | false | Enables data-skipping allowing queries to leverage indexes to reduce the search space by skipping over files
`Config Param: READ_DATA_SKIPPING_ENABLED` | +| [read.streaming.check-interval](#readstreamingcheck-interval) | 60 | Check interval for streaming read of SECOND, default 1 minute
`Config Param: READ_STREAMING_CHECK_INTERVAL` | +| [read.streaming.skip_clustering](#readstreamingskip_clustering) | true | Whether to skip clustering instants to avoid reading base files of clustering operations for streaming read to improve read performance.
`Config Param: READ_STREAMING_SKIP_CLUSTERING` | +| [read.streaming.skip_compaction](#readstreamingskip_compaction) | true | Whether to skip compaction instants and avoid reading compacted base files for streaming read to improve read performance. This option can be used to avoid reading duplicates when changelog mode is enabled, it is a solution to keep data integrity
`Config Param: READ_STREAMING_SKIP_COMPACT` | +| [read.utc-timezone](#readutc-timezone) | true | Use UTC timezone or local timezone to the conversion between epoch time and LocalDateTime. Hive 0.x/1.x/2.x use local timezone. But Hive 3.x use UTC timezone, by default true
`Config Param: READ_UTC_TIMEZONE` | +| [record.merger.impls](#recordmergerimpls) | org.apache.hudi.common.model.HoodieAvroRecordMerger | List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. These merger impls will filter by record.merger.strategy. Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc)
`Config Param: RECORD_MERGER_IMPLS` | +| [record.merger.strategy](#recordmergerstrategy) | eeb8d96f-b1e4-49fd-bbf8-28ac514178e5 | Id of merger strategy. Hudi will pick HoodieRecordMerger implementations in record.merger.impls which has the same merger strategy id
`Config Param: RECORD_MERGER_STRATEGY_ID` | +| [write.batch.size](#writebatchsize) | 256.0 | Batch buffer size in MB to flush data into the underneath filesystem, default 256MB
`Config Param: WRITE_BATCH_SIZE` | +| [write.bulk_insert.shuffle_input](#writebulk_insertshuffle_input) | true | Whether to shuffle the inputs by specific fields for bulk insert tasks, default true
`Config Param: WRITE_BULK_INSERT_SHUFFLE_INPUT` | +| [write.bulk_insert.sort_input](#writebulk_insertsort_input) | true | Whether to sort the inputs by specific fields for bulk insert tasks, default true
`Config Param: WRITE_BULK_INSERT_SORT_INPUT` | +| [write.bulk_insert.sort_input.by_record_key](#writebulk_insertsort_inputby_record_key) | false | Whether to sort the inputs by record keys for bulk insert tasks, default false
`Config Param: WRITE_BULK_INSERT_SORT_INPUT_BY_RECORD_KEY` | +| [write.client.id](#writeclientid) | | Unique identifier used to distinguish different writer pipelines for concurrent mode
`Config Param: WRITE_CLIENT_ID` | +| [write.commit.ack.timeout](#writecommitacktimeout) | -1 | Timeout limit for a writer task after it finishes a checkpoint and waits for the instant commit success, only for internal use
`Config Param: WRITE_COMMIT_ACK_TIMEOUT` | +| [write.ignore.failed](#writeignorefailed) | false | Flag to indicate whether to ignore any non exception error (e.g. writestatus error). within a checkpoint batch. By default false. Turning this on, could hide the write status errors while the flink checkpoint moves ahead. So, would recommend users to use this with caution.
`Config Param: IGNORE_FAILED` | +| [write.insert.cluster](#writeinsertcluster) | false | Whether to merge small files for insert mode, if true, the write throughput will decrease because the read/write of existing small file, only valid for COW table, default false
`Config Param: INSERT_CLUSTER` | +| [write.log.max.size](#writelogmaxsize) | 1024 | Maximum size allowed in MB for a log file before it is rolled over to the next version, default 1GB
`Config Param: WRITE_LOG_MAX_SIZE` | +| [write.log_block.size](#writelog_blocksize) | 128 | Max log block size in MB for log file, default 128MB
`Config Param: WRITE_LOG_BLOCK_SIZE` | +| [write.merge.max_memory](#writemergemax_memory) | 100 | Max memory in MB for merge, default 100MB
`Config Param: WRITE_MERGE_MAX_MEMORY` | +| [write.parquet.block.size](#writeparquetblocksize) | 120 | Parquet RowGroup size. It's recommended to make this large enough that scan costs can be amortized by packing enough column values into a single row group.
`Config Param: WRITE_PARQUET_BLOCK_SIZE` | +| [write.parquet.page.size](#writeparquetpagesize) | 1 | Parquet page size. Page is the unit of read within a parquet file. Within a block, pages are compressed separately.
`Config Param: WRITE_PARQUET_PAGE_SIZE` | +| [write.partition.overwrite.mode](#writepartitionoverwritemode) | STATIC | When INSERT OVERWRITE a partitioned data source table, we currently support 2 modes: static and dynamic. Static mode deletes all the partitions that match the partition specification(e.g. PARTITION(a=1,b)) in the INSERT statement, before overwriting. Dynamic mode doesn't delete partitions ahead, and only overwrite those partitions that have data written into it at runtime. By default we use static mode to keep the same behavior of previous version.
`Config Param: WRITE_PARTITION_OVERWRITE_MODE` | +| [write.precombine](#writeprecombine) | false | Flag to indicate whether to drop duplicates before insert/upsert. By default these cases will accept duplicates, to gain extra performance: 1) insert operation; 2) upsert for MOR table, the MOR table deduplicate on reading
`Config Param: PRE_COMBINE` | +| [write.rate.limit](#writeratelimit) | 0 | Write record rate limit per second to prevent traffic jitter and improve stability, default 0 (no limit)
`Config Param: WRITE_RATE_LIMIT` | +| [write.retry.interval.ms](#writeretryintervalms) | 2000 | Flag to indicate how long (by millisecond) before a retry should issued for failed checkpoint batch. By default 2000 and it will be doubled by every retry
`Config Param: RETRY_INTERVAL_MS` | +| [write.retry.times](#writeretrytimes) | 3 | Flag to indicate how many times streaming job should retry for a failed checkpoint batch. By default 3
`Config Param: RETRY_TIMES` | +| [write.sort.memory](#writesortmemory) | 128 | Sort memory in MB, default 128MB
`Config Param: WRITE_SORT_MEMORY` | +| [write.task.max.size](#writetaskmaxsize) | 1024.0 | Maximum memory in MB for a write task, when the threshold hits, it flushes the max size data bucket to avoid OOM, default 1GB
`Config Param: WRITE_TASK_MAX_SIZE` | +| [write.utc-timezone](#writeutc-timezone) | true | Use UTC timezone or local timezone to the conversion between epoch time and LocalDateTime. Default value is utc timezone for forward compatibility.
`Config Param: WRITE_UTC_TIMEZONE` | +--- + +## Write Client Configs {#WRITE_CLIENT} +Internally, the Hudi datasource uses a RDD based HoodieWriteClient API to actually perform writes to storage. These configs provide deep control over lower level aspects like file sizing, compression, parallelism, compaction, write schema, cleaning etc. Although Hudi provides sane defaults, from time-time these configs may need to be tweaked to optimize for specific workloads. + + +### Common Configurations {#Common-Configurations} +The following set of configurations are common across Hudi. + + + +[**Basic Configs**](#Common-Configurations-basic-configs) + + +| Config Name | Default | Description | +| ----------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.base.path](#hoodiebasepath) | (N/A) | Base path on lake storage, under which all the table data is stored. Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs etc in .hoodie directory under this base path directory.
`Config Param: BASE_PATH` | + +[**Advanced Configs**](#Common-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------- | --------- || +| [as.of.instant](#asofinstant) | (N/A) | The query instant for time travel. Without specified this option, we query the latest snapshot.
`Config Param: TIMESTAMP_AS_OF` | +| [hoodie.memory.compaction.max.size](#hoodiememorycompactionmaxsize) | (N/A) | Maximum amount of memory used in bytes for compaction operations in bytes , before spilling to local storage.
`Config Param: MAX_MEMORY_FOR_COMPACTION` | +| [hoodie.common.diskmap.compression.enabled](#hoodiecommondiskmapcompressionenabled) | true | Turn on compression for BITCASK disk map used by the External Spillable Map
`Config Param: DISK_MAP_BITCASK_COMPRESSION_ENABLED` | +| [hoodie.common.spillable.diskmap.type](#hoodiecommonspillablediskmaptype) | BITCASK | When handling input data that cannot be held in memory, to merge with a file on storage, a spillable diskmap is employed. By default, we use a persistent hashmap based loosely on bitcask, that offers O(1) inserts, lookups. Change this to `ROCKS_DB` to prefer using rocksDB, for handling the spill.
`Config Param: SPILLABLE_DISK_MAP_TYPE` | +| [hoodie.datasource.write.reconcile.schema](#hoodiedatasourcewritereconcileschema) | false | This config controls how writer's schema will be selected based on the incoming batch's schema as well as existing table's one. When schema reconciliation is DISABLED, incoming batch's schema will be picked as a writer-schema (therefore updating table's schema). When schema reconciliation is ENABLED, writer-schema will be picked such that table's schema (after txn) is either kept the same or extended, meaning that we'll always prefer the schema that either adds new columns or stays the same. This enables us, to always extend the table's schema during evolution and never lose the data (when, for ex, existing column is being dropped in a new batch)
`Config Param: RECONCILE_SCHEMA` | +| [hoodie.fs.atomic_creation.support](#hoodiefsatomic_creationsupport) | | This config is used to specify the file system which supports atomic file creation . atomic means that an operation either succeeds and has an effect or has fails and has no effect; now this feature is used by FileSystemLockProvider to guaranteeing that only one writer can create the lock file at a time. since some FS does not support atomic file creation (eg: S3), we decide the FileSystemLockProvider only support HDFS,local FS and View FS as default. if you want to use FileSystemLockProvider with other FS, you can set this config with the FS scheme, eg: fs1,fs2
`Config Param: HOODIE_FS_ATOMIC_CREATION_SUPPORT`
`Since Version: 0.14.0` | +| [hoodie.memory.dfs.buffer.max.size](#hoodiememorydfsbuffermaxsize) | 16777216 | Property to control the max memory in bytes for dfs input stream buffer size
`Config Param: MAX_DFS_STREAM_BUFFER_SIZE` | +| [hoodie.read.timeline.holes.resolution.policy](#hoodiereadtimelineholesresolutionpolicy) | FAIL | When doing incremental queries, there could be hollow commits (requested or inflight commits that are not the latest) that are produced by concurrent writers and could lead to potential data loss. This config allows users to have different ways of handling this situation. The valid values are [FAIL, BLOCK, USE_TRANSITION_TIME]: Use `FAIL` to throw an exception when hollow commit is detected. This is helpful when hollow commits are not expected. Use `BLOCK` to block processing commits from going beyond the hollow ones. This fits the case where waiting for hollow commits to finish is acceptable. Use `USE_TRANSITION_TIME` (experimental) to query commits in range by state transition time (completion time), instead of commit time (start time). Using this mode will result in `begin.instanttime` and `end.instanttime` using `stateTransitionTime` instead of the instant's commit time.
`Config Param: INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT`
`Since Version: 0.14.0` | +| [hoodie.schema.on.read.enable](#hoodieschemaonreadenable) | false | Enables support for Schema Evolution feature
`Config Param: SCHEMA_EVOLUTION_ENABLE` | +| [hoodie.write.set.null.for.missing.columns](#hoodiewritesetnullformissingcolumns) | false | When a nullable column is missing from incoming batch during a write operation, the write operation will fail schema compatibility check. Set this option to true will make the missing column be filled with null values to successfully complete the write operation.
`Config Param: SET_NULL_FOR_MISSING_COLUMNS`
`Since Version: 0.14.1` | +--- + + +### Memory Configurations {#Memory-Configurations} +Controls memory usage for compaction and merges, performed internally by Hudi. + + + +[**Advanced Configs**](#Memory-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.memory.compaction.max.size](#hoodiememorycompactionmaxsize) | (N/A) | Maximum amount of memory used in bytes for compaction operations in bytes , before spilling to local storage.
`Config Param: MAX_MEMORY_FOR_COMPACTION` | +| [hoodie.memory.spillable.map.path](#hoodiememoryspillablemappath) | (N/A) | Default file path for spillable map
`Config Param: SPILLABLE_MAP_BASE_PATH` | +| [hoodie.memory.compaction.fraction](#hoodiememorycompactionfraction) | 0.6 | HoodieCompactedLogScanner reads logblocks, converts records to HoodieRecords and then merges these log blocks and records. At any point, the number of entries in a log block can be less than or equal to the number of entries in the corresponding parquet file. This can lead to OOM in the Scanner. Hence, a spillable map helps alleviate the memory pressure. Use this config to set the max allowable inMemory footprint of the spillable map
`Config Param: MAX_MEMORY_FRACTION_FOR_COMPACTION` | +| [hoodie.memory.dfs.buffer.max.size](#hoodiememorydfsbuffermaxsize) | 16777216 | Property to control the max memory in bytes for dfs input stream buffer size
`Config Param: MAX_DFS_STREAM_BUFFER_SIZE` | +| [hoodie.memory.merge.fraction](#hoodiememorymergefraction) | 0.6 | This fraction is multiplied with the user memory fraction (1 - spark.memory.fraction) to get a final fraction of heap space to use during merge
`Config Param: MAX_MEMORY_FRACTION_FOR_MERGE` | +| [hoodie.memory.merge.max.size](#hoodiememorymergemaxsize) | 1073741824 | Maximum amount of memory used in bytes for merge operations, before spilling to local storage.
`Config Param: MAX_MEMORY_FOR_MERGE` | +| [hoodie.memory.writestatus.failure.fraction](#hoodiememorywritestatusfailurefraction) | 0.1 | Property to control how what fraction of the failed record, exceptions we report back to driver. Default is 10%. If set to 100%, with lot of failures, this can cause memory pressure, cause OOMs and mask actual data errors.
`Config Param: WRITESTATUS_FAILURE_FRACTION` | +--- + + +### Metadata Configs {#Metadata-Configs} +Configurations used by the Hudi Metadata Table. This table maintains the metadata about a given Hudi table (e.g file listings) to avoid overhead of accessing cloud storage, during queries. + + + +[**Basic Configs**](#Metadata-Configs-basic-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metadata.enable](#hoodiemetadataenable) | true | Enable the internal metadata table which serves table metadata like level file listings
`Config Param: ENABLE`
`Since Version: 0.7.0` | +| [hoodie.metadata.index.bloom.filter.enable](#hoodiemetadataindexbloomfilterenable) | false | Enable indexing bloom filters of user data files under metadata table. When enabled, metadata table will have a partition to store the bloom filter index and will be used during the index lookups.
`Config Param: ENABLE_METADATA_INDEX_BLOOM_FILTER`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.enable](#hoodiemetadataindexcolumnstatsenable) | false | Enable indexing column ranges of user data files under metadata table key lookups. When enabled, metadata table will have a partition to store the column ranges and will be used for pruning files during the index lookups.
`Config Param: ENABLE_METADATA_INDEX_COLUMN_STATS`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.expression.enable](#hoodiemetadataindexexpressionenable) | false | Enable expression index within the metadata table. When this configuration property is enabled (`true`), the Hudi writer automatically keeps all expression indexes consistent with the data table. When disabled (`false`), all expression indexes are deleted. Note that individual expression index can only be created through a `CREATE INDEX` and deleted through a `DROP INDEX` statement in Spark SQL.
`Config Param: EXPRESSION_INDEX_ENABLE_PROP`
`Since Version: 1.0.0` | +| [hoodie.metadata.index.partition.stats.enable](#hoodiemetadataindexpartitionstatsenable) | false | Enable aggregating stats for each column at the storage partition level.
`Config Param: ENABLE_METADATA_INDEX_PARTITION_STATS`
`Since Version: 1.0.0` | +| [hoodie.metadata.index.secondary.enable](#hoodiemetadataindexsecondaryenable) | true | Enable secondary index within the metadata table. When this configuration property is enabled (`true`), the Hudi writer automatically keeps all secondary indexes consistent with the data table. When disabled (`false`), all secondary indexes are deleted. Note that individual secondary index can only be created through a `CREATE INDEX` and deleted through a `DROP INDEX` statement in Spark SQL.
`Config Param: SECONDARY_INDEX_ENABLE_PROP`
`Since Version: 1.0.0` | + +[**Advanced Configs**](#Metadata-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------ | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metadata.index.bloom.filter.column.list](#hoodiemetadataindexbloomfiltercolumnlist) | (N/A) | Comma-separated list of columns for which bloom filter index will be built. If not set, only record key will be indexed.
`Config Param: BLOOM_FILTER_INDEX_FOR_COLUMNS`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.column.list](#hoodiemetadataindexcolumnstatscolumnlist) | (N/A) | Comma-separated list of columns for which column stats index will be built. If not set, all columns will be indexed
`Config Param: COLUMN_STATS_INDEX_FOR_COLUMNS`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.processing.mode.override](#hoodiemetadataindexcolumnstatsprocessingmodeoverride) | (N/A) | By default Column Stats Index is automatically determining whether it should be read and processed either'in-memory' (w/in executing process) or using Spark (on a cluster), based on some factors like the size of the Index and how many columns are read. This config allows to override this behavior.
`Config Param: COLUMN_STATS_INDEX_PROCESSING_MODE_OVERRIDE`
`Since Version: 0.12.0` | +| [_hoodie.metadata.ignore.spurious.deletes](#_hoodiemetadataignorespuriousdeletes) | true | There are cases when extra files are requested to be deleted from metadata table which are never added before. This config determines how to handle such spurious deletes
`Config Param: IGNORE_SPURIOUS_DELETES`
`Since Version: 0.10.0` | +| [hoodie.file.listing.parallelism](#hoodiefilelistingparallelism) | 200 | Parallelism to use, when listing the table on lake storage.
`Config Param: FILE_LISTING_PARALLELISM_VALUE`
`Since Version: 0.7.0` | +| [hoodie.metadata.auto.initialize](#hoodiemetadataautoinitialize) | true | Initializes the metadata table by reading from the file system when the table is first created. Enabled by default. Warning: This should only be disabled when manually constructing the metadata table outside of typical Hudi writer flows.
`Config Param: AUTO_INITIALIZE`
`Since Version: 0.14.0` | +| [hoodie.metadata.compact.max.delta.commits](#hoodiemetadatacompactmaxdeltacommits) | 10 | Controls how often the metadata table is compacted.
`Config Param: COMPACT_NUM_DELTA_COMMITS`
`Since Version: 0.7.0` | +| [hoodie.metadata.dir.filter.regex](#hoodiemetadatadirfilterregex) | | Directories matching this regex, will be filtered out when initializing metadata table from lake storage for the first time.
`Config Param: DIR_FILTER_REGEX`
`Since Version: 0.7.0` | +| [hoodie.metadata.index.async](#hoodiemetadataindexasync) | false | Enable asynchronous indexing of metadata table.
`Config Param: ASYNC_INDEX_ENABLE`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.bloom.filter.file.group.count](#hoodiemetadataindexbloomfilterfilegroupcount) | 4 | Metadata bloom filter index partition file group count. This controls the size of the base and log files and read parallelism in the bloom filter index partition. The recommendation is to size the file group count such that the base files are under 1GB.
`Config Param: METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.bloom.filter.parallelism](#hoodiemetadataindexbloomfilterparallelism) | 200 | Parallelism to use for generating bloom filter index in metadata table.
`Config Param: BLOOM_FILTER_INDEX_PARALLELISM`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.check.timeout.seconds](#hoodiemetadataindexchecktimeoutseconds) | 900 | After the async indexer has finished indexing upto the base instant, it will ensure that all inflight writers reliably write index updates as well. If this timeout expires, then the indexer will abort itself safely.
`Config Param: METADATA_INDEX_CHECK_TIMEOUT_SECONDS`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.file.group.count](#hoodiemetadataindexcolumnstatsfilegroupcount) | 2 | Metadata column stats partition file group count. This controls the size of the base and log files and read parallelism in the column stats index partition. The recommendation is to size the file group count such that the base files are under 1GB.
`Config Param: METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.inMemory.projection.threshold](#hoodiemetadataindexcolumnstatsinMemoryprojectionthreshold) | 100000 | When reading Column Stats Index, if the size of the expected resulting projection is below the in-memory threshold (counted by the # of rows), it will be attempted to be loaded "in-memory" (ie not using the execution engine like Spark, Flink, etc). If the value is above the threshold execution engine will be used to compose the projection.
`Config Param: COLUMN_STATS_INDEX_IN_MEMORY_PROJECTION_THRESHOLD`
`Since Version: 0.12.0` | +| [hoodie.metadata.index.column.stats.max.columns.to.index](#hoodiemetadataindexcolumnstatsmaxcolumnstoindex) | 32 | Maximum number of columns to generate column stats for. If the config `hoodie.metadata.index.column.stats.column.list` is set, this config will be ignored. If the config `hoodie.metadata.index.column.stats.column.list` is not set, the column stats of the first `n` columns (`n` defined by this config) in the table schema are generated.
`Config Param: COLUMN_STATS_INDEX_MAX_COLUMNS`
`Since Version: 1.0.0` | +| [hoodie.metadata.index.column.stats.parallelism](#hoodiemetadataindexcolumnstatsparallelism) | 200 | Parallelism to use, when generating column stats index.
`Config Param: COLUMN_STATS_INDEX_PARALLELISM`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.expression.file.group.count](#hoodiemetadataindexexpressionfilegroupcount) | 2 | Metadata expression index partition file group count.
`Config Param: EXPRESSION_INDEX_FILE_GROUP_COUNT`
`Since Version: 1.0.0` | +| [hoodie.metadata.index.expression.parallelism](#hoodiemetadataindexexpressionparallelism) | 200 | Parallelism to use, when generating expression index.
`Config Param: EXPRESSION_INDEX_PARALLELISM`
`Since Version: 1.0.0` | +| [hoodie.metadata.index.partition.stats.file.group.count](#hoodiemetadataindexpartitionstatsfilegroupcount) | 1 | Metadata partition stats file group count. This controls the size of the base and log files and read parallelism in the partition stats index.
`Config Param: METADATA_INDEX_PARTITION_STATS_FILE_GROUP_COUNT`
`Since Version: 1.0.0` | +| [hoodie.metadata.index.partition.stats.parallelism](#hoodiemetadataindexpartitionstatsparallelism) | 200 | Parallelism to use, when generating partition stats index.
`Config Param: PARTITION_STATS_INDEX_PARALLELISM`
`Since Version: 1.0.0` | +| [hoodie.metadata.index.secondary.parallelism](#hoodiemetadataindexsecondaryparallelism) | 200 | Parallelism to use, when generating secondary index.
`Config Param: SECONDARY_INDEX_PARALLELISM`
`Since Version: 1.0.0` | +| [hoodie.metadata.log.compaction.blocks.threshold](#hoodiemetadatalogcompactionblocksthreshold) | 5 | Controls the criteria to log compacted files groups in metadata table.
`Config Param: LOG_COMPACT_BLOCKS_THRESHOLD`
`Since Version: 0.14.0` | +| [hoodie.metadata.log.compaction.enable](#hoodiemetadatalogcompactionenable) | false | This configs enables logcompaction for the metadata table.
`Config Param: ENABLE_LOG_COMPACTION_ON_METADATA_TABLE`
`Since Version: 0.14.0` | +| [hoodie.metadata.max.deltacommits.when_pending](#hoodiemetadatamaxdeltacommitswhen_pending) | 1000 | When there is a pending instant in data table, this config limits the allowed number of deltacommits in metadata table to prevent the metadata table's timeline from growing unboundedly as compaction won't be triggered due to the pending data table instant.
`Config Param: METADATA_MAX_NUM_DELTACOMMITS_WHEN_PENDING`
`Since Version: 0.14.0` | +| [hoodie.metadata.max.init.parallelism](#hoodiemetadatamaxinitparallelism) | 100000 | Maximum parallelism to use when initializing Record Index.
`Config Param: RECORD_INDEX_MAX_PARALLELISM`
`Since Version: 0.14.0` | +| [hoodie.metadata.max.logfile.size](#hoodiemetadatamaxlogfilesize) | 2147483648 | Maximum size in bytes of a single log file. Larger log files can contain larger log blocks thereby reducing the number of blocks to search for keys
`Config Param: MAX_LOG_FILE_SIZE_BYTES_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.max.reader.buffer.size](#hoodiemetadatamaxreaderbuffersize) | 10485760 | Max memory to use for the reader buffer while merging log blocks
`Config Param: MAX_READER_BUFFER_SIZE_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.max.reader.memory](#hoodiemetadatamaxreadermemory) | 1073741824 | Max memory to use for the reader to read from metadata
`Config Param: MAX_READER_MEMORY_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.metrics.enable](#hoodiemetadatametricsenable) | false | Enable publishing of metrics around metadata table.
`Config Param: METRICS_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.metadata.optimized.log.blocks.scan.enable](#hoodiemetadataoptimizedlogblocksscanenable) | false | Optimized log blocks scanner that addresses all the multi-writer use-cases while appending to log files. It also differentiates original blocks written by ingestion writers and compacted blocks written by log compaction.
`Config Param: ENABLE_OPTIMIZED_LOG_BLOCKS_SCAN`
`Since Version: 0.13.0` | +| [hoodie.metadata.record.index.enable](#hoodiemetadatarecordindexenable) | false | Create the HUDI Record Index within the Metadata Table
`Config Param: RECORD_INDEX_ENABLE_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.record.index.growth.factor](#hoodiemetadatarecordindexgrowthfactor) | 2.0 | The current number of records are multiplied by this number when estimating the number of file groups to create automatically. This helps account for growth in the number of records in the dataset.
`Config Param: RECORD_INDEX_GROWTH_FACTOR_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.record.index.max.filegroup.count](#hoodiemetadatarecordindexmaxfilegroupcount) | 10000 | Maximum number of file groups to use for Record Index.
`Config Param: RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.record.index.max.filegroup.size](#hoodiemetadatarecordindexmaxfilegroupsize) | 1073741824 | Maximum size in bytes of a single file group. Large file group takes longer to compact.
`Config Param: RECORD_INDEX_MAX_FILE_GROUP_SIZE_BYTES_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.record.index.min.filegroup.count](#hoodiemetadatarecordindexminfilegroupcount) | 10 | Minimum number of file groups to use for Record Index.
`Config Param: RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.spillable.map.path](#hoodiemetadataspillablemappath) | | Path on local storage to use, when keys read from metadata are held in a spillable map.
`Config Param: SPILLABLE_MAP_DIR_PROP`
`Since Version: 0.14.0` | +--- + + +### Metaserver Configs {#Metaserver-Configs} +Configurations used by the Hudi Metaserver. + + + +[**Advanced Configs**](#Metaserver-Configs-advanced-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------- | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.database.name](#hoodiedatabasename) | (N/A) | Database name. If different databases have the same table name during incremental query, we can set it to limit the table name under a specific database
`Config Param: DATABASE_NAME`
`Since Version: 0.13.0` | +| [hoodie.table.name](#hoodietablename) | (N/A) | Table name that will be used for registering with Hive. Needs to be same across runs.
`Config Param: TABLE_NAME`
`Since Version: 0.13.0` | +| [hoodie.metaserver.connect.retries](#hoodiemetaserverconnectretries) | 3 | Number of retries while opening a connection to metaserver
`Config Param: METASERVER_CONNECTION_RETRIES`
`Since Version: 0.13.0` | +| [hoodie.metaserver.connect.retry.delay](#hoodiemetaserverconnectretrydelay) | 1 | Number of seconds for the client to wait between consecutive connection attempts
`Config Param: METASERVER_CONNECTION_RETRY_DELAY`
`Since Version: 0.13.0` | +| [hoodie.metaserver.enabled](#hoodiemetaserverenabled) | false | Enable Hudi metaserver for storing Hudi tables' metadata.
`Config Param: METASERVER_ENABLE`
`Since Version: 0.13.0` | +| [hoodie.metaserver.uris](#hoodiemetaserveruris) | thrift://localhost:9090 | Metaserver server uris
`Config Param: METASERVER_URLS`
`Since Version: 0.13.0` | +--- + + +### Storage Configs {#Storage-Configs} +Configurations that control aspects around writing, sizing, reading base and log files. + + + +[**Basic Configs**](#Storage-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.parquet.compression.codec](#hoodieparquetcompressioncodec) | gzip | Compression Codec for parquet files
`Config Param: PARQUET_COMPRESSION_CODEC_NAME` | +| [hoodie.parquet.max.file.size](#hoodieparquetmaxfilesize) | 125829120 | Target size in bytes for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.
`Config Param: PARQUET_MAX_FILE_SIZE` | + +[**Advanced Configs**](#Storage-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------- | ------------------------------------------------------------ || +| [hoodie.logfile.data.block.format](#hoodielogfiledatablockformat) | (N/A) | Format of the data block within delta logs. Following formats are currently supported "avro", "hfile", "parquet"
`Config Param: LOGFILE_DATA_BLOCK_FORMAT` | +| [hoodie.parquet.writelegacyformat.enabled](#hoodieparquetwritelegacyformatenabled) | (N/A) | Sets spark.sql.parquet.writeLegacyFormat. If true, data will be written in a way of Spark 1.4 and earlier. For example, decimal values will be written in Parquet's fixed-length byte array format which other systems such as Apache Hive and Apache Impala use. If false, the newer format in Parquet will be used. For example, decimals will be written in int-based format.
`Config Param: PARQUET_WRITE_LEGACY_FORMAT_ENABLED` | +| [hoodie.avro.write.support.class](#hoodieavrowritesupportclass) | org.apache.hudi.avro.HoodieAvroWriteSupport | Provided write support class should extend HoodieAvroWriteSupport class and it is loaded at runtime. This is only required when trying to override the existing write context.
`Config Param: HOODIE_AVRO_WRITE_SUPPORT_CLASS`
`Since Version: 0.14.0` | +| [hoodie.bloom.index.filter.dynamic.max.entries](#hoodiebloomindexfilterdynamicmaxentries) | 100000 | The threshold for the maximum number of keys to record in a dynamic Bloom filter row. Only applies if filter type is BloomFilterTypeCode.DYNAMIC_V0.
`Config Param: BLOOM_FILTER_DYNAMIC_MAX_ENTRIES` | +| [hoodie.bloom.index.filter.type](#hoodiebloomindexfiltertype) | DYNAMIC_V0 | org.apache.hudi.common.bloom.BloomFilterTypeCode: Filter type used by Bloom filter. SIMPLE: Bloom filter that is based on the configured size. DYNAMIC_V0(default): Bloom filter that is auto sized based on number of keys.
`Config Param: BLOOM_FILTER_TYPE` | +| [hoodie.hfile.block.size](#hoodiehfileblocksize) | 1048576 | Lower values increase the size in bytes of metadata tracked within HFile, but can offer potentially faster lookup times.
`Config Param: HFILE_BLOCK_SIZE` | +| [hoodie.hfile.compression.algorithm](#hoodiehfilecompressionalgorithm) | GZ | Compression codec to use for hfile base files.
`Config Param: HFILE_COMPRESSION_ALGORITHM_NAME` | +| [hoodie.hfile.max.file.size](#hoodiehfilemaxfilesize) | 125829120 | Target file size in bytes for HFile base files.
`Config Param: HFILE_MAX_FILE_SIZE` | +| [hoodie.index.bloom.fpp](#hoodieindexbloomfpp) | 0.000000001 | Only applies if index type is BLOOM. Error rate allowed given the number of entries. This is used to calculate how many bits should be assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), we like to tradeoff disk space for lower false positives. If the number of entries added to bloom filter exceeds the configured value (hoodie.index.bloom.num_entries), then this fpp may not be honored.
`Config Param: BLOOM_FILTER_FPP_VALUE` | +| [hoodie.index.bloom.num_entries](#hoodieindexbloomnum_entries) | 60000 | Only applies if index type is BLOOM. This is the number of entries to be stored in the bloom filter. The rationale for the default: Assume the maxParquetFileSize is 128MB and averageRecordSize is 1kb and hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. Warning: Setting this very low, will generate a lot of false positives and index lookup will have to scan a lot more files than it has to and setting this to a very high number will increase the size every base file linearly (roughly 4KB for every 50000 entries). This config is also used with DYNAMIC bloom filter which determines the initial size for the bloom.
`Config Param: BLOOM_FILTER_NUM_ENTRIES_VALUE` | +| [hoodie.io.factory.class](#hoodieiofactoryclass) | org.apache.hudi.io.hadoop.HoodieHadoopIOFactory | The fully-qualified class name of the factory class to return readers and writers of files used by Hudi. The provided class should implement `org.apache.hudi.io.storage.HoodieIOFactory`.
`Config Param: HOODIE_IO_FACTORY_CLASS`
`Since Version: 0.15.0` | +| [hoodie.logfile.data.block.max.size](#hoodielogfiledatablockmaxsize) | 268435456 | LogFile Data block max size in bytes. This is the maximum size allowed for a single data block to be appended to a log file. This helps to make sure the data appended to the log file is broken up into sizable blocks to prevent from OOM errors. This size should be greater than the JVM memory.
`Config Param: LOGFILE_DATA_BLOCK_MAX_SIZE` | +| [hoodie.logfile.max.size](#hoodielogfilemaxsize) | 1073741824 | LogFile max size in bytes. This is the maximum size allowed for a log file before it is rolled over to the next version.
`Config Param: LOGFILE_MAX_SIZE` | +| [hoodie.logfile.to.parquet.compression.ratio](#hoodielogfiletoparquetcompressionratio) | 0.35 | Expected additional compression as records move from log files to parquet. Used for merge_on_read table to send inserts into log files & control the size of compacted parquet file.
`Config Param: LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION` | +| [hoodie.orc.block.size](#hoodieorcblocksize) | 125829120 | ORC block size, recommended to be aligned with the target file size.
`Config Param: ORC_BLOCK_SIZE` | +| [hoodie.orc.compression.codec](#hoodieorccompressioncodec) | ZLIB | Compression codec to use for ORC base files.
`Config Param: ORC_COMPRESSION_CODEC_NAME` | +| [hoodie.orc.max.file.size](#hoodieorcmaxfilesize) | 125829120 | Target file size in bytes for ORC base files.
`Config Param: ORC_FILE_MAX_SIZE` | +| [hoodie.orc.stripe.size](#hoodieorcstripesize) | 67108864 | Size of the memory buffer in bytes for writing
`Config Param: ORC_STRIPE_SIZE` | +| [hoodie.parquet.block.size](#hoodieparquetblocksize) | 125829120 | Parquet RowGroup size in bytes. It's recommended to make this large enough that scan costs can be amortized by packing enough column values into a single row group.
`Config Param: PARQUET_BLOCK_SIZE` | +| [hoodie.parquet.bloom.filter.enabled](#hoodieparquetbloomfilterenabled) | true | Control whether to write bloom filter or not. Default true. We can set to false in non bloom index cases for CPU resource saving.
`Config Param: PARQUET_WITH_BLOOM_FILTER_ENABLED`
`Since Version: 0.15.0` | +| [hoodie.parquet.compression.ratio](#hoodieparquetcompressionratio) | 0.1 | Expected compression of parquet data used by Hudi, when it tries to size new parquet files. Increase this value, if bulk_insert is producing smaller than expected sized files
`Config Param: PARQUET_COMPRESSION_RATIO_FRACTION` | +| [hoodie.parquet.dictionary.enabled](#hoodieparquetdictionaryenabled) | true | Whether to use dictionary encoding
`Config Param: PARQUET_DICTIONARY_ENABLED` | +| [hoodie.parquet.field_id.write.enabled](#hoodieparquetfield_idwriteenabled) | true | Would only be effective with Spark 3.3+. Sets spark.sql.parquet.fieldId.write.enabled. If enabled, Spark will write out parquet native field ids that are stored inside StructField's metadata as parquet.field.id to parquet files.
`Config Param: PARQUET_FIELD_ID_WRITE_ENABLED`
`Since Version: 0.12.0` | +| [hoodie.parquet.outputtimestamptype](#hoodieparquetoutputtimestamptype) | TIMESTAMP_MICROS | Sets spark.sql.parquet.outputTimestampType. Parquet timestamp type to use when Spark writes data to Parquet files.
`Config Param: PARQUET_OUTPUT_TIMESTAMP_TYPE` | +| [hoodie.parquet.page.size](#hoodieparquetpagesize) | 1048576 | Parquet page size in bytes. Page is the unit of read within a parquet file. Within a block, pages are compressed separately.
`Config Param: PARQUET_PAGE_SIZE` | +| [hoodie.parquet.spark.row.write.support.class](#hoodieparquetsparkrowwritesupportclass) | org.apache.hudi.io.storage.row.HoodieRowParquetWriteSupport | Provided write support class should extend HoodieRowParquetWriteSupport class and it is loaded at runtime. This is only required when trying to override the existing write context when `hoodie.datasource.write.row.writer.enable=true`.
`Config Param: HOODIE_PARQUET_SPARK_ROW_WRITE_SUPPORT_CLASS`
`Since Version: 0.15.0` | +| [hoodie.storage.class](#hoodiestorageclass) | org.apache.hudi.storage.hadoop.HoodieHadoopStorage | The fully-qualified class name of the `HoodieStorage` implementation class to instantiate. The provided class should implement `org.apache.hudi.storage.HoodieStorage`
`Config Param: HOODIE_STORAGE_CLASS`
`Since Version: 0.15.0` | +--- + + +### Consistency Guard Configurations {#Consistency-Guard-Configurations} +The consistency guard related config options, to help talk to eventually consistent object storage.(Tip: S3 is NOT eventually consistent anymore!) + + + +[**Advanced Configs**](#Consistency-Guard-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [_hoodie.optimistic.consistency.guard.enable](#_hoodieoptimisticconsistencyguardenable) | false | Enable consistency guard, which optimistically assumes consistency is achieved after a certain time period.
`Config Param: OPTIMISTIC_CONSISTENCY_GUARD_ENABLE`
`Since Version: 0.6.0` | +| [hoodie.consistency.check.enabled](#hoodieconsistencycheckenabled) | false | Enabled to handle S3 eventual consistency issue. This property is no longer required since S3 is now strongly consistent. Will be removed in the future releases.
`Config Param: ENABLE`
`Since Version: 0.5.0`
`Deprecated since: 0.7.0` | +| [hoodie.consistency.check.initial_interval_ms](#hoodieconsistencycheckinitial_interval_ms) | 400 | Amount of time (in ms) to wait, before checking for consistency after an operation on storage.
`Config Param: INITIAL_CHECK_INTERVAL_MS`
`Since Version: 0.5.0`
`Deprecated since: 0.7.0` | +| [hoodie.consistency.check.max_checks](#hoodieconsistencycheckmax_checks) | 6 | Maximum number of consistency checks to perform, with exponential backoff.
`Config Param: MAX_CHECKS`
`Since Version: 0.5.0`
`Deprecated since: 0.7.0` | +| [hoodie.consistency.check.max_interval_ms](#hoodieconsistencycheckmax_interval_ms) | 20000 | Maximum amount of time (in ms), to wait for consistency checking.
`Config Param: MAX_CHECK_INTERVAL_MS`
`Since Version: 0.5.0`
`Deprecated since: 0.7.0` | +| [hoodie.optimistic.consistency.guard.sleep_time_ms](#hoodieoptimisticconsistencyguardsleep_time_ms) | 500 | Amount of time (in ms), to wait after which we assume storage is consistent.
`Config Param: OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS`
`Since Version: 0.6.0` | +--- + + +### FileSystem Guard Configurations {#FileSystem-Guard-Configurations} +The filesystem retry related config options, to help deal with runtime exception like list/get/put/delete performance issues. + + + +[**Advanced Configs**](#FileSystem-Guard-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.filesystem.operation.retry.enable](#hoodiefilesystemoperationretryenable) | false | Enabled to handle list/get/delete etc file system performance issue.
`Config Param: FILESYSTEM_RETRY_ENABLE`
`Since Version: 0.11.0` | +| [hoodie.filesystem.operation.retry.exceptions](#hoodiefilesystemoperationretryexceptions) | | The class name of the Exception that needs to be retried, separated by commas. Default is empty which means retry all the IOException and RuntimeException from FileSystem
`Config Param: RETRY_EXCEPTIONS`
`Since Version: 0.11.0` | +| [hoodie.filesystem.operation.retry.initial_interval_ms](#hoodiefilesystemoperationretryinitial_interval_ms) | 100 | Amount of time (in ms) to wait, before retry to do operations on storage.
`Config Param: INITIAL_RETRY_INTERVAL_MS`
`Since Version: 0.11.0` | +| [hoodie.filesystem.operation.retry.max_interval_ms](#hoodiefilesystemoperationretrymax_interval_ms) | 2000 | Maximum amount of time (in ms), to wait for next retry.
`Config Param: MAX_RETRY_INTERVAL_MS`
`Since Version: 0.11.0` | +| [hoodie.filesystem.operation.retry.max_numbers](#hoodiefilesystemoperationretrymax_numbers) | 4 | Maximum number of retry actions to perform, with exponential backoff.
`Config Param: MAX_RETRY_NUMBERS`
`Since Version: 0.11.0` | +--- + + +### File System View Storage Configurations {#File-System-View-Storage-Configurations} +Configurations that control how file metadata is stored by Hudi, for transaction processing and queries. + + + +[**Advanced Configs**](#File-System-View-Storage-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.filesystem.remote.backup.view.enable](#hoodiefilesystemremotebackupviewenable) | true | Config to control whether backup needs to be configured if clients were not able to reach timeline service.
`Config Param: REMOTE_BACKUP_VIEW_ENABLE` | +| [hoodie.filesystem.view.incr.timeline.sync.enable](#hoodiefilesystemviewincrtimelinesyncenable) | false | Controls whether or not, the file system view is incrementally updated as new actions are performed on the timeline.
`Config Param: INCREMENTAL_TIMELINE_SYNC_ENABLE` | +| [hoodie.filesystem.view.remote.host](#hoodiefilesystemviewremotehost) | localhost | We expect this to be rarely hand configured.
`Config Param: REMOTE_HOST_NAME` | +| [hoodie.filesystem.view.remote.port](#hoodiefilesystemviewremoteport) | 26754 | Port to serve file system view queries, when remote. We expect this to be rarely hand configured.
`Config Param: REMOTE_PORT_NUM` | +| [hoodie.filesystem.view.remote.retry.enable](#hoodiefilesystemviewremoteretryenable) | false | Whether to enable API request retry for remote file system view.
`Config Param: REMOTE_RETRY_ENABLE`
`Since Version: 0.12.1` | +| [hoodie.filesystem.view.remote.retry.exceptions](#hoodiefilesystemviewremoteretryexceptions) | | The class name of the Exception that needs to be retried, separated by commas. Default is empty which means retry all the IOException and RuntimeException from Remote Request.
`Config Param: RETRY_EXCEPTIONS`
`Since Version: 0.12.1` | +| [hoodie.filesystem.view.remote.retry.initial_interval_ms](#hoodiefilesystemviewremoteretryinitial_interval_ms) | 100 | Amount of time (in ms) to wait, before retry to do operations on storage.
`Config Param: REMOTE_INITIAL_RETRY_INTERVAL_MS`
`Since Version: 0.12.1` | +| [hoodie.filesystem.view.remote.retry.max_interval_ms](#hoodiefilesystemviewremoteretrymax_interval_ms) | 2000 | Maximum amount of time (in ms), to wait for next retry.
`Config Param: REMOTE_MAX_RETRY_INTERVAL_MS`
`Since Version: 0.12.1` | +| [hoodie.filesystem.view.remote.retry.max_numbers](#hoodiefilesystemviewremoteretrymax_numbers) | 3 | Maximum number of retry for API requests against a remote file system view. e.g timeline server.
`Config Param: REMOTE_MAX_RETRY_NUMBERS`
`Since Version: 0.12.1` | +| [hoodie.filesystem.view.remote.timeout.secs](#hoodiefilesystemviewremotetimeoutsecs) | 300 | Timeout in seconds, to wait for API requests against a remote file system view. e.g timeline server.
`Config Param: REMOTE_TIMEOUT_SECS` | +| [hoodie.filesystem.view.rocksdb.base.path](#hoodiefilesystemviewrocksdbbasepath) | /tmp/hoodie_timeline_rocksdb | Path on local storage to use, when storing file system view in embedded kv store/rocksdb.
`Config Param: ROCKSDB_BASE_PATH` | +| [hoodie.filesystem.view.secondary.type](#hoodiefilesystemviewsecondarytype) | MEMORY | Specifies the secondary form of storage for file system view, if the primary (e.g timeline server) is unavailable.
`Config Param: SECONDARY_VIEW_TYPE` | +| [hoodie.filesystem.view.spillable.bootstrap.base.file.mem.fraction](#hoodiefilesystemviewspillablebootstrapbasefilememfraction) | 0.05 | Fraction of the file system view memory, to be used for holding mapping to bootstrap base files.
`Config Param: BOOTSTRAP_BASE_FILE_MEM_FRACTION` | +| [hoodie.filesystem.view.spillable.clustering.mem.fraction](#hoodiefilesystemviewspillableclusteringmemfraction) | 0.02 | Fraction of the file system view memory, to be used for holding clustering related metadata.
`Config Param: SPILLABLE_CLUSTERING_MEM_FRACTION` | +| [hoodie.filesystem.view.spillable.compaction.mem.fraction](#hoodiefilesystemviewspillablecompactionmemfraction) | 0.1 | Fraction of the file system view memory, to be used for holding compaction related metadata.
`Config Param: SPILLABLE_COMPACTION_MEM_FRACTION` | +| [hoodie.filesystem.view.spillable.dir](#hoodiefilesystemviewspillabledir) | /tmp/ | Path on local storage to use, when file system view is held in a spillable map.
`Config Param: SPILLABLE_DIR` | +| [hoodie.filesystem.view.spillable.log.compaction.mem.fraction](#hoodiefilesystemviewspillablelogcompactionmemfraction) | 0.02 | Fraction of the file system view memory, to be used for holding log compaction related metadata.
`Config Param: SPILLABLE_LOG_COMPACTION_MEM_FRACTION`
`Since Version: 0.13.0` | +| [hoodie.filesystem.view.spillable.mem](#hoodiefilesystemviewspillablemem) | 104857600 | Amount of memory to be used in bytes for holding file system view, before spilling to disk.
`Config Param: SPILLABLE_MEMORY` | +| [hoodie.filesystem.view.spillable.replaced.mem.fraction](#hoodiefilesystemviewspillablereplacedmemfraction) | 0.05 | Fraction of the file system view memory, to be used for holding replace commit related metadata.
`Config Param: SPILLABLE_REPLACED_MEM_FRACTION` | +| [hoodie.filesystem.view.type](#hoodiefilesystemviewtype) | MEMORY | File system view provides APIs for viewing the files on the underlying lake storage, as file groups and file slices. This config controls how such a view is held. Options include MEMORY,SPILLABLE_DISK,EMBEDDED_KV_STORE,REMOTE_ONLY,REMOTE_FIRST which provide different trade offs for memory usage and API request performance.
`Config Param: VIEW_TYPE` | +--- + + +### Archival Configs {#Archival-Configs} +Configurations that control archival. + + + +[**Basic Configs**](#Archival-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------ | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.keep.max.commits](#hoodiekeepmaxcommits) | 30 | Archiving service moves older entries from timeline into an archived log after each write, to keep the metadata overhead constant, even as the table size grows. This config controls the maximum number of instants to retain in the active timeline.
`Config Param: MAX_COMMITS_TO_KEEP` | +| [hoodie.keep.min.commits](#hoodiekeepmincommits) | 20 | Similar to hoodie.keep.max.commits, but controls the minimum number of instants to retain in the active timeline.
`Config Param: MIN_COMMITS_TO_KEEP` | + +[**Advanced Configs**](#Archival-Configs-advanced-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------- | ------- || +| [hoodie.archive.async](#hoodiearchiveasync) | false | Only applies when hoodie.archive.automatic is turned on. When turned on runs archiver async with writing, which can speed up overall write performance.
`Config Param: ASYNC_ARCHIVE`
`Since Version: 0.11.0` | +| [hoodie.archive.automatic](#hoodiearchiveautomatic) | true | When enabled, the archival table service is invoked immediately after each commit, to archive commits if we cross a maximum value of commits. It's recommended to enable this, to ensure number of active commits is bounded.
`Config Param: AUTO_ARCHIVE` | +| [hoodie.archive.beyond.savepoint](#hoodiearchivebeyondsavepoint) | false | If enabled, archival will proceed beyond savepoint, skipping savepoint commits. If disabled, archival will stop at the earliest savepoint commit.
`Config Param: ARCHIVE_BEYOND_SAVEPOINT`
`Since Version: 0.12.0` | +| [hoodie.archive.delete.parallelism](#hoodiearchivedeleteparallelism) | 100 | When performing archival operation, Hudi needs to delete the files of the archived instants in the active timeline in .hoodie folder. The file deletion also happens after merging small archived files into larger ones if enabled. This config limits the Spark parallelism for deleting files in both cases, i.e., parallelism of deleting files does not go above the configured value and the parallelism is the number of files to delete if smaller than the configured value. If you see that the file deletion in archival operation is slow because of the limited parallelism, you can increase this to tune the performance.
`Config Param: DELETE_ARCHIVED_INSTANT_PARALLELISM_VALUE` | +| [hoodie.commits.archival.batch](#hoodiecommitsarchivalbatch) | 10 | Archiving of instants is batched in best-effort manner, to pack more instants into a single archive log. This config controls such archival batch size.
`Config Param: COMMITS_ARCHIVAL_BATCH_SIZE` | +| [hoodie.timeline.compaction.batch.size](#hoodietimelinecompactionbatchsize) | 10 | The number of small files to compact at once.
`Config Param: TIMELINE_COMPACTION_BATCH_SIZE` | +--- + + +### Bootstrap Configs {#Bootstrap-Configs} +Configurations that control how you want to bootstrap your existing tables for the first time into hudi. The bootstrap operation can flexibly avoid copying data over before you can use Hudi and support running the existing writers and new hudi writers in parallel, to validate the migration. + + + +[**Basic Configs**](#Bootstrap-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.bootstrap.base.path](#hoodiebootstrapbasepath) | (N/A) | Base path of the dataset that needs to be bootstrapped as a Hudi table
`Config Param: BASE_PATH`
`Since Version: 0.6.0` | + +[**Advanced Configs**](#Bootstrap-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- || +| [hoodie.bootstrap.data.queries.only](#hoodiebootstrapdataqueriesonly) | false | Improves query performance, but queries cannot use hudi metadata fields
`Config Param: DATA_QUERIES_ONLY`
`Since Version: 0.14.0` | +| [hoodie.bootstrap.full.input.provider](#hoodiebootstrapfullinputprovider) | org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider | Class to use for reading the bootstrap dataset partitions/files, for Bootstrap mode FULL_RECORD
`Config Param: FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.index.class](#hoodiebootstrapindexclass) | org.apache.hudi.common.bootstrap.index.hfile.HFileBootstrapIndex | Implementation to use, for mapping a skeleton base file to a bootstrap base file.
`Config Param: INDEX_CLASS_NAME`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.mode.selector](#hoodiebootstrapmodeselector) | org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector | Selects the mode in which each file/partition in the bootstrapped dataset gets bootstrapped
`Config Param: MODE_SELECTOR_CLASS_NAME`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.mode.selector.regex](#hoodiebootstrapmodeselectorregex) | .* | Matches each bootstrap dataset partition against this regex and applies the mode below to it.
`Config Param: PARTITION_SELECTOR_REGEX_PATTERN`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.mode.selector.regex.mode](#hoodiebootstrapmodeselectorregexmode) | METADATA_ONLY | org.apache.hudi.client.bootstrap.BootstrapMode: Bootstrap mode for importing an existing table into Hudi FULL_RECORD: In this mode, the full record data is copied into hudi and metadata columns are added. A full record bootstrap is functionally equivalent to a bulk-insert. After a full record bootstrap, Hudi will function properly even if the original table is modified or deleted. METADATA_ONLY(default): In this mode, the full record data is not copied into Hudi therefore it avoids full cost of rewriting the dataset. Instead, 'skeleton' files containing just the corresponding metadata columns are added to the Hudi table. Hudi relies on the data in the original table and will face data-loss or corruption if files in the original table location are deleted or modified.
`Config Param: PARTITION_SELECTOR_REGEX_MODE`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.parallelism](#hoodiebootstrapparallelism) | 1500 | For metadata-only bootstrap, Hudi parallelizes the operation so that each table partition is handled by one Spark task. This config limits the number of parallelism. We pick the configured parallelism if the number of table partitions is larger than this configured value. The parallelism is assigned to the number of table partitions if it is smaller than the configured value. For full-record bootstrap, i.e., BULK_INSERT operation of the records, this configured value is passed as the BULK_INSERT shuffle parallelism (`hoodie.bulkinsert.shuffle.parallelism`), determining the BULK_INSERT write behavior. If you see that the bootstrap is slow due to the limited parallelism, you can increase this.
`Config Param: PARALLELISM_VALUE`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.partitionpath.translator.class](#hoodiebootstrappartitionpathtranslatorclass) | org.apache.hudi.client.bootstrap.translator.IdentityBootstrapPartitionPathTranslator | Translates the partition paths from the bootstrapped data into how is laid out as a Hudi table.
`Config Param: PARTITION_PATH_TRANSLATOR_CLASS_NAME`
`Since Version: 0.6.0` | +--- + + +### Clean Configs {#Clean-Configs} +Cleaning (reclamation of older/unused file groups/slices). + + + +[**Basic Configs**](#Clean-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------ | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.clean.async.enabled](#hoodiecleanasyncenabled) | false | Only applies when hoodie.clean.automatic is turned on. When turned on runs cleaner async with writing, which can speed up overall write performance.
`Config Param: ASYNC_CLEAN` | +| [hoodie.clean.commits.retained](#hoodiecleancommitsretained) | 10 | When KEEP_LATEST_COMMITS cleaning policy is used, the number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much data retention the table supports for incremental queries.
`Config Param: CLEANER_COMMITS_RETAINED` | + +[**Advanced Configs**](#Clean-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------ | -------------------- || +| [hoodie.clean.automatic](#hoodiecleanautomatic) | true | When enabled, the cleaner table service is invoked immediately after each commit, to delete older file slices. It's recommended to enable this, to ensure metadata and data storage growth is bounded.
`Config Param: AUTO_CLEAN` | +| [hoodie.clean.delete.bootstrap.base.file](#hoodiecleandeletebootstrapbasefile) | false | When set to true, cleaner also deletes the bootstrap base file when it's skeleton base file is cleaned. Turn this to true, if you want to ensure the bootstrap dataset storage is reclaimed over time, as the table receives updates/deletes. Another reason to turn this on, would be to ensure data residing in bootstrap base files are also physically deleted, to comply with data privacy enforcement processes.
`Config Param: CLEANER_BOOTSTRAP_BASE_FILE_ENABLE` | +| [hoodie.clean.failed.writes.policy](#hoodiecleanfailedwritespolicy) | EAGER | org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy: Policy that controls how to clean up failed writes. Hudi will delete any files written by failed writes to re-claim space. EAGER(default): Clean failed writes inline after every write operation. LAZY: Clean failed writes lazily after heartbeat timeout when the cleaning service runs. This policy is required when multi-writers are enabled. NEVER: Never clean failed writes.
`Config Param: FAILED_WRITES_CLEANER_POLICY` | +| [hoodie.clean.fileversions.retained](#hoodiecleanfileversionsretained) | 3 | When KEEP_LATEST_FILE_VERSIONS cleaning policy is used, the minimum number of file slices to retain in each file group, during cleaning.
`Config Param: CLEANER_FILE_VERSIONS_RETAINED` | +| [hoodie.clean.hours.retained](#hoodiecleanhoursretained) | 24 | When KEEP_LATEST_BY_HOURS cleaning policy is used, the number of hours for which commits need to be retained. This config provides a more flexible option as compared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group, corresponding to commits with commit times older than the configured number of hours to be retained are cleaned.
`Config Param: CLEANER_HOURS_RETAINED` | +| [hoodie.clean.incremental.enabled](#hoodiecleanincrementalenabled) | true | When enabled, the plans for each cleaner service run is computed incrementally off the events in the timeline, since the last cleaner run. This is much more efficient than obtaining listings for the full table for each planning (even with a metadata table).
`Config Param: CLEANER_INCREMENTAL_MODE_ENABLE` | +| [hoodie.clean.multiple.enabled](#hoodiecleanmultipleenabled) | false | Allows scheduling/executing multiple cleans by enabling this config. If users prefer to strictly ensure clean requests should be mutually exclusive, .i.e. a 2nd clean will not be scheduled if another clean is not yet completed to avoid repeat cleaning of same files, they might want to disable this config.
`Config Param: ALLOW_MULTIPLE_CLEANS`
`Since Version: 0.11.0`
`Deprecated since: 0.15.0` | +| [hoodie.clean.parallelism](#hoodiecleanparallelism) | 200 | This config controls the behavior of both the cleaning plan and cleaning execution. Deriving the cleaning plan is parallelized at the table partition level, i.e., each table partition is processed by one Spark task to figure out the files to clean. The cleaner picks the configured parallelism if the number of table partitions is larger than this configured value. The parallelism is assigned to the number of table partitions if it is smaller than the configured value. The clean execution, i.e., the file deletion, is parallelized at file level, which is the unit of Spark task distribution. Similarly, the actual parallelism cannot exceed the configured value if the number of files is larger. If cleaning plan or execution is slow due to limited parallelism, you can increase this to tune the performance..
`Config Param: CLEANER_PARALLELISM_VALUE` | +| [hoodie.clean.policy](#hoodiecleanpolicy) | KEEP_LATEST_COMMITS | org.apache.hudi.common.model.HoodieCleaningPolicy: Cleaning policy to be used. The cleaner service deletes older file slices files to re-claim space. Long running query plans may often refer to older file slices and will break if those are cleaned, before the query has had a chance to run. So, it is good to make sure that the data is retained for more than the maximum query execution time. By default, the cleaning policy is determined based on one of the following configs explicitly set by the user (at most one of them can be set; otherwise, KEEP_LATEST_COMMITS cleaning policy is used). KEEP_LATEST_FILE_VERSIONS: keeps the last N versions of the file slices written; used when "hoodie.clean.fileversions.retained" is explicitly set only. KEEP_LATEST_COMMITS(default): keeps the file slices written by the last N commits; used when "hoodie.clean.commits.retained" is explicitly set only. KEEP_LATEST_BY_HOURS: keeps the file slices written in the last N hours based on the commit time; used when "hoodie.clean.hours.retained" is explicitly set only.
`Config Param: CLEANER_POLICY` | +| [hoodie.clean.trigger.max.commits](#hoodiecleantriggermaxcommits) | 1 | Number of commits after the last clean operation, before scheduling of a new clean is attempted.
`Config Param: CLEAN_MAX_COMMITS` | +| [hoodie.clean.trigger.strategy](#hoodiecleantriggerstrategy) | NUM_COMMITS | org.apache.hudi.table.action.clean.CleaningTriggerStrategy: Controls when cleaning is scheduled. NUM_COMMITS(default): Trigger the cleaning service every N commits, determined by `hoodie.clean.trigger.max.commits`.
`Config Param: CLEAN_TRIGGER_STRATEGY` | +--- + + +### Clustering Configs {#Clustering-Configs} +Configurations that control the clustering table service in hudi, which optimizes the storage layout for better query performance by sorting and sizing data files. + + + +[**Basic Configs**](#Clustering-Configs-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false | Enable running of clustering service, asynchronously as inserts happen on the table.
`Config Param: ASYNC_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.clustering.inline](#hoodieclusteringinline) | false | Turn on inline clustering - clustering will be run after each write operation is complete
`Config Param: INLINE_CLUSTERING`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.small.file.limit](#hoodieclusteringplanstrategysmallfilelimit) | 314572800 | Files smaller than the size in bytes specified here are candidates for clustering
`Config Param: PLAN_STRATEGY_SMALL_FILE_LIMIT`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.target.file.max.bytes](#hoodieclusteringplanstrategytargetfilemaxbytes) | 1073741824 | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups
`Config Param: PLAN_STRATEGY_TARGET_FILE_MAX_BYTES`
`Since Version: 0.7.0` | + +[**Advanced Configs**](#Clustering-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- || +| [hoodie.clustering.plan.strategy.cluster.begin.partition](#hoodieclusteringplanstrategyclusterbeginpartition) | (N/A) | Begin partition used to filter partition (inclusive), only effective when the filter mode 'hoodie.clustering.plan.partition.filter.mode' is SELECTED_PARTITIONS
`Config Param: PARTITION_FILTER_BEGIN_PARTITION`
`Since Version: 0.11.0` | +| [hoodie.clustering.plan.strategy.cluster.end.partition](#hoodieclusteringplanstrategyclusterendpartition) | (N/A) | End partition used to filter partition (inclusive), only effective when the filter mode 'hoodie.clustering.plan.partition.filter.mode' is SELECTED_PARTITIONS
`Config Param: PARTITION_FILTER_END_PARTITION`
`Since Version: 0.11.0` | +| [hoodie.clustering.plan.strategy.partition.regex.pattern](#hoodieclusteringplanstrategypartitionregexpattern) | (N/A) | Filter clustering partitions that matched regex pattern
`Config Param: PARTITION_REGEX_PATTERN`
`Since Version: 0.11.0` | +| [hoodie.clustering.plan.strategy.partition.selected](#hoodieclusteringplanstrategypartitionselected) | (N/A) | Partitions to run clustering
`Config Param: PARTITION_SELECTED`
`Since Version: 0.11.0` | +| [hoodie.clustering.plan.strategy.sort.columns](#hoodieclusteringplanstrategysortcolumns) | (N/A) | Columns to sort the data by when clustering
`Config Param: PLAN_STRATEGY_SORT_COLUMNS`
`Since Version: 0.7.0` | +| [hoodie.clustering.async.max.commits](#hoodieclusteringasyncmaxcommits) | 4 | Config to control frequency of async clustering
`Config Param: ASYNC_CLUSTERING_MAX_COMMITS`
`Since Version: 0.9.0` | +| [hoodie.clustering.execution.strategy.class](#hoodieclusteringexecutionstrategyclass) | org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy | Config to provide a strategy class (subclass of RunClusteringStrategy) to define how the clustering plan is executed. By default, we sort the file groups in th plan by the specified columns, while meeting the configured target file sizes.
`Config Param: EXECUTION_STRATEGY_CLASS_NAME`
`Since Version: 0.7.0` | +| [hoodie.clustering.group.read.parallelism](#hoodieclusteringgroupreadparallelism) | 20 | Maximum number of parallelism when Spark read records from clustering group.
`Config Param: CLUSTERING_GROUP_READ_PARALLELISM`
`Since Version: 1.0.0` | +| [hoodie.clustering.inline.max.commits](#hoodieclusteringinlinemaxcommits) | 4 | Config to control frequency of clustering planning
`Config Param: INLINE_CLUSTERING_MAX_COMMITS`
`Since Version: 0.7.0` | +| [hoodie.clustering.max.parallelism](#hoodieclusteringmaxparallelism) | 15 | Maximum number of parallelism jobs submitted in clustering operation. If the resource is sufficient(Like Spark engine has enough idle executors), increasing this value will let the clustering job run faster, while it will give additional pressure to the execution engines to manage more concurrent running jobs.
`Config Param: CLUSTERING_MAX_PARALLELISM`
`Since Version: 0.14.0` | +| [hoodie.clustering.plan.partition.filter.mode](#hoodieclusteringplanpartitionfiltermode) | NONE | org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode: Partition filter mode used in the creation of clustering plan. NONE(default): Do not filter partitions. The clustering plan will include all partitions that have clustering candidates. RECENT_DAYS: This filter assumes that your data is partitioned by date. The clustering plan will only include partitions from K days ago to N days ago, where K >= N. K is determined by `hoodie.clustering.plan.strategy.daybased.lookback.partitions` and N is determined by `hoodie.clustering.plan.strategy.daybased.skipfromlatest.partitions`. SELECTED_PARTITIONS: The clustering plan will include only partition paths with names that sort within the inclusive range [`hoodie.clustering.plan.strategy.cluster.begin.partition`, `hoodie.clustering.plan.strategy.cluster.end.partition`]. DAY_ROLLING: To determine the partitions in the clustering plan, the eligible partitions will be sorted in ascending order. Each partition will have an index i in that list. The clustering plan will only contain partitions such that i mod 24 = H, where H is the current hour of the day (from 0 to 23).
`Config Param: PLAN_PARTITION_FILTER_MODE_NAME`
`Since Version: 0.11.0` | +| [hoodie.clustering.plan.strategy.class](#hoodieclusteringplanstrategyclass) | org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy | Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan i.e select what file groups are being clustered. Default strategy, looks at the clustering small file size limit (determined by hoodie.clustering.plan.strategy.small.file.limit) to pick the small file slices within partitions for clustering.
`Config Param: PLAN_STRATEGY_CLASS_NAME`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.daybased.lookback.partitions](#hoodieclusteringplanstrategydaybasedlookbackpartitions) | 2 | Number of partitions to list to create ClusteringPlan
`Config Param: DAYBASED_LOOKBACK_PARTITIONS`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.daybased.skipfromlatest.partitions](#hoodieclusteringplanstrategydaybasedskipfromlatestpartitions) | 0 | Number of partitions to skip from latest when choosing partitions to create ClusteringPlan
`Config Param: PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST`
`Since Version: 0.9.0` | +| [hoodie.clustering.plan.strategy.max.bytes.per.group](#hoodieclusteringplanstrategymaxbytespergroup) | 2147483648 | Each clustering operation can create multiple output file groups. Total amount of data processed by clustering operation is defined by below two properties (CLUSTERING_MAX_BYTES_PER_GROUP * CLUSTERING_MAX_NUM_GROUPS). Max amount of data to be included in one group
`Config Param: PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.max.num.groups](#hoodieclusteringplanstrategymaxnumgroups) | 30 | Maximum number of groups to create as part of ClusteringPlan. Increasing groups will increase parallelism
`Config Param: PLAN_STRATEGY_MAX_GROUPS`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.single.group.clustering.enabled](#hoodieclusteringplanstrategysinglegroupclusteringenabled) | true | Whether to generate clustering plan when there is only one file group involved, by default true
`Config Param: PLAN_STRATEGY_SINGLE_GROUP_CLUSTERING_ENABLED`
`Since Version: 0.14.0` | +| [hoodie.clustering.rollback.pending.replacecommit.on.conflict](#hoodieclusteringrollbackpendingreplacecommitonconflict) | false | If updates are allowed to file groups pending clustering, then set this config to rollback failed or pending clustering instants. Pending clustering will be rolled back ONLY IF there is conflict between incoming upsert and filegroup to be clustered. Please exercise caution while setting this config, especially when clustering is done very frequently. This could lead to race condition in rare scenarios, for example, when the clustering completes after instants are fetched but before rollback completed.
`Config Param: ROLLBACK_PENDING_CLUSTERING_ON_CONFLICT`
`Since Version: 0.10.0` | +| [hoodie.clustering.schedule.inline](#hoodieclusteringscheduleinline) | false | When set to true, clustering service will be attempted for inline scheduling after each write. Users have to ensure they have a separate job to run async clustering(execution) for the one scheduled by this writer. Users can choose to set both `hoodie.clustering.inline` and `hoodie.clustering.schedule.inline` to false and have both scheduling and execution triggered by any async process, on which case `hoodie.clustering.async.enabled` is expected to be set to true. But if `hoodie.clustering.inline` is set to false, and `hoodie.clustering.schedule.inline` is set to true, regular writers will schedule clustering inline, but users are expected to trigger async job for execution. If `hoodie.clustering.inline` is set to true, regular writers will do both scheduling and execution inline for clustering
`Config Param: SCHEDULE_INLINE_CLUSTERING` | +| [hoodie.clustering.updates.strategy](#hoodieclusteringupdatesstrategy) | org.apache.hudi.client.clustering.update.strategy.SparkRejectUpdateStrategy | Determines how to handle updates, deletes to file groups that are under clustering. Default strategy just rejects the update
`Config Param: UPDATES_STRATEGY`
`Since Version: 0.7.0` | +| [hoodie.layout.optimize.build.curve.sample.size](#hoodielayoutoptimizebuildcurvesamplesize) | 200000 | Determines target sample size used by the Boundary-based Interleaved Index method of building space-filling curve. Larger sample size entails better layout optimization outcomes, at the expense of higher memory footprint.
`Config Param: LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE`
`Since Version: 0.10.0` | +| [hoodie.layout.optimize.curve.build.method](#hoodielayoutoptimizecurvebuildmethod) | DIRECT | org.apache.hudi.config.HoodieClusteringConfig$SpatialCurveCompositionStrategyType: This configuration only has effect if `hoodie.layout.optimize.strategy` is set to either "z-order" or "hilbert" (i.e. leveraging space-filling curves). This configuration controls the type of a strategy to use for building the space-filling curves, tackling specifically how the Strings are ordered based on the curve. Since we truncate the String to 8 bytes for ordering, there are two issues: (1) it can lead to poor aggregation effect, (2) the truncation of String longer than 8 bytes loses the precision, if the Strings are different but the 8-byte prefix is the same. The boundary-based interleaved index method ("SAMPLE") has better generalization, solving the two problems above, but is slower than direct method ("DIRECT"). User should benchmark the write and query performance before tweaking this in production, if this is actually a problem. Please refer to RFC-28 for more details. DIRECT(default): This strategy builds the spatial curve in full, filling in all of the individual points corresponding to each individual record, which requires less compute. SAMPLE: This strategy leverages boundary-base interleaved index method (described in more details in Amazon DynamoDB blog https://aws.amazon.com/cn/blogs/database/tag/z-order/) and produces a better layout compared to DIRECT strategy. It requires more compute and is slower.
`Config Param: LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD`
`Since Version: 0.10.0` | +| [hoodie.layout.optimize.data.skipping.enable](#hoodielayoutoptimizedataskippingenable) | true | Enable data skipping by collecting statistics once layout optimization is complete.
`Config Param: LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE`
`Since Version: 0.10.0`
`Deprecated since: 0.11.0` | +| [hoodie.layout.optimize.enable](#hoodielayoutoptimizeenable) | false | This setting has no effect. Please refer to clustering configuration, as well as LAYOUT_OPTIMIZE_STRATEGY config to enable advanced record layout optimization strategies
`Config Param: LAYOUT_OPTIMIZE_ENABLE`
`Since Version: 0.10.0`
`Deprecated since: 0.11.0` | +| [hoodie.layout.optimize.strategy](#hoodielayoutoptimizestrategy) | LINEAR | org.apache.hudi.config.HoodieClusteringConfig$LayoutOptimizationStrategy: Determines ordering strategy for records layout optimization. LINEAR(default): Orders records lexicographically ZORDER: Orders records along Z-order spatial-curve. HILBERT: Orders records along Hilbert's spatial-curve.
`Config Param: LAYOUT_OPTIMIZE_STRATEGY`
`Since Version: 0.10.0` | +--- + + +### Compaction Configs {#Compaction-Configs} +Configurations that control compaction (merging of log files onto a new base files). + + + +[**Basic Configs**](#Compaction-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.compact.inline](#hoodiecompactinline) | false | When set to true, compaction service is triggered after each write. While being simpler operationally, this adds extra latency on the write path.
`Config Param: INLINE_COMPACT` | +| [hoodie.compact.inline.max.delta.commits](#hoodiecompactinlinemaxdeltacommits) | 5 | Number of delta commits after the last compaction, before scheduling of a new compaction is attempted. This config takes effect only for the compaction triggering strategy based on the number of commits, i.e., NUM_COMMITS, NUM_COMMITS_AFTER_LAST_REQUEST, NUM_AND_TIME, and NUM_OR_TIME.
`Config Param: INLINE_COMPACT_NUM_DELTA_COMMITS` | + +[**Advanced Configs**](#Compaction-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------- || +| [hoodie.compaction.partition.path.regex](#hoodiecompactionpartitionpathregex) | (N/A) | Used to specify the partition path regex for compaction. Only partitions that match the regex will be compacted. Only be used when configure PartitionRegexBasedCompactionStrategy.
`Config Param: COMPACTION_SPECIFY_PARTITION_PATH_REGEX` | +| [hoodie.compact.inline.max.delta.seconds](#hoodiecompactinlinemaxdeltaseconds) | 3600 | Number of elapsed seconds after the last compaction, before scheduling a new one. This config takes effect only for the compaction triggering strategy based on the elapsed time, i.e., TIME_ELAPSED, NUM_AND_TIME, and NUM_OR_TIME.
`Config Param: INLINE_COMPACT_TIME_DELTA_SECONDS` | +| [hoodie.compact.inline.trigger.strategy](#hoodiecompactinlinetriggerstrategy) | NUM_COMMITS | org.apache.hudi.table.action.compact.CompactionTriggerStrategy: Controls when compaction is scheduled. NUM_COMMITS(default): triggers compaction when there are at least N delta commits after last completed compaction. NUM_COMMITS_AFTER_LAST_REQUEST: triggers compaction when there are at least N delta commits after last completed or requested compaction. TIME_ELAPSED: triggers compaction after N seconds since last compaction. NUM_AND_TIME: triggers compaction when both there are at least N delta commits and N seconds elapsed (both must be satisfied) after last completed compaction. NUM_OR_TIME: triggers compaction when both there are at least N delta commits or N seconds elapsed (either condition is satisfied) after last completed compaction.
`Config Param: INLINE_COMPACT_TRIGGER_STRATEGY` | +| [hoodie.compact.schedule.inline](#hoodiecompactscheduleinline) | false | When set to true, compaction service will be attempted for inline scheduling after each write. Users have to ensure they have a separate job to run async compaction(execution) for the one scheduled by this writer. Users can choose to set both `hoodie.compact.inline` and `hoodie.compact.schedule.inline` to false and have both scheduling and execution triggered by any async process. But if `hoodie.compact.inline` is set to false, and `hoodie.compact.schedule.inline` is set to true, regular writers will schedule compaction inline, but users are expected to trigger async job for execution. If `hoodie.compact.inline` is set to true, regular writers will do both scheduling and execution inline for compaction
`Config Param: SCHEDULE_INLINE_COMPACT` | +| [hoodie.compaction.daybased.target.partitions](#hoodiecompactiondaybasedtargetpartitions) | 10 | Used by org.apache.hudi.io.compact.strategy.DayBasedCompactionStrategy to denote the number of latest partitions to compact during a compaction run.
`Config Param: TARGET_PARTITIONS_PER_DAYBASED_COMPACTION` | +| [hoodie.compaction.logfile.num.threshold](#hoodiecompactionlogfilenumthreshold) | 0 | Only if the log file num is greater than the threshold, the file group will be compacted.
`Config Param: COMPACTION_LOG_FILE_NUM_THRESHOLD`
`Since Version: 0.13.0` | +| [hoodie.compaction.logfile.size.threshold](#hoodiecompactionlogfilesizethreshold) | 0 | Only if the log file size is greater than the threshold in bytes, the file group will be compacted.
`Config Param: COMPACTION_LOG_FILE_SIZE_THRESHOLD` | +| [hoodie.compaction.strategy](#hoodiecompactionstrategy) | org.apache.hudi.table.action.compact.strategy.LogFileSizeBasedCompactionStrategy | Compaction strategy decides which file groups are picked up for compaction during each compaction run. By default. Hudi picks the log file with most accumulated unmerged data. The strategy can be composed with multiple strategies by concatenating the class names with ','.
`Config Param: COMPACTION_STRATEGY` | +| [hoodie.compaction.target.io](#hoodiecompactiontargetio) | 512000 | Amount of MBs to spend during compaction run for the LogFileSizeBasedCompactionStrategy. This value helps bound ingestion latency while compaction is run inline mode.
`Config Param: TARGET_IO_PER_COMPACTION_IN_MB` | +| [hoodie.copyonwrite.insert.auto.split](#hoodiecopyonwriteinsertautosplit) | true | Config to control whether we control insert split sizes automatically based on average record sizes. It's recommended to keep this turned on, since hand tuning is otherwise extremely cumbersome.
`Config Param: COPY_ON_WRITE_AUTO_SPLIT_INSERTS` | +| [hoodie.copyonwrite.insert.split.size](#hoodiecopyonwriteinsertsplitsize) | 500000 | Number of inserts assigned for each partition/bucket for writing. We based the default on writing out 100MB files, with at least 1kb records (100K records per file), and over provision to 500K. As long as auto-tuning of splits is turned on, this only affects the first write, where there is no history to learn record sizes from.
`Config Param: COPY_ON_WRITE_INSERT_SPLIT_SIZE` | +| [hoodie.copyonwrite.record.size.estimate](#hoodiecopyonwriterecordsizeestimate) | 1024 | The average record size. If not explicitly specified, hudi will compute the record size estimate compute dynamically based on commit metadata. This is critical in computing the insert parallelism and bin-packing inserts into small files.
`Config Param: COPY_ON_WRITE_RECORD_SIZE_ESTIMATE` | +| [hoodie.log.compaction.blocks.threshold](#hoodielogcompactionblocksthreshold) | 5 | Log compaction can be scheduled if the no. of log blocks crosses this threshold value. This is effective only when log compaction is enabled via hoodie.log.compaction.inline
`Config Param: LOG_COMPACTION_BLOCKS_THRESHOLD`
`Since Version: 0.13.0` | +| [hoodie.log.compaction.enable](#hoodielogcompactionenable) | false | By enabling log compaction through this config, log compaction will also get enabled for the metadata table.
`Config Param: ENABLE_LOG_COMPACTION`
`Since Version: 0.14.0` | +| [hoodie.log.compaction.inline](#hoodielogcompactioninline) | false | When set to true, logcompaction service is triggered after each write. While being simpler operationally, this adds extra latency on the write path.
`Config Param: INLINE_LOG_COMPACT`
`Since Version: 0.13.0` | +| [hoodie.parquet.small.file.limit](#hoodieparquetsmallfilelimit) | 104857600 | During upsert operation, we opportunistically expand existing small files on storage, instead of writing new files, to keep number of files to an optimum. This config sets the file size limit below which a file on storage becomes a candidate to be selected as such a `small file`. By default, treat any file <= 100MB as a small file. Also note that if this set <= 0, will not try to get small files and directly write new files
`Config Param: PARQUET_SMALL_FILE_LIMIT` | +| [hoodie.record.size.estimation.threshold](#hoodierecordsizeestimationthreshold) | 1.0 | We use the previous commits' metadata to calculate the estimated record size and use it to bin pack records into partitions. If the previous commit is too small to make an accurate estimation, Hudi will search commits in the reverse order, until we find a commit that has totalBytesWritten larger than (PARQUET_SMALL_FILE_LIMIT_BYTES * this_threshold)
`Config Param: RECORD_SIZE_ESTIMATION_THRESHOLD` | +--- + + +### Error table Configs {#Error-table-Configs} +Configurations that are required for Error table configs + + + +[**Basic Configs**](#Error-table-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.errortable.base.path](#hoodieerrortablebasepath) | (N/A) | Base path for error table under which all error records would be stored.
`Config Param: ERROR_TABLE_BASE_PATH` | +| [hoodie.errortable.target.table.name](#hoodieerrortabletargettablename) | (N/A) | Table name to be used for the error table
`Config Param: ERROR_TARGET_TABLE` | +| [hoodie.errortable.write.class](#hoodieerrortablewriteclass) | (N/A) | Class which handles the error table writes. This config is used to configure a custom implementation for Error Table Writer. Specify the full class name of the custom error table writer as a value for this config
`Config Param: ERROR_TABLE_WRITE_CLASS` | +| [hoodie.errortable.enable](#hoodieerrortableenable) | false | Config to enable error table. If the config is enabled, all the records with processing error in DeltaStreamer are transferred to error table.
`Config Param: ERROR_TABLE_ENABLED` | +| [hoodie.errortable.insert.shuffle.parallelism](#hoodieerrortableinsertshuffleparallelism) | 200 | Config to set insert shuffle parallelism. The config is similar to hoodie.insert.shuffle.parallelism config but applies to the error table.
`Config Param: ERROR_TABLE_INSERT_PARALLELISM_VALUE` | +| [hoodie.errortable.upsert.shuffle.parallelism](#hoodieerrortableupsertshuffleparallelism) | 200 | Config to set upsert shuffle parallelism. The config is similar to hoodie.upsert.shuffle.parallelism config but applies to the error table.
`Config Param: ERROR_TABLE_UPSERT_PARALLELISM_VALUE` | +| [hoodie.errortable.validate.recordcreation.enable](#hoodieerrortablevalidaterecordcreationenable) | true | Records that fail to be created due to keygeneration failure or other issues will be sent to the Error Table
`Config Param: ERROR_ENABLE_VALIDATE_RECORD_CREATION`
`Since Version: 0.15.0` | +| [hoodie.errortable.validate.targetschema.enable](#hoodieerrortablevalidatetargetschemaenable) | false | Records with schema mismatch with Target Schema are sent to Error Table.
`Config Param: ERROR_ENABLE_VALIDATE_TARGET_SCHEMA` | +| [hoodie.errortable.write.failure.strategy](#hoodieerrortablewritefailurestrategy) | ROLLBACK_COMMIT | The config specifies the failure strategy if error table write fails. Use one of - [ROLLBACK_COMMIT (Rollback the corresponding base table write commit for which the error events were triggered) , LOG_ERROR (Error is logged but the base table write succeeds) ]
`Config Param: ERROR_TABLE_WRITE_FAILURE_STRATEGY` | +--- + + +### Layout Configs {#Layout-Configs} +Configurations that control storage layout and data distribution, which defines how the files are organized within a table. + + + +[**Advanced Configs**](#Layout-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.storage.layout.partitioner.class](#hoodiestoragelayoutpartitionerclass) | (N/A) | Partitioner class, it is used to distribute data in a specific way.
`Config Param: LAYOUT_PARTITIONER_CLASS_NAME` | +| [hoodie.storage.layout.type](#hoodiestoragelayouttype) | DEFAULT | org.apache.hudi.table.storage.HoodieStorageLayout$LayoutType: Determines how the files are organized within a table. DEFAULT(default): Each file group contains records of a certain set of keys, without particular grouping criteria. BUCKET: Each file group contains records of a set of keys which map to a certain range of hash values, so that using the hash function can easily identify the file group a record belongs to, based on the record key.
`Config Param: LAYOUT_TYPE` | +--- + + +### TTL management Configs {#TTL-management-Configs} +Data ttl management + + + +[**Advanced Configs**](#TTL-management-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.partition.ttl.strategy.class](#hoodiepartitionttlstrategyclass) | (N/A) | Config to provide a strategy class (subclass of PartitionTTLStrategy) to get the expired partitions
`Config Param: PARTITION_TTL_STRATEGY_CLASS_NAME`
`Since Version: 1.0.0` | +| [hoodie.partition.ttl.strategy.partition.selected](#hoodiepartitionttlstrategypartitionselected) | (N/A) | Partitions to manage ttl
`Config Param: PARTITION_SELECTED`
`Since Version: 1.0.0` | +| [hoodie.partition.ttl.inline](#hoodiepartitionttlinline) | false | When enabled, the partition ttl management service is invoked immediately after each commit, to delete exipired partitions
`Config Param: INLINE_PARTITION_TTL`
`Since Version: 1.0.0` | +| [hoodie.partition.ttl.management.strategy.type](#hoodiepartitionttlmanagementstrategytype) | KEEP_BY_TIME | Partition ttl management strategy type to determine the strategy class
`Config Param: PARTITION_TTL_STRATEGY_TYPE`
`Since Version: 1.0.0` | +| [hoodie.partition.ttl.strategy.days.retain](#hoodiepartitionttlstrategydaysretain) | -1 | Partition ttl management KEEP_BY_TIME strategy days retain
`Config Param: DAYS_RETAIN`
`Since Version: 1.0.0` | +| [hoodie.partition.ttl.strategy.max.delete.partitions](#hoodiepartitionttlstrategymaxdeletepartitions) | 1000 | max partitions to delete in partition ttl management
`Config Param: MAX_PARTITION_TO_DELETE`
`Since Version: 1.0.0` | +--- + + +### Write Configurations {#Write-Configurations} +Configurations that control write behavior on Hudi tables. These can be directly passed down from even higher level frameworks (e.g Spark datasources, Flink sink) and utilities (e.g Hudi Streamer). + + + +[**Basic Configs**](#Write-Configurations-basic-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- || +| [hoodie.base.path](#hoodiebasepath) | (N/A) | Base path on lake storage, under which all the table data is stored. Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs etc in .hoodie directory under this base path directory.
`Config Param: BASE_PATH` | +| [hoodie.table.name](#hoodietablename) | (N/A) | Table name that will be used for registering with metastores like HMS. Needs to be same across runs.
`Config Param: TBL_NAME` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)
`Config Param: PRECOMBINE_FIELD_NAME` | +| [hoodie.instant_state.timeline_server_based.enabled](#hoodieinstant_statetimeline_server_basedenabled) | false | If enabled, writers get instant state from timeline server rather than requesting DFS directly
`Config Param: INSTANT_STATE_TIMELINE_SERVER_BASED`
`Since Version: 1.0.0` | +| [hoodie.instant_state.timeline_server_based.force_refresh.request.number](#hoodieinstant_statetimeline_server_basedforce_refreshrequestnumber) | 100 | Number of requests to trigger instant state cache refreshing
`Config Param: INSTANT_STATE_TIMELINE_SERVER_BASED_FORCE_REFRESH_REQUEST_NUMBER`
`Since Version: 1.0.0` | +| [hoodie.write.auto.upgrade](#hoodiewriteautoupgrade) | true | If enabled, writers automatically migrate the table to the specified write table version if the current table version is lower.
`Config Param: AUTO_UPGRADE_VERSION`
`Since Version: 1.0.0` | +| [hoodie.write.concurrency.mode](#hoodiewriteconcurrencymode) | SINGLE_WRITER | org.apache.hudi.common.model.WriteConcurrencyMode: Concurrency modes for write operations. SINGLE_WRITER(default): Only one active writer to the table. Maximizes throughput. OPTIMISTIC_CONCURRENCY_CONTROL: Multiple writers can operate on the table with lazy conflict resolution using locks. This means that only one writer succeeds if multiple writers write to the same file group. NON_BLOCKING_CONCURRENCY_CONTROL: Multiple writers can operate on the table with non-blocking conflict resolution. The writers can write into the same file group with the conflicts resolved automatically by the query reader and the compactor.
`Config Param: WRITE_CONCURRENCY_MODE` | +| [hoodie.write.record.merge.mode](#hoodiewriterecordmergemode) | EVENT_TIME_ORDERING | org.apache.hudi.common.config.RecordMergeMode: Determines the logic of merging updates COMMIT_TIME_ORDERING: Using transaction time to merge records, i.e., the record from later transaction overwrites the earlier record with the same key. EVENT_TIME_ORDERING(default): Using event time as the ordering to merge records, i.e., the record with the larger event time overwrites the record with the smaller event time on the same key, regardless of transaction time. The event time or preCombine field needs to be specified by the user. CUSTOM: Using custom merging logic specified by the user.
`Config Param: RECORD_MERGE_MODE`
`Since Version: 1.0.0` | +| [hoodie.write.table.version](#hoodiewritetableversion) | 8 | The table version this writer is storing the table in. This should match the current table version.
`Config Param: WRITE_TABLE_VERSION`
`Since Version: 1.0.0` | + +[**Advanced Configs**](#Write-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- || +| [hoodie.avro.schema](#hoodieavroschema) | (N/A) | Schema string representing the current write schema of the table. Hudi passes this to implementations of HoodieRecordPayload to convert incoming records to avro. This is also used as the write schema evolving records during an update.
`Config Param: AVRO_SCHEMA_STRING` | +| [hoodie.bulkinsert.user.defined.partitioner.class](#hoodiebulkinsertuserdefinedpartitionerclass) | (N/A) | If specified, this class will be used to re-partition records before they are bulk inserted. This can be used to sort, pack, cluster data optimally for common query patterns. For now we support a build-in user defined bulkinsert partitioner org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner which can does sorting based on specified column values set by hoodie.bulkinsert.user.defined.partitioner.sort.columns
`Config Param: BULKINSERT_USER_DEFINED_PARTITIONER_CLASS_NAME` | +| [hoodie.bulkinsert.user.defined.partitioner.sort.columns](#hoodiebulkinsertuserdefinedpartitionersortcolumns) | (N/A) | Columns to sort the data by when use org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner as user defined partitioner during bulk_insert. For example 'column1,column2'
`Config Param: BULKINSERT_USER_DEFINED_PARTITIONER_SORT_COLUMNS` | +| [hoodie.datasource.write.keygenerator.class](#hoodiedatasourcewritekeygeneratorclass) | (N/A) | Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator` extract a key out of incoming records.
`Config Param: KEYGENERATOR_CLASS_NAME` | +| [hoodie.datasource.write.payload.class](#hoodiedatasourcewritepayloadclass) | (N/A) | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for PRECOMBINE_FIELD_OPT_VAL in-effective
`Config Param: WRITE_PAYLOAD_CLASS_NAME` | +| [hoodie.internal.schema](#hoodieinternalschema) | (N/A) | Schema string representing the latest schema of the table. Hudi passes this to implementations of evolution of schema
`Config Param: INTERNAL_SCHEMA_STRING` | +| [hoodie.write.record.merge.custom.implementation.classes](#hoodiewriterecordmergecustomimplementationclasses) | (N/A) | List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. These record merge impls will filter by hoodie.write.record.merge.strategy.idHudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc)
`Config Param: RECORD_MERGE_IMPL_CLASSES`
`Since Version: 0.13.0` | +| [hoodie.write.record.merge.strategy.id](#hoodiewriterecordmergestrategyid) | (N/A) | ID of record merge strategy. Hudi will pick HoodieRecordMerger implementations in `hoodie.write.record.merge.custom.implementation.classes` which has the same merge strategy id
`Config Param: RECORD_MERGE_STRATEGY_ID`
`Since Version: 0.13.0` | +| [hoodie.write.schema](#hoodiewriteschema) | (N/A) | Config allowing to override writer's schema. This might be necessary in cases when writer's schema derived from the incoming dataset might actually be different from the schema we actually want to use when writing. This, for ex, could be the case for'partial-update' use-cases (like `MERGE INTO` Spark SQL statement for ex) where only a projection of the incoming dataset might be used to update the records in the existing table, prompting us to override the writer's schema
`Config Param: WRITE_SCHEMA_OVERRIDE` | +| [_.hoodie.allow.multi.write.on.same.instant](#_hoodieallowmultiwriteonsameinstant) | false |
`Config Param: ALLOW_MULTI_WRITE_ON_SAME_INSTANT_ENABLE` | +| [hoodie.allow.empty.commit](#hoodieallowemptycommit) | true | Whether to allow generation of empty commits, even if no data was written in the commit. It's useful in cases where extra metadata needs to be published regardless e.g tracking source offsets when ingesting data
`Config Param: ALLOW_EMPTY_COMMIT` | +| [hoodie.allow.operation.metadata.field](#hoodieallowoperationmetadatafield) | false | Whether to include '_hoodie_operation' in the metadata fields. Once enabled, all the changes of a record are persisted to the delta log directly without merge
`Config Param: ALLOW_OPERATION_METADATA_FIELD`
`Since Version: 0.9.0` | +| [hoodie.auto.adjust.lock.configs](#hoodieautoadjustlockconfigs) | false | Auto adjust lock configurations when metadata table is enabled and for async table services.
`Config Param: AUTO_ADJUST_LOCK_CONFIGS`
`Since Version: 0.11.0` | +| [hoodie.auto.commit](#hoodieautocommit) | true | Controls whether a write operation should auto commit. This can be turned off to perform inspection of the uncommitted write before deciding to commit.
`Config Param: AUTO_COMMIT_ENABLE` | +| [hoodie.avro.schema.external.transformation](#hoodieavroschemaexternaltransformation) | false | When enabled, records in older schema are rewritten into newer schema during upsert,delete and background compaction,clustering operations.
`Config Param: AVRO_EXTERNAL_SCHEMA_TRANSFORMATION_ENABLE` | +| [hoodie.avro.schema.validate](#hoodieavroschemavalidate) | false | Validate the schema used for the write against the latest schema, for backwards compatibility.
`Config Param: AVRO_SCHEMA_VALIDATE_ENABLE` | +| [hoodie.base.file.format](#hoodiebasefileformat) | PARQUET | File format to store all the base file data. org.apache.hudi.common.model.HoodieFileFormat: Hoodie file formats. PARQUET(default): Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. It provides efficient data compression and encoding schemes with enhanced performance to handle complex data in bulk. HFILE: (internal config) File format for metadata table. A file of sorted key/value pairs. Both keys and values are byte arrays. ORC: The Optimized Row Columnar (ORC) file format provides a highly efficient way to store Hive data. It was designed to overcome limitations of the other Hive file formats. Using ORC files improves performance when Hive is reading, writing, and processing data.
`Config Param: BASE_FILE_FORMAT` | +| [hoodie.bulkinsert.shuffle.parallelism](#hoodiebulkinsertshuffleparallelism) | 0 | For large initial imports using bulk_insert operation, controls the parallelism to use for sort modes or custom partitioning done before writing records to the table. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data or the parallelism based on the logical plan for row writer. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism. If you observe small files from the bulk insert operation, we suggest configuring this shuffle parallelism explicitly, so that the parallelism is around total_input_data_size/120MB.
`Config Param: BULKINSERT_PARALLELISM_VALUE` | +| [hoodie.bulkinsert.sort.mode](#hoodiebulkinsertsortmode) | NONE | org.apache.hudi.execution.bulkinsert.BulkInsertSortMode: Modes for sorting records during bulk insert. NONE(default): No sorting. Fastest and matches `spark.write.parquet()` in number of files and overhead. GLOBAL_SORT: This ensures best file sizes, with lowest memory overhead at cost of sorting. PARTITION_SORT: Strikes a balance by only sorting within a Spark RDD partition, still keeping the memory overhead of writing low. File sizing is not as good as GLOBAL_SORT. PARTITION_PATH_REPARTITION: This ensures that the data for a single physical partition in the table is written by the same Spark executor. This should only be used when input data is evenly distributed across different partition paths. If data is skewed (most records are intended for a handful of partition paths among all) then this can cause an imbalance among Spark executors. PARTITION_PATH_REPARTITION_AND_SORT: This ensures that the data for a single physical partition in the table is written by the same Spark executor. This should only be used when input data is evenly distributed across different partition paths. Compared to PARTITION_PATH_REPARTITION, this sort mode does an additional step of sorting the records based on the partition path within a single Spark partition, given that data for multiple physical partitions can be sent to the same Spark partition and executor. If data is skewed (most records are intended for a handful of partition paths among all) then this can cause an imbalance among Spark executors.
`Config Param: BULK_INSERT_SORT_MODE` | +| [hoodie.bulkinsert.sort.suffix.record_key](#hoodiebulkinsertsortsuffixrecord_key) | false | When using user defined sort columns there can be possibility of skew because spark's RangePartitioner used in sort can reduce the number of outputSparkPartitionsif the sampled dataset has a low cardinality on the provided sort columns. This can cause an increase in commit durations as we are not leveraging the original parallelism.Enabling this config suffixes the record key at the end to avoid skew.This config is used by RowCustomColumnsSortPartitioner, RDDCustomColumnsSortPartitioner and JavaCustomColumnsSortPartitioner
`Config Param: BULKINSERT_SUFFIX_RECORD_KEY_SORT_COLUMNS`
`Since Version: 1.0.0` | +| [hoodie.client.heartbeat.interval_in_ms](#hoodieclientheartbeatinterval_in_ms) | 60000 | Writers perform heartbeats to indicate liveness. Controls how often (in ms), such heartbeats are registered to lake storage.
`Config Param: CLIENT_HEARTBEAT_INTERVAL_IN_MS` | +| [hoodie.client.heartbeat.tolerable.misses](#hoodieclientheartbeattolerablemisses) | 2 | Number of heartbeat misses, before a writer is deemed not alive and all pending writes are aborted.
`Config Param: CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES` | +| [hoodie.client.init.callback.classes](#hoodieclientinitcallbackclasses) | | Fully-qualified class names of the Hudi client init callbacks to run at the initialization of the Hudi client. The class names are separated by `,`. The class must be a subclass of `org.apache.hudi.callback.HoodieClientInitCallback`.By default, no Hudi client init callback is executed.
`Config Param: CLIENT_INIT_CALLBACK_CLASS_NAMES`
`Since Version: 0.14.0` | +| [hoodie.combine.before.delete](#hoodiecombinebeforedelete) | true | During delete operations, controls whether we should combine deletes (and potentially also upserts) before writing to storage.
`Config Param: COMBINE_BEFORE_DELETE` | +| [hoodie.combine.before.insert](#hoodiecombinebeforeinsert) | false | When inserted records share same key, controls whether they should be first combined (i.e de-duplicated) before writing to storage.
`Config Param: COMBINE_BEFORE_INSERT` | +| [hoodie.combine.before.upsert](#hoodiecombinebeforeupsert) | true | When upserted records share same key, controls whether they should be first combined (i.e de-duplicated) before writing to storage. This should be turned off only if you are absolutely certain that there are no duplicates incoming, otherwise it can lead to duplicate keys and violate the uniqueness guarantees.
`Config Param: COMBINE_BEFORE_UPSERT` | +| [hoodie.consistency.check.initial_interval_ms](#hoodieconsistencycheckinitial_interval_ms) | 2000 | Initial time between successive attempts to ensure written data's metadata is consistent on storage. Grows with exponential backoff after the initial value.
`Config Param: INITIAL_CONSISTENCY_CHECK_INTERVAL_MS` | +| [hoodie.consistency.check.max_checks](#hoodieconsistencycheckmax_checks) | 7 | Maximum number of checks, for consistency of written data.
`Config Param: MAX_CONSISTENCY_CHECKS` | +| [hoodie.consistency.check.max_interval_ms](#hoodieconsistencycheckmax_interval_ms) | 300000 | Max time to wait between successive attempts at performing consistency checks
`Config Param: MAX_CONSISTENCY_CHECK_INTERVAL_MS` | +| [hoodie.datasource.write.keygenerator.type](#hoodiedatasourcewritekeygeneratortype) | SIMPLE | **Note** This is being actively worked on. Please use `hoodie.datasource.write.keygenerator.class` instead. org.apache.hudi.keygen.constant.KeyGeneratorType: Key generator type, indicating the key generator class to use, that implements `org.apache.hudi.keygen.KeyGenerator`. SIMPLE(default): Simple key generator, which takes names of fields to be used for recordKey and partitionPath as configs. SIMPLE_AVRO: Simple key generator, which takes names of fields to be used for recordKey and partitionPath as configs. COMPLEX: Complex key generator, which takes names of fields to be used for recordKey and partitionPath as configs. COMPLEX_AVRO: Complex key generator, which takes names of fields to be used for recordKey and partitionPath as configs. TIMESTAMP: Timestamp-based key generator, that relies on timestamps for partitioning field. Still picks record key by name. TIMESTAMP_AVRO: Timestamp-based key generator, that relies on timestamps for partitioning field. Still picks record key by name. CUSTOM: This is a generic implementation type of KeyGenerator where users can configure record key as a single field or a combination of fields. Similarly partition path can be configured to have multiple fields or only one field. This KeyGenerator expects value for prop "hoodie.datasource.write.partitionpath.field" in a specific format. For example: properties.put("hoodie.datasource.write.partitionpath.field", "field1:PartitionKeyType1,field2:PartitionKeyType2"). CUSTOM_AVRO: This is a generic implementation type of KeyGenerator where users can configure record key as a single field or a combination of fields. Similarly partition path can be configured to have multiple fields or only one field. This KeyGenerator expects value for prop "hoodie.datasource.write.partitionpath.field" in a specific format. For example: properties.put("hoodie.datasource.write.partitionpath.field", "field1:PartitionKeyType1,field2:PartitionKeyType2"). NON_PARTITION: Simple Key generator for non-partitioned tables. NON_PARTITION_AVRO: Simple Key generator for non-partitioned tables. GLOBAL_DELETE: Key generator for deletes using global indices. GLOBAL_DELETE_AVRO: Key generator for deletes using global indices. AUTO_RECORD: Automatic record key generation. AUTO_RECORD_AVRO: Automatic record key generation. HOODIE_TABLE_METADATA: Custom key generator for the Hudi table metadata. SPARK_SQL: Custom spark-sql specific KeyGenerator overriding behavior handling TimestampType partition values. SPARK_SQL_UUID: A KeyGenerator which use the uuid as the record key. SPARK_SQL_MERGE_INTO: Meant to be used internally for the spark sql MERGE INTO command. STREAMER_TEST: A test KeyGenerator for deltastreamer tests.
`Config Param: KEYGENERATOR_TYPE` | +| [hoodie.datasource.write.schema.allow.auto.evolution.column.drop](#hoodiedatasourcewriteschemaallowautoevolutioncolumndrop) | false | Controls whether table's schema is allowed to automatically evolve when incoming batch's schema can have any of the columns dropped. By default, Hudi will not allow this kind of (auto) schema evolution. Set this config to true to allow table's schema to be updated automatically when columns are dropped from the new incoming batch.
`Config Param: SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP`
`Since Version: 0.13.0` | +| [hoodie.delete.shuffle.parallelism](#hoodiedeleteshuffleparallelism) | 0 | Parallelism used for delete operation. Delete operations also performs shuffles, similar to upsert operation. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism.
`Config Param: DELETE_PARALLELISM_VALUE` | +| [hoodie.embed.timeline.server](#hoodieembedtimelineserver) | true | When true, spins up an instance of the timeline server (meta server that serves cached file listings, statistics),running on each writer's driver process, accepting requests during the write from executors.
`Config Param: EMBEDDED_TIMELINE_SERVER_ENABLE` | +| [hoodie.embed.timeline.server.async](#hoodieembedtimelineserverasync) | false | Controls whether or not, the requests to the timeline server are processed in asynchronous fashion, potentially improving throughput.
`Config Param: EMBEDDED_TIMELINE_SERVER_USE_ASYNC_ENABLE` | +| [hoodie.embed.timeline.server.gzip](#hoodieembedtimelineservergzip) | true | Controls whether gzip compression is used, for large responses from the timeline server, to improve latency.
`Config Param: EMBEDDED_TIMELINE_SERVER_COMPRESS_ENABLE` | +| [hoodie.embed.timeline.server.port](#hoodieembedtimelineserverport) | 0 | Port at which the timeline server listens for requests. When running embedded in each writer, it picks a free port and communicates to all the executors. This should rarely be changed.
`Config Param: EMBEDDED_TIMELINE_SERVER_PORT_NUM` | +| [hoodie.embed.timeline.server.reuse.enabled](#hoodieembedtimelineserverreuseenabled) | false | Controls whether the timeline server instance should be cached and reused across the tablesto avoid startup costs and server overhead. This should only be used if you are running multiple writers in the same JVM.
`Config Param: EMBEDDED_TIMELINE_SERVER_REUSE_ENABLED` | +| [hoodie.embed.timeline.server.threads](#hoodieembedtimelineserverthreads) | -1 | Number of threads to serve requests in the timeline server. By default, auto configured based on the number of underlying cores.
`Config Param: EMBEDDED_TIMELINE_NUM_SERVER_THREADS` | +| [hoodie.fail.on.timeline.archiving](#hoodiefailontimelinearchiving) | true | Timeline archiving removes older instants from the timeline, after each write operation, to minimize metadata overhead. Controls whether or not, the write should be failed as well, if such archiving fails.
`Config Param: FAIL_ON_TIMELINE_ARCHIVING_ENABLE` | +| [hoodie.fail.writes.on.inline.table.service.exception](#hoodiefailwritesoninlinetableserviceexception) | true | Table services such as compaction and clustering can fail and prevent syncing to the metaclient. Set this to true to fail writes when table services fail
`Config Param: FAIL_ON_INLINE_TABLE_SERVICE_EXCEPTION`
`Since Version: 0.13.0` | +| [hoodie.fileid.prefix.provider.class](#hoodiefileidprefixproviderclass) | org.apache.hudi.table.RandomFileIdPrefixProvider | File Id Prefix provider class, that implements `org.apache.hudi.fileid.FileIdPrefixProvider`
`Config Param: FILEID_PREFIX_PROVIDER_CLASS`
`Since Version: 0.10.0` | +| [hoodie.finalize.write.parallelism](#hoodiefinalizewriteparallelism) | 200 | Parallelism for the write finalization internal operation, which involves removing any partially written files from lake storage, before committing the write. Reduce this value, if the high number of tasks incur delays for smaller tables or low latency writes.
`Config Param: FINALIZE_WRITE_PARALLELISM_VALUE` | +| [hoodie.insert.shuffle.parallelism](#hoodieinsertshuffleparallelism) | 0 | Parallelism for inserting records into the table. Inserts can shuffle data before writing to tune file sizes and optimize the storage layout. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism. If you observe small files from the insert operation, we suggest configuring this shuffle parallelism explicitly, so that the parallelism is around total_input_data_size/120MB.
`Config Param: INSERT_PARALLELISM_VALUE` | +| [hoodie.markers.delete.parallelism](#hoodiemarkersdeleteparallelism) | 100 | Determines the parallelism for deleting marker files, which are used to track all files (valid or invalid/partial) written during a write operation. Increase this value if delays are observed, with large batch writes.
`Config Param: MARKERS_DELETE_PARALLELISM_VALUE` | +| [hoodie.markers.timeline_server_based.batch.interval_ms](#hoodiemarkerstimeline_server_basedbatchinterval_ms) | 50 | The batch interval in milliseconds for marker creation batch processing
`Config Param: MARKERS_TIMELINE_SERVER_BASED_BATCH_INTERVAL_MS`
`Since Version: 0.9.0` | +| [hoodie.markers.timeline_server_based.batch.num_threads](#hoodiemarkerstimeline_server_basedbatchnum_threads) | 20 | Number of threads to use for batch processing marker creation requests at the timeline server
`Config Param: MARKERS_TIMELINE_SERVER_BASED_BATCH_NUM_THREADS`
`Since Version: 0.9.0` | +| [hoodie.merge.allow.duplicate.on.inserts](#hoodiemergeallowduplicateoninserts) | true | When enabled, we allow duplicate keys even if inserts are routed to merge with an existing file (for ensuring file sizing). This is only relevant for insert operation, since upsert, delete operations will ensure unique key constraints are maintained.
`Config Param: MERGE_ALLOW_DUPLICATE_ON_INSERTS_ENABLE` | +| [hoodie.merge.data.validation.enabled](#hoodiemergedatavalidationenabled) | false | When enabled, data validation checks are performed during merges to ensure expected number of records after merge operation.
`Config Param: MERGE_DATA_VALIDATION_CHECK_ENABLE` | +| [hoodie.merge.small.file.group.candidates.limit](#hoodiemergesmallfilegroupcandidateslimit) | 1 | Limits number of file groups, whose base file satisfies small-file limit, to consider for appending records during upsert operation. Only applicable to MOR tables
`Config Param: MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT` | +| [hoodie.release.resource.on.completion.enable](#hoodiereleaseresourceoncompletionenable) | true | Control to enable release all persist rdds when the spark job finish.
`Config Param: RELEASE_RESOURCE_ENABLE`
`Since Version: 0.11.0` | +| [hoodie.rollback.instant.backup.dir](#hoodierollbackinstantbackupdir) | .rollback_backup | Path where instants being rolled back are copied. If not absolute path then a directory relative to .hoodie folder is created.
`Config Param: ROLLBACK_INSTANT_BACKUP_DIRECTORY` | +| [hoodie.rollback.instant.backup.enabled](#hoodierollbackinstantbackupenabled) | false | Backup instants removed during rollback and restore (useful for debugging)
`Config Param: ROLLBACK_INSTANT_BACKUP_ENABLED` | +| [hoodie.rollback.parallelism](#hoodierollbackparallelism) | 100 | This config controls the parallelism for rollback of commits. Rollbacks perform deletion of files or logging delete blocks to file groups on storage in parallel. The configure value limits the parallelism so that the number of Spark tasks do not exceed the value. If rollback is slow due to the limited parallelism, you can increase this to tune the performance.
`Config Param: ROLLBACK_PARALLELISM_VALUE` | +| [hoodie.rollback.using.markers](#hoodierollbackusingmarkers) | true | Enables a more efficient mechanism for rollbacks based on the marker files generated during the writes. Turned on by default.
`Config Param: ROLLBACK_USING_MARKERS_ENABLE` | +| [hoodie.sensitive.config.keys](#hoodiesensitiveconfigkeys) | ssl,tls,sasl,auth,credentials | Comma separated list of filters for sensitive config keys. Hudi Streamer will not print any configuration which contains the configured filter. For example with a configured filter `ssl`, value for config `ssl.trustore.location` would be masked.
`Config Param: SENSITIVE_CONFIG_KEYS_FILTER`
`Since Version: 0.14.0` | +| [hoodie.skip.default.partition.validation](#hoodieskipdefaultpartitionvalidation) | false | When table is upgraded from pre 0.12 to 0.12, we check for "default" partition and fail if found one. Users are expected to rewrite the data in those partitions. Enabling this config will bypass this validation
`Config Param: SKIP_DEFAULT_PARTITION_VALIDATION`
`Since Version: 0.12.0` | +| [hoodie.table.services.enabled](#hoodietableservicesenabled) | true | Master control to disable all table services including archive, clean, compact, cluster, etc.
`Config Param: TABLE_SERVICES_ENABLED`
`Since Version: 0.11.0` | +| [hoodie.timeline.layout.version](#hoodietimelinelayoutversion) | 2 | Controls the layout of the timeline. Version 0 relied on renames, Version 1 (default) models the timeline as an immutable log relying only on atomic writes for object storage.
`Config Param: TIMELINE_LAYOUT_VERSION_NUM`
`Since Version: 0.5.1` | +| [hoodie.upsert.shuffle.parallelism](#hoodieupsertshuffleparallelism) | 0 | Parallelism to use for upsert operation on the table. Upserts can shuffle data to perform index lookups, file sizing, bin packing records optimally into file groups. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism. If you observe small files from the upsert operation, we suggest configuring this shuffle parallelism explicitly, so that the parallelism is around total_input_data_size/120MB.
`Config Param: UPSERT_PARALLELISM_VALUE` | +| [hoodie.write.buffer.limit.bytes](#hoodiewritebufferlimitbytes) | 4194304 | Size of in-memory buffer used for parallelizing network reads and lake storage writes.
`Config Param: WRITE_BUFFER_LIMIT_BYTES_VALUE` | +| [hoodie.write.buffer.record.cache.limit](#hoodiewritebufferrecordcachelimit) | 131072 | Maximum queue size of in-memory buffer for parallelizing network reads and lake storage writes.
`Config Param: WRITE_BUFFER_RECORD_CACHE_LIMIT`
`Since Version: 0.15.0` | +| [hoodie.write.buffer.record.sampling.rate](#hoodiewritebufferrecordsamplingrate) | 64 | Sampling rate of in-memory buffer used to estimate object size. Higher value lead to lower CPU usage.
`Config Param: WRITE_BUFFER_RECORD_SAMPLING_RATE`
`Since Version: 0.15.0` | +| [hoodie.write.concurrency.async.conflict.detector.initial_delay_ms](#hoodiewriteconcurrencyasyncconflictdetectorinitial_delay_ms) | 0 | Used for timeline-server-based markers with `AsyncTimelineServerBasedDetectionStrategy`. The time in milliseconds to delay the first execution of async marker-based conflict detection.
`Config Param: ASYNC_CONFLICT_DETECTOR_INITIAL_DELAY_MS`
`Since Version: 0.13.0` | +| [hoodie.write.concurrency.async.conflict.detector.period_ms](#hoodiewriteconcurrencyasyncconflictdetectorperiod_ms) | 30000 | Used for timeline-server-based markers with `AsyncTimelineServerBasedDetectionStrategy`. The period in milliseconds between successive executions of async marker-based conflict detection.
`Config Param: ASYNC_CONFLICT_DETECTOR_PERIOD_MS`
`Since Version: 0.13.0` | +| [hoodie.write.concurrency.early.conflict.check.commit.conflict](#hoodiewriteconcurrencyearlyconflictcheckcommitconflict) | false | Whether to enable commit conflict checking or not during early conflict detection.
`Config Param: EARLY_CONFLICT_DETECTION_CHECK_COMMIT_CONFLICT`
`Since Version: 0.13.0` | +| [hoodie.write.concurrency.early.conflict.detection.enable](#hoodiewriteconcurrencyearlyconflictdetectionenable) | false | Whether to enable early conflict detection based on markers. It eagerly detects writing conflict before create markers and fails fast if a conflict is detected, to release cluster compute resources as soon as possible.
`Config Param: EARLY_CONFLICT_DETECTION_ENABLE`
`Since Version: 0.13.0` | +| [hoodie.write.concurrency.early.conflict.detection.strategy](#hoodiewriteconcurrencyearlyconflictdetectionstrategy) | | The class name of the early conflict detection strategy to use. This should be a subclass of `org.apache.hudi.common.conflict.detection.EarlyConflictDetectionStrategy`.
`Config Param: EARLY_CONFLICT_DETECTION_STRATEGY_CLASS_NAME`
`Since Version: 0.13.0` | +| [hoodie.write.executor.disruptor.buffer.limit.bytes](#hoodiewriteexecutordisruptorbufferlimitbytes) | 1024 | The size of the Disruptor Executor ring buffer, must be power of 2
`Config Param: WRITE_EXECUTOR_DISRUPTOR_BUFFER_LIMIT_BYTES`
`Since Version: 0.13.0` | +| [hoodie.write.executor.disruptor.wait.strategy](#hoodiewriteexecutordisruptorwaitstrategy) | BLOCKING_WAIT | org.apache.hudi.common.util.queue.DisruptorWaitStrategyType: Strategy employed for making Disruptor Executor wait on a cursor. BLOCKING_WAIT(default): The slowest of the available wait strategies. However, it is the most conservative with the respect to CPU usage and will give the most consistent behaviour across the widest variety of deployment options. SLEEPING_WAIT: Like the `BLOCKING_WAIT` strategy, it attempts to be conservative with CPU usage by using a simple busy wait loop. The difference is that the `SLEEPING_WAIT` strategy uses a call to `LockSupport.parkNanos(1)` in the middle of the loop. On a typical Linux system this will pause the thread for around 60µs. YIELDING_WAIT: The `YIELDING_WAIT` strategy is one of two wait strategy that can be used in low-latency systems. It is designed for cases where there is an opportunity to burn CPU cycles with the goal of improving latency. The `YIELDING_WAIT` strategy will busy spin, waiting for the sequence to increment to the appropriate value. Inside the body of the loop `Thread#yield()` will be called allowing other queued threads to run. This is the recommended wait strategy when you need very high performance, and the number of `EventHandler` threads is lower than the total number of logical cores, such as when hyper-threading is enabled. BUSY_SPIN_WAIT: The `BUSY_SPIN_WAIT` strategy is the highest performing wait strategy. Like the `YIELDING_WAIT` strategy, it can be used in low-latency systems, but puts the highest constraints on the deployment environment.
`Config Param: WRITE_EXECUTOR_DISRUPTOR_WAIT_STRATEGY`
`Since Version: 0.13.0` | +| [hoodie.write.executor.type](#hoodiewriteexecutortype) | SIMPLE | org.apache.hudi.common.util.queue.ExecutorType: Types of executor that implements org.apache.hudi.common.util.queue.HoodieExecutor. The executor orchestrates concurrent producers and consumers communicating through a message queue. BOUNDED_IN_MEMORY: Executor which orchestrates concurrent producers and consumers communicating through a bounded in-memory message queue using LinkedBlockingQueue. This queue will use extra lock to balance producers and consumers. DISRUPTOR: Executor which orchestrates concurrent producers and consumers communicating through disruptor as a lock free message queue to gain better writing performance. Although DisruptorExecutor is still an experimental feature. SIMPLE(default): Executor with no inner message queue and no inner lock. Consuming and writing records from iterator directly. The advantage is that there is no need for additional memory and cpu resources due to lock or multithreading. The disadvantage is that the executor is a single-write-single-read model, cannot support functions such as speed limit and can not de-couple the network read (shuffle read) and network write (writing objects/files to storage) anymore.
`Config Param: WRITE_EXECUTOR_TYPE`
`Since Version: 0.13.0` | +| [hoodie.write.markers.type](#hoodiewritemarkerstype) | TIMELINE_SERVER_BASED | org.apache.hudi.common.table.marker.MarkerType: Marker type indicating how markers are stored in the file system, used for identifying the files written and cleaning up files not committed which should be deleted. DIRECT: Individual marker file corresponding to each data file is directly created by the writer. TIMELINE_SERVER_BASED(default): Marker operations are all handled at the timeline service which serves as a proxy. New marker entries are batch processed and stored in a limited number of underlying files for efficiency. If HDFS is used or timeline server is disabled, DIRECT markers are used as fallback even if this is configured. This configuration does not take effect for Spark structured streaming; DIRECT markers are always used.
`Config Param: MARKERS_TYPE`
`Since Version: 0.9.0` | +| [hoodie.write.num.retries.on.conflict.failures](#hoodiewritenumretriesonconflictfailures) | 0 | Maximum number of times to retry a batch on conflict failure.
`Config Param: NUM_RETRIES_ON_CONFLICT_FAILURES`
`Since Version: 0.14.0` | +| [hoodie.write.partial.update.schema](#hoodiewritepartialupdateschema) | | Avro schema of the partial updates. This is automatically set by the Hudi write client and user is not expected to manually change the value.
`Config Param: WRITE_PARTIAL_UPDATE_SCHEMA`
`Since Version: 1.0.0` | +| [hoodie.write.record.positions](#hoodiewriterecordpositions) | true | Whether to write record positions to the block header for data blocks containing updates and delete blocks. The record positions can be used to improve the performance of merging records from base and log files.
`Config Param: WRITE_RECORD_POSITIONS`
`Since Version: 1.0.0` | +| [hoodie.write.status.storage.level](#hoodiewritestatusstoragelevel) | MEMORY_AND_DISK_SER | Write status objects hold metadata about a write (stats, errors), that is not yet committed to storage. This controls the how that information is cached for inspection by clients. We rarely expect this to be changed.
`Config Param: WRITE_STATUS_STORAGE_LEVEL_VALUE` | +| [hoodie.write.tagged.record.storage.level](#hoodiewritetaggedrecordstoragelevel) | MEMORY_AND_DISK_SER | Determine what level of persistence is used to cache write RDDs. Refer to org.apache.spark.storage.StorageLevel for different values
`Config Param: TAGGED_RECORD_STORAGE_LEVEL_VALUE` | +| [hoodie.writestatus.class](#hoodiewritestatusclass) | org.apache.hudi.client.WriteStatus | Subclass of org.apache.hudi.client.WriteStatus to be used to collect information about a write. Can be overridden to collection additional metrics/statistics about the data if needed.
`Config Param: WRITE_STATUS_CLASS_NAME` | +--- + + +### Commit Callback Configs {#COMMIT_CALLBACK} +Configurations controlling callback behavior into HTTP endpoints, to push notifications on commits on hudi tables. + + +#### Write commit callback configs {#Write-commit-callback-configs} + + + + +[**Advanced Configs**](#Write-commit-callback-configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------- | ------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.write.commit.callback.http.custom.headers](#hoodiewritecommitcallbackhttpcustomheaders) | (N/A) | Http callback custom headers. Format: HeaderName1:HeaderValue1;HeaderName2:HeaderValue2
`Config Param: CALLBACK_HTTP_CUSTOM_HEADERS`
`Since Version: 0.15.0` | +| [hoodie.write.commit.callback.http.url](#hoodiewritecommitcallbackhttpurl) | (N/A) | Callback host to be sent along with callback messages
`Config Param: CALLBACK_HTTP_URL`
`Since Version: 0.6.0` | +| [hoodie.write.commit.callback.class](#hoodiewritecommitcallbackclass) | org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback | Full path of callback class and must be a subclass of HoodieWriteCommitCallback class, org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback by default
`Config Param: CALLBACK_CLASS_NAME`
`Since Version: 0.6.0` | +| [hoodie.write.commit.callback.http.api.key](#hoodiewritecommitcallbackhttpapikey) | hudi_write_commit_http_callback | Http callback API key. hudi_write_commit_http_callback by default
`Config Param: CALLBACK_HTTP_API_KEY_VALUE`
`Since Version: 0.6.0` | +| [hoodie.write.commit.callback.http.timeout.seconds](#hoodiewritecommitcallbackhttptimeoutseconds) | 30 | Callback timeout in seconds.
`Config Param: CALLBACK_HTTP_TIMEOUT_IN_SECONDS`
`Since Version: 0.6.0` | +| [hoodie.write.commit.callback.on](#hoodiewritecommitcallbackon) | false | Turn commit callback on/off. off by default.
`Config Param: TURN_CALLBACK_ON`
`Since Version: 0.6.0` | +--- + + +#### Write commit Kafka callback configs {#Write-commit-Kafka-callback-configs} +Controls notifications sent to Kafka, on events happening to a hudi table. + + + +[**Advanced Configs**](#Write-commit-Kafka-callback-configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.write.commit.callback.kafka.bootstrap.servers](#hoodiewritecommitcallbackkafkabootstrapservers) | (N/A) | Bootstrap servers of kafka cluster, to be used for publishing commit metadata.
`Config Param: BOOTSTRAP_SERVERS`
`Since Version: 0.7.0` | +| [hoodie.write.commit.callback.kafka.partition](#hoodiewritecommitcallbackkafkapartition) | (N/A) | It may be desirable to serialize all changes into a single Kafka partition for providing strict ordering. By default, Kafka messages are keyed by table name, which guarantees ordering at the table level, but not globally (or when new partitions are added)
`Config Param: PARTITION`
`Since Version: 0.7.0` | +| [hoodie.write.commit.callback.kafka.topic](#hoodiewritecommitcallbackkafkatopic) | (N/A) | Kafka topic name to publish timeline activity into.
`Config Param: TOPIC`
`Since Version: 0.7.0` | +| [hoodie.write.commit.callback.kafka.acks](#hoodiewritecommitcallbackkafkaacks) | all | kafka acks level, all by default to ensure strong durability.
`Config Param: ACKS`
`Since Version: 0.7.0` | +| [hoodie.write.commit.callback.kafka.retries](#hoodiewritecommitcallbackkafkaretries) | 3 | Times to retry the produce. 3 by default
`Config Param: RETRIES`
`Since Version: 0.7.0` | +--- + + +#### Write commit pulsar callback configs {#Write-commit-pulsar-callback-configs} +Controls notifications sent to pulsar, on events happening to a hudi table. + + + +[**Advanced Configs**](#Write-commit-pulsar-callback-configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------- | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.write.commit.callback.pulsar.broker.service.url](#hoodiewritecommitcallbackpulsarbrokerserviceurl) | (N/A) | Server's url of pulsar cluster, to be used for publishing commit metadata.
`Config Param: BROKER_SERVICE_URL`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.topic](#hoodiewritecommitcallbackpulsartopic) | (N/A) | pulsar topic name to publish timeline activity into.
`Config Param: TOPIC`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.connection-timeout](#hoodiewritecommitcallbackpulsarconnection-timeout) | 10s | Duration of waiting for a connection to a broker to be established.
`Config Param: CONNECTION_TIMEOUT`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.keepalive-interval](#hoodiewritecommitcallbackpulsarkeepalive-interval) | 30s | Duration of keeping alive interval for each client broker connection.
`Config Param: KEEPALIVE_INTERVAL`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.operation-timeout](#hoodiewritecommitcallbackpulsaroperation-timeout) | 30s | Duration of waiting for completing an operation.
`Config Param: OPERATION_TIMEOUT`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.producer.block-if-queue-full](#hoodiewritecommitcallbackpulsarproducerblock-if-queue-full) | true | When the queue is full, the method is blocked instead of an exception is thrown.
`Config Param: PRODUCER_BLOCK_QUEUE_FULL`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.producer.pending-queue-size](#hoodiewritecommitcallbackpulsarproducerpending-queue-size) | 1000 | The maximum size of a queue holding pending messages.
`Config Param: PRODUCER_PENDING_QUEUE_SIZE`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.producer.pending-total-size](#hoodiewritecommitcallbackpulsarproducerpending-total-size) | 50000 | The maximum number of pending messages across partitions.
`Config Param: PRODUCER_PENDING_SIZE`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.producer.route-mode](#hoodiewritecommitcallbackpulsarproducerroute-mode) | RoundRobinPartition | Message routing logic for producers on partitioned topics.
`Config Param: PRODUCER_ROUTE_MODE`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.producer.send-timeout](#hoodiewritecommitcallbackpulsarproducersend-timeout) | 30s | The timeout in each sending to pulsar.
`Config Param: PRODUCER_SEND_TIMEOUT`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.request-timeout](#hoodiewritecommitcallbackpulsarrequest-timeout) | 60s | Duration of waiting for completing a request.
`Config Param: REQUEST_TIMEOUT`
`Since Version: 0.11.0` | +--- + + +### Lock Configs {#LOCK} +Configurations that control locking mechanisms required for concurrency control between writers to a Hudi table. Concurrency between Hudi's own table services are auto managed internally. + + +#### Common Lock Configurations {#Common-Lock-Configurations} + + + + +[**Basic Configs**](#Common-Lock-Configurations-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.write.lock.heartbeat_interval_ms](#hoodiewritelockheartbeat_interval_ms) | 60000 | Heartbeat interval in ms, to send a heartbeat to indicate that hive client holding locks.
`Config Param: LOCK_HEARTBEAT_INTERVAL_MS`
`Since Version: 0.15.0` | + +[**Advanced Configs**](#Common-Lock-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.write.lock.filesystem.path](#hoodiewritelockfilesystempath) | (N/A) | For DFS based lock providers, path to store the locks under. use Table's meta path as default
`Config Param: FILESYSTEM_LOCK_PATH`
`Since Version: 0.8.0` | +| [hoodie.write.lock.hivemetastore.database](#hoodiewritelockhivemetastoredatabase) | (N/A) | For Hive based lock provider, the Hive database to acquire lock against
`Config Param: HIVE_DATABASE_NAME`
`Since Version: 0.8.0` | +| [hoodie.write.lock.hivemetastore.table](#hoodiewritelockhivemetastoretable) | (N/A) | For Hive based lock provider, the Hive table to acquire lock against
`Config Param: HIVE_TABLE_NAME`
`Since Version: 0.8.0` | +| [hoodie.write.lock.hivemetastore.uris](#hoodiewritelockhivemetastoreuris) | (N/A) | For Hive based lock provider, the Hive metastore URI to acquire locks against.
`Config Param: HIVE_METASTORE_URI`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.base_path](#hoodiewritelockzookeeperbase_path) | (N/A) | The base path on Zookeeper under which to create lock related ZNodes. This should be same for all concurrent writers to the same table
`Config Param: ZK_BASE_PATH`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.port](#hoodiewritelockzookeeperport) | (N/A) | Zookeeper port to connect to.
`Config Param: ZK_PORT`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.url](#hoodiewritelockzookeeperurl) | (N/A) | Zookeeper URL to connect to.
`Config Param: ZK_CONNECT_URL`
`Since Version: 0.8.0` | +| [hoodie.write.lock.client.num_retries](#hoodiewritelockclientnum_retries) | 50 | Maximum number of times to retry to acquire lock additionally from the lock manager.
`Config Param: LOCK_ACQUIRE_CLIENT_NUM_RETRIES`
`Since Version: 0.8.0` | +| [hoodie.write.lock.client.wait_time_ms_between_retry](#hoodiewritelockclientwait_time_ms_between_retry) | 5000 | Amount of time to wait between retries on the lock provider by the lock manager
`Config Param: LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS`
`Since Version: 0.8.0` | +| [hoodie.write.lock.conflict.resolution.strategy](#hoodiewritelockconflictresolutionstrategy) | org.apache.hudi.client.transaction.SimpleConcurrentFileWritesConflictResolutionStrategy | Lock provider class name, this should be subclass of org.apache.hudi.client.transaction.ConflictResolutionStrategy
`Config Param: WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS_NAME`
`Since Version: 0.8.0` | +| [hoodie.write.lock.filesystem.expire](#hoodiewritelockfilesystemexpire) | 0 | For DFS based lock providers, expire time in minutes, must be a non-negative number, default means no expire
`Config Param: FILESYSTEM_LOCK_EXPIRE`
`Since Version: 0.12.0` | +| [hoodie.write.lock.max_wait_time_ms_between_retry](#hoodiewritelockmax_wait_time_ms_between_retry) | 16000 | Maximum amount of time to wait between retries by lock provider client. This bounds the maximum delay from the exponential backoff. Currently used by ZK based lock provider only.
`Config Param: LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS`
`Since Version: 0.8.0` | +| [hoodie.write.lock.num_retries](#hoodiewritelocknum_retries) | 15 | Maximum number of times to retry lock acquire, at each lock provider
`Config Param: LOCK_ACQUIRE_NUM_RETRIES`
`Since Version: 0.8.0` | +| [hoodie.write.lock.provider](#hoodiewritelockprovider) | org.apache.hudi.client.transaction.lock.InProcessLockProvider | Lock provider class name, user can provide their own implementation of LockProvider which should be subclass of org.apache.hudi.common.lock.LockProvider
`Config Param: LOCK_PROVIDER_CLASS_NAME`
`Since Version: 0.8.0` | +| [hoodie.write.lock.wait_time_ms](#hoodiewritelockwait_time_ms) | 60000 | Timeout in ms, to wait on an individual lock acquire() call, at the lock provider.
`Config Param: LOCK_ACQUIRE_WAIT_TIMEOUT_MS`
`Since Version: 0.8.0` | +| [hoodie.write.lock.wait_time_ms_between_retry](#hoodiewritelockwait_time_ms_between_retry) | 1000 | Initial amount of time to wait between retries to acquire locks, subsequent retries will exponentially backoff.
`Config Param: LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.connection_timeout_ms](#hoodiewritelockzookeeperconnection_timeout_ms) | 15000 | Timeout in ms, to wait for establishing connection with Zookeeper.
`Config Param: ZK_CONNECTION_TIMEOUT_MS`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.lock_key](#hoodiewritelockzookeeperlock_key) | | Key name under base_path at which to create a ZNode and acquire lock. Final path on zk will look like base_path/lock_key. If this parameter is not set, we would set it as the table name
`Config Param: ZK_LOCK_KEY`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.session_timeout_ms](#hoodiewritelockzookeepersession_timeout_ms) | 60000 | Timeout in ms, to wait after losing connection to ZooKeeper, before the session is expired
`Config Param: ZK_SESSION_TIMEOUT_MS`
`Since Version: 0.8.0` | +--- + + +#### DynamoDB based Locks Configurations {#DynamoDB-based-Locks-Configurations} +Configs that control DynamoDB based locking mechanisms required for concurrency control between writers to a Hudi table. Concurrency between Hudi's own table services are auto managed internally. + + + +[**Advanced Configs**](#DynamoDB-based-Locks-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.write.lock.dynamodb.endpoint_url](#hoodiewritelockdynamodbendpoint_url) | (N/A) | For DynamoDB based lock provider, the url endpoint used for Amazon DynamoDB service. Useful for development with a local dynamodb instance.
`Config Param: DYNAMODB_ENDPOINT_URL`
`Since Version: 0.10.1` | +| [hoodie.write.lock.dynamodb.billing_mode](#hoodiewritelockdynamodbbilling_mode) | PAY_PER_REQUEST | For DynamoDB based lock provider, by default it is `PAY_PER_REQUEST` mode. Alternative is `PROVISIONED`.
`Config Param: DYNAMODB_LOCK_BILLING_MODE`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.partition_key](#hoodiewritelockdynamodbpartition_key) | | For DynamoDB based lock provider, the partition key for the DynamoDB lock table. Each Hudi dataset should has it's unique key so concurrent writers could refer to the same partition key. By default we use the Hudi table name specified to be the partition key
`Config Param: DYNAMODB_LOCK_PARTITION_KEY`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.read_capacity](#hoodiewritelockdynamodbread_capacity) | 20 | For DynamoDB based lock provider, read capacity units when using PROVISIONED billing mode
`Config Param: DYNAMODB_LOCK_READ_CAPACITY`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.region](#hoodiewritelockdynamodbregion) | us-east-1 | For DynamoDB based lock provider, the region used in endpoint for Amazon DynamoDB service. Would try to first get it from AWS_REGION environment variable. If not find, by default use us-east-1
`Config Param: DYNAMODB_LOCK_REGION`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.table](#hoodiewritelockdynamodbtable) | hudi_locks | For DynamoDB based lock provider, the name of the DynamoDB table acting as lock table
`Config Param: DYNAMODB_LOCK_TABLE_NAME`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.table_creation_timeout](#hoodiewritelockdynamodbtable_creation_timeout) | 120000 | For DynamoDB based lock provider, the maximum number of milliseconds to wait for creating DynamoDB table
`Config Param: DYNAMODB_LOCK_TABLE_CREATION_TIMEOUT`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.write_capacity](#hoodiewritelockdynamodbwrite_capacity) | 10 | For DynamoDB based lock provider, write capacity units when using PROVISIONED billing mode
`Config Param: DYNAMODB_LOCK_WRITE_CAPACITY`
`Since Version: 0.10.0` | +| [hoodie.write.lock.wait_time_ms](#hoodiewritelockwait_time_ms) | 60000 | Lock Acquire Wait Timeout in milliseconds
`Config Param: LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY`
`Since Version: 0.10.0` | +--- + + +### Key Generator Configs {#KEY_GENERATOR} +Hudi maintains keys (record key + partition path) for uniquely identifying a particular record. These configs allow developers to setup the Key generator class that extracts these out of incoming records. + + +#### Key Generator Options {#Key-Generator-Options} + + + + +[**Basic Configs**](#Key-Generator-Options-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | (N/A) | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString()
`Config Param: PARTITIONPATH_FIELD_NAME` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | (N/A) | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: RECORDKEY_FIELD_NAME` | +| [hoodie.datasource.write.secondarykey.column](#hoodiedatasourcewritesecondarykeycolumn) | (N/A) | Columns that constitute the secondary key component. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`
`Config Param: SECONDARYKEY_COLUMN_NAME` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)
`Config Param: HIVE_STYLE_PARTITIONING_ENABLE` | + +[**Advanced Configs**](#Key-Generator-Options-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------------------------ | ------- || +| [hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled](#hoodiedatasourcewritekeygeneratorconsistentlogicaltimestampenabled) | false | When set to true, consistent value will be generated for a logical timestamp type column, like timestamp-millis and timestamp-micros, irrespective of whether row-writer is enabled. Disabled by default so as not to break the pipeline that deploy either fully row-writer path or non row-writer path. For example, if it is kept disabled then record key of timestamp type with value `2016-12-29 09:54:00` will be written as timestamp `2016-12-29 09:54:00.0` in row-writer path, while it will be written as long value `1483023240000000` in non row-writer path. If enabled, then the timestamp value will be written in both the cases.
`Config Param: KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED`
`Since Version: 0.10.1` | +| [hoodie.datasource.write.partitionpath.urlencode](#hoodiedatasourcewritepartitionpathurlencode) | false | Should we url encode the partition path value, before creating the folder structure.
`Config Param: URL_ENCODE_PARTITIONING` | +--- + + +#### Timestamp-based key generator configs {#Timestamp-based-key-generator-configs} +Configs used for TimestampBasedKeyGenerator which relies on timestamps for the partition field. The field values are interpreted as timestamps and not just converted to string while generating partition path value for records. Record key is same as before where it is chosen by field name. + + + +[**Advanced Configs**](#Timestamp-based-key-generator-configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.keygen.timebased.timestamp.type](#hoodiekeygentimebasedtimestamptype) | (N/A) | Timestamp type of the field, which should be one of the timestamp types supported: `UNIX_TIMESTAMP`, `DATE_STRING`, `MIXED`, `EPOCHMILLISECONDS`, `EPOCHMICROSECONDS`, `SCALAR`.
`Config Param: TIMESTAMP_TYPE_FIELD` | +| [hoodie.keygen.datetime.parser.class](#hoodiekeygendatetimeparserclass) | org.apache.hudi.keygen.parser.HoodieDateTimeParser | Date time parser class name.
`Config Param: DATE_TIME_PARSER` | +| [hoodie.keygen.timebased.input.dateformat](#hoodiekeygentimebasedinputdateformat) | | Input date format such as `yyyy-MM-dd'T'HH:mm:ss.SSSZ`.
`Config Param: TIMESTAMP_INPUT_DATE_FORMAT` | +| [hoodie.keygen.timebased.input.dateformat.list.delimiter.regex](#hoodiekeygentimebasedinputdateformatlistdelimiterregex) | , | The delimiter for allowed input date format list, usually `,`.
`Config Param: TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX` | +| [hoodie.keygen.timebased.input.timezone](#hoodiekeygentimebasedinputtimezone) | UTC | Timezone of the input timestamp, such as `UTC`.
`Config Param: TIMESTAMP_INPUT_TIMEZONE_FORMAT` | +| [hoodie.keygen.timebased.output.dateformat](#hoodiekeygentimebasedoutputdateformat) | | Output date format such as `yyyy-MM-dd'T'HH:mm:ss.SSSZ`.
`Config Param: TIMESTAMP_OUTPUT_DATE_FORMAT` | +| [hoodie.keygen.timebased.output.timezone](#hoodiekeygentimebasedoutputtimezone) | UTC | Timezone of the output timestamp, such as `UTC`.
`Config Param: TIMESTAMP_OUTPUT_TIMEZONE_FORMAT` | +| [hoodie.keygen.timebased.timestamp.scalar.time.unit](#hoodiekeygentimebasedtimestampscalartimeunit) | SECONDS | When timestamp type `SCALAR` is used, this specifies the time unit, with allowed unit specified by `TimeUnit` enums (`NANOSECONDS`, `MICROSECONDS`, `MILLISECONDS`, `SECONDS`, `MINUTES`, `HOURS`, `DAYS`).
`Config Param: INPUT_TIME_UNIT` | +| [hoodie.keygen.timebased.timezone](#hoodiekeygentimebasedtimezone) | UTC | Timezone of both input and output timestamp if they are the same, such as `UTC`. Please use `hoodie.keygen.timebased.input.timezone` and `hoodie.keygen.timebased.output.timezone` instead if the input and output timezones are different.
`Config Param: TIMESTAMP_TIMEZONE_FORMAT` | +--- + + +### Index Configs {#INDEX} +Configurations that control indexing behavior, which tags incoming records as either inserts or updates to older records. + + +#### Common Index Configs {#Common-Index-Configs} + + + + +[**Basic Configs**](#Common-Index-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------ | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.expression.index.function](#hoodieexpressionindexfunction) | (N/A) | Function to be used for building the expression index.
`Config Param: INDEX_FUNCTION`
`Since Version: 1.0.0` | +| [hoodie.expression.index.name](#hoodieexpressionindexname) | (N/A) | Name of the expression index. This is also used for the partition name in the metadata table.
`Config Param: INDEX_NAME`
`Since Version: 1.0.0` | +| [hoodie.table.checksum](#hoodietablechecksum) | (N/A) | Index definition checksum is used to guard against partial writes in HDFS. It is added as the last entry in index.properties and then used to validate while reading table config.
`Config Param: INDEX_DEFINITION_CHECKSUM`
`Since Version: 1.0.0` | +| [hoodie.expression.index.type](#hoodieexpressionindextype) | COLUMN_STATS | Type of the expression index. Default is `column_stats` if there are no functions and expressions in the command. Valid options could be BITMAP, COLUMN_STATS, LUCENE, etc. If index_type is not provided, and there are functions or expressions in the command then a expression index using column stats will be created.
`Config Param: INDEX_TYPE`
`Since Version: 1.0.0` | +--- + + +#### Common Index Configs {#Common-Index-Configs} + + + + +[**Basic Configs**](#Common-Index-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------- | ------- || +| [hoodie.index.type](#hoodieindextype) | (N/A) | org.apache.hudi.index.HoodieIndex$IndexType: Determines how input records are indexed, i.e., looked up based on the key for the location in the existing table. Default is SIMPLE on Spark engine, and INMEMORY on Flink and Java engines. HBASE: uses an external managed Apache HBase table to store record key to location mapping. HBase index is a global index, enforcing key uniqueness across all partitions in the table. INMEMORY: Uses in-memory hashmap in Spark and Java engine and Flink in-memory state in Flink for indexing. BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced inside partitions. GLOBAL_BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced across all partitions in the table. SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced inside partitions. GLOBAL_SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced across all partitions in the table. BUCKET: locates the file group containing the record fast by using bucket hashing, particularly beneficial in large scale. Use `hoodie.index.bucket.engine` to choose bucket engine type, i.e., how buckets are generated. FLINK_STATE: Internal Config for indexing based on Flink state. RECORD_INDEX: Index which saves the record key to location mappings in the HUDI Metadata Table. Record index is a global index, enforcing key uniqueness across all partitions in the table. Supports sharding to achieve very high scale.
`Config Param: INDEX_TYPE` | +| [hoodie.bucket.index.query.pruning](#hoodiebucketindexquerypruning) | true | Control if table with bucket index use bucket query or not
`Config Param: BUCKET_QUERY_INDEX` | + +[**Advanced Configs**](#Common-Index-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------- | -------------------- || +| [hoodie.bucket.index.hash.field](#hoodiebucketindexhashfield) | (N/A) | Index key. It is used to index the record and find its file group. If not set, use record key field as default
`Config Param: BUCKET_INDEX_HASH_FIELD` | +| [hoodie.bucket.index.max.num.buckets](#hoodiebucketindexmaxnumbuckets) | (N/A) | Only applies if bucket index engine is consistent hashing. Determine the upper bound of the number of buckets in the hudi table. Bucket resizing cannot be done higher than this max limit.
`Config Param: BUCKET_INDEX_MAX_NUM_BUCKETS`
`Since Version: 0.13.0` | +| [hoodie.bucket.index.min.num.buckets](#hoodiebucketindexminnumbuckets) | (N/A) | Only applies if bucket index engine is consistent hashing. Determine the lower bound of the number of buckets in the hudi table. Bucket resizing cannot be done lower than this min limit.
`Config Param: BUCKET_INDEX_MIN_NUM_BUCKETS`
`Since Version: 0.13.0` | +| [hoodie.bloom.index.bucketized.checking](#hoodiebloomindexbucketizedchecking) | true | Only applies if index type is BLOOM. When true, bucketized bloom filtering is enabled. This reduces skew seen in sort based bloom index lookup
`Config Param: BLOOM_INDEX_BUCKETIZED_CHECKING` | +| [hoodie.bloom.index.input.storage.level](#hoodiebloomindexinputstoragelevel) | MEMORY_AND_DISK_SER | Only applies when #bloomIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. Refer to org.apache.spark.storage.StorageLevel for different values
`Config Param: BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE` | +| [hoodie.bloom.index.keys.per.bucket](#hoodiebloomindexkeysperbucket) | 10000000 | Only applies if bloomIndexBucketizedChecking is enabled and index type is bloom. This configuration controls the “bucket” size which tracks the number of record-key checks made against a single file and is the unit of work allocated to each partition performing bloom filter lookup. A higher value would amortize the fixed cost of reading a bloom filter to memory.
`Config Param: BLOOM_INDEX_KEYS_PER_BUCKET` | +| [hoodie.bloom.index.parallelism](#hoodiebloomindexparallelism) | 0 | Only applies if index type is BLOOM. This is the amount of parallelism for index lookup, which involves a shuffle. By default, this is auto computed based on input workload characteristics. If the parallelism is explicitly configured by the user, the user-configured value is used in defining the actual parallelism. If the indexing stage is slow due to the limited parallelism, you can increase this to tune the performance.
`Config Param: BLOOM_INDEX_PARALLELISM` | +| [hoodie.bloom.index.prune.by.ranges](#hoodiebloomindexprunebyranges) | true | Only applies if index type is BLOOM. When true, range information from files to leveraged speed up index lookups. Particularly helpful, if the key has a monotonously increasing prefix, such as timestamp. If the record key is completely random, it is better to turn this off, since range pruning will only add extra overhead to the index lookup.
`Config Param: BLOOM_INDEX_PRUNE_BY_RANGES` | +| [hoodie.bloom.index.update.partition.path](#hoodiebloomindexupdatepartitionpath) | true | Only applies if index type is GLOBAL_BLOOM. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition
`Config Param: BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE` | +| [hoodie.bloom.index.use.caching](#hoodiebloomindexusecaching) | true | Only applies if index type is BLOOM.When true, the input RDD will cached to speed up index lookup by reducing IO for computing parallelism or affected partitions
`Config Param: BLOOM_INDEX_USE_CACHING` | +| [hoodie.bloom.index.use.metadata](#hoodiebloomindexusemetadata) | false | Only applies if index type is BLOOM.When true, the index lookup uses bloom filters and column stats from metadata table when available to speed up the process.
`Config Param: BLOOM_INDEX_USE_METADATA`
`Since Version: 0.11.0` | +| [hoodie.bloom.index.use.treebased.filter](#hoodiebloomindexusetreebasedfilter) | true | Only applies if index type is BLOOM. When true, interval tree based file pruning optimization is enabled. This mode speeds-up file-pruning based on key ranges when compared with the brute-force mode
`Config Param: BLOOM_INDEX_TREE_BASED_FILTER` | +| [hoodie.bucket.index.merge.threshold](#hoodiebucketindexmergethreshold) | 0.2 | Control if buckets should be merged when using consistent hashing bucket indexSpecifically, if a file slice size is smaller than `hoodie.xxxx.max.file.size` * threshold, then it will be consideredas a merge candidate.
`Config Param: BUCKET_MERGE_THRESHOLD`
`Since Version: 0.13.0` | +| [hoodie.bucket.index.num.buckets](#hoodiebucketindexnumbuckets) | 256 | Only applies if index type is BUCKET. Determine the number of buckets in the hudi table, and each partition is divided to N buckets.
`Config Param: BUCKET_INDEX_NUM_BUCKETS` | +| [hoodie.bucket.index.split.threshold](#hoodiebucketindexsplitthreshold) | 2.0 | Control if the bucket should be split when using consistent hashing bucket index.Specifically, if a file slice size reaches `hoodie.xxxx.max.file.size` * threshold, then split will be carried out.
`Config Param: BUCKET_SPLIT_THRESHOLD`
`Since Version: 0.13.0` | +| [hoodie.global.index.reconcile.parallelism](#hoodieglobalindexreconcileparallelism) | 60 | Only applies if index type is GLOBAL_BLOOM or GLOBAL_SIMPLE. This controls the parallelism for deduplication during indexing where more than 1 record could be tagged due to partition update.
`Config Param: GLOBAL_INDEX_RECONCILE_PARALLELISM` | +| [hoodie.global.simple.index.parallelism](#hoodieglobalsimpleindexparallelism) | 0 | Only applies if index type is GLOBAL_SIMPLE. This limits the parallelism of fetching records from the base files of all table partitions. The index picks the configured parallelism if the number of base files is larger than this configured value; otherwise, the number of base files is used as the parallelism. If the indexing stage is slow due to the limited parallelism, you can increase this to tune the performance.
`Config Param: GLOBAL_SIMPLE_INDEX_PARALLELISM` | +| [hoodie.index.bucket.engine](#hoodieindexbucketengine) | SIMPLE | org.apache.hudi.index.HoodieIndex$BucketIndexEngineType: Determines the type of bucketing or hashing to use when `hoodie.index.type` is set to `BUCKET`. SIMPLE(default): Uses a fixed number of buckets for file groups which cannot shrink or expand. This works for both COW and MOR tables. CONSISTENT_HASHING: Supports dynamic number of buckets with bucket resizing to properly size each bucket. This solves potential data skew problem where one bucket can be significantly larger than others in SIMPLE engine type. This only works with MOR tables.
`Config Param: BUCKET_INDEX_ENGINE_TYPE`
`Since Version: 0.11.0` | +| [hoodie.index.class](#hoodieindexclass) | | Full path of user-defined index class and must be a subclass of HoodieIndex class. It will take precedence over the hoodie.index.type configuration if specified
`Config Param: INDEX_CLASS_NAME` | +| [hoodie.record.index.input.storage.level](#hoodierecordindexinputstoragelevel) | MEMORY_AND_DISK_SER | Only applies when #recordIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. Refer to org.apache.spark.storage.StorageLevel for different values
`Config Param: RECORD_INDEX_INPUT_STORAGE_LEVEL_VALUE`
`Since Version: 0.14.0` | +| [hoodie.record.index.update.partition.path](#hoodierecordindexupdatepartitionpath) | false | Similar to Key: 'hoodie.bloom.index.update.partition.path' , default: true , isAdvanced: true , description: Only applies if index type is GLOBAL_BLOOM. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition since version: version is not defined deprecated after: version is not defined, but for record index.
`Config Param: RECORD_INDEX_UPDATE_PARTITION_PATH_ENABLE`
`Since Version: 0.14.0` | +| [hoodie.record.index.use.caching](#hoodierecordindexusecaching) | true | Only applies if index type is RECORD_INDEX.When true, the input RDD will be cached to speed up index lookup by reducing IO for computing parallelism or affected partitions
`Config Param: RECORD_INDEX_USE_CACHING`
`Since Version: 0.14.0` | +| [hoodie.simple.index.input.storage.level](#hoodiesimpleindexinputstoragelevel) | MEMORY_AND_DISK_SER | Only applies when #simpleIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. Refer to org.apache.spark.storage.StorageLevel for different values
`Config Param: SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE` | +| [hoodie.simple.index.parallelism](#hoodiesimpleindexparallelism) | 0 | Only applies if index type is SIMPLE. This limits the parallelism of fetching records from the base files of affected partitions. By default, this is auto computed based on input workload characteristics. If the parallelism is explicitly configured by the user, the user-configured value is used in defining the actual parallelism. If the indexing stage is slow due to the limited parallelism, you can increase this to tune the performance.
`Config Param: SIMPLE_INDEX_PARALLELISM` | +| [hoodie.simple.index.update.partition.path](#hoodiesimpleindexupdatepartitionpath) | true | Similar to Key: 'hoodie.bloom.index.update.partition.path' , default: true , isAdvanced: true , description: Only applies if index type is GLOBAL_BLOOM. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition since version: version is not defined deprecated after: version is not defined, but for simple index.
`Config Param: SIMPLE_INDEX_UPDATE_PARTITION_PATH_ENABLE` | +| [hoodie.simple.index.use.caching](#hoodiesimpleindexusecaching) | true | Only applies if index type is SIMPLE. When true, the incoming writes will cached to speed up index lookup by reducing IO for computing parallelism or affected partitions
`Config Param: SIMPLE_INDEX_USE_CACHING` | +--- + + +#### HBase Index Configs {#HBase-Index-Configs} +Configurations that control indexing behavior (when HBase based indexing is enabled), which tags incoming records as either inserts or updates to older records. + + + +[**Advanced Configs**](#HBase-Index-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------- | ------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.index.hbase.kerberos.user.keytab](#hoodieindexhbasekerberosuserkeytab) | (N/A) | File name of the kerberos keytab file for connecting to the hbase cluster.
`Config Param: KERBEROS_USER_KEYTAB` | +| [hoodie.index.hbase.kerberos.user.principal](#hoodieindexhbasekerberosuserprincipal) | (N/A) | The kerberos principal name for connecting to the hbase cluster.
`Config Param: KERBEROS_USER_PRINCIPAL` | +| [hoodie.index.hbase.master.kerberos.principal](#hoodieindexhbasemasterkerberosprincipal) | (N/A) | The value of hbase.master.kerberos.principal in hbase cluster.
`Config Param: MASTER_PRINCIPAL` | +| [hoodie.index.hbase.max.qps.fraction](#hoodieindexhbasemaxqpsfraction) | (N/A) | Maximum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads
`Config Param: MAX_QPS_FRACTION` | +| [hoodie.index.hbase.min.qps.fraction](#hoodieindexhbaseminqpsfraction) | (N/A) | Minimum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads
`Config Param: MIN_QPS_FRACTION` | +| [hoodie.index.hbase.regionserver.kerberos.principal](#hoodieindexhbaseregionserverkerberosprincipal) | (N/A) | The value of hbase.regionserver.kerberos.principal in hbase cluster.
`Config Param: REGIONSERVER_PRINCIPAL` | +| [hoodie.index.hbase.sleep.ms.for.get.batch](#hoodieindexhbasesleepmsforgetbatch) | (N/A) |
`Config Param: SLEEP_MS_FOR_GET_BATCH` | +| [hoodie.index.hbase.sleep.ms.for.put.batch](#hoodieindexhbasesleepmsforputbatch) | (N/A) |
`Config Param: SLEEP_MS_FOR_PUT_BATCH` | +| [hoodie.index.hbase.table](#hoodieindexhbasetable) | (N/A) | Only applies if index type is HBASE. HBase Table name to use as the index. Hudi stores the row_key and [partition_path, fileID, commitTime] mapping in the table
`Config Param: TABLENAME` | +| [hoodie.index.hbase.zknode.path](#hoodieindexhbasezknodepath) | (N/A) | Only applies if index type is HBASE. This is the root znode that will contain all the znodes created/used by HBase
`Config Param: ZK_NODE_PATH` | +| [hoodie.index.hbase.zkport](#hoodieindexhbasezkport) | (N/A) | Only applies if index type is HBASE. HBase ZK Quorum port to connect to
`Config Param: ZKPORT` | +| [hoodie.index.hbase.zkquorum](#hoodieindexhbasezkquorum) | (N/A) | Only applies if index type is HBASE. HBase ZK Quorum url to connect to
`Config Param: ZKQUORUM` | +| [hoodie.hbase.index.update.partition.path](#hoodiehbaseindexupdatepartitionpath) | false | Only applies if index type is HBASE. When an already existing record is upserted to a new partition compared to whats in storage, this config when set, will delete old record in old partition and will insert it as new record in new partition.
`Config Param: UPDATE_PARTITION_PATH_ENABLE` | +| [hoodie.index.hbase.bucket.number](#hoodieindexhbasebucketnumber) | 8 | Only applicable when using RebalancedSparkHoodieHBaseIndex, same as hbase regions count can get the best performance
`Config Param: BUCKET_NUMBER` | +| [hoodie.index.hbase.desired_puts_time_in_secs](#hoodieindexhbasedesired_puts_time_in_secs) | 600 |
`Config Param: DESIRED_PUTS_TIME_IN_SECONDS` | +| [hoodie.index.hbase.dynamic_qps](#hoodieindexhbasedynamic_qps) | false | Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on write volume.
`Config Param: COMPUTE_QPS_DYNAMICALLY` | +| [hoodie.index.hbase.get.batch.size](#hoodieindexhbasegetbatchsize) | 100 | Controls the batch size for performing gets against HBase. Batching improves throughput, by saving round trips.
`Config Param: GET_BATCH_SIZE` | +| [hoodie.index.hbase.max.qps.per.region.server](#hoodieindexhbasemaxqpsperregionserver) | 1000 | Property to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to limit the aggregate QPS generated across various jobs to an Hbase Region Server. It is recommended to set this value based on global indexing throughput needs and most importantly, how much the HBase installation in use is able to tolerate without Region Servers going down.
`Config Param: MAX_QPS_PER_REGION_SERVER` | +| [hoodie.index.hbase.put.batch.size](#hoodieindexhbaseputbatchsize) | 100 | Controls the batch size for performing puts against HBase. Batching improves throughput, by saving round trips.
`Config Param: PUT_BATCH_SIZE` | +| [hoodie.index.hbase.put.batch.size.autocompute](#hoodieindexhbaseputbatchsizeautocompute) | false | Property to set to enable auto computation of put batch size
`Config Param: PUT_BATCH_SIZE_AUTO_COMPUTE` | +| [hoodie.index.hbase.qps.allocator.class](#hoodieindexhbaseqpsallocatorclass) | org.apache.hudi.index.hbase.DefaultHBaseQPSResourceAllocator | Property to set which implementation of HBase QPS resource allocator to be used, whichcontrols the batching rate dynamically.
`Config Param: QPS_ALLOCATOR_CLASS_NAME` | +| [hoodie.index.hbase.qps.fraction](#hoodieindexhbaseqpsfraction) | 0.5 | Property to set the fraction of the global share of QPS that should be allocated to this job. Let's say there are 3 jobs which have input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively. Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming Region Servers.
`Config Param: QPS_FRACTION` | +| [hoodie.index.hbase.rollback.sync](#hoodieindexhbaserollbacksync) | false | When set to true, the rollback method will delete the last failed task index. The default value is false. Because deleting the index will add extra load on the Hbase cluster for each rollback
`Config Param: ROLLBACK_SYNC_ENABLE` | +| [hoodie.index.hbase.security.authentication](#hoodieindexhbasesecurityauthentication) | simple | Property to decide if the hbase cluster secure authentication is enabled or not. Possible values are 'simple' (no authentication), and 'kerberos'.
`Config Param: SECURITY_AUTHENTICATION` | +| [hoodie.index.hbase.zk.connection_timeout_ms](#hoodieindexhbasezkconnection_timeout_ms) | 15000 | Timeout to use for establishing connection with zookeeper, from HBase client.
`Config Param: ZK_CONNECTION_TIMEOUT_MS` | +| [hoodie.index.hbase.zk.session_timeout_ms](#hoodieindexhbasezksession_timeout_ms) | 60000 | Session timeout value to use for Zookeeper failure detection, for the HBase client.Lower this value, if you want to fail faster.
`Config Param: ZK_SESSION_TIMEOUT_MS` | +| [hoodie.index.hbase.zkpath.qps_root](#hoodieindexhbasezkpathqps_root) | /QPS_ROOT | chroot in zookeeper, to use for all qps allocation co-ordination.
`Config Param: ZKPATH_QPS_ROOT` | +--- + +## Reader Configs {#READER} +Please fill in the description for Config Group Name: Reader Configs + + +### Reader Configs {#Reader-Configs} +Configurations that control file group reading. + + + +[**Advanced Configs**](#Reader-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [_hoodie.hfile.use.native.reader](#_hoodiehfileusenativereader) | true | When enabled, the native HFile reader is used to read HFiles. This is an internal config.
`Config Param: USE_NATIVE_HFILE_READER`
`Since Version: 1.0.0` | +| [hoodie.compaction.lazy.block.read](#hoodiecompactionlazyblockread) | true | When merging the delta log files, this config helps to choose whether the log blocks should be read lazily or not. Choose true to use lazy block reading (low memory usage, but incurs seeks to each block header) or false for immediate block read (higher memory usage)
`Config Param: COMPACTION_LAZY_BLOCK_READ_ENABLE` | +| [hoodie.compaction.reverse.log.read](#hoodiecompactionreverselogread) | false | HoodieLogFormatReader reads a logfile in the forward direction starting from pos=0 to pos=file_length. If this config is set to true, the reader reads the logfile in reverse direction, from pos=file_length to pos=0
`Config Param: COMPACTION_REVERSE_LOG_READ_ENABLE` | +| [hoodie.datasource.merge.type](#hoodiedatasourcemergetype) | payload_combine | For Snapshot query on merge on read table. Use this key to define how the payloads are merged, in 1) skip_merge: read the base file records plus the log file records without merging; 2) payload_combine: read the base file records first, for each record in base file, checks whether the key is in the log file records (combines the two records with same key for base and log file records), then read the left log file records
`Config Param: MERGE_TYPE` | +| [hoodie.file.group.reader.enabled](#hoodiefilegroupreaderenabled) | true | Use engine agnostic file group reader if enabled
`Config Param: FILE_GROUP_READER_ENABLED`
`Since Version: 1.0.0` | +| [hoodie.merge.use.record.positions](#hoodiemergeuserecordpositions) | false | Whether to use positions in the block header for data blocks containing updates and delete blocks for merging.
`Config Param: MERGE_USE_RECORD_POSITIONS`
`Since Version: 1.0.0` | +| [hoodie.optimized.log.blocks.scan.enable](#hoodieoptimizedlogblocksscanenable) | false | New optimized scan for log blocks that handles all multi-writer use-cases while appending to log files. It also differentiates original blocks written by ingestion writers and compacted blocks written log compaction.
`Config Param: ENABLE_OPTIMIZED_LOG_BLOCKS_SCAN`
`Since Version: 0.13.0` | +--- + +## Metastore and Catalog Sync Configs {#META_SYNC} +Configurations used by the Hudi to sync metadata to external metastores and catalogs. + + +### Common Metadata Sync Configs {#Common-Metadata-Sync-Configs} + + + + +[**Basic Configs**](#Common-Metadata-Sync-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | + +[**Advanced Configs**](#Common-Metadata-Sync-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------ | ------------------------------------------------- || +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET | Base file format for the sync.
`Config Param: META_SYNC_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default | The name of the destination database that we should sync the hudi table to.
`Config Param: META_SYNC_DATABASE_NAME` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.
`Config Param: META_SYNC_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | | Field in the table to use for determining hive partition columns.
`Config Param: META_SYNC_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown | The name of the destination table that we should sync the hudi table to.
`Config Param: META_SYNC_TABLE_NAME` | +| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | | Base path of the hoodie table to sync
`Config Param: META_SYNC_BASE_PATH` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false | If true, only sync on conditions like schema change or partition change.
`Config Param: META_SYNC_CONDITIONAL_SYNC` | +| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false.
`Config Param: META_SYNC_DECODE_PARTITION` | +| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost.
`Config Param: META_SYNC_INCREMENTAL`
`Since Version: 0.14.0` | +| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | true | Enable the internal metadata table for file listing for syncing with metastores
`Config Param: META_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.no_partition_metadata](#hoodiemetasyncno_partition_metadata) | false | If true, the partition metadata will not be synced to the metastore. This is useful when the partition metadata is large, and the partition info can be obtained from Hudi's internal metadata table. Note, Key: 'hoodie.metadata.enable' , default: true , isAdvanced: false , description: Enable the internal metadata table which serves table metadata like level file listings since version: 0.7.0 deprecated after: version is not defined must be set to true.
`Config Param: META_SYNC_NO_PARTITION_METADATA`
`Since Version: 1.0.0` | +| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | true | sync meta info to origin table if enable
`Config Param: META_SYNC_SNAPSHOT_WITH_TABLE_NAME`
`Since Version: 0.14.0` | +| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | | The spark version used when syncing with a metastore.
`Config Param: META_SYNC_SPARK_VERSION` | +--- + + +### Glue catalog sync based client Configurations {#Glue-catalog-sync-based-client-Configurations} +Configs that control Glue catalog sync based client. + + + +[**Basic Configs**](#Glue-catalog-sync-based-client-Configurations-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------------------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.glue.partition_index_fields](#hoodiedatasourcemetasyncgluepartition_index_fields) | | Specify the partitions fields to index on aws glue. Separate the fields by semicolon. By default, when the feature is enabled, all the partition will be indexed. You can create up to three indexes, separate them by comma. Eg: col1;col2;col3,col2,col3
`Config Param: META_SYNC_PARTITION_INDEX_FIELDS`
`Since Version: 0.15.0` | +| [hoodie.datasource.meta.sync.glue.partition_index_fields.enable](#hoodiedatasourcemetasyncgluepartition_index_fieldsenable) | false | Enable aws glue partition index feature, to speedup partition based query pattern
`Config Param: META_SYNC_PARTITION_INDEX_FIELDS_ENABLE`
`Since Version: 0.15.0` | + +[**Advanced Configs**](#Glue-catalog-sync-based-client-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.glue.all_partitions_read_parallelism](#hoodiedatasourcemetasyncglueall_partitions_read_parallelism) | 1 | Parallelism for listing all partitions(first time sync). Should be in interval [1, 10].
`Config Param: ALL_PARTITIONS_READ_PARALLELISM`
`Since Version: 0.15.0` | +| [hoodie.datasource.meta.sync.glue.changed_partitions_read_parallelism](#hoodiedatasourcemetasyncgluechanged_partitions_read_parallelism) | 1 | Parallelism for listing changed partitions(second and subsequent syncs).
`Config Param: CHANGED_PARTITIONS_READ_PARALLELISM`
`Since Version: 0.15.0` | +| [hoodie.datasource.meta.sync.glue.metadata_file_listing](#hoodiedatasourcemetasyncgluemetadata_file_listing) | false | Makes athena use the metadata table to list partitions and files. Currently it won't benefit from other features such stats indexes
`Config Param: GLUE_METADATA_FILE_LISTING`
`Since Version: 0.14.0` | +| [hoodie.datasource.meta.sync.glue.partition_change_parallelism](#hoodiedatasourcemetasyncgluepartition_change_parallelism) | 1 | Parallelism for change operations - such as create/update/delete.
`Config Param: PARTITION_CHANGE_PARALLELISM`
`Since Version: 0.15.0` | +| [hoodie.datasource.meta.sync.glue.recreate_table_on_error](#hoodiedatasourcemetasyncgluerecreate_table_on_error) | false | Glue sync may fail if the Glue table exists with partitions differing from the Hoodie table or if schema evolution is not supported by Glue.Enabling this configuration will drop and create the table to match the Hoodie config
`Config Param: RECREATE_GLUE_TABLE_ON_ERROR`
`Since Version: 0.14.0` | +| [hoodie.datasource.meta.sync.glue.skip_table_archive](#hoodiedatasourcemetasyncglueskip_table_archive) | true | Glue catalog sync based client will skip archiving the table version if this config is set to true
`Config Param: GLUE_SKIP_TABLE_ARCHIVE`
`Since Version: 0.14.0` | +--- + + +### BigQuery Sync Configs {#BigQuery-Sync-Configs} +Configurations used by the Hudi to sync metadata to Google BigQuery. + + + +[**Basic Configs**](#BigQuery-Sync-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | + +[**Advanced Configs**](#BigQuery-Sync-Configs-advanced-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- || +| [hoodie.gcp.bigquery.sync.big_lake_connection_id](#hoodiegcpbigquerysyncbig_lake_connection_id) | (N/A) | The Big Lake connection ID to use
`Config Param: BIGQUERY_SYNC_BIG_LAKE_CONNECTION_ID`
`Since Version: 0.14.1` | +| [hoodie.gcp.bigquery.sync.billing.project.id](#hoodiegcpbigquerysyncbillingprojectid) | (N/A) | Name of the billing project id in BigQuery. By default it uses the configuration from `hoodie.gcp.bigquery.sync.project_id` if this configuration is not set. This can only be used with manifest file based approach
`Config Param: BIGQUERY_SYNC_BILLING_PROJECT_ID`
`Since Version: 1.0.0` | +| [hoodie.gcp.bigquery.sync.dataset_location](#hoodiegcpbigquerysyncdataset_location) | (N/A) | Location of the target dataset in BigQuery
`Config Param: BIGQUERY_SYNC_DATASET_LOCATION` | +| [hoodie.gcp.bigquery.sync.project_id](#hoodiegcpbigquerysyncproject_id) | (N/A) | Name of the target project in BigQuery
`Config Param: BIGQUERY_SYNC_PROJECT_ID` | +| [hoodie.gcp.bigquery.sync.source_uri](#hoodiegcpbigquerysyncsource_uri) | (N/A) | Name of the source uri gcs path of the table
`Config Param: BIGQUERY_SYNC_SOURCE_URI` | +| [hoodie.gcp.bigquery.sync.source_uri_prefix](#hoodiegcpbigquerysyncsource_uri_prefix) | (N/A) | Name of the source uri gcs path prefix of the table
`Config Param: BIGQUERY_SYNC_SOURCE_URI_PREFIX` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET | Base file format for the sync.
`Config Param: META_SYNC_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default | The name of the destination database that we should sync the hudi table to.
`Config Param: META_SYNC_DATABASE_NAME` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.
`Config Param: META_SYNC_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | | Field in the table to use for determining hive partition columns.
`Config Param: META_SYNC_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown | The name of the destination table that we should sync the hudi table to.
`Config Param: META_SYNC_TABLE_NAME` | +| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | | Base path of the hoodie table to sync
`Config Param: META_SYNC_BASE_PATH` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false | If true, only sync on conditions like schema change or partition change.
`Config Param: META_SYNC_CONDITIONAL_SYNC` | +| [hoodie.gcp.bigquery.sync.dataset_name](#hoodiegcpbigquerysyncdataset_name) | | Name of the target dataset in BigQuery
`Config Param: BIGQUERY_SYNC_DATASET_NAME` | +| [hoodie.gcp.bigquery.sync.partition_fields](#hoodiegcpbigquerysyncpartition_fields) | | Comma-delimited partition fields. Default to non-partitioned.
`Config Param: BIGQUERY_SYNC_PARTITION_FIELDS` | +| [hoodie.gcp.bigquery.sync.require_partition_filter](#hoodiegcpbigquerysyncrequire_partition_filter) | false | If true, configure table to require a partition filter to be specified when querying the table
`Config Param: BIGQUERY_SYNC_REQUIRE_PARTITION_FILTER`
`Since Version: 0.14.1` | +| [hoodie.gcp.bigquery.sync.table_name](#hoodiegcpbigquerysynctable_name) | | Name of the target table in BigQuery
`Config Param: BIGQUERY_SYNC_TABLE_NAME` | +| [hoodie.gcp.bigquery.sync.use_bq_manifest_file](#hoodiegcpbigquerysyncuse_bq_manifest_file) | false | If true, generate a manifest file with data file absolute paths and use BigQuery manifest file support to directly create one external table over the Hudi table. If false (default), generate a manifest file with data file names and create two external tables and one view in BigQuery. Query the view for the same results as querying the Hudi table
`Config Param: BIGQUERY_SYNC_USE_BQ_MANIFEST_FILE`
`Since Version: 0.14.0` | +| [hoodie.gcp.bigquery.sync.use_file_listing_from_metadata](#hoodiegcpbigquerysyncuse_file_listing_from_metadata) | true | Fetch file listing from Hudi's metadata
`Config Param: BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false.
`Config Param: META_SYNC_DECODE_PARTITION` | +| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost.
`Config Param: META_SYNC_INCREMENTAL`
`Since Version: 0.14.0` | +| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | true | Enable the internal metadata table for file listing for syncing with metastores
`Config Param: META_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.no_partition_metadata](#hoodiemetasyncno_partition_metadata) | false | If true, the partition metadata will not be synced to the metastore. This is useful when the partition metadata is large, and the partition info can be obtained from Hudi's internal metadata table. Note, Key: 'hoodie.metadata.enable' , default: true , isAdvanced: false , description: Enable the internal metadata table which serves table metadata like level file listings since version: 0.7.0 deprecated after: version is not defined must be set to true.
`Config Param: META_SYNC_NO_PARTITION_METADATA`
`Since Version: 1.0.0` | +| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | true | sync meta info to origin table if enable
`Config Param: META_SYNC_SNAPSHOT_WITH_TABLE_NAME`
`Since Version: 0.14.0` | +| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | | The spark version used when syncing with a metastore.
`Config Param: META_SYNC_SPARK_VERSION` | +--- + + +### Hive Sync Configs {#Hive-Sync-Configs} +Configurations used by the Hudi to sync metadata to Hive Metastore. + + + +[**Basic Configs**](#Hive-Sync-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | (N/A) | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.
`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false | When set to true, register/sync the table to Apache Hive metastore.
`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 | Hive metastore url
`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 | Hive metastore url
`Config Param: METASTORE_URIS` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | + +[**Advanced Configs**](#Hive-Sync-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- || +| [hoodie.datasource.hive_sync.serde_properties](#hoodiedatasourcehive_syncserde_properties) | (N/A) | Serde properties to hive table.
`Config Param: HIVE_TABLE_SERDE_PROPERTIES` | +| [hoodie.datasource.hive_sync.table_properties](#hoodiedatasourcehive_synctable_properties) | (N/A) | Additional properties to store with table.
`Config Param: HIVE_TABLE_PROPERTIES` | +| [hoodie.datasource.hive_sync.auto_create_database](#hoodiedatasourcehive_syncauto_create_database) | true | Auto create hive database if does not exists
`Config Param: HIVE_AUTO_CREATE_DATABASE` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET | Base file format for the sync.
`Config Param: META_SYNC_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.batch_num](#hoodiedatasourcehive_syncbatch_num) | 1000 | The number of partitions one batch when synchronous partitions to hive.
`Config Param: HIVE_BATCH_SYNC_PARTITION_NUM` | +| [hoodie.datasource.hive_sync.bucket_sync](#hoodiedatasourcehive_syncbucket_sync) | false | Whether sync hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'
`Config Param: HIVE_SYNC_BUCKET_SYNC` | +| [hoodie.datasource.hive_sync.bucket_sync_spec](#hoodiedatasourcehive_syncbucket_sync_spec) | | The hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'
`Config Param: HIVE_SYNC_BUCKET_SYNC_SPEC` | +| [hoodie.datasource.hive_sync.create_managed_table](#hoodiedatasourcehive_synccreate_managed_table) | false | Whether to sync the table as managed table.
`Config Param: HIVE_CREATE_MANAGED_TABLE` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default | The name of the destination database that we should sync the hudi table to.
`Config Param: META_SYNC_DATABASE_NAME` | +| [hoodie.datasource.hive_sync.filter_pushdown_enabled](#hoodiedatasourcehive_syncfilter_pushdown_enabled) | false | Whether to enable push down partitions by filter
`Config Param: HIVE_SYNC_FILTER_PUSHDOWN_ENABLED` | +| [hoodie.datasource.hive_sync.filter_pushdown_max_size](#hoodiedatasourcehive_syncfilter_pushdown_max_size) | 1000 | Max size limit to push down partition filters, if the estimate push down filters exceed this size, will directly try to fetch all partitions between the min/max.In case of glue metastore, this value should be reduced because it has a filter length limit.
`Config Param: HIVE_SYNC_FILTER_PUSHDOWN_MAX_SIZE` | +| [hoodie.datasource.hive_sync.ignore_exceptions](#hoodiedatasourcehive_syncignore_exceptions) | false | Ignore exceptions when syncing with Hive.
`Config Param: HIVE_IGNORE_EXCEPTIONS` | +| [hoodie.datasource.hive_sync.omit_metadata_fields](#hoodiedatasourcehive_syncomit_metadata_fields) | false | Whether to omit the hoodie metadata fields in the target table.
`Config Param: HIVE_SYNC_OMIT_METADATA_FIELDS`
`Since Version: 0.13.0` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.
`Config Param: META_SYNC_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | | Field in the table to use for determining hive partition columns.
`Config Param: META_SYNC_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.password](#hoodiedatasourcehive_syncpassword) | hive | hive password to use
`Config Param: HIVE_PASS` | +| [hoodie.datasource.hive_sync.recreate_table_on_error](#hoodiedatasourcehive_syncrecreate_table_on_error) | false | Hive sync may fail if the Hive table exists with partitions differing from the Hoodie table or if schema evolution if not supported by Hive.Enabling this configuration will drop and create the table to match the Hoodie config
`Config Param: RECREATE_HIVE_TABLE_ON_ERROR`
`Since Version: 0.14.0` | +| [hoodie.datasource.hive_sync.schema_string_length_thresh](#hoodiedatasourcehive_syncschema_string_length_thresh) | 4000 |
`Config Param: HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD` | +| [hoodie.datasource.hive_sync.skip_ro_suffix](#hoodiedatasourcehive_syncskip_ro_suffix) | false | Skip the _ro suffix for Read optimized table, when registering
`Config Param: HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE` | +| [hoodie.datasource.hive_sync.support_timestamp](#hoodiedatasourcehive_syncsupport_timestamp) | false | ‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. Disabled by default for backward compatibility. NOTE: On Spark entrypoints, this is defaulted to TRUE
`Config Param: HIVE_SUPPORT_TIMESTAMP_TYPE` | +| [hoodie.datasource.hive_sync.sync_as_datasource](#hoodiedatasourcehive_syncsync_as_datasource) | true |
`Config Param: HIVE_SYNC_AS_DATA_SOURCE_TABLE` | +| [hoodie.datasource.hive_sync.sync_comment](#hoodiedatasourcehive_syncsync_comment) | false | Whether to sync the table column comments while syncing the table.
`Config Param: HIVE_SYNC_COMMENT` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown | The name of the destination table that we should sync the hudi table to.
`Config Param: META_SYNC_TABLE_NAME` | +| [hoodie.datasource.hive_sync.table.strategy](#hoodiedatasourcehive_synctablestrategy) | ALL | Hive table synchronization strategy. Available option: RO, RT, ALL.
`Config Param: HIVE_SYNC_TABLE_STRATEGY`
`Since Version: 0.13.0` | +| [hoodie.datasource.hive_sync.use_jdbc](#hoodiedatasourcehive_syncuse_jdbc) | true | Use JDBC when hive synchronization is enabled
`Config Param: HIVE_USE_JDBC` | +| [hoodie.datasource.hive_sync.use_pre_apache_input_format](#hoodiedatasourcehive_syncuse_pre_apache_input_format) | false | Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. Use this when you are in the process of migrating from com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format
`Config Param: HIVE_USE_PRE_APACHE_INPUT_FORMAT` | +| [hoodie.datasource.hive_sync.username](#hoodiedatasourcehive_syncusername) | hive | hive user name to use
`Config Param: HIVE_USER` | +| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | | Base path of the hoodie table to sync
`Config Param: META_SYNC_BASE_PATH` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false | If true, only sync on conditions like schema change or partition change.
`Config Param: META_SYNC_CONDITIONAL_SYNC` | +| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false.
`Config Param: META_SYNC_DECODE_PARTITION` | +| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost.
`Config Param: META_SYNC_INCREMENTAL`
`Since Version: 0.14.0` | +| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | true | Enable the internal metadata table for file listing for syncing with metastores
`Config Param: META_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.no_partition_metadata](#hoodiemetasyncno_partition_metadata) | false | If true, the partition metadata will not be synced to the metastore. This is useful when the partition metadata is large, and the partition info can be obtained from Hudi's internal metadata table. Note, Key: 'hoodie.metadata.enable' , default: true , isAdvanced: false , description: Enable the internal metadata table which serves table metadata like level file listings since version: 0.7.0 deprecated after: version is not defined must be set to true.
`Config Param: META_SYNC_NO_PARTITION_METADATA`
`Since Version: 1.0.0` | +| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | true | sync meta info to origin table if enable
`Config Param: META_SYNC_SNAPSHOT_WITH_TABLE_NAME`
`Since Version: 0.14.0` | +| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | | The spark version used when syncing with a metastore.
`Config Param: META_SYNC_SPARK_VERSION` | +--- + + +### Global Hive Sync Configs {#Global-Hive-Sync-Configs} +Global replication configurations used by the Hudi to sync metadata to Hive Metastore. + + + +[**Basic Configs**](#Global-Hive-Sync-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | (N/A) | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.
`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false | When set to true, register/sync the table to Apache Hive metastore.
`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 | Hive metastore url
`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 | Hive metastore url
`Config Param: METASTORE_URIS` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | + +[**Advanced Configs**](#Global-Hive-Sync-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- || +| [hoodie.datasource.hive_sync.serde_properties](#hoodiedatasourcehive_syncserde_properties) | (N/A) | Serde properties to hive table.
`Config Param: HIVE_TABLE_SERDE_PROPERTIES` | +| [hoodie.datasource.hive_sync.table_properties](#hoodiedatasourcehive_synctable_properties) | (N/A) | Additional properties to store with table.
`Config Param: HIVE_TABLE_PROPERTIES` | +| [hoodie.meta_sync.global.replicate.timestamp](#hoodiemeta_syncglobalreplicatetimestamp) | (N/A) |
`Config Param: META_SYNC_GLOBAL_REPLICATE_TIMESTAMP` | +| [hoodie.datasource.hive_sync.auto_create_database](#hoodiedatasourcehive_syncauto_create_database) | true | Auto create hive database if does not exists
`Config Param: HIVE_AUTO_CREATE_DATABASE` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET | Base file format for the sync.
`Config Param: META_SYNC_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.batch_num](#hoodiedatasourcehive_syncbatch_num) | 1000 | The number of partitions one batch when synchronous partitions to hive.
`Config Param: HIVE_BATCH_SYNC_PARTITION_NUM` | +| [hoodie.datasource.hive_sync.bucket_sync](#hoodiedatasourcehive_syncbucket_sync) | false | Whether sync hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'
`Config Param: HIVE_SYNC_BUCKET_SYNC` | +| [hoodie.datasource.hive_sync.bucket_sync_spec](#hoodiedatasourcehive_syncbucket_sync_spec) | | The hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'
`Config Param: HIVE_SYNC_BUCKET_SYNC_SPEC` | +| [hoodie.datasource.hive_sync.create_managed_table](#hoodiedatasourcehive_synccreate_managed_table) | false | Whether to sync the table as managed table.
`Config Param: HIVE_CREATE_MANAGED_TABLE` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default | The name of the destination database that we should sync the hudi table to.
`Config Param: META_SYNC_DATABASE_NAME` | +| [hoodie.datasource.hive_sync.filter_pushdown_enabled](#hoodiedatasourcehive_syncfilter_pushdown_enabled) | false | Whether to enable push down partitions by filter
`Config Param: HIVE_SYNC_FILTER_PUSHDOWN_ENABLED` | +| [hoodie.datasource.hive_sync.filter_pushdown_max_size](#hoodiedatasourcehive_syncfilter_pushdown_max_size) | 1000 | Max size limit to push down partition filters, if the estimate push down filters exceed this size, will directly try to fetch all partitions between the min/max.In case of glue metastore, this value should be reduced because it has a filter length limit.
`Config Param: HIVE_SYNC_FILTER_PUSHDOWN_MAX_SIZE` | +| [hoodie.datasource.hive_sync.ignore_exceptions](#hoodiedatasourcehive_syncignore_exceptions) | false | Ignore exceptions when syncing with Hive.
`Config Param: HIVE_IGNORE_EXCEPTIONS` | +| [hoodie.datasource.hive_sync.omit_metadata_fields](#hoodiedatasourcehive_syncomit_metadata_fields) | false | Whether to omit the hoodie metadata fields in the target table.
`Config Param: HIVE_SYNC_OMIT_METADATA_FIELDS`
`Since Version: 0.13.0` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.
`Config Param: META_SYNC_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | | Field in the table to use for determining hive partition columns.
`Config Param: META_SYNC_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.password](#hoodiedatasourcehive_syncpassword) | hive | hive password to use
`Config Param: HIVE_PASS` | +| [hoodie.datasource.hive_sync.recreate_table_on_error](#hoodiedatasourcehive_syncrecreate_table_on_error) | false | Hive sync may fail if the Hive table exists with partitions differing from the Hoodie table or if schema evolution if not supported by Hive.Enabling this configuration will drop and create the table to match the Hoodie config
`Config Param: RECREATE_HIVE_TABLE_ON_ERROR`
`Since Version: 0.14.0` | +| [hoodie.datasource.hive_sync.schema_string_length_thresh](#hoodiedatasourcehive_syncschema_string_length_thresh) | 4000 |
`Config Param: HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD` | +| [hoodie.datasource.hive_sync.skip_ro_suffix](#hoodiedatasourcehive_syncskip_ro_suffix) | false | Skip the _ro suffix for Read optimized table, when registering
`Config Param: HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE` | +| [hoodie.datasource.hive_sync.support_timestamp](#hoodiedatasourcehive_syncsupport_timestamp) | false | ‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. Disabled by default for backward compatibility. NOTE: On Spark entrypoints, this is defaulted to TRUE
`Config Param: HIVE_SUPPORT_TIMESTAMP_TYPE` | +| [hoodie.datasource.hive_sync.sync_as_datasource](#hoodiedatasourcehive_syncsync_as_datasource) | true |
`Config Param: HIVE_SYNC_AS_DATA_SOURCE_TABLE` | +| [hoodie.datasource.hive_sync.sync_comment](#hoodiedatasourcehive_syncsync_comment) | false | Whether to sync the table column comments while syncing the table.
`Config Param: HIVE_SYNC_COMMENT` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown | The name of the destination table that we should sync the hudi table to.
`Config Param: META_SYNC_TABLE_NAME` | +| [hoodie.datasource.hive_sync.table.strategy](#hoodiedatasourcehive_synctablestrategy) | ALL | Hive table synchronization strategy. Available option: RO, RT, ALL.
`Config Param: HIVE_SYNC_TABLE_STRATEGY`
`Since Version: 0.13.0` | +| [hoodie.datasource.hive_sync.use_jdbc](#hoodiedatasourcehive_syncuse_jdbc) | true | Use JDBC when hive synchronization is enabled
`Config Param: HIVE_USE_JDBC` | +| [hoodie.datasource.hive_sync.use_pre_apache_input_format](#hoodiedatasourcehive_syncuse_pre_apache_input_format) | false | Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. Use this when you are in the process of migrating from com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format
`Config Param: HIVE_USE_PRE_APACHE_INPUT_FORMAT` | +| [hoodie.datasource.hive_sync.username](#hoodiedatasourcehive_syncusername) | hive | hive user name to use
`Config Param: HIVE_USER` | +| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | | Base path of the hoodie table to sync
`Config Param: META_SYNC_BASE_PATH` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false | If true, only sync on conditions like schema change or partition change.
`Config Param: META_SYNC_CONDITIONAL_SYNC` | +| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false.
`Config Param: META_SYNC_DECODE_PARTITION` | +| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost.
`Config Param: META_SYNC_INCREMENTAL`
`Since Version: 0.14.0` | +| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | true | Enable the internal metadata table for file listing for syncing with metastores
`Config Param: META_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.no_partition_metadata](#hoodiemetasyncno_partition_metadata) | false | If true, the partition metadata will not be synced to the metastore. This is useful when the partition metadata is large, and the partition info can be obtained from Hudi's internal metadata table. Note, Key: 'hoodie.metadata.enable' , default: true , isAdvanced: false , description: Enable the internal metadata table which serves table metadata like level file listings since version: 0.7.0 deprecated after: version is not defined must be set to true.
`Config Param: META_SYNC_NO_PARTITION_METADATA`
`Since Version: 1.0.0` | +| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | true | sync meta info to origin table if enable
`Config Param: META_SYNC_SNAPSHOT_WITH_TABLE_NAME`
`Since Version: 0.14.0` | +| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | | The spark version used when syncing with a metastore.
`Config Param: META_SYNC_SPARK_VERSION` | +--- + + +### DataHub Sync Configs {#DataHub-Sync-Configs} +Configurations used by the Hudi to sync metadata to DataHub. + + + +[**Basic Configs**](#DataHub-Sync-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false | Enable Syncing the Hudi Table with an external meta store or data catalog.
`Config Param: META_SYNC_ENABLED` | + +[**Advanced Configs**](#DataHub-Sync-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------- || +| [hoodie.meta.sync.datahub.emitter.server](#hoodiemetasyncdatahubemitterserver) | (N/A) | Server URL of the DataHub instance.
`Config Param: META_SYNC_DATAHUB_EMITTER_SERVER` | +| [hoodie.meta.sync.datahub.emitter.supplier.class](#hoodiemetasyncdatahubemittersupplierclass) | (N/A) | Pluggable class to supply a DataHub REST emitter to connect to the DataHub instance. This overwrites other emitter configs.
`Config Param: META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS` | +| [hoodie.meta.sync.datahub.emitter.token](#hoodiemetasyncdatahubemittertoken) | (N/A) | Auth token to connect to the DataHub instance.
`Config Param: META_SYNC_DATAHUB_EMITTER_TOKEN` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET | Base file format for the sync.
`Config Param: META_SYNC_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default | The name of the destination database that we should sync the hudi table to.
`Config Param: META_SYNC_DATABASE_NAME` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.
`Config Param: META_SYNC_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | | Field in the table to use for determining hive partition columns.
`Config Param: META_SYNC_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown | The name of the destination table that we should sync the hudi table to.
`Config Param: META_SYNC_TABLE_NAME` | +| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | | Base path of the hoodie table to sync
`Config Param: META_SYNC_BASE_PATH` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false | If true, only sync on conditions like schema change or partition change.
`Config Param: META_SYNC_CONDITIONAL_SYNC` | +| [hoodie.meta.sync.datahub.dataplatform.name](#hoodiemetasyncdatahubdataplatformname) | hudi | String used to represent Hudi when creating its corresponding DataPlatform entity within Datahub
`Config Param: META_SYNC_DATAHUB_DATAPLATFORM_NAME` | +| [hoodie.meta.sync.datahub.dataset.env](#hoodiemetasyncdatahubdatasetenv) | DEV | Environment to use when pushing entities to Datahub
`Config Param: META_SYNC_DATAHUB_DATASET_ENV` | +| [hoodie.meta.sync.datahub.dataset.identifier.class](#hoodiemetasyncdatahubdatasetidentifierclass) | org.apache.hudi.sync.datahub.config.HoodieDataHubDatasetIdentifier | Pluggable class to help provide info to identify a DataHub Dataset.
`Config Param: META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS` | +| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false.
`Config Param: META_SYNC_DECODE_PARTITION` | +| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost.
`Config Param: META_SYNC_INCREMENTAL`
`Since Version: 0.14.0` | +| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | true | Enable the internal metadata table for file listing for syncing with metastores
`Config Param: META_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.no_partition_metadata](#hoodiemetasyncno_partition_metadata) | false | If true, the partition metadata will not be synced to the metastore. This is useful when the partition metadata is large, and the partition info can be obtained from Hudi's internal metadata table. Note, Key: 'hoodie.metadata.enable' , default: true , isAdvanced: false , description: Enable the internal metadata table which serves table metadata like level file listings since version: 0.7.0 deprecated after: version is not defined must be set to true.
`Config Param: META_SYNC_NO_PARTITION_METADATA`
`Since Version: 1.0.0` | +| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | true | sync meta info to origin table if enable
`Config Param: META_SYNC_SNAPSHOT_WITH_TABLE_NAME`
`Since Version: 0.14.0` | +| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | | The spark version used when syncing with a metastore.
`Config Param: META_SYNC_SPARK_VERSION` | +--- + +## Metrics Configs {#METRICS} +These set of configs are used to enable monitoring and reporting of key Hudi stats and metrics. + + +### Metrics Configurations for Amazon CloudWatch {#Metrics-Configurations-for-Amazon-CloudWatch} +Enables reporting on Hudi metrics using Amazon CloudWatch. Hudi publishes metrics on every commit, clean, rollback etc. + + + +[**Advanced Configs**](#Metrics-Configurations-for-Amazon-CloudWatch-advanced-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------- | ------- | --------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.cloudwatch.maxDatumsPerRequest](#hoodiemetricscloudwatchmaxDatumsPerRequest) | 20 | Max number of Datums per request
`Config Param: MAX_DATUMS_PER_REQUEST`
`Since Version: 0.10.0` | +| [hoodie.metrics.cloudwatch.metric.prefix](#hoodiemetricscloudwatchmetricprefix) | | Metric prefix of reporter
`Config Param: METRIC_PREFIX`
`Since Version: 0.10.0` | +| [hoodie.metrics.cloudwatch.namespace](#hoodiemetricscloudwatchnamespace) | Hudi | Namespace of reporter
`Config Param: METRIC_NAMESPACE`
`Since Version: 0.10.0` | +| [hoodie.metrics.cloudwatch.report.period.seconds](#hoodiemetricscloudwatchreportperiodseconds) | 60 | Reporting interval in seconds
`Config Param: REPORT_PERIOD_SECONDS`
`Since Version: 0.10.0` | +--- + + +### Metrics Configurations {#Metrics-Configurations} +Enables reporting on Hudi metrics. Hudi publishes metrics on every commit, clean, rollback etc. The following sections list the supported reporters. + + + +[**Basic Configs**](#Metrics-Configurations-basic-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------- | --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.on](#hoodiemetricson) | false | Turn on/off metrics reporting. off by default.
`Config Param: TURN_METRICS_ON`
`Since Version: 0.5.0` | +| [hoodie.metrics.reporter.type](#hoodiemetricsreportertype) | GRAPHITE | Type of metrics reporter.
`Config Param: METRICS_REPORTER_TYPE_VALUE`
`Since Version: 0.5.0` | +| [hoodie.metricscompaction.log.blocks.on](#hoodiemetricscompactionlogblockson) | false | Turn on/off metrics reporting for log blocks with compaction commit. off by default.
`Config Param: TURN_METRICS_COMPACTION_LOG_BLOCKS_ON`
`Since Version: 0.14.0` | + +[**Advanced Configs**](#Metrics-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.executor.enable](#hoodiemetricsexecutorenable) | (N/A) |
`Config Param: EXECUTOR_METRICS_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.metrics.configs.properties](#hoodiemetricsconfigsproperties) | | Comma separated list of config file paths for metric exporter configs
`Config Param: METRICS_REPORTER_FILE_BASED_CONFIGS_PATH`
`Since Version: 0.14.0` | +| [hoodie.metrics.lock.enable](#hoodiemetricslockenable) | false | Enable metrics for locking infra. Useful when operating in multiwriter mode
`Config Param: LOCK_METRICS_ENABLE`
`Since Version: 0.13.0` | +| [hoodie.metrics.reporter.class](#hoodiemetricsreporterclass) | |
`Config Param: METRICS_REPORTER_CLASS_NAME`
`Since Version: 0.6.0` | +| [hoodie.metrics.reporter.metricsname.prefix](#hoodiemetricsreportermetricsnameprefix) | | The prefix given to the metrics names.
`Config Param: METRICS_REPORTER_PREFIX`
`Since Version: 0.11.0` | +--- + + +### Metrics Configurations for Datadog reporter {#Metrics-Configurations-for-Datadog-reporter} +Enables reporting on Hudi metrics using the Datadog reporter type. Hudi publishes metrics on every commit, clean, rollback etc. + + + +[**Advanced Configs**](#Metrics-Configurations-for-Datadog-reporter-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.datadog.api.key](#hoodiemetricsdatadogapikey) | (N/A) | Datadog API key
`Config Param: API_KEY`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.api.key.supplier](#hoodiemetricsdatadogapikeysupplier) | (N/A) | Datadog API key supplier to supply the API key at runtime. This will take effect if hoodie.metrics.datadog.api.key is not set.
`Config Param: API_KEY_SUPPLIER`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.api.site](#hoodiemetricsdatadogapisite) | (N/A) | Datadog API site: EU or US
`Config Param: API_SITE_VALUE`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.metric.host](#hoodiemetricsdatadogmetrichost) | (N/A) | Datadog metric host to be sent along with metrics data.
`Config Param: METRIC_HOST_NAME`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.metric.prefix](#hoodiemetricsdatadogmetricprefix) | (N/A) | Datadog metric prefix to be prepended to each metric name with a dot as delimiter. For example, if it is set to foo, foo. will be prepended.
`Config Param: METRIC_PREFIX_VALUE`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.metric.tags](#hoodiemetricsdatadogmetrictags) | (N/A) | Datadog metric tags (comma-delimited) to be sent along with metrics data.
`Config Param: METRIC_TAG_VALUES`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.api.key.skip.validation](#hoodiemetricsdatadogapikeyskipvalidation) | false | Before sending metrics via Datadog API, whether to skip validating Datadog API key or not. Default to false.
`Config Param: API_KEY_SKIP_VALIDATION`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.api.timeout.seconds](#hoodiemetricsdatadogapitimeoutseconds) | 3 | Datadog API timeout in seconds. Default to 3.
`Config Param: API_TIMEOUT_IN_SECONDS`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.report.period.seconds](#hoodiemetricsdatadogreportperiodseconds) | 30 | Datadog reporting period in seconds. Default to 30.
`Config Param: REPORT_PERIOD_IN_SECONDS`
`Since Version: 0.6.0` | +--- + + +### Metrics Configurations for Graphite {#Metrics-Configurations-for-Graphite} +Enables reporting on Hudi metrics using Graphite. Hudi publishes metrics on every commit, clean, rollback etc. + + + +[**Advanced Configs**](#Metrics-Configurations-for-Graphite-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.graphite.metric.prefix](#hoodiemetricsgraphitemetricprefix) | (N/A) | Standard prefix applied to all metrics. This helps to add datacenter, environment information for e.g
`Config Param: GRAPHITE_METRIC_PREFIX_VALUE`
`Since Version: 0.5.1` | +| [hoodie.metrics.graphite.host](#hoodiemetricsgraphitehost) | localhost | Graphite host to connect to.
`Config Param: GRAPHITE_SERVER_HOST_NAME`
`Since Version: 0.5.0` | +| [hoodie.metrics.graphite.port](#hoodiemetricsgraphiteport) | 4756 | Graphite port to connect to.
`Config Param: GRAPHITE_SERVER_PORT_NUM`
`Since Version: 0.5.0` | +| [hoodie.metrics.graphite.report.period.seconds](#hoodiemetricsgraphitereportperiodseconds) | 30 | Graphite reporting period in seconds. Default to 30.
`Config Param: GRAPHITE_REPORT_PERIOD_IN_SECONDS`
`Since Version: 0.10.0` | +--- + + +### Metrics Configurations for Jmx {#Metrics-Configurations-for-Jmx} +Enables reporting on Hudi metrics using Jmx. Hudi publishes metrics on every commit, clean, rollback etc. + + + +[**Advanced Configs**](#Metrics-Configurations-for-Jmx-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------- | +| [hoodie.metrics.jmx.host](#hoodiemetricsjmxhost) | localhost | Jmx host to connect to
`Config Param: JMX_HOST_NAME`
`Since Version: 0.5.1` | +| [hoodie.metrics.jmx.port](#hoodiemetricsjmxport) | 9889 | Jmx port to connect to
`Config Param: JMX_PORT_NUM`
`Since Version: 0.5.1` | +--- + + +### Metrics Configurations for M3 {#Metrics-Configurations-for-M3} +Enables reporting on Hudi metrics using M3. Hudi publishes metrics on every commit, clean, rollback etc. + + + +[**Basic Configs**](#Metrics-Configurations-for-M3-basic-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.m3.env](#hoodiemetricsm3env) | production | M3 tag to label the environment (defaults to 'production'), applied to all metrics.
`Config Param: M3_ENV`
`Since Version: 0.15.0` | +| [hoodie.metrics.m3.host](#hoodiemetricsm3host) | localhost | M3 host to connect to.
`Config Param: M3_SERVER_HOST_NAME`
`Since Version: 0.15.0` | +| [hoodie.metrics.m3.port](#hoodiemetricsm3port) | 9052 | M3 port to connect to.
`Config Param: M3_SERVER_PORT_NUM`
`Since Version: 0.15.0` | +| [hoodie.metrics.m3.service](#hoodiemetricsm3service) | hoodie | M3 tag to label the service name (defaults to 'hoodie'), applied to all metrics.
`Config Param: M3_SERVICE`
`Since Version: 0.15.0` | +| [hoodie.metrics.m3.tags](#hoodiemetricsm3tags) | | Optional M3 tags applied to all metrics.
`Config Param: M3_TAGS`
`Since Version: 0.15.0` | +--- + + +### Metrics Configurations for Prometheus {#Metrics-Configurations-for-Prometheus} +Enables reporting on Hudi metrics using Prometheus. Hudi publishes metrics on every commit, clean, rollback etc. + + + +[**Advanced Configs**](#Metrics-Configurations-for-Prometheus-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.prometheus.port](#hoodiemetricsprometheusport) | 9090 | Port for prometheus server.
`Config Param: PROMETHEUS_PORT_NUM`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.delete.on.shutdown](#hoodiemetricspushgatewaydeleteonshutdown) | true | Delete the pushgateway info or not when job shutdown, true by default.
`Config Param: PUSHGATEWAY_DELETE_ON_SHUTDOWN_ENABLE`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.host](#hoodiemetricspushgatewayhost) | localhost | Hostname of the prometheus push gateway.
`Config Param: PUSHGATEWAY_HOST_NAME`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.job.name](#hoodiemetricspushgatewayjobname) | | Name of the push gateway job.
`Config Param: PUSHGATEWAY_JOBNAME`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.port](#hoodiemetricspushgatewayport) | 9091 | Port for the push gateway.
`Config Param: PUSHGATEWAY_PORT_NUM`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.random.job.name.suffix](#hoodiemetricspushgatewayrandomjobnamesuffix) | true | Whether the pushgateway name need a random suffix , default true.
`Config Param: PUSHGATEWAY_RANDOM_JOBNAME_SUFFIX`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.report.labels](#hoodiemetricspushgatewayreportlabels) | | Label for the metrics emitted to the Pushgateway. Labels can be specified with key:value pairs separated by commas
`Config Param: PUSHGATEWAY_LABELS`
`Since Version: 0.14.0` | +| [hoodie.metrics.pushgateway.report.period.seconds](#hoodiemetricspushgatewayreportperiodseconds) | 30 | Reporting interval in seconds.
`Config Param: PUSHGATEWAY_REPORT_PERIOD_IN_SECONDS`
`Since Version: 0.6.0` | +--- + +## Record Payload Config {#RECORD_PAYLOAD} +This is the lowest level of customization offered by Hudi. Record payloads define how to produce new values to upsert based on incoming new record and stored old record. Hudi provides default implementations such as OverwriteWithLatestAvroPayload which simply update table with the latest/last-written record. This can be overridden to a custom class extending HoodieRecordPayload class, on both datasource and WriteClient levels. + + +### Payload Configurations {#Payload-Configurations} +Payload related configs, that can be leveraged to control merges based on specific business fields in the data. + + + +[**Advanced Configs**](#Payload-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------- | -------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.compaction.payload.class](#hoodiecompactionpayloadclass) | org.apache.hudi.common.model.DefaultHoodieRecordPayload | This needs to be same as class used during insert/upserts. Just like writing, compaction also uses the record payload class to merge records in the log against each other, merge again with the base file and produce the final record to be written after compaction.
`Config Param: PAYLOAD_CLASS_NAME` | +| [hoodie.payload.event.time.field](#hoodiepayloadeventtimefield) | ts | Table column/field name to derive timestamp associated with the records. This canbe useful for e.g, determining the freshness of the table.
`Config Param: EVENT_TIME_FIELD` | +| [hoodie.payload.ordering.field](#hoodiepayloadorderingfield) | ts | Table column/field name to order records that have the same key, before merging and writing to storage.
`Config Param: ORDERING_FIELD` | +--- + +## Kafka Connect Configs {#KAFKA_CONNECT} +These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables + + +### Kafka Sink Connect Configurations {#Kafka-Sink-Connect-Configurations} +Configurations for Kafka Connect Sink Connector for Hudi. + + + +[**Basic Configs**](#Kafka-Sink-Connect-Configurations-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------- | --------------- | ----------------------------------------------------------------------------------------- | +| [bootstrap.servers](#bootstrapservers) | localhost:9092 | The bootstrap servers for the Kafka Cluster.
`Config Param: KAFKA_BOOTSTRAP_SERVERS` | + +[**Advanced Configs**](#Kafka-Sink-Connect-Configurations-advanced-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------- | ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hadoop.conf.dir](#hadoopconfdir) | (N/A) | The Hadoop configuration directory.
`Config Param: HADOOP_CONF_DIR` | +| [hadoop.home](#hadoophome) | (N/A) | The Hadoop home directory.
`Config Param: HADOOP_HOME` | +| [hoodie.kafka.allow.commit.on.errors](#hoodiekafkaallowcommitonerrors) | true | Commit even when some records failed to be written
`Config Param: ALLOW_COMMIT_ON_ERRORS` | +| [hoodie.kafka.commit.interval.secs](#hoodiekafkacommitintervalsecs) | 60 | The interval at which Hudi will commit the records written to the files, making them consumable on the read-side.
`Config Param: COMMIT_INTERVAL_SECS` | +| [hoodie.kafka.compaction.async.enable](#hoodiekafkacompactionasyncenable) | true | Controls whether async compaction should be turned on for MOR table writing.
`Config Param: ASYNC_COMPACT_ENABLE` | +| [hoodie.kafka.control.topic](#hoodiekafkacontroltopic) | hudi-control-topic | Kafka topic name used by the Hudi Sink Connector for sending and receiving control messages. Not used for data records.
`Config Param: CONTROL_TOPIC_NAME` | +| [hoodie.kafka.coordinator.write.timeout.secs](#hoodiekafkacoordinatorwritetimeoutsecs) | 300 | The timeout after sending an END_COMMIT until when the coordinator will wait for the write statuses from all the partitionsto ignore the current commit and start a new commit.
`Config Param: COORDINATOR_WRITE_TIMEOUT_SECS` | +| [hoodie.meta.sync.classes](#hoodiemetasyncclasses) | org.apache.hudi.hive.HiveSyncTool | Meta sync client tool, using comma to separate multi tools
`Config Param: META_SYNC_CLASSES` | +| [hoodie.meta.sync.enable](#hoodiemetasyncenable) | false | Enable Meta Sync such as Hive
`Config Param: META_SYNC_ENABLE` | +| [hoodie.schemaprovider.class](#hoodieschemaproviderclass) | org.apache.hudi.schema.FilebasedSchemaProvider | subclass of org.apache.hudi.schema.SchemaProvider to attach schemas to input & target table data, built in options: org.apache.hudi.schema.FilebasedSchemaProvider.
`Config Param: SCHEMA_PROVIDER_CLASS` | +--- + +## Amazon Web Services Configs {#AWS} +Configurations specific to Amazon Web Services. + + +### Amazon Web Services Configs {#Amazon-Web-Services-Configs} +Amazon Web Services configurations to access resources like Amazon DynamoDB (for locks), Amazon CloudWatch (metrics) and Amazon Glue (metadata). + + + +[**Advanced Configs**](#Amazon-Web-Services-Configs-advanced-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.aws.access.key](#hoodieawsaccesskey) | (N/A) | AWS access key id
`Config Param: AWS_ACCESS_KEY`
`Since Version: 0.10.0` | +| [hoodie.aws.glue.endpoint](#hoodieawsglueendpoint) | (N/A) | Aws glue endpoint
`Config Param: AWS_GLUE_ENDPOINT`
`Since Version: 0.15.0` | +| [hoodie.aws.glue.region](#hoodieawsglueregion) | (N/A) | Aws glue endpoint
`Config Param: AWS_GLUE_REGION`
`Since Version: 0.15.0` | +| [hoodie.aws.role.arn](#hoodieawsrolearn) | (N/A) | AWS Role ARN to assume
`Config Param: AWS_ASSUME_ROLE_ARN`
`Since Version: 0.15.0` | +| [hoodie.aws.role.external.id](#hoodieawsroleexternalid) | (N/A) | External ID use when assuming the AWS Role
`Config Param: AWS_ASSUME_ROLE_EXTERNAL_ID`
`Since Version: 0.15.0` | +| [hoodie.aws.secret.key](#hoodieawssecretkey) | (N/A) | AWS secret key
`Config Param: AWS_SECRET_KEY`
`Since Version: 0.10.0` | +| [hoodie.aws.session.token](#hoodieawssessiontoken) | (N/A) | AWS session token
`Config Param: AWS_SESSION_TOKEN`
`Since Version: 0.10.0` | +| [hoodie.aws.role.session.name](#hoodieawsrolesessionname) | hoodie | Session name to use when assuming the AWS Role
`Config Param: AWS_ASSUME_ROLE_SESSION_NAME`
`Since Version: 0.15.0` | +--- + +## Hudi Streamer Configs {#HUDI_STREAMER} +These set of configs are used for Hudi Streamer utility which provides the way to ingest from different sources such as DFS or Kafka. + + +### Hudi Streamer Configs {#Hudi-Streamer-Configs} + + + + +[**Basic Configs**](#Hudi-Streamer-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.kafka.topic](#hoodiestreamersourcekafkatopic) | (N/A) | Kafka topic name. The config is specific to HoodieMultiTableStreamer
`Config Param: KAFKA_TOPIC` | + +[**Advanced Configs**](#Hudi-Streamer-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------ | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.checkpoint.provider.path](#hoodiestreamercheckpointproviderpath) | (N/A) | The path for providing the checkpoints.
`Config Param: CHECKPOINT_PROVIDER_PATH` | +| [hoodie.streamer.ingestion.tablesToBeIngested](#hoodiestreameringestiontablesToBeIngested) | (N/A) | Comma separated names of tables to be ingested in the format <database>.<table>, for example db1.table1,db1.table2
`Config Param: TABLES_TO_BE_INGESTED` | +| [hoodie.streamer.transformer.class](#hoodiestreamertransformerclass) | (N/A) | Names of transformer classes to apply. The config is specific to HoodieMultiTableStreamer.
`Config Param: TRANSFORMER_CLASS`
`Since Version: 0.14.0` | +| [hoodie.streamer.checkpoint.force.skip](#hoodiestreamercheckpointforceskip) | false | Config to force to skip saving checkpoint in the commit metadata.It is typically used in one-time backfill scenarios, where checkpoints are not to be persisted.
`Config Param: CHECKPOINT_FORCE_SKIP` | +| [hoodie.streamer.ingestion.targetBasePath](#hoodiestreameringestiontargetBasePath) | | The path to which a particular table is ingested. The config is specific to HoodieMultiTableStreamer and overrides path determined using option `--base-path-prefix` for a table. This config is ignored for a single table streamer
`Config Param: TARGET_BASE_PATH` | +| [hoodie.streamer.row.throw.explicit.exceptions](#hoodiestreamerrowthrowexplicitexceptions) | false | When enabled, the dataframe generated from reading source data is wrapped with an exception handler to explicitly surface exceptions.
`Config Param: ROW_THROW_EXPLICIT_EXCEPTIONS`
`Since Version: 0.15.0` | +| [hoodie.streamer.sample.writes.enabled](#hoodiestreamersamplewritesenabled) | false | Set this to true to sample from the first batch of records and write to the auxiliary path, before writing to the table.The sampled records are used to calculate the average record size. The relevant write client will have `hoodie.copyonwrite.record.size.estimate` being overwritten by the calculated result.
`Config Param: SAMPLE_WRITES_ENABLED`
`Since Version: 0.14.0` | +| [hoodie.streamer.sample.writes.size](#hoodiestreamersamplewritessize) | 5000 | Number of records to sample from the first write. To improve the estimation's accuracy, for smaller or more compressable record size, set the sample size bigger. For bigger or less compressable record size, set smaller.
`Config Param: SAMPLE_WRITES_SIZE`
`Since Version: 0.14.0` | +| [hoodie.streamer.source.kafka.append.offsets](#hoodiestreamersourcekafkaappendoffsets) | false | When enabled, appends kafka offset info like source offset(_hoodie_kafka_source_offset), partition (_hoodie_kafka_source_partition) and timestamp (_hoodie_kafka_source_timestamp) to the records. By default its disabled and no kafka offsets are added
`Config Param: KAFKA_APPEND_OFFSETS` | +| [hoodie.streamer.source.sanitize.invalid.char.mask](#hoodiestreamersourcesanitizeinvalidcharmask) | __ | Defines the character sequence that replaces invalid characters in schema field names if hoodie.streamer.source.sanitize.invalid.schema.field.names is enabled.
`Config Param: SCHEMA_FIELD_NAME_INVALID_CHAR_MASK` | +| [hoodie.streamer.source.sanitize.invalid.schema.field.names](#hoodiestreamersourcesanitizeinvalidschemafieldnames) | false | Sanitizes names of invalid schema fields both in the data read from source and also in the schema Replaces invalid characters with hoodie.streamer.source.sanitize.invalid.char.mask. Invalid characters are by goes by avro naming convention (https://avro.apache.org/docs/current/spec.html#names).
`Config Param: SANITIZE_SCHEMA_FIELD_NAMES` | +--- + + +### Hudi Streamer SQL Transformer Configs {#Hudi-Streamer-SQL-Transformer-Configs} +Configurations controlling the behavior of SQL transformer in Hudi Streamer. + + + +[**Basic Configs**](#Hudi-Streamer-SQL-Transformer-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------- | ------- | -------------------------------------------------------------------------------------------- | +| [hoodie.streamer.transformer.sql](#hoodiestreamertransformersql) | (N/A) | SQL Query to be executed during write
`Config Param: TRANSFORMER_SQL` | +| [hoodie.streamer.transformer.sql.file](#hoodiestreamertransformersqlfile) | (N/A) | File with a SQL script to be executed during write
`Config Param: TRANSFORMER_SQL_FILE` | +--- + + +### Hudi Streamer Source Configs {#DELTA_STREAMER_SOURCE} +Configurations controlling the behavior of reading source data. + + +#### Cloud Source Configs {#Cloud-Source-Configs} +Configs that are common during ingestion across different cloud stores + + + +[**Advanced Configs**](#Cloud-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------------ | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.cloud.data.datasource.options](#hoodiestreamersourceclouddatadatasourceoptions) | (N/A) | A JSON string passed to the Spark DataFrameReader while loading the dataset. Example: `hoodie.streamer.gcp.spark.datasource.options={"header":"true","encoding":"UTF-8"}`
`Config Param: SPARK_DATASOURCE_OPTIONS` | +| [hoodie.streamer.source.cloud.data.ignore.relpath.prefix](#hoodiestreamersourceclouddataignorerelpathprefix) | (N/A) | Ignore objects in the bucket whose relative path starts this prefix
`Config Param: IGNORE_RELATIVE_PATH_PREFIX` | +| [hoodie.streamer.source.cloud.data.ignore.relpath.substring](#hoodiestreamersourceclouddataignorerelpathsubstring) | (N/A) | Ignore objects in the bucket whose relative path contains this substring
`Config Param: IGNORE_RELATIVE_PATH_SUBSTR` | +| [hoodie.streamer.source.cloud.data.partition.fields.from.path](#hoodiestreamersourceclouddatapartitionfieldsfrompath) | (N/A) | A comma delimited list of path-based partition fields in the source file structure.
`Config Param: PATH_BASED_PARTITION_FIELDS`
`Since Version: 0.14.0` | +| [hoodie.streamer.source.cloud.data.partition.max.size](#hoodiestreamersourceclouddatapartitionmaxsize) | (N/A) | specify this value in bytes, to coalesce partitions of source dataset not greater than specified limit
`Config Param: SOURCE_MAX_BYTES_PER_PARTITION`
`Since Version: 0.14.1` | +| [hoodie.streamer.source.cloud.data.select.file.extension](#hoodiestreamersourceclouddataselectfileextension) | (N/A) | Only match files with this extension. By default, this is the same as hoodie.streamer.source.hoodieincr.file.format
`Config Param: CLOUD_DATAFILE_EXTENSION` | +| [hoodie.streamer.source.cloud.data.select.relative.path.regex](#hoodiestreamersourceclouddataselectrelativepathregex) | (N/A) | Only selects objects in the bucket whose relative path matches this regex. For example: When hoodie.streamer.source.cloud.data.select.relpath.prefix is set to /path/prefix, and the hoodie.streamer.source.cloud.data.select.relative.path.regex is regex/files[0-9]+, only files located in the /path/prefix/regex directory that match the pattern (e.g., file1, file2, etc.) will be ingested. If hoodie.streamer.source.cloud.data.select.relpath.prefix is not set, the ingestion process will look for files matching /regex/files[0-9]+ in the source bucket.
`Config Param: SELECT_RELATIVE_PATH_REGEX`
`Since Version: 1.0.0` | +| [hoodie.streamer.source.cloud.data.select.relpath.prefix](#hoodiestreamersourceclouddataselectrelpathprefix) | (N/A) | Only selects objects in the bucket whose relative path starts with this prefix
`Config Param: SELECT_RELATIVE_PATH_PREFIX` | +| [hoodie.streamer.source.cloud.data.check.file.exists](#hoodiestreamersourceclouddatacheckfileexists) | false | If true, checks whether file exists before attempting to pull it
`Config Param: ENABLE_EXISTS_CHECK` | +| [hoodie.streamer.source.cloud.data.datafile.format](#hoodiestreamersourceclouddatadatafileformat) | parquet | Format of the data file. By default, this will be the same as hoodie.streamer.source.hoodieincr.file.format
`Config Param: DATAFILE_FORMAT` | +| [hoodie.streamer.source.cloud.data.reader.coalesce.aliases](#hoodiestreamersourceclouddatareadercoalescealiases) | true | Boolean value to allow coalesce alias columns with actual columns while reading from source
`Config Param: SPARK_DATASOURCE_READER_COALESCE_ALIAS_COLUMNS`
`Since Version: 1.0.0` | +| [hoodie.streamer.source.cloud.data.reader.comma.separated.path.format](#hoodiestreamersourceclouddatareadercommaseparatedpathformat) | false | Boolean value for specifying path format in load args of `spark.read.format("..").load("a.xml,b.xml,c.xml")`, * set true if path format needs to be comma separated string value, if false it's passed as array of strings like * `spark.read.format("..").load(new String[]{a.xml,b.xml,c.xml})`
`Config Param: SPARK_DATASOURCE_READER_COMMA_SEPARATED_PATH_FORMAT`
`Since Version: 0.14.1` | +| [hoodie.streamer.source.cloud.meta.ack](#hoodiestreamersourcecloudmetaack) | true | Whether to acknowledge Metadata messages during Cloud Ingestion or not. This is useful during dev and testing. In Prod this should always be true. In case of Cloud Pubsub, not acknowledging means Pubsub will keep redelivering the same messages.
`Config Param: ACK_MESSAGES` | +| [hoodie.streamer.source.cloud.meta.batch.size](#hoodiestreamersourcecloudmetabatchsize) | 10 | Number of metadata messages to pull in one API call to the cloud events queue. Multiple API calls with this batch size are sent to cloud events queue, until we consume hoodie.streamer.source.cloud.meta.max.num.messages.per.syncfrom the queue or hoodie.streamer.source.cloud.meta.max.fetch.time.per.sync.ms amount of time has passed or queue is empty.
`Config Param: BATCH_SIZE_CONF` | +| [hoodie.streamer.source.cloud.meta.max.fetch.time.per.sync.secs](#hoodiestreamersourcecloudmetamaxfetchtimepersyncsecs) | 60 | Max time in secs to consume hoodie.streamer.source.cloud.meta.max.num.messages.per.sync messages from cloud queue. Cloud event queues like SQS, PubSub can return empty responses even when messages are available the queue, this config ensures we don't wait forever to consume MAX_MESSAGES_CONF messages, but time out and move on further.
`Config Param: MAX_FETCH_TIME_PER_SYNC_SECS`
`Since Version: 0.14.1` | +| [hoodie.streamer.source.cloud.meta.max.num.messages.per.sync](#hoodiestreamersourcecloudmetamaxnummessagespersync) | 1000 | Maximum number of messages to consume per sync round. Multiple rounds of hoodie.streamer.source.cloud.meta.batch.size could be invoked to reach max messages as configured by this config
`Config Param: MAX_NUM_MESSAGES_PER_SYNC`
`Since Version: 0.14.1` | +--- + + +#### DFS Path Selector Configs {#DFS-Path-Selector-Configs} +Configurations controlling the behavior of path selector for DFS source in Hudi Streamer. + + + +[**Basic Configs**](#DFS-Path-Selector-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------- | ------- | ------------------------------------------------------------------- | +| [hoodie.streamer.source.dfs.root](#hoodiestreamersourcedfsroot) | (N/A) | Root path of the source on DFS
`Config Param: ROOT_INPUT_PATH` | + +[**Advanced Configs**](#DFS-Path-Selector-Configs-advanced-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------- | ------- | ---------------------------------------------------------------- | +| [hoodie.streamer.source.input.selector](#hoodiestreamersourceinputselector) | (N/A) | Source input selector
`Config Param: SOURCE_INPUT_SELECTOR` | +--- + + +#### Date Partition Path Selector Configs {#Date-Partition-Path-Selector-Configs} +Configurations controlling the behavior of date partition path selector for DFS source in Hudi Streamer. + + + +[**Advanced Configs**](#Date-Partition-Path-Selector-Configs-advanced-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------------------------------------- | ----------- | ---------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.dfs.datepartitioned.selector.currentdate](#hoodiestreamersourcedfsdatepartitionedselectorcurrentdate) | (N/A) | Current date.
`Config Param: CURRENT_DATE` | +| [hoodie.streamer.source.dfs.datepartitioned.date.format](#hoodiestreamersourcedfsdatepartitioneddateformat) | yyyy-MM-dd | Date format.
`Config Param: DATE_FORMAT` | +| [hoodie.streamer.source.dfs.datepartitioned.selector.depth](#hoodiestreamersourcedfsdatepartitionedselectordepth) | 0 | Depth of the files to scan. 0 implies no (date) partition.
`Config Param: DATE_PARTITION_DEPTH` | +| [hoodie.streamer.source.dfs.datepartitioned.selector.lookback.days](#hoodiestreamersourcedfsdatepartitionedselectorlookbackdays) | 2 | The maximum look-back days for scanning.
`Config Param: LOOKBACK_DAYS` | +| [hoodie.streamer.source.dfs.datepartitioned.selector.parallelism](#hoodiestreamersourcedfsdatepartitionedselectorparallelism) | 20 | Parallelism for listing partitions.
`Config Param: PARTITIONS_LIST_PARALLELISM` | +--- + + +#### GCS Events Source Configs {#GCS-Events-Source-Configs} +Configurations controlling the behavior of GCS Events Source in Hudi Streamer. + + + +[**Advanced Configs**](#GCS-Events-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------ | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.streamer.source.gcs.project.id](#hoodiestreamersourcegcsprojectid) | (N/A) | The GCP Project Id where the Pubsub Subscription to ingest from resides. Needed to connect to the Pubsub subscription
`Config Param: GOOGLE_PROJECT_ID` | +| [hoodie.streamer.source.gcs.subscription.id](#hoodiestreamersourcegcssubscriptionid) | (N/A) | The GCP Pubsub subscription id for the GCS Notifications. Needed to connect to the Pubsub subscription
`Config Param: PUBSUB_SUBSCRIPTION_ID` | +--- + + +#### Hive Incremental Pulling Source Configs {#Hive-Incremental-Pulling-Source-Configs} +Configurations controlling the behavior of incremental pulling from a Hive table as a source in Hudi Streamer. + + + +[**Advanced Configs**](#Hive-Incremental-Pulling-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------- | ------- | -------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.incrpull.root](#hoodiestreamersourceincrpullroot) | (N/A) | The root path of Hive incremental pulling source.
`Config Param: ROOT_INPUT_PATH` | +--- + + +#### Hudi Incremental Source Configs {#Hudi-Incremental-Source-Configs} +Configurations controlling the behavior of incremental pulling from a Hudi table as a source in Hudi Streamer. + + + +[**Basic Configs**](#Hudi-Incremental-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------- | +| [hoodie.streamer.source.hoodieincr.path](#hoodiestreamersourcehoodieincrpath) | (N/A) | Base-path for the source Hudi table
`Config Param: HOODIE_SRC_BASE_PATH` | + +[**Advanced Configs**](#Hudi-Incremental-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.streamer.source.hoodieincr.data.datasource.options](#hoodiestreamersourcehoodieincrdatadatasourceoptions) | (N/A) | A comma-separated list of Hudi options that can be passed to the spark dataframe reader of a hudi table, eg: `hoodie.metadata.enable=true,hoodie.enable.data.skipping=true`. Used only for incremental source.
`Config Param: HOODIE_INCREMENTAL_SPARK_DATASOURCE_OPTIONS`
`Since Version: 0.15.0` | +| [hoodie.streamer.source.hoodieincr.missing.checkpoint.strategy](#hoodiestreamersourcehoodieincrmissingcheckpointstrategy) | (N/A) | Allows Hudi Streamer to decide the instant to consume from when checkpoint is not set. Possible values: [READ_LATEST (Read from latest commit in hoodie source table), READ_UPTO_LATEST_COMMIT (Read everything upto latest commit)]
`Config Param: MISSING_CHECKPOINT_STRATEGY` | +| [hoodie.streamer.source.hoodieincr.partition.extractor.class](#hoodiestreamersourcehoodieincrpartitionextractorclass) | (N/A) | PartitionValueExtractor class to extract partition fields from _hoodie_partition_path
`Config Param: HOODIE_SRC_PARTITION_EXTRACTORCLASS` | +| [hoodie.streamer.source.hoodieincr.partition.fields](#hoodiestreamersourcehoodieincrpartitionfields) | (N/A) | Specifies partition fields that needs to be added to source table after parsing _hoodie_partition_path.
`Config Param: HOODIE_SRC_PARTITION_FIELDS` | +| [hoodie.streamer.source.hoodieincr.drop.all.meta.fields.from.source](#hoodiestreamersourcehoodieincrdropallmetafieldsfromsource) | false | Drops all meta fields from the source hudi table while ingesting into sink hudi table.
`Config Param: HOODIE_DROP_ALL_META_FIELDS_FROM_SOURCE` | +| [hoodie.streamer.source.hoodieincr.file.format](#hoodiestreamersourcehoodieincrfileformat) | parquet | This config is passed to the reader while loading dataset. Default value is parquet.
`Config Param: SOURCE_FILE_FORMAT` | +| [hoodie.streamer.source.hoodieincr.num_instants](#hoodiestreamersourcehoodieincrnum_instants) | 5 | Max number of instants whose changes can be incrementally fetched
`Config Param: NUM_INSTANTS_PER_FETCH` | +| [hoodie.streamer.source.hoodieincr.read_latest_on_missing_ckpt](#hoodiestreamersourcehoodieincrread_latest_on_missing_ckpt) | false | If true, allows Hudi Streamer to incrementally fetch from latest committed instant when checkpoint is not provided. This config is deprecated. Please refer to hoodie.streamer.source.hoodieincr.missing.checkpoint.strategy
`Config Param: READ_LATEST_INSTANT_ON_MISSING_CKPT` | +--- + + +#### JDBC Source Configs {#JDBC-Source-Configs} +Configurations controlling the behavior of JDBC source in Hudi Streamer. + + + +[**Advanced Configs**](#JDBC-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.jdbc.driver.class](#hoodiestreamerjdbcdriverclass) | (N/A) | Driver class used for JDBC connection
`Config Param: DRIVER_CLASS` | +| [hoodie.streamer.jdbc.extra.options.](#hoodiestreamerjdbcextraoptions) | (N/A) | Used to set any extra options the user specifies for jdbc
`Config Param: EXTRA_OPTIONS` | +| [hoodie.streamer.jdbc.incr.fallback.to.full.fetch](#hoodiestreamerjdbcincrfallbacktofullfetch) | (N/A) | If set true, makes incremental fetch to fallback to full fetch in case of any error
`Config Param: FALLBACK_TO_FULL_FETCH` | +| [hoodie.streamer.jdbc.incr.pull](#hoodiestreamerjdbcincrpull) | (N/A) | Will the JDBC source do an incremental pull?
`Config Param: IS_INCREMENTAL` | +| [hoodie.streamer.jdbc.password](#hoodiestreamerjdbcpassword) | (N/A) | Password used for JDBC connection
`Config Param: PASSWORD` | +| [hoodie.streamer.jdbc.password.file](#hoodiestreamerjdbcpasswordfile) | (N/A) | Base-path for the JDBC password file.
`Config Param: PASSWORD_FILE` | +| [hoodie.streamer.jdbc.storage.level](#hoodiestreamerjdbcstoragelevel) | (N/A) | Used to control the persistence level. Default value: MEMORY_AND_DISK_SER
`Config Param: STORAGE_LEVEL` | +| [hoodie.streamer.jdbc.table.incr.column.name](#hoodiestreamerjdbctableincrcolumnname) | (N/A) | If run in incremental mode, this field is to pull new data incrementally
`Config Param: INCREMENTAL_COLUMN` | +| [hoodie.streamer.jdbc.table.name](#hoodiestreamerjdbctablename) | (N/A) | RDBMS table to pull
`Config Param: RDBMS_TABLE_NAME` | +| [hoodie.streamer.jdbc.url](#hoodiestreamerjdbcurl) | (N/A) | JDBC url for the Hoodie datasource.
`Config Param: URL` | +| [hoodie.streamer.jdbc.user](#hoodiestreamerjdbcuser) | (N/A) | Username used for JDBC connection
`Config Param: USER` | +--- + + +#### Json Kafka Post Processor Configs {#Json-Kafka-Post-Processor-Configs} +Configurations controlling the post processor of Json Kafka Source in Hudi Streamer. + + + +[**Advanced Configs**](#Json-Kafka-Post-Processor-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.json.kafka.post.processor.maxwell.database.regex](#hoodiestreamersourcejsonkafkapostprocessormaxwelldatabaseregex) | (N/A) | Database name regex
`Config Param: DATABASE_NAME_REGEX` | +| [hoodie.streamer.source.json.kafka.post.processor.maxwell.table.regex](#hoodiestreamersourcejsonkafkapostprocessormaxwelltableregex) | (N/A) | Table name regex
`Config Param: TABLE_NAME_REGEX` | +| [hoodie.streamer.source.json.kafka.processor.class](#hoodiestreamersourcejsonkafkaprocessorclass) | (N/A) | Json kafka source post processor class name, post process data after consuming fromsource and before giving it to Hudi Streamer.
`Config Param: JSON_KAFKA_PROCESSOR_CLASS` | +| [hoodie.streamer.source.json.kafka.post.processor.maxwell.precombine.field.format](#hoodiestreamersourcejsonkafkapostprocessormaxwellprecombinefieldformat) | yyyy-MM-dd HH:mm:ss | When the preCombine filed is in DATE_STRING format, use should tell hoodiewhat format it is. 'yyyy-MM-dd HH:mm:ss' by default
`Config Param: PRECOMBINE_FIELD_FORMAT` | +| [hoodie.streamer.source.json.kafka.post.processor.maxwell.precombine.field.type](#hoodiestreamersourcejsonkafkapostprocessormaxwellprecombinefieldtype) | DATE_STRING | Data type of the preCombine field. could be NON_TIMESTAMP, DATE_STRING,UNIX_TIMESTAMP or EPOCHMILLISECONDS. DATE_STRING by default
`Config Param: PRECOMBINE_FIELD_TYPE` | +--- + + +#### Kafka Source Configs {#Kafka-Source-Configs} +Configurations controlling the behavior of Kafka source in Hudi Streamer. + + + +[**Basic Configs**](#Kafka-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.kafka.topic](#hoodiestreamersourcekafkatopic) | (N/A) | Kafka topic name.
`Config Param: KAFKA_TOPIC_NAME` | +| [hoodie.streamer.source.kafka.proto.value.deserializer.class](#hoodiestreamersourcekafkaprotovaluedeserializerclass) | org.apache.kafka.common.serialization.ByteArrayDeserializer | Kafka Proto Payload Deserializer Class
`Config Param: KAFKA_PROTO_VALUE_DESERIALIZER_CLASS`
`Since Version: 0.15.0` | + +[**Advanced Configs**](#Kafka-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------- || +| [hoodie.streamer.source.kafka.value.deserializer.schema](#hoodiestreamersourcekafkavaluedeserializerschema) | (N/A) | Schema to deserialize the records.
`Config Param: KAFKA_VALUE_DESERIALIZER_SCHEMA` | +| [auto.offset.reset](#autooffsetreset) | LATEST | Kafka consumer strategy for reading data.
`Config Param: KAFKA_AUTO_OFFSET_RESET` | +| [hoodie.streamer.kafka.source.maxEvents](#hoodiestreamerkafkasourcemaxEvents) | 5000000 | Maximum number of records obtained in each batch.
`Config Param: MAX_EVENTS_FROM_KAFKA_SOURCE` | +| [hoodie.streamer.source.kafka.checkpoint.type](#hoodiestreamersourcekafkacheckpointtype) | string | Kafka checkpoint type. Value must be one of the following: string, timestamp, single_offset. Default type is string. For type string, checkpoint should be provided as: topicName,0:offset0,1:offset1,2:offset2. For type timestamp, checkpoint should be provided as long value of desired timestamp. For type single_offset, we assume that topic consists of a single partition, so checkpoint should be provided as long value of desired offset.
`Config Param: KAFKA_CHECKPOINT_TYPE` | +| [hoodie.streamer.source.kafka.enable.commit.offset](#hoodiestreamersourcekafkaenablecommitoffset) | false | Automatically submits offset to kafka.
`Config Param: ENABLE_KAFKA_COMMIT_OFFSET` | +| [hoodie.streamer.source.kafka.enable.failOnDataLoss](#hoodiestreamersourcekafkaenablefailOnDataLoss) | false | Fail when checkpoint goes out of bounds instead of seeking to earliest offsets.
`Config Param: ENABLE_FAIL_ON_DATA_LOSS` | +| [hoodie.streamer.source.kafka.fetch_partition.time.out](#hoodiestreamersourcekafkafetch_partitiontimeout) | 300000 | Time out for fetching partitions. 5min by default
`Config Param: KAFKA_FETCH_PARTITION_TIME_OUT` | +| [hoodie.streamer.source.kafka.minPartitions](#hoodiestreamersourcekafkaminPartitions) | 0 | Desired minimum number of partitions to read from Kafka. By default, Hudi has a 1-1 mapping of topicPartitions to Hudi partitions consuming from Kafka. If set this option to a value greater than topicPartitions, Hudi will divvy up large Kafka partitions to smaller pieces. Please note that this configuration is like a hint: the number of input tasks will be approximately minPartitions. It can be less or more depending on rounding errors or Kafka partitions that didn't receive any new data.
`Config Param: KAFKA_SOURCE_MIN_PARTITIONS`
`Since Version: 0.14.0` | +| [hoodie.streamer.source.kafka.retry.exceptions](#hoodiestreamersourcekafkaretryexceptions) | | The class name of the Exception that needs to be retried, separated by commas. Default is empty which means retry all the IOException and RuntimeException from KafkaConsumer
`Config Param: RETRY_EXCEPTIONS`
`Since Version: 1.1.0` | +| [hoodie.streamer.source.kafka.retry.initial_interval_ms](#hoodiestreamersourcekafkaretryinitial_interval_ms) | 100 | Amount of time (in ms) to wait, before retry to do operations on KafkaConsumer.
`Config Param: INITIAL_RETRY_INTERVAL_MS`
`Since Version: 1.1.0` | +| [hoodie.streamer.source.kafka.retry.max_count](#hoodiestreamersourcekafkaretrymax_count) | 4 | Maximum number of retry actions to perform, with exponential backoff.
`Config Param: MAX_RETRY_COUNT`
`Since Version: 1.1.0` | +| [hoodie.streamer.source.kafka.retry.max_interval_ms](#hoodiestreamersourcekafkaretrymax_interval_ms) | 2000 | Maximum amount of time (in ms), to wait for next retry.
`Config Param: MAX_RETRY_INTERVAL_MS`
`Since Version: 1.1.0` | +| [hoodie.streamer.source.kafka.value.deserializer.class](#hoodiestreamersourcekafkavaluedeserializerclass) | io.confluent.kafka.serializers.KafkaAvroDeserializer | This class is used by kafka client to deserialize the records.
`Config Param: KAFKA_AVRO_VALUE_DESERIALIZER_CLASS`
`Since Version: 0.9.0` | +--- + + +#### Parquet DFS Source Configs {#Parquet-DFS-Source-Configs} +Configurations controlling the behavior of Parquet DFS source in Hudi Streamer. + + + +[**Advanced Configs**](#Parquet-DFS-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.parquet.dfs.merge_schema.enable](#hoodiestreamersourceparquetdfsmerge_schemaenable) | false | Merge schema across parquet files within a single write
`Config Param: PARQUET_DFS_MERGE_SCHEMA`
`Since Version: 0.15.0` | +--- + + +#### Pulsar Source Configs {#Pulsar-Source-Configs} +Configurations controlling the behavior of Pulsar source in Hudi Streamer. + + + +[**Basic Configs**](#Pulsar-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------- | ------------------------ | --------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.pulsar.topic](#hoodiestreamersourcepulsartopic) | (N/A) | Name of the target Pulsar topic to source data from
`Config Param: PULSAR_SOURCE_TOPIC_NAME` | +| [hoodie.streamer.source.pulsar.endpoint.admin.url](#hoodiestreamersourcepulsarendpointadminurl) | http://localhost:8080 | URL of the target Pulsar endpoint (of the form 'pulsar://host:port'
`Config Param: PULSAR_SOURCE_ADMIN_ENDPOINT_URL` | +| [hoodie.streamer.source.pulsar.endpoint.service.url](#hoodiestreamersourcepulsarendpointserviceurl) | pulsar://localhost:6650 | URL of the target Pulsar endpoint (of the form 'pulsar://host:port'
`Config Param: PULSAR_SOURCE_SERVICE_ENDPOINT_URL` | + +[**Advanced Configs**](#Pulsar-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------ | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.pulsar.maxRecords](#hoodiestreamersourcepulsarmaxRecords) | 5000000 | Max number of records obtained in a single each batch
`Config Param: PULSAR_SOURCE_MAX_RECORDS_PER_BATCH_THRESHOLD` | +| [hoodie.streamer.source.pulsar.offset.autoResetStrategy](#hoodiestreamersourcepulsaroffsetautoResetStrategy) | LATEST | Policy determining how offsets shall be automatically reset in case there's no checkpoint information present
`Config Param: PULSAR_SOURCE_OFFSET_AUTO_RESET_STRATEGY` | +--- + + +#### S3 Event-based Hudi Incremental Source Configs {#S3-Event-based-Hudi-Incremental-Source-Configs} +Configurations controlling the behavior of incremental pulling from S3 events meta information from Hudi table as a source in Hudi Streamer. + + + +[**Advanced Configs**](#S3-Event-based-Hudi-Incremental-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.s3incr.ignore.key.prefix](#hoodiestreamersources3incrignorekeyprefix) | (N/A) | Control whether to ignore the s3 objects starting with this prefix
`Config Param: S3_IGNORE_KEY_PREFIX` | +| [hoodie.streamer.source.s3incr.ignore.key.substring](#hoodiestreamersources3incrignorekeysubstring) | (N/A) | Control whether to ignore the s3 objects with this substring
`Config Param: S3_IGNORE_KEY_SUBSTRING` | +| [hoodie.streamer.source.s3incr.key.prefix](#hoodiestreamersources3incrkeyprefix) | (N/A) | Control whether to filter the s3 objects starting with this prefix
`Config Param: S3_KEY_PREFIX` | +| [hoodie.streamer.source.s3incr.spark.datasource.options](#hoodiestreamersources3incrsparkdatasourceoptions) | (N/A) | Json string, passed to the reader while loading dataset. Example Hudi Streamer conf `--hoodie-conf hoodie.streamer.source.s3incr.spark.datasource.options={"header":"true","encoding":"UTF-8"}`
`Config Param: SPARK_DATASOURCE_OPTIONS` | +| [hoodie.streamer.source.s3incr.check.file.exists](#hoodiestreamersources3incrcheckfileexists) | false | Control whether we do existence check for files before consuming them
`Config Param: S3_INCR_ENABLE_EXISTS_CHECK` | +| [hoodie.streamer.source.s3incr.fs.prefix](#hoodiestreamersources3incrfsprefix) | s3 | The file system prefix.
`Config Param: S3_FS_PREFIX` | +--- + + +#### S3 Source Configs {#S3-Source-Configs} +Configurations controlling the behavior of S3 source in Hudi Streamer. + + + +[**Basic Configs**](#S3-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| ---------------------------------------------------------------------- | ------- | -------------------------------------------------------------------------- | +| [hoodie.streamer.s3.source.queue.url](#hoodiestreamers3sourcequeueurl) | (N/A) | Queue url for cloud object events
`Config Param: S3_SOURCE_QUEUE_URL` | + +[**Advanced Configs**](#S3-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.s3.source.queue.region](#hoodiestreamers3sourcequeueregion) | (N/A) | Case-sensitive region name of the cloud provider for the queue. For example, "us-east-1".
`Config Param: S3_SOURCE_QUEUE_REGION` | +| [hoodie.streamer.s3.source.queue.fs](#hoodiestreamers3sourcequeuefs) | s3 | File system corresponding to queue. For example, for AWS SQS it is s3/s3a.
`Config Param: S3_SOURCE_QUEUE_FS` | +| [hoodie.streamer.s3.source.queue.long.poll.wait](#hoodiestreamers3sourcequeuelongpollwait) | 20 | Long poll wait time in seconds, If set as 0 then client will fetch on short poll basis.
`Config Param: S3_QUEUE_LONG_POLL_WAIT` | +| [hoodie.streamer.s3.source.queue.max.messages.per.batch](#hoodiestreamers3sourcequeuemaxmessagesperbatch) | 5 | Max messages for each batch of Hudi Streamer run. Source will process these maximum number of message at a time.
`Config Param: S3_SOURCE_QUEUE_MAX_MESSAGES_PER_BATCH` | +| [hoodie.streamer.s3.source.queue.max.messages.per.request](#hoodiestreamers3sourcequeuemaxmessagesperrequest) | 10 | Max messages for each request
`Config Param: S3_SOURCE_QUEUE_MAX_MESSAGES_PER_REQUEST` | +| [hoodie.streamer.s3.source.queue.visibility.timeout](#hoodiestreamers3sourcequeuevisibilitytimeout) | 30 | Visibility timeout for messages in queue. After we consume the message, queue will move the consumed messages to in-flight state, these messages can't be consumed again by source for this timeout period.
`Config Param: S3_SOURCE_QUEUE_VISIBILITY_TIMEOUT` | +--- + + +#### File-based SQL Source Configs {#File-based-SQL-Source-Configs} +Configurations controlling the behavior of File-based SQL Source in Hudi Streamer. + + + +[**Basic Configs**](#File-based-SQL-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.sql.file](#hoodiestreamersourcesqlfile) | (N/A) | SQL file path containing the SQL query to read source data.
`Config Param: SOURCE_SQL_FILE`
`Since Version: 0.14.0` | + +[**Advanced Configs**](#File-based-SQL-Source-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------ | ------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.source.sql.checkpoint.emit](#hoodiestreamersourcesqlcheckpointemit) | false | Whether to emit the current epoch as the streamer checkpoint.
`Config Param: EMIT_EPOCH_CHECKPOINT`
`Since Version: 0.14.0` | +--- + + +#### SQL Source Configs {#SQL-Source-Configs} +Configurations controlling the behavior of SQL source in Hudi Streamer. + + + +[**Basic Configs**](#SQL-Source-Configs-basic-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------ | ------- | ------------------------------------------------------------------- | +| [hoodie.streamer.source.sql.sql.query](#hoodiestreamersourcesqlsqlquery) | (N/A) | SQL query for fetching source data.
`Config Param: SOURCE_SQL` | +--- + + +### Hudi Streamer Schema Provider Configs {#SCHEMA_PROVIDER} +Configurations that control the schema provider for Hudi Streamer. + + +#### Hudi Streamer Schema Provider Configs {#Hudi-Streamer-Schema-Provider-Configs} + + + + +[**Basic Configs**](#Hudi-Streamer-Schema-Provider-Configs-basic-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.schemaprovider.registry.targetUrl](#hoodiestreamerschemaproviderregistrytargetUrl) | (N/A) | The schema of the target you are writing to e.g. https://foo:bar@schemaregistry.org
`Config Param: TARGET_SCHEMA_REGISTRY_URL` | +| [hoodie.streamer.schemaprovider.registry.url](#hoodiestreamerschemaproviderregistryurl) | (N/A) | The schema of the source you are reading from e.g. https://foo:bar@schemaregistry.org
`Config Param: SRC_SCHEMA_REGISTRY_URL` | + +[**Advanced Configs**](#Hudi-Streamer-Schema-Provider-Configs-advanced-configs) + + +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------- | +| [hoodie.streamer.schemaprovider.registry.baseUrl](#hoodiestreamerschemaproviderregistrybaseUrl) | (N/A) | The base URL of the schema registry.
`Config Param: SCHEMA_REGISTRY_BASE_URL` | +| [hoodie.streamer.schemaprovider.registry.schemaconverter](#hoodiestreamerschemaproviderregistryschemaconverter) | (N/A) | The class name of the custom schema converter to use.
`Config Param: SCHEMA_CONVERTER` | +| [hoodie.streamer.schemaprovider.registry.sourceUrlSuffix](#hoodiestreamerschemaproviderregistrysourceUrlSuffix) | (N/A) | The source URL suffix.
`Config Param: SCHEMA_REGISTRY_SOURCE_URL_SUFFIX` | +| [hoodie.streamer.schemaprovider.registry.targetUrlSuffix](#hoodiestreamerschemaproviderregistrytargetUrlSuffix) | (N/A) | The target URL suffix.
`Config Param: SCHEMA_REGISTRY_TARGET_URL_SUFFIX` | +| [hoodie.streamer.schemaprovider.registry.urlSuffix](#hoodiestreamerschemaproviderregistryurlSuffix) | (N/A) | The suffix of the URL for the schema registry.
`Config Param: SCHEMA_REGISTRY_URL_SUFFIX` | +--- + + +#### File-based Schema Provider Configs {#File-based-Schema-Provider-Configs} +Configurations for file-based schema provider. + + + +[**Basic Configs**](#File-based-Schema-Provider-Configs-basic-configs) + + +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------- | +| [hoodie.streamer.schemaprovider.source.schema.file](#hoodiestreamerschemaprovidersourceschemafile) | (N/A) | The schema of the source you are reading from
`Config Param: SOURCE_SCHEMA_FILE` | +| [hoodie.streamer.schemaprovider.target.schema.file](#hoodiestreamerschemaprovidertargetschemafile) | (N/A) | The schema of the target you are writing to
`Config Param: TARGET_SCHEMA_FILE` | +--- + + +#### Hive Schema Provider Configs {#Hive-Schema-Provider-Configs} +Configurations for Hive schema provider. + + + +[**Advanced Configs**](#Hive-Schema-Provider-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.schemaprovider.source.schema.hive.table](#hoodiestreamerschemaprovidersourceschemahivetable) | (N/A) | Hive table from where source schema can be fetched
`Config Param: SOURCE_SCHEMA_TABLE` | +| [hoodie.streamer.schemaprovider.target.schema.hive.table](#hoodiestreamerschemaprovidertargetschemahivetable) | (N/A) | Hive table from where target schema can be fetched
`Config Param: TARGET_SCHEMA_TABLE` | +| [hoodie.streamer.schemaprovider.source.schema.hive.database](#hoodiestreamerschemaprovidersourceschemahivedatabase) | default | Hive database from where source schema can be fetched
`Config Param: SOURCE_SCHEMA_DATABASE` | +| [hoodie.streamer.schemaprovider.target.schema.hive.database](#hoodiestreamerschemaprovidertargetschemahivedatabase) | default | Hive database from where target schema can be fetched
`Config Param: TARGET_SCHEMA_DATABASE` | +--- + + +#### JDBC-based Schema Provider Configs {#JDBC-based-Schema-Provider-Configs} +Configurations for JDBC-based schema provider. + + + +[**Advanced Configs**](#JDBC-based-Schema-Provider-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------ | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.schemaprovider.source.schema.jdbc.connection.url](#hoodiestreamerschemaprovidersourceschemajdbcconnectionurl) | (N/A) | The JDBC URL to connect to. The source-specific connection properties may be specified in the URL. e.g., jdbc:postgresql://localhost/test?user=fred&password=secret
`Config Param: SOURCE_SCHEMA_JDBC_CONNECTION_URL` | +| [hoodie.streamer.schemaprovider.source.schema.jdbc.dbtable](#hoodiestreamerschemaprovidersourceschemajdbcdbtable) | (N/A) | The table with the schema to reference e.g. test_database.test1_table or test1_table
`Config Param: SOURCE_SCHEMA_JDBC_DBTABLE` | +| [hoodie.streamer.schemaprovider.source.schema.jdbc.driver.type](#hoodiestreamerschemaprovidersourceschemajdbcdrivertype) | (N/A) | The class name of the JDBC driver to use to connect to this URL. e.g. org.h2.Driver
`Config Param: SOURCE_SCHEMA_JDBC_DRIVER_TYPE` | +| [hoodie.streamer.schemaprovider.source.schema.jdbc.nullable](#hoodiestreamerschemaprovidersourceschemajdbcnullable) | (N/A) | If true, all the columns are nullable.
`Config Param: SOURCE_SCHEMA_JDBC_NULLABLE` | +| [hoodie.streamer.schemaprovider.source.schema.jdbc.password](#hoodiestreamerschemaprovidersourceschemajdbcpassword) | (N/A) | Password for the connection e.g. secret
`Config Param: SOURCE_SCHEMA_JDBC_PASSWORD` | +| [hoodie.streamer.schemaprovider.source.schema.jdbc.timeout](#hoodiestreamerschemaprovidersourceschemajdbctimeout) | (N/A) | The number of seconds the driver will wait for a Statement object to execute. Zero means there is no limit. In the write path, this option depends on how JDBC drivers implement the API setQueryTimeout, e.g., the h2 JDBC driver checks the timeout of each query instead of an entire JDBC batch. It defaults to 0.
`Config Param: SOURCE_SCHEMA_JDBC_TIMEOUT` | +| [hoodie.streamer.schemaprovider.source.schema.jdbc.username](#hoodiestreamerschemaprovidersourceschemajdbcusername) | (N/A) | Username for the connection e.g. fred
`Config Param: SOURCE_SCHEMA_JDBC_USERNAME` | +--- + + +#### JDBC-based Schema Provider Configs {#JDBC-based-Schema-Provider-Configs} +Configurations for Proto schema provider. + + + +[**Advanced Configs**](#JDBC-based-Schema-Provider-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------- | ------- || +| [hoodie.streamer.schemaprovider.proto.class.name](#hoodiestreamerschemaproviderprotoclassname) | (N/A) | The Protobuf Message class used as the source for the schema.
`Config Param: PROTO_SCHEMA_CLASS_NAME`
`Since Version: 0.13.0` | +| [hoodie.streamer.schemaprovider.proto.flatten.wrappers](#hoodiestreamerschemaproviderprotoflattenwrappers) | false | When set to true wrapped primitives like Int64Value are translated to a record with a single 'value' field. By default, the value is false and the wrapped primitives are treated as a nullable value
`Config Param: PROTO_SCHEMA_WRAPPED_PRIMITIVES_AS_RECORDS`
`Since Version: 0.13.0` | +| [hoodie.streamer.schemaprovider.proto.max.recursion.depth](#hoodiestreamerschemaproviderprotomaxrecursiondepth) | 5 | The max depth to unravel the Proto schema when translating into an Avro schema. Setting this depth allows the user to convert a schema that is recursive in proto into something that can be represented in their lake format like Parquet. After a given class has been seen N times within a single branch, the schema provider will create a record with a byte array to hold the remaining proto data and a string to hold the message descriptor's name for context.
`Config Param: PROTO_SCHEMA_MAX_RECURSION_DEPTH`
`Since Version: 0.13.0` | +| [hoodie.streamer.schemaprovider.proto.timestamps.as.records](#hoodiestreamerschemaproviderprototimestampsasrecords) | false | When set to true Timestamp fields are translated to a record with a seconds and nanos field. By default, the value is false and the timestamp is converted to a long with the timestamp-micros logical type
`Config Param: PROTO_SCHEMA_TIMESTAMPS_AS_RECORDS`
`Since Version: 0.13.0` | +--- + + +#### Schema Post Processor Config Configs {#Schema-Post-Processor-Config-Configs} +Configurations for Schema Post Processor + + + +[**Advanced Configs**](#Schema-Post-Processor-Config-Configs-advanced-configs) + + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------------------------------------------------------- | ------- | -------------------------------------------------------------------------------------------------------- | +| [hoodie.streamer.schemaprovider.schema_post_processor](#hoodiestreamerschemaproviderschema_post_processor) | (N/A) | The class name of the schema post processor.
`Config Param: SCHEMA_POST_PROCESSOR` | +| [hoodie.streamer.schemaprovider.schema_post_processor.add.column.default](#hoodiestreamerschemaproviderschema_post_processoraddcolumndefault) | (N/A) | New column's default value
`Config Param: SCHEMA_POST_PROCESSOR_ADD_COLUMN_DEFAULT_PROP` | +| [hoodie.streamer.schemaprovider.schema_post_processor.add.column.doc](#hoodiestreamerschemaproviderschema_post_processoraddcolumndoc) | (N/A) | Docs about new column
`Config Param: SCHEMA_POST_PROCESSOR_ADD_COLUMN_DOC_PROP` | +| [hoodie.streamer.schemaprovider.schema_post_processor.add.column.name](#hoodiestreamerschemaproviderschema_post_processoraddcolumnname) | (N/A) | New column's name
`Config Param: SCHEMA_POST_PROCESSOR_ADD_COLUMN_NAME_PROP` | +| [hoodie.streamer.schemaprovider.schema_post_processor.add.column.type](#hoodiestreamerschemaproviderschema_post_processoraddcolumntype) | (N/A) | New column's type
`Config Param: SCHEMA_POST_PROCESSOR_ADD_COLUMN_TYPE_PROP` | +| [hoodie.streamer.schemaprovider.schema_post_processor.delete.columns](#hoodiestreamerschemaproviderschema_post_processordeletecolumns) | (N/A) | Columns to delete in the schema post processor.
`Config Param: DELETE_COLUMN_POST_PROCESSOR_COLUMN` | +| [hoodie.streamer.schemaprovider.schema_post_processor.add.column.nullable](#hoodiestreamerschemaproviderschema_post_processoraddcolumnnullable) | true | New column's nullable
`Config Param: SCHEMA_POST_PROCESSOR_ADD_COLUMN_NULLABLE_PROP` | +--- + diff --git a/website/versioned_docs/version-1.0.0/cos_hoodie.md b/website/versioned_docs/version-1.0.0/cos_hoodie.md new file mode 100644 index 0000000000000..dfde6e8cff0a0 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/cos_hoodie.md @@ -0,0 +1,71 @@ +--- +title: Tencent Cloud +keywords: [ hudi, hive, tencent, cos, spark, presto] +summary: In this page, we go over how to configure Hudi with COS filesystem. +last_modified_at: 2020-04-21T11:38:24-10:00 +--- +In this page, we explain how to get your Hudi spark job to store into Tencent Cloud COS. + +## Tencent Cloud COS configs + +There are two configurations required for Hudi-COS compatibility: + +- Adding Tencent Cloud COS Credentials for Hudi +- Adding required Jars to classpath + +### Tencent Cloud COS Credentials + +Add the required configs in your core-site.xml from where Hudi can fetch them. Replace the `fs.defaultFS` with your COS bucket name, replace `fs.cosn.userinfo.secretId` with your COS secret Id, replace `fs.cosn.userinfo.secretKey` with your COS key. Hudi should be able to read/write from the bucket. + +```xml + + fs.defaultFS + cosn://bucketname + COS bucket name + + + + fs.cosn.userinfo.secretId + cos-secretId + Tencent Cloud Secret Id + + + + fs.cosn.userinfo.secretKey + cos-secretkey + Tencent Cloud Secret Key + + + + fs.cosn.bucket.region + ap-region + The region where the bucket is located. + + + + fs.cosn.bucket.endpoint_suffix + cos.endpoint.suffix + + COS endpoint to connect to. + For public cloud users, it is recommended not to set this option, and only the correct area field is required. + + + + + fs.cosn.impl + org.apache.hadoop.fs.CosFileSystem + The implementation class of the CosN Filesystem. + + + + fs.AbstractFileSystem.cosn.impl + org.apache.hadoop.fs.CosN + The implementation class of the CosN AbstractFileSystem. + + +``` + +### Tencent Cloud COS Libs +COS hadoop libraries to add to our classpath + +- org.apache.hadoop:hadoop-cos:2.8.5 diff --git a/website/versioned_docs/version-1.0.0/deployment.md b/website/versioned_docs/version-1.0.0/deployment.md new file mode 100644 index 0000000000000..1c5a41a0acb59 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/deployment.md @@ -0,0 +1,284 @@ +--- +title: Deployment +keywords: [ hudi, administration, operation, devops, deployment] +summary: This section offers an overview of tools available to operate an ecosystem of Hudi +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +--- + +This section provides all the help you need to deploy and operate Hudi tables at scale. +Specifically, we will cover the following aspects. + + - [Deployment Model](#deploying) : How various Hudi components are deployed and managed. + - [Upgrading Versions](#upgrading) : Picking up new releases of Hudi, guidelines and general best-practices. + - [Downgrading Versions](#downgrading) : Reverting back to an older version of Hudi + - [Migrating to Hudi](#migrating) : How to migrate your existing tables to Apache Hudi. + +## Deploying + +All in all, Hudi deploys with no long running servers or additional infrastructure cost to your data lake. In fact, Hudi pioneered this model of building a transactional distributed storage layer +using existing infrastructure and its heartening to see other systems adopting similar approaches as well. Hudi writing is done via Spark jobs (Hudi Streamer or custom Spark datasource jobs), deployed per standard Apache Spark [recommendations](https://spark.apache.org/docs/latest/cluster-overview). +Querying Hudi tables happens via libraries installed into Apache Hive, Apache Spark or PrestoDB and hence no additional infrastructure is necessary. + +A typical Hudi data ingestion can be achieved in 2 modes. In a single run mode, Hudi ingestion reads next batch of data, ingest them to Hudi table and exits. In continuous mode, Hudi ingestion runs as a long-running service executing ingestion in a loop. + +With Merge_On_Read Table, Hudi ingestion needs to also take care of compacting delta files. Again, compaction can be performed in an asynchronous-mode by letting compaction run concurrently with ingestion or in a serial fashion with one after another. + +### Hudi Streamer + +[Hudi Streamer](/docs/hoodie_streaming_ingestion#hudi-streamer) is the standalone utility to incrementally pull upstream changes +from varied sources such as DFS, Kafka and DB Changelogs and ingest them to hudi tables. It runs as a spark application in two modes. + +To use Hudi Streamer in Spark, the `hudi-utilities-slim-bundle` and Hudi Spark bundle are required, by adding +`--packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0` to the `spark-submit` command. + + - **Run Once Mode** : In this mode, Hudi Streamer performs one ingestion round which includes incrementally pulling events from upstream sources and ingesting them to hudi table. Background operations like cleaning old file versions and archiving hoodie timeline are automatically executed as part of the run. For Merge-On-Read tables, Compaction is also run inline as part of ingestion unless disabled by passing the flag "--disable-compaction". By default, Compaction is run inline for every ingestion run and this can be changed by setting the property "hoodie.compact.inline.max.delta.commits". You can either manually run this spark application or use any cron trigger or workflow orchestrator (most common deployment strategy) such as Apache Airflow to spawn this application. See command line options in [this section](/docs/hoodie_streaming_ingestion#hudi-streamer) for running the spark application. + +Here is an example invocation for reading from kafka topic in a single-run mode and writing to Merge On Read table type in a yarn cluster. + +```java +[hoodie]$ spark-submit \ + --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ + --master yarn \ + --deploy-mode cluster \ + --num-executors 10 \ + --executor-memory 3g \ + --driver-memory 6g \ + --conf spark.driver.extraJavaOptions="-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/varadarb_ds_driver.hprof" \ + --conf spark.executor.extraJavaOptions="-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/varadarb_ds_executor.hprof" \ + --queue hadoop-platform-queue \ + --conf spark.scheduler.mode=FAIR \ + --conf spark.yarn.executor.memoryOverhead=1072 \ + --conf spark.yarn.driver.memoryOverhead=2048 \ + --conf spark.task.cpus=1 \ + --conf spark.executor.cores=1 \ + --conf spark.task.maxFailures=10 \ + --conf spark.memory.fraction=0.4 \ + --conf spark.rdd.compress=true \ + --conf spark.kryoserializer.buffer.max=200m \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.memory.storageFraction=0.1 \ + --conf spark.shuffle.service.enabled=true \ + --conf spark.sql.hive.convertMetastoreParquet=false \ + --conf spark.ui.port=5555 \ + --conf spark.driver.maxResultSize=3g \ + --conf spark.executor.heartbeatInterval=120s \ + --conf spark.network.timeout=600s \ + --conf spark.eventLog.overwrite=true \ + --conf spark.eventLog.enabled=true \ + --conf spark.eventLog.dir=hdfs:///user/spark/applicationHistory \ + --conf spark.yarn.max.executor.failures=10 \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.sql.shuffle.partitions=100 \ + --driver-class-path $HADOOP_CONF_DIR \ + --class org.apache.hudi.utilities.streamer.HoodieStreamer \ + --table-type MERGE_ON_READ \ + --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \ + --source-ordering-field ts \ + --target-base-path /user/hive/warehouse/stock_ticks_mor \ + --target-table stock_ticks_mor \ + --props /var/demo/config/kafka-source.properties \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider +``` + + - **Continuous Mode** : Here, Hudi Streamer runs an infinite loop with each round performing one ingestion round as described in **Run Once Mode**. The frequency of data ingestion can be controlled by the configuration "--min-sync-interval-seconds". For Merge-On-Read tables, Compaction is run in asynchronous fashion concurrently with ingestion unless disabled by passing the flag "--disable-compaction". Every ingestion run triggers a compaction request asynchronously and this frequency can be changed by setting the property "hoodie.compact.inline.max.delta.commits". As both ingestion and compaction is running in the same spark context, you can use resource allocation configuration in Hudi Streamer CLI such as ("--delta-sync-scheduling-weight", "--compact-scheduling-weight", ""--delta-sync-scheduling-minshare", and "--compact-scheduling-minshare") to control executor allocation between ingestion and compaction. + +Here is an example invocation for reading from kafka topic in a continuous mode and writing to Merge On Read table type in a yarn cluster. + +```java +[hoodie]$ spark-submit \ + --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ + --master yarn \ + --deploy-mode cluster \ + --num-executors 10 \ + --executor-memory 3g \ + --driver-memory 6g \ + --conf spark.driver.extraJavaOptions="-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/varadarb_ds_driver.hprof" \ + --conf spark.executor.extraJavaOptions="-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/varadarb_ds_executor.hprof" \ + --queue hadoop-platform-queue \ + --conf spark.scheduler.mode=FAIR \ + --conf spark.yarn.executor.memoryOverhead=1072 \ + --conf spark.yarn.driver.memoryOverhead=2048 \ + --conf spark.task.cpus=1 \ + --conf spark.executor.cores=1 \ + --conf spark.task.maxFailures=10 \ + --conf spark.memory.fraction=0.4 \ + --conf spark.rdd.compress=true \ + --conf spark.kryoserializer.buffer.max=200m \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.memory.storageFraction=0.1 \ + --conf spark.shuffle.service.enabled=true \ + --conf spark.sql.hive.convertMetastoreParquet=false \ + --conf spark.ui.port=5555 \ + --conf spark.driver.maxResultSize=3g \ + --conf spark.executor.heartbeatInterval=120s \ + --conf spark.network.timeout=600s \ + --conf spark.eventLog.overwrite=true \ + --conf spark.eventLog.enabled=true \ + --conf spark.eventLog.dir=hdfs:///user/spark/applicationHistory \ + --conf spark.yarn.max.executor.failures=10 \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.sql.shuffle.partitions=100 \ + --driver-class-path $HADOOP_CONF_DIR \ + --class org.apache.hudi.utilities.streamer.HoodieStreamer \ + --table-type MERGE_ON_READ \ + --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \ + --source-ordering-field ts \ + --target-base-path /user/hive/warehouse/stock_ticks_mor \ + --target-table stock_ticks_mor \ + --props /var/demo/config/kafka-source.properties \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --continuous +``` + +### Spark Datasource Writer Jobs + +As described in [Batch Writes](writing_data#spark-datasource-api), you can use spark datasource to ingest to hudi table. This mechanism allows you to ingest any spark dataframe in Hudi format. Hudi Spark DataSource also supports spark streaming to ingest a streaming source to Hudi table. For Merge On Read table types, inline compaction is turned on by default which runs after every ingestion run. The compaction frequency can be changed by setting the property "hoodie.compact.inline.max.delta.commits". + +Here is an example invocation using spark datasource + +```java +inputDF.write() + .format("org.apache.hudi") + .options(clientOpts) // any of the Hudi client opts can be passed in as well + .option("hoodie.datasource.write.recordkey.field", "_row_key") + .option("hoodie.datasource.write.partitionpath.field", "partition") + .option("hoodie.datasource.write.precombine.field"(), "timestamp") + .option("hoodie.table.name", tableName) + .mode(SaveMode.Append) + .save(basePath); +``` + +## Upgrading + +New Hudi releases are listed on the [releases page](/releases/download), with detailed notes which list all the changes, with highlights in each release. +At the end of the day, Hudi is a storage system and with that comes a lot of responsibilities, which we take seriously. + +As general guidelines, + + - We strive to keep all changes backwards compatible (i.e new code can read old data/timeline files) and when we cannot, we will provide upgrade/downgrade tools via the CLI + - We cannot always guarantee forward compatibility (i.e old code being able to read data/timeline files written by a greater version). This is generally the norm, since no new features can be built otherwise. + However any large such changes, will be turned off by default, for smooth transition to newer release. After a few releases and once enough users deem the feature stable in production, we will flip the defaults in a subsequent release. + - Always upgrade the query bundles (mr-bundle, presto-bundle, spark-bundle) first and then upgrade the writers (Hudi Streamer, spark jobs using datasource). This often provides the best experience and it's easy to fix + any issues by rolling forward/back the writer code (which typically you might have more control over) + - With large, feature rich releases we recommend migrating slowly, by first testing in staging environments and running your own tests. Upgrading Hudi is no different than upgrading any database system. + +Note that release notes can override this information with specific instructions, applicable on case-by-case basis. + +### Upgrading to 1.0.0 + +1.0.0 is a major release with significant format changes. To ensure a smooth migration experience, we recommend the +following steps: + +1. Stop any async table services in 0.x completely. +2. Upgrade writers to 1.x with table version (tv) 6, `autoUpgrade` and metadata disabled (this won't auto-upgrade anything); + 0.x readers will continue to work; writers can also be readers and will continue to read both tv=6. + a. Set `hoodie.write.auto.upgrade` to false. + b. Set `hoodie.metadata.enable` to false. +3. Upgrade table services to 1.x with tv=6, and resume operations. +4. Upgrade all remaining readers to 1.x, with tv=6. +5. Redeploy writers with tv=8; table services and readers will adapt/pick up tv=8 on the fly. +6. Once all readers and writers are in 1.x, we are good to enable any new features, including metadata, with 1.x tables. + +During the upgrade, metadata table will not be updated and it will be behind the data table. It is important to note +that metadata table will be updated only when the writer is upgraded to tv=8. So, even the readers should keep metadata +disabled during rolling upgrade until all writers are upgraded to tv=8. + +:::caution +Most things are seamlessly handled by the auto upgrade process, but there are some limitations. Please read through the +limitations of the upgrade downgrade process before proceeding to migrate. Please +check [RFC-78](https://github.com/apache/hudi/blob/master/rfc/rfc-78/rfc-78.md#support-matrix-for-different-readers-and-writers) +for more details. +::: + +## Downgrading + +Upgrade is automatic whenever a new Hudi version is used whereas downgrade is a manual step. We need to use the Hudi +CLI to downgrade a table from a higher version to lower version. Let's consider an example where we create a table using +0.12.0, upgrade it to 0.13.0 and then downgrade it via Hudi CLI. + +Launch spark shell with Hudi 0.11.0 version. +```shell +spark-shell \ + --packages org.apache.hudi:hudi-spark3.2-bundle_2.12:0.11.0 \ + --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ + --conf 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog' \ + --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' +``` + +Create a hudi table by using the scala script below. +```scala +import org.apache.hudi.QuickstartUtils._ +import scala.collection.JavaConversions._ +import org.apache.spark.sql.SaveMode._ +import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.config.HoodieWriteConfig._ +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.HoodieDataSourceHelpers + +val dataGen = new DataGenerator +val tableType = MOR_TABLE_TYPE_OPT_VAL +val basePath = "file:///tmp/hudi_table" +val tableName = "hudi_table" + +val inserts = convertToStringList(dataGen.generateInserts(100)).toList +val insertDf = spark.read.json(spark.sparkContext.parallelize(inserts, 2)) +insertDf.write.format("hudi"). + options(getQuickstartWriteConfigs). + option("hoodie.datasource.write.precombine.field", "ts"). + option("hoodie.datasource.write.recordkey.field", "uuid"). + option("hoodie.datasource.write.partitionpath.field", "partitionpath"). + option("hoodie.table.name", tableName). + option("hoodie.datasource.write.operation", "insert"). + mode(Append). + save(basePath) +``` + +You will see an entry for table version in hoodie.properties which states the table version is 4. +```shell +bash$ cat /tmp/hudi_table/.hoodie/hoodie.properties | grep hoodie.table.version +hoodie.table.version=4 +``` + +Launch a new spark shell using version 0.13.0 and append to the same table using the script above. Note the upgrade +happens automatically with the new version. +```shell +spark-shell \ + --packages org.apache.hudi:hudi-spark3.2-bundle_2.12:0.13.1 \ + --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ + --conf 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog' \ + --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' +``` + +After upgrade, the table version is updated to 5. +```shell +bash$ cat /tmp/hudi_table/.hoodie/hoodie.properties | grep hoodie.table.version +hoodie.table.version=5 +``` + +Lets try downgrading the table back to version 4. For downgrading we will need to use Hudi CLI and execute downgrade. +For more details on downgrade, please refer documentation [here](cli#upgrade-and-downgrade-table). +```shell +connect --path /tmp/hudi_table +downgrade table --toVersion 4 +``` + +After downgrade, the table version is updated to 4. +```shell +bash$ cat /tmp/hudi_table/.hoodie/hoodie.properties | grep hoodie.table.version +hoodie.table.version=4 +``` + +## Migrating + +Currently migrating to Hudi can be done using two approaches + +- **Convert newer partitions to Hudi** : This model is suitable for large event tables (e.g: click streams, ad impressions), which also typically receive writes for the last few days alone. You can convert the last + N partitions to Hudi and proceed writing as if it were a Hudi table to begin with. The Hudi query side code is able to correctly handle both hudi and non-hudi data partitions. +- **Full conversion to Hudi** : This model is suitable if you are currently bulk/full loading the table few times a day (e.g database ingestion). The full conversion of Hudi is simply a one-time step (akin to 1 run of your existing job), + which moves all of the data into the Hudi format and provides the ability to incrementally update for future writes. + +For more details, refer to the detailed [migration guide](/docs/migration_guide). In the future, we will be supporting seamless zero-copy bootstrap of existing tables with all the upsert/incremental query capabilities fully supported. diff --git a/website/versioned_docs/version-1.0.0/disaster_recovery.md b/website/versioned_docs/version-1.0.0/disaster_recovery.md new file mode 100644 index 0000000000000..a264b7d361555 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/disaster_recovery.md @@ -0,0 +1,309 @@ +--- +title: Disaster Recovery +toc: true +--- + +Disaster Recovery is very much mission-critical for any software. Especially when it comes to data systems, the impact could be very serious +leading to delay in business decisions or even wrong business decisions at times. Apache Hudi has two operations to assist you in recovering +data from a previous state: `savepoint` and `restore`. + +## Savepoint + +As the name suggest, `savepoint` saves the table as of the commit time, so that it lets you restore the table to this +savepoint at a later point in time if need be. Care is taken to ensure cleaner will not clean up any files that are savepointed. +On similar lines, savepoint cannot be triggered on a commit that is already cleaned up. In simpler terms, this is synonymous +to taking a backup, just that we don't make a new copy of the table, but just save the state of the table elegantly so that +we can restore it later when in need. + +## Restore + +This operation lets you restore your table to one of the savepoint commit. This operation cannot be undone (or reversed) and so care +should be taken before doing a restore. Hudi will delete all data files and commit files (timeline files) greater than the +savepoint commit to which the table is being restored. You should pause all writes to the table when performing +a restore since they are likely to fail while the restore is in progress. Also, reads could also fail since snapshot queries +will be hitting latest files which has high possibility of getting deleted with restore. + +## Runbook + +Savepoint and restore can only be triggered from `hudi-cli`. Let's walk through an example of how one can take savepoint +and later restore the state of the table. + +Let's create a hudi table via `spark-shell` and trigger a batch of inserts. + +```scala +import org.apache.hudi.QuickstartUtils._ +import scala.collection.JavaConversions._ +import org.apache.spark.sql.SaveMode._ +import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.config.HoodieWriteConfig._ + +val tableName = "hudi_trips_cow" +val basePath = "file:///tmp/hudi_trips_cow" +val dataGen = new DataGenerator + +val inserts = convertToStringList(dataGen.generateInserts(10)) +val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2)) +df.write.format("hudi"). + options(getQuickstartWriteConfigs). + option("hoodie.datasource.write.precombine.field", "ts"). + option("hoodie.datasource.write.recordkey.field", "uuid"). + option("hoodie.datasource.write.partitionpath.field", "partitionpath"). + option("hoodie.table.name", tableName). + mode(Overwrite). + save(basePath) +``` + +Let's add four more batches of inserts. +```scala +for (_ <- 1 to 4) { + val inserts = convertToStringList(dataGen.generateInserts(10)) + val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2)) + df.write.format("hudi"). + options(getQuickstartWriteConfigs). + option("hoodie.datasource.write.precombine.field", "ts"). + option("hoodie.datasource.write.recordkey.field", "uuid"). + option("hoodie.datasource.write.partitionpath.field", "partitionpath"). + option("hoodie.table.name", tableName). + mode(Append). + save(basePath) +} +``` + +Total record count should be 50. +```scala +val tripsSnapshotDF = spark. + read. + format("hudi"). + load(basePath) +tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot") + +spark.sql("select count(partitionpath, uuid) from hudi_trips_snapshot").show() + ++--------------------------+ +|count(partitionpath, uuid)| ++--------------------------+ +| 50| ++--------------------------+ +``` +Let's take a look at the timeline after 5 batches of inserts. +```shell +ls -ltr /tmp/hudi_trips_cow/.hoodie +total 128 +drwxr-xr-x 2 nsb wheel 64 Jan 28 16:00 archived +-rw-r--r-- 1 nsb wheel 546 Jan 28 16:00 hoodie.properties +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:00 20220128160040171.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:00 20220128160040171.inflight +-rw-r--r-- 1 nsb wheel 4374 Jan 28 16:00 20220128160040171.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:01 20220128160124637.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:01 20220128160124637.inflight +-rw-r--r-- 1 nsb wheel 4414 Jan 28 16:01 20220128160124637.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160226172.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160226172.inflight +-rw-r--r-- 1 nsb wheel 4427 Jan 28 16:02 20220128160226172.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160229636.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160229636.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:02 20220128160229636.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160245447.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160245447.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:02 20220128160245447.commit +``` + +Let's trigger a savepoint as of the latest commit. Savepoint can only be done via `hudi-cli`. + +```sh +./hudi-cli.sh + +connect --path /tmp/hudi_trips_cow/ +commits show +set --conf SPARK_HOME= +savepoint create --commit 20220128160245447 --sparkMaster local[2] +``` + +:::note NOTE: +Make sure you replace 20220128160245447 with the latest commit in your table. +::: + +Let's check the timeline after savepoint. +```shell +ls -ltr /tmp/hudi_trips_cow/.hoodie +total 136 +drwxr-xr-x 2 nsb wheel 64 Jan 28 16:00 archived +-rw-r--r-- 1 nsb wheel 546 Jan 28 16:00 hoodie.properties +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:00 20220128160040171.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:00 20220128160040171.inflight +-rw-r--r-- 1 nsb wheel 4374 Jan 28 16:00 20220128160040171.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:01 20220128160124637.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:01 20220128160124637.inflight +-rw-r--r-- 1 nsb wheel 4414 Jan 28 16:01 20220128160124637.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160226172.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160226172.inflight +-rw-r--r-- 1 nsb wheel 4427 Jan 28 16:02 20220128160226172.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160229636.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160229636.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:02 20220128160229636.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160245447.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160245447.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:02 20220128160245447.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:05 20220128160245447.savepoint.inflight +-rw-r--r-- 1 nsb wheel 1168 Jan 28 16:05 20220128160245447.savepoint +``` + +You could notice that savepoint meta files are added which keeps track of the files that are part of the latest table snapshot. + +Now, let's continue adding three more batches of inserts. + +```scala +for (_ <- 1 to 3) { + val inserts = convertToStringList(dataGen.generateInserts(10)) + val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2)) + df.write.format("hudi"). + options(getQuickstartWriteConfigs). + option("hoodie.datasource.write.precombine.field", "ts"). + option("hoodie.datasource.write.recordkey.field", "uuid"). + option("hoodie.datasource.write.partitionpath.field", "partitionpath"). + option("hoodie.table.name", tableName). + mode(Append). + save(basePath) +} +``` + +Total record count will be 80 since we have done 8 batches in total. (5 until savepoint and 3 after savepoint) +```scala +val tripsSnapshotDF = spark. + read. + format("hudi"). + load(basePath) +tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot") + +spark.sql("select count(partitionpath, uuid) from hudi_trips_snapshot").show() ++--------------------------+ +|count(partitionpath, uuid)| ++--------------------------+ +| 80| ++--------------------------+ +``` + +Let's say something bad happened, and you want to restore your table to an older snapshot. As we called out earlier, we can +trigger restore only from `hudi-cli`. And do remember to bring down all of your writer processes while doing a restore. + +Let's checkout timeline once, before we trigger the restore. +```shell +ls -ltr /tmp/hudi_trips_cow/.hoodie +total 208 +drwxr-xr-x 2 nsb wheel 64 Jan 28 16:00 archived +-rw-r--r-- 1 nsb wheel 546 Jan 28 16:00 hoodie.properties +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:00 20220128160040171.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:00 20220128160040171.inflight +-rw-r--r-- 1 nsb wheel 4374 Jan 28 16:00 20220128160040171.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:01 20220128160124637.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:01 20220128160124637.inflight +-rw-r--r-- 1 nsb wheel 4414 Jan 28 16:01 20220128160124637.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160226172.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160226172.inflight +-rw-r--r-- 1 nsb wheel 4427 Jan 28 16:02 20220128160226172.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160229636.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160229636.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:02 20220128160229636.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160245447.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160245447.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:02 20220128160245447.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:05 20220128160245447.savepoint.inflight +-rw-r--r-- 1 nsb wheel 1168 Jan 28 16:05 20220128160245447.savepoint +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:06 20220128160620557.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:06 20220128160620557.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:06 20220128160620557.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:06 20220128160627501.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:06 20220128160627501.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:06 20220128160627501.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:06 20220128160630785.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:06 20220128160630785.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:06 20220128160630785.commit +``` + +If you are continuing in the same `hudi-cli` session, you can just execute `refresh` so that table state gets refreshed to +its latest state. If not, connect to the table again. + +```shell +./hudi-cli.sh + +connect --path /tmp/hudi_trips_cow/ +commits show +set --conf SPARK_HOME= +savepoints show +╔═══════════════════╗ +║ SavepointTime ║ +╠═══════════════════╣ +║ 20220128160245447 ║ +╚═══════════════════╝ +savepoint rollback --savepoint 20220128160245447 --sparkMaster local[2] +``` + +:::note NOTE: +Make sure you replace 20220128160245447 with the latest savepoint in your table. +::: + +Hudi table should have been restored to the savepointed commit 20220128160245447. Both data files and timeline files should have +been deleted. +```shell +ls -ltr /tmp/hudi_trips_cow/.hoodie +total 152 +drwxr-xr-x 2 nsb wheel 64 Jan 28 16:00 archived +-rw-r--r-- 1 nsb wheel 546 Jan 28 16:00 hoodie.properties +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:00 20220128160040171.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:00 20220128160040171.inflight +-rw-r--r-- 1 nsb wheel 4374 Jan 28 16:00 20220128160040171.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:01 20220128160124637.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:01 20220128160124637.inflight +-rw-r--r-- 1 nsb wheel 4414 Jan 28 16:01 20220128160124637.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160226172.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160226172.inflight +-rw-r--r-- 1 nsb wheel 4427 Jan 28 16:02 20220128160226172.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160229636.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160229636.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:02 20220128160229636.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:02 20220128160245447.commit.requested +-rw-r--r-- 1 nsb wheel 2594 Jan 28 16:02 20220128160245447.inflight +-rw-r--r-- 1 nsb wheel 4428 Jan 28 16:02 20220128160245447.commit +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:05 20220128160245447.savepoint.inflight +-rw-r--r-- 1 nsb wheel 1168 Jan 28 16:05 20220128160245447.savepoint +-rw-r--r-- 1 nsb wheel 0 Jan 28 16:07 20220128160732437.restore.inflight +-rw-r--r-- 1 nsb wheel 4152 Jan 28 16:07 20220128160732437.restore +``` + +Let's check the total record count in the table. Should match the records we had, just before we triggered the savepoint. +```scala +val tripsSnapshotDF = spark. + read. + format("hudi"). + load(basePath) +tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot") + +spark.sql("select count(partitionpath, uuid) from hudi_trips_snapshot").show() ++--------------------------+ +|count(partitionpath, uuid)| ++--------------------------+ +| 50| ++--------------------------+ +``` + +As you could see, entire table state is restored back to the commit which was savepointed. Users can choose to trigger savepoint +at regular cadence and keep deleting older savepoints when new ones are created. `hudi-cli` has a command `savepoint delete` +to assist in deleting a savepoint. Please do remember that cleaner may not clean the files that are savepointed. And so users +should ensure they delete the savepoints from time to time. If not, the storage reclamation may not happen. + +Note: Savepoint and restore for MOR table is available only from 0.11. + +## Related Resources +

Videos

+ +* [Use Glue 4.0 to take regular save points for your Hudi tables for backup or disaster Recovery](https://www.youtube.com/watch?v=VgIMPSK7rFAa) +* [How to Rollback to Previous Checkpoint during Disaster in Apache Hudi using Glue 4.0 Demo](https://www.youtube.com/watch?v=Vi25q4vzogs) + + + + + + + + diff --git a/website/versioned_docs/version-1.0.0/docker_demo.md b/website/versioned_docs/version-1.0.0/docker_demo.md new file mode 100644 index 0000000000000..0564bce20a7c6 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/docker_demo.md @@ -0,0 +1,1526 @@ +--- +title: Docker Demo +keywords: [ hudi, docker, demo] +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## A Demo using Docker containers + +Let's use a real world example to see how Hudi works end to end. For this purpose, a self contained +data infrastructure is brought up in a local Docker cluster within your computer. It requires the +Hudi repo to have been cloned locally. + +The steps have been tested on a Mac laptop + +### Prerequisites + + * Clone the [Hudi repository](https://github.com/apache/hudi) to your local machine. + * Docker Setup : For Mac, Please follow the steps as defined in [Install Docker Desktop on Mac](https://docs.docker.com/desktop/install/mac-install/). For running Spark-SQL queries, please ensure atleast 6 GB and 4 CPUs are allocated to Docker (See Docker -> Preferences -> Advanced). Otherwise, spark-SQL queries could be killed because of memory issues. + * kcat : A command-line utility to publish/consume from kafka topics. Use `brew install kcat` to install kcat. + * /etc/hosts : The demo references many services running in container by the hostname. Add the following settings to /etc/hosts + + ```java + 127.0.0.1 adhoc-1 + 127.0.0.1 adhoc-2 + 127.0.0.1 namenode + 127.0.0.1 datanode1 + 127.0.0.1 hiveserver + 127.0.0.1 hivemetastore + 127.0.0.1 kafkabroker + 127.0.0.1 sparkmaster + 127.0.0.1 zookeeper + ``` + * Java : Java SE Development Kit 8. + * Maven : A build automation tool for Java projects. + * jq : A lightweight and flexible command-line JSON processor. Use `brew install jq` to install jq. + +Also, this has not been tested on some environments like Docker on Windows. + + +## Setting up Docker Cluster + + +### Build Hudi + +The first step is to build Hudi. **Note** This step builds Hudi on default supported scala version - 2.11. + +NOTE: Make sure you've cloned the [Hudi repository](https://github.com/apache/hudi) first. + +```java +cd +mvn clean package -Pintegration-tests -DskipTests +``` + +### Bringing up Demo Cluster + +The next step is to run the Docker compose script and setup configs for bringing up the cluster. These files are in the [Hudi repository](https://github.com/apache/hudi) which you should already have locally on your machine from the previous steps. + +This should pull the Docker images from Docker hub and setup the Docker cluster. + + + + +```java +cd docker +./setup_demo.sh +.... +.... +.... +[+] Running 10/13 +⠿ Container zookeeper Removed 8.6s +⠿ Container datanode1 Removed 18.3s +⠿ Container trino-worker-1 Removed 50.7s +⠿ Container spark-worker-1 Removed 16.7s +⠿ Container adhoc-2 Removed 16.9s +⠿ Container graphite Removed 16.9s +⠿ Container kafkabroker Removed 14.1s +⠿ Container adhoc-1 Removed 14.1s +⠿ Container presto-worker-1 Removed 11.9s +⠿ Container presto-coordinator-1 Removed 34.6s +....... +...... +[+] Running 17/17 +⠿ adhoc-1 Pulled 2.9s +⠿ graphite Pulled 2.8s +⠿ spark-worker-1 Pulled 3.0s +⠿ kafka Pulled 2.9s +⠿ datanode1 Pulled 2.9s +⠿ hivemetastore Pulled 2.9s +⠿ hiveserver Pulled 3.0s +⠿ hive-metastore-postgresql Pulled 2.8s +⠿ presto-coordinator-1 Pulled 2.9s +⠿ namenode Pulled 2.9s +⠿ trino-worker-1 Pulled 2.9s +⠿ sparkmaster Pulled 2.9s +⠿ presto-worker-1 Pulled 2.9s +⠿ zookeeper Pulled 2.8s +⠿ adhoc-2 Pulled 2.9s +⠿ historyserver Pulled 2.9s +⠿ trino-coordinator-1 Pulled 2.9s +[+] Running 17/17 +⠿ Container zookeeper Started 41.0s +⠿ Container kafkabroker Started 41.7s +⠿ Container graphite Started 41.5s +⠿ Container hive-metastore-postgresql Running 0.0s +⠿ Container namenode Running 0.0s +⠿ Container hivemetastore Running 0.0s +⠿ Container trino-coordinator-1 Runni... 0.0s +⠿ Container presto-coordinator-1 Star... 42.1s +⠿ Container historyserver Started 41.0s +⠿ Container datanode1 Started 49.9s +⠿ Container hiveserver Running 0.0s +⠿ Container trino-worker-1 Started 42.1s +⠿ Container sparkmaster Started 41.9s +⠿ Container spark-worker-1 Started 50.2s +⠿ Container adhoc-2 Started 38.5s +⠿ Container adhoc-1 Started 38.5s +⠿ Container presto-worker-1 Started 38.4s +Copying spark default config and setting up configs +Copying spark default config and setting up configs +$ docker ps +``` + + + + +:::note Please note the following for Mac AArch64 users +
    +
  • The demo must be built and run using the master branch. We currently plan to include support starting with the + 0.13.0 release.
  • +
  • Presto and Trino are not currently supported in the demo.
  • +
+::: + +```java +cd docker +./setup_demo.sh --mac-aarch64 +....... +...... +[+] Running 12/12 +⠿ adhoc-1 Pulled 2.9s +⠿ spark-worker-1 Pulled 3.0s +⠿ kafka Pulled 2.9s +⠿ datanode1 Pulled 2.9s +⠿ hivemetastore Pulled 2.9s +⠿ hiveserver Pulled 3.0s +⠿ hive-metastore-postgresql Pulled 2.8s +⠿ namenode Pulled 2.9s +⠿ sparkmaster Pulled 2.9s +⠿ zookeeper Pulled 2.8s +⠿ adhoc-2 Pulled 2.9s +⠿ historyserver Pulled 2.9s +[+] Running 12/12 +⠿ Container zookeeper Started 41.0s +⠿ Container kafkabroker Started 41.7s +⠿ Container hive-metastore-postgresql Running 0.0s +⠿ Container namenode Running 0.0s +⠿ Container hivemetastore Running 0.0s +⠿ Container historyserver Started 41.0s +⠿ Container datanode1 Started 49.9s +⠿ Container hiveserver Running 0.0s +⠿ Container sparkmaster Started 41.9s +⠿ Container spark-worker-1 Started 50.2s +⠿ Container adhoc-2 Started 38.5s +⠿ Container adhoc-1 Started 38.5s +Copying spark default config and setting up configs +Copying spark default config and setting up configs +$ docker ps +``` +
+ +
+ +At this point, the Docker cluster will be up and running. The demo cluster brings up the following services + + * HDFS Services (NameNode, DataNode) + * Spark Master and Worker + * Hive Services (Metastore, HiveServer2 along with PostgresDB) + * Kafka Broker and a Zookeeper Node (Kafka will be used as upstream source for the demo) + * Containers for Presto setup (Presto coordinator and worker) + * Containers for Trino setup (Trino coordinator and worker) + * Adhoc containers to run Hudi/Hive CLI commands + +## Demo + +Stock Tracker data will be used to showcase different Hudi query types and the effects of Compaction. + +Take a look at the directory `docker/demo/data`. There are 2 batches of stock data - each at 1 minute granularity. +The first batch contains stocker tracker data for some stock symbols during the first hour of trading window +(9:30 a.m to 10:30 a.m). The second batch contains tracker data for next 30 mins (10:30 - 11 a.m). Hudi will +be used to ingest these batches to a table which will contain the latest stock tracker data at hour level granularity. +The batches are windowed intentionally so that the second batch contains updates to some of the rows in the first batch. + +### Step 1 : Publish the first batch to Kafka + +Upload the first batch to Kafka topic 'stock ticks' + +`cat docker/demo/data/batch_1.json | kcat -b kafkabroker -t stock_ticks -P` + +To check if the new topic shows up, use +```java +kcat -b kafkabroker -L -J | jq . +{ + "originating_broker": { + "id": 1001, + "name": "kafkabroker:9092/1001" + }, + "query": { + "topic": "*" + }, + "brokers": [ + { + "id": 1001, + "name": "kafkabroker:9092" + } + ], + "topics": [ + { + "topic": "stock_ticks", + "partitions": [ + { + "partition": 0, + "leader": 1001, + "replicas": [ + { + "id": 1001 + } + ], + "isrs": [ + { + "id": 1001 + } + ] + } + ] + } + ] +} +``` + +### Step 2: Incrementally ingest data from Kafka topic + +Hudi comes with a tool named Hudi Streamer. This tool can connect to variety of data sources (including Kafka) to +pull changes and apply to Hudi table using upsert/insert primitives. Here, we will use the tool to download +json data from kafka topic and ingest to both COW and MOR tables we initialized in the previous step. This tool +automatically initializes the tables in the file-system if they do not exist yet. + +```java +docker exec -it adhoc-2 /bin/bash + +# Run the following spark-submit command to execute the Hudi Streamer and ingest to stock_ticks_cow table in HDFS +spark-submit \ + --class org.apache.hudi.utilities.streamer.HoodieStreamer $HUDI_UTILITIES_BUNDLE \ + --table-type COPY_ON_WRITE \ + --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \ + --source-ordering-field ts \ + --target-base-path /user/hive/warehouse/stock_ticks_cow \ + --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider + +# Run the following spark-submit command to execute the Hudi Streamer and ingest to stock_ticks_mor table in HDFS +spark-submit \ + --class org.apache.hudi.utilities.streamer.HoodieStreamer $HUDI_UTILITIES_BUNDLE \ + --table-type MERGE_ON_READ \ + --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \ + --source-ordering-field ts \ + --target-base-path /user/hive/warehouse/stock_ticks_mor \ + --target-table stock_ticks_mor \ + --props /var/demo/config/kafka-source.properties \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --disable-compaction + +# As part of the setup (Look at setup_demo.sh), the configs needed for Hudi Streamer is uploaded to HDFS. The configs +# contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields. + +exit +``` + +You can use HDFS web-browser to look at the tables +`http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow`. + +You can explore the new partition folder created in the table along with a "commit" / "deltacommit" +file under .hoodie which signals a successful commit. + +There will be a similar setup when you browse the MOR table +`http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor` + + +### Step 3: Sync with Hive + +At this step, the tables are available in HDFS. We need to sync with Hive to create new Hive tables and add partitions +inorder to run Hive queries against those tables. + +```java +docker exec -it adhoc-2 /bin/bash + +# This command takes in HiveServer URL and COW Hudi table location in HDFS and sync the HDFS state to Hive +/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \ + --jdbc-url jdbc:hive2://hiveserver:10000 \ + --user hive \ + --pass hive \ + --partitioned-by dt \ + --base-path /user/hive/warehouse/stock_ticks_cow \ + --database default \ + --table stock_ticks_cow \ + --partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor +..... +2020-01-25 19:51:28,953 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_cow +..... + +# Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR table type) +/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \ + --jdbc-url jdbc:hive2://hiveserver:10000 \ + --user hive \ + --pass hive \ + --partitioned-by dt \ + --base-path /user/hive/warehouse/stock_ticks_mor \ + --database default \ + --table stock_ticks_mor \ + --partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor +... +2020-01-25 19:51:51,066 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_ro +... +2020-01-25 19:51:51,569 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_rt +.... + +exit +``` +After executing the above command, you will notice + +1. A hive table named `stock_ticks_cow` created which supports Snapshot and Incremental queries on Copy On Write table. +2. Two new tables `stock_ticks_mor_rt` and `stock_ticks_mor_ro` created for the Merge On Read table. The former +supports Snapshot and Incremental queries (providing near-real time data) while the later supports ReadOptimized queries. + + +### Step 4 (a): Run Hive Queries + +Run a hive query to find the latest timestamp ingested for stock symbol 'GOOG'. You will notice that both snapshot +(for both COW and MOR _rt table) and read-optimized queries (for MOR _ro table) give the same value "10:29 a.m" as Hudi create a +parquet file for the first batch of data. + +```java +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 \ + --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \ + --hiveconf hive.stats.autogather=false + +# List Tables +0: jdbc:hive2://hiveserver:10000> show tables; ++---------------------+--+ +| tab_name | ++---------------------+--+ +| stock_ticks_cow | +| stock_ticks_mor_ro | +| stock_ticks_mor_rt | ++---------------------+--+ +3 rows selected (1.199 seconds) +0: jdbc:hive2://hiveserver:10000> + + +# Look at partitions that were added +0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt; ++----------------+--+ +| partition | ++----------------+--+ +| dt=2018-08-31 | ++----------------+--+ +1 row selected (0.24 seconds) + + +# COPY-ON-WRITE Queries: +========================= + + +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:29:00 | ++---------+----------------------+--+ + +Now, run a projection query: + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924221953 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + + +# Merge-On-Read Queries: +========================== + +Lets run similar queries against M-O-R table. Lets look at both +ReadOptimized and Snapshot(realtime data) queries supported by M-O-R table + +# Run ReadOptimized Query. Notice that the latest timestamp is 10:29 +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:29:00 | ++---------+----------------------+--+ +1 row selected (6.326 seconds) + + +# Run Snapshot Query. Notice that the latest timestamp is again 10:29 + +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:29:00 | ++---------+----------------------+--+ +1 row selected (1.606 seconds) + + +# Run Read Optimized and Snapshot project queries + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +exit +``` + +### Step 4 (b): Run Spark-SQL Queries +Hudi support Spark as query processor just like Hive. Here are the same hive queries +running in spark-sql + +```java +docker exec -it adhoc-1 /bin/bash +$SPARK_INSTALL/bin/spark-shell \ + --jars $HUDI_SPARK_BUNDLE \ + --master local[2] \ + --driver-class-path $HADOOP_CONF_DIR \ + --conf spark.sql.hive.convertMetastoreParquet=false \ + --deploy-mode client \ + --driver-memory 1G \ + --executor-memory 3G \ + --num-executors 1 +... + +Welcome to + ____ __ + / __/__ ___ _____/ /__ + _\ \/ _ \/ _ `/ __/ '_/ + /___/ .__/\_,_/_/ /_/\_\ version 2.4.4 + /_/ + +Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_212) +Type in expressions to have them evaluated. +Type :help for more information. + +scala> spark.sql("show tables").show(100, false) ++--------+------------------+-----------+ +|database|tableName |isTemporary| ++--------+------------------+-----------+ +|default |stock_ticks_cow |false | +|default |stock_ticks_mor_ro|false | +|default |stock_ticks_mor_rt|false | ++--------+------------------+-----------+ + +# Copy-On-Write Table + +## Run max timestamp query against COW table + +scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false) +[Stage 0:> (0 + 1) / 1]SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". +SLF4J: Defaulting to no-operation (NOP) logger implementation +SLF4J: See http://www.slf4j.org/codes#StaticLoggerBinder for further details. ++------+-------------------+ +|symbol|max(ts) | ++------+-------------------+ +|GOOG |2018-08-31 10:29:00| ++------+-------------------+ + +## Projection Query + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false) ++-------------------+------+-------------------+------+---------+--------+ +|_hoodie_commit_time|symbol|ts |volume|open |close | ++-------------------+------+-------------------+------+---------+--------+ +|20180924221953 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 | +|20180924221953 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085| ++-------------------+------+-------------------+------+---------+--------+ + +# Merge-On-Read Queries: +========================== + +Lets run similar queries against M-O-R table. Lets look at both +ReadOptimized and Snapshot queries supported by M-O-R table + +# Run ReadOptimized Query. Notice that the latest timestamp is 10:29 +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'").show(100, false) ++------+-------------------+ +|symbol|max(ts) | ++------+-------------------+ +|GOOG |2018-08-31 10:29:00| ++------+-------------------+ + + +# Run Snapshot Query. Notice that the latest timestamp is again 10:29 + +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) ++------+-------------------+ +|symbol|max(ts) | ++------+-------------------+ +|GOOG |2018-08-31 10:29:00| ++------+-------------------+ + +# Run Read Optimized and Snapshot project queries + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'").show(100, false) ++-------------------+------+-------------------+------+---------+--------+ +|_hoodie_commit_time|symbol|ts |volume|open |close | ++-------------------+------+-------------------+------+---------+--------+ +|20180924222155 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 | +|20180924222155 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085| ++-------------------+------+-------------------+------+---------+--------+ + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false) ++-------------------+------+-------------------+------+---------+--------+ +|_hoodie_commit_time|symbol|ts |volume|open |close | ++-------------------+------+-------------------+------+---------+--------+ +|20180924222155 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 | +|20180924222155 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085| ++-------------------+------+-------------------+------+---------+--------+ +``` + +### Step 4 (c): Run Presto Queries + +Here are the Presto queries for similar Hive and Spark queries. + +:::note +
    +
  • Currently, Presto does not support snapshot or incremental queries on Hudi tables.
  • +
  • This section of the demo is not supported for Mac AArch64 users at this time.
  • +
+::: + +```java +docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090 +presto> show catalogs; + Catalog +----------- + hive + jmx + localfile + system +(4 rows) + +Query 20190817_134851_00000_j8rcz, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:04 [0 rows, 0B] [0 rows/s, 0B/s] + +presto> use hive.default; +USE +presto:default> show tables; + Table +-------------------- + stock_ticks_cow + stock_ticks_mor_ro + stock_ticks_mor_rt +(3 rows) + +Query 20190822_181000_00001_segyw, FINISHED, 2 nodes +Splits: 19 total, 19 done (100.00%) +0:05 [3 rows, 99B] [0 rows/s, 18B/s] + + +# COPY-ON-WRITE Queries: +========================= + + +presto:default> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:29:00 +(1 row) + +Query 20190822_181011_00002_segyw, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0:12 [197 rows, 613B] [16 rows/s, 50B/s] + +presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20190822180221 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20190822180221 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 +(2 rows) + +Query 20190822_181141_00003_segyw, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0:02 [197 rows, 613B] [109 rows/s, 341B/s] + + +# Merge-On-Read Queries: +========================== + +Lets run similar queries against M-O-R table. + +# Run ReadOptimized Query. Notice that the latest timestamp is 10:29 + presto:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:29:00 +(1 row) + +Query 20190822_181158_00004_segyw, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0:02 [197 rows, 613B] [110 rows/s, 343B/s] + + +presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20190822180250 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20190822180250 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 +(2 rows) + +Query 20190822_181256_00006_segyw, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0:02 [197 rows, 613B] [92 rows/s, 286B/s] + +presto:default> exit +``` + +### Step 4 (d): Run Trino Queries + +Here are the similar queries with Trino. +:::note +
    +
  • Currently, Trino does not support snapshot or incremental queries on Hudi tables.
  • +
  • This section of the demo is not supported for Mac AArch64 users at this time.
  • +
+::: + +```java +docker exec -it adhoc-2 trino --server trino-coordinator-1:8091 +trino> show catalogs; + Catalog +--------- + hive + system +(2 rows) + +Query 20220112_055038_00000_sac73, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +3.74 [0 rows, 0B] [0 rows/s, 0B/s] + +trino> use hive.default; +USE +trino:default> show tables; + Table +-------------------- + stock_ticks_cow + stock_ticks_mor_ro + stock_ticks_mor_rt +(3 rows) + +Query 20220112_055050_00003_sac73, FINISHED, 2 nodes +Splits: 19 total, 19 done (100.00%) +1.84 [3 rows, 102B] [1 rows/s, 55B/s] + +# COPY-ON-WRITE Queries: +========================= + +trino:default> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:29:00 +(1 row) + +Query 20220112_055101_00005_sac73, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +4.08 [197 rows, 442KB] [48 rows/s, 108KB/s] + +trino:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20220112054822108 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20220112054822108 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 +(2 rows) + +Query 20220112_055113_00006_sac73, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0.40 [197 rows, 450KB] [487 rows/s, 1.09MB/s] + +# Merge-On-Read Queries: +========================== + +Lets run similar queries against MOR table. + +# Run ReadOptimized Query. Notice that the latest timestamp is 10:29 + +trino:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:29:00 +(1 row) + +Query 20220112_055125_00007_sac73, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0.50 [197 rows, 442KB] [395 rows/s, 888KB/s] + +trino:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20220112054844841 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20220112054844841 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 +(2 rows) + +Query 20220112_055136_00008_sac73, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0.49 [197 rows, 450KB] [404 rows/s, 924KB/s] + +trino:default> exit +``` + +### Step 5: Upload second batch to Kafka and run Hudi Streamer to ingest + +Upload the second batch of data and ingest this batch using Hudi Streamer. As this batch does not bring in any new +partitions, there is no need to run hive-sync + +```java +cat docker/demo/data/batch_2.json | kcat -b kafkabroker -t stock_ticks -P + +# Within Docker container, run the ingestion command +docker exec -it adhoc-2 /bin/bash + +# Run the following spark-submit command to execute the Hudi Streamer and ingest to stock_ticks_cow table in HDFS +spark-submit \ + --class org.apache.hudi.utilities.streamer.HoodieStreamer $HUDI_UTILITIES_BUNDLE \ + --table-type COPY_ON_WRITE \ + --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \ + --source-ordering-field ts \ + --target-base-path /user/hive/warehouse/stock_ticks_cow \ + --target-table stock_ticks_cow \ + --props /var/demo/config/kafka-source.properties \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider + +# Run the following spark-submit command to execute the Hudi Streamer and ingest to stock_ticks_mor table in HDFS +spark-submit \ + --class org.apache.hudi.utilities.streamer.HoodieStreamer $HUDI_UTILITIES_BUNDLE \ + --table-type MERGE_ON_READ \ + --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \ + --source-ordering-field ts \ + --target-base-path /user/hive/warehouse/stock_ticks_mor \ + --target-table stock_ticks_mor \ + --props /var/demo/config/kafka-source.properties \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --disable-compaction + +exit +``` + +With Copy-On-Write table, the second ingestion by Hudi Streamer resulted in a new version of Parquet file getting created. +See `http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow/2018/08/31` + +With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file. +Take a look at the HDFS filesystem to get an idea: `http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor/2018/08/31` + +### Step 6 (a): Run Hive Queries + +With Copy-On-Write table, the Snapshot query immediately sees the changes as part of second batch once the batch +got committed as each ingestion creates newer versions of parquet files. + +With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file. +This is the time, when ReadOptimized and Snapshot queries will provide different results. ReadOptimized query will still +return "10:29 am" as it will only read from the Parquet file. Snapshot query will do on-the-fly merge and return +latest committed data which is "10:59 a.m". + +```java +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 \ + --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \ + --hiveconf hive.stats.autogather=false + +# Copy On Write Table: + +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ +1 row selected (1.932 seconds) + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924224524 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +As you can notice, the above queries now reflect the changes that came as part of ingesting second batch. + + +# Merge On Read Table: + +# Read Optimized Query +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:29:00 | ++---------+----------------------+--+ +1 row selected (1.6 seconds) + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +# Snapshot Query +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924224537 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +exit +``` + +### Step 6 (b): Run Spark SQL Queries + +Running the same queries in Spark-SQL: + +```java +docker exec -it adhoc-1 /bin/bash +$SPARK_INSTALL/bin/spark-shell \ + --jars $HUDI_SPARK_BUNDLE \ + --driver-class-path $HADOOP_CONF_DIR \ + --conf spark.sql.hive.convertMetastoreParquet=false \ + --deploy-mode client \ + --driver-memory 1G \ + --master local[2] \ + --executor-memory 3G \ + --num-executors 1 + +# Copy On Write Table: + +scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false) ++------+-------------------+ +|symbol|max(ts) | ++------+-------------------+ +|GOOG |2018-08-31 10:59:00| ++------+-------------------+ + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false) + ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924224524 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +As you can notice, the above queries now reflect the changes that came as part of ingesting second batch. + + +# Merge On Read Table: + +# Read Optimized Query +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'").show(100, false) ++---------+----------------------+ +| symbol | _c1 | ++---------+----------------------+ +| GOOG | 2018-08-31 10:29:00 | ++---------+----------------------+ +1 row selected (1.6 seconds) + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'").show(100, false) ++----------------------+---------+----------------------+---------+------------+-----------+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+ + +# Snapshot Query +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) ++---------+----------------------+ +| symbol | _c1 | ++---------+----------------------+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+ + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false) ++----------------------+---------+----------------------+---------+------------+-----------+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924224537 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+ + +exit +``` + +### Step 6 (c): Run Presto Queries + +Running the same queries on Presto for ReadOptimized queries. + +:::note +This section of the demo is not supported for Mac AArch64 users at this time. +::: + +```java +docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090 +presto> use hive.default; +USE + +# Copy On Write Table: + +presto:default>select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:59:00 +(1 row) + +Query 20190822_181530_00007_segyw, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0:02 [197 rows, 613B] [125 rows/s, 389B/s] + +presto:default>select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20190822180221 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20190822181433 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 +(2 rows) + +Query 20190822_181545_00008_segyw, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0:02 [197 rows, 613B] [106 rows/s, 332B/s] + +As you can notice, the above queries now reflect the changes that came as part of ingesting second batch. + + +# Merge On Read Table: + +# Read Optimized Query +presto:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:29:00 +(1 row) + +Query 20190822_181602_00009_segyw, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0:01 [197 rows, 613B] [139 rows/s, 435B/s] + +presto:default>select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20190822180250 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20190822180250 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 +(2 rows) + +Query 20190822_181615_00010_segyw, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0:01 [197 rows, 613B] [154 rows/s, 480B/s] + +presto:default> exit +``` + +### Step 6 (d): Run Trino Queries + +Running the same queries on Trino for Read-Optimized queries. + +:::note +This section of the demo is not supported for Mac AArch64 users at this time. +::: + +```java +docker exec -it adhoc-2 trino --server trino-coordinator-1:8091 +trino> use hive.default; +USE + +# Copy On Write Table: + +trino:default> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:59:00 +(1 row) + +Query 20220112_055443_00012_sac73, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0.63 [197 rows, 442KB] [310 rows/s, 697KB/s] + +trino:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20220112054822108 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20220112055352654 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 +(2 rows) + +Query 20220112_055450_00013_sac73, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0.65 [197 rows, 450KB] [303 rows/s, 692KB/s] + +As you can notice, the above queries now reflect the changes that came as part of ingesting second batch. + +# Merge On Read Table: +# Read Optimized Query + +trino:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:29:00 +(1 row) + +Query 20220112_055500_00014_sac73, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0.59 [197 rows, 442KB] [336 rows/s, 756KB/s] + +trino:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20220112054844841 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20220112054844841 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 +(2 rows) + +Query 20220112_055506_00015_sac73, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0.35 [197 rows, 450KB] [556 rows/s, 1.24MB/s] + +trino:default> exit +``` + +### Step 7 (a): Incremental Query for COPY-ON-WRITE Table + +With 2 batches of data ingested, lets showcase the support for incremental queries in Hudi Copy-On-Write tables + +Lets take the same projection query example + +```java +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 \ + --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \ + --hiveconf hive.stats.autogather=false + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924064621 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +``` + +As you notice from the above queries, there are 2 commits - 20180924064621 and 20180924065039 in timeline order. +When you follow the steps, you will be getting different timestamps for commits. Substitute them +in place of the above timestamps. + +To show the effects of incremental-query, let us assume that a reader has already seen the changes as part of +ingesting first batch. Now, for the reader to see effect of the second batch, he/she has to keep the start timestamp to +the commit time of the first batch (20180924064621) and run incremental query + +Hudi incremental mode provides efficient scanning for incremental queries by filtering out files that do not have any +candidate rows using hudi-managed metadata. + +```java +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 \ + --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \ + --hiveconf hive.stats.autogather=false + +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.mode=INCREMENTAL; +No rows affected (0.009 seconds) +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.max.commits=3; +No rows affected (0.009 seconds) +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.start.timestamp=20180924064621; +``` + +With the above setting, file-ids that do not have any updates from the commit 20180924065039 is filtered out without scanning. +Here is the incremental query : + +```java +0: jdbc:hive2://hiveserver:10000> +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064621'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +1 row selected (0.83 seconds) +0: jdbc:hive2://hiveserver:10000> +``` + +### Step 7 (b): Incremental Query with Spark SQL: + +```java +docker exec -it adhoc-1 /bin/bash +$SPARK_INSTALL/bin/spark-shell \ + --jars $HUDI_SPARK_BUNDLE \ + --driver-class-path $HADOOP_CONF_DIR \ + --conf spark.sql.hive.convertMetastoreParquet=false \ + --deploy-mode client \ + --driver-memory 1G \ + --master local[2] \ + --executor-memory 3G \ + --num-executors 1 + +Welcome to + ____ __ + / __/__ ___ _____/ /__ + _\ \/ _ \/ _ `/ __/ '_/ + /___/ .__/\_,_/_/ /_/\_\ version 2.4.4 + /_/ + +Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_212) +Type in expressions to have them evaluated. +Type :help for more information. + +scala> import org.apache.hudi.DataSourceReadOptions +import org.apache.hudi.DataSourceReadOptions + +# In the below query, 20180925045257 is the first commit's timestamp +scala> val hoodieIncViewDF = spark.read.format("org.apache.hudi").option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL).option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "20180924064621").load("/user/hive/warehouse/stock_ticks_cow") +SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". +SLF4J: Defaulting to no-operation (NOP) logger implementation +SLF4J: See http://www.slf4j.org/codes#StaticLoggerBinder for further details. +hoodieIncViewDF: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 15 more fields] + +scala> hoodieIncViewDF.registerTempTable("stock_ticks_cow_incr_tmp1") +warning: there was one deprecation warning; re-run with -deprecation for details + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_incr_tmp1 where symbol = 'GOOG'").show(100, false); ++----------------------+---------+----------------------+---------+------------+-----------+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+ +| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+ +``` + +### Step 8: Schedule and Run Compaction for Merge-On-Read table + +Lets schedule and run a compaction to create a new version of columnar file so that read-optimized readers will see fresher data. +Again, You can use Hudi CLI to manually schedule and run compaction + +```java +docker exec -it adhoc-1 /bin/bash +root@adhoc-1:/opt# /var/hoodie/ws/hudi-cli/hudi-cli.sh +... +Table command getting loaded +HoodieSplashScreen loaded +=================================================================== +* ___ ___ * +* /\__\ ___ /\ \ ___ * +* / / / /\__\ / \ \ /\ \ * +* / /__/ / / / / /\ \ \ \ \ \ * +* / \ \ ___ / / / / / \ \__\ / \__\ * +* / /\ \ /\__\ / /__/ ___ / /__/ \ |__| / /\/__/ * +* \/ \ \/ / / \ \ \ /\__\ \ \ \ / / / /\/ / / * +* \ / / \ \ / / / \ \ / / / \ /__/ * +* / / / \ \/ / / \ \/ / / \ \__\ * +* / / / \ / / \ / / \/__/ * +* \/__/ \/__/ \/__/ Apache Hudi CLI * +* * +=================================================================== + +Welcome to Apache Hudi CLI. Please type help if you are looking for help. +hudi->connect --path /user/hive/warehouse/stock_ticks_mor +18/09/24 06:59:34 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +18/09/24 06:59:35 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor +18/09/24 06:59:35 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]] +18/09/24 06:59:35 INFO table.HoodieTableConfig: Loading table properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties +18/09/24 06:59:36 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1) from /user/hive/warehouse/stock_ticks_mor +Metadata for table stock_ticks_mor loaded +hoodie:stock_ticks_mor->compactions show all +20/02/10 03:41:32 INFO timeline.HoodieActiveTimeline: Loaded instants [[20200210015059__clean__COMPLETED], [20200210015059__deltacommit__COMPLETED], [20200210022758__clean__COMPLETED], [20200210022758__deltacommit__COMPLETED], [==>20200210023843__compaction__REQUESTED]] +___________________________________________________________________ +| Compaction Instant Time| State | Total FileIds to be Compacted| +|==================================================================| + +# Schedule a compaction. This will use Spark Launcher to schedule compaction +hoodie:stock_ticks_mor->compaction schedule --hoodieConfigs hoodie.compact.inline.max.delta.commits=1 +.... +Compaction successfully completed for 20180924070031 + +# Now refresh and check again. You will see that there is a new compaction requested + +hoodie:stock_ticks_mor->refresh +18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor +18/09/24 07:01:16 INFO table.HoodieTableConfig: Loading table properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties +18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1) from /user/hive/warehouse/stock_ticks_mor +Metadata for table stock_ticks_mor loaded + +hoodie:stock_ticks_mor->compactions show all +18/09/24 06:34:12 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924041125__clean__COMPLETED], [20180924041125__deltacommit__COMPLETED], [20180924042735__clean__COMPLETED], [20180924042735__deltacommit__COMPLETED], [==>20180924063245__compaction__REQUESTED]] +___________________________________________________________________ +| Compaction Instant Time| State | Total FileIds to be Compacted| +|==================================================================| +| 20180924070031 | REQUESTED| 1 | + +# Execute the compaction. The compaction instant value passed below must be the one displayed in the above "compactions show all" query +hoodie:stock_ticks_mor->compaction run --compactionInstant 20180924070031 --parallelism 2 --sparkMemory 1G --schemaFilePath /var/demo/config/schema.avsc --retry 1 +.... +Compaction successfully completed for 20180924070031 + +## Now check if compaction is completed + +hoodie:stock_ticks_mor->refresh +18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor +18/09/24 07:03:00 INFO table.HoodieTableConfig: Loading table properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties +18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1) from /user/hive/warehouse/stock_ticks_mor +Metadata for table stock_ticks_mor loaded + +hoodie:stock_ticks_mor->compactions show all +18/09/24 07:03:15 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED], [20180924070031__commit__COMPLETED]] +___________________________________________________________________ +| Compaction Instant Time| State | Total FileIds to be Compacted| +|==================================================================| +| 20180924070031 | COMPLETED| 1 | + +``` + +### Step 9: Run Hive Queries including incremental queries + +You will see that both ReadOptimized and Snapshot queries will show the latest committed data. +Lets also run the incremental query for MOR table. +From looking at the below query output, it will be clear that the fist commit time for the MOR table is 20180924064636 +and the second commit time is 20180924070031 + +```java +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 \ + --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \ + --hiveconf hive.stats.autogather=false + +# Read Optimized Query +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ +1 row selected (1.6 seconds) + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +# Snapshot Query +0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'; +WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. ++---------+----------------------+--+ +| symbol | _c1 | ++---------+----------------------+--+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+--+ + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +# Incremental Query: + +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.mode=INCREMENTAL; +No rows affected (0.008 seconds) +# Max-Commits covers both second batch and compaction commit +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.max.commits=3; +No rows affected (0.007 seconds) +0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.start.timestamp=20180924064636; +No rows affected (0.013 seconds) +# Query: +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064636'; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ + +exit +``` + +### Step 10: Read Optimized and Snapshot queries for MOR with Spark-SQL after compaction + +```java +docker exec -it adhoc-1 /bin/bash +$SPARK_INSTALL/bin/spark-shell \ + --jars $HUDI_SPARK_BUNDLE \ + --driver-class-path $HADOOP_CONF_DIR \ + --conf spark.sql.hive.convertMetastoreParquet=false \ + --deploy-mode client \ + --driver-memory 1G \ + --master local[2] \ + --executor-memory 3G \ + --num-executors 1 + +# Read Optimized Query +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'").show(100, false) ++---------+----------------------+ +| symbol | max(ts) | ++---------+----------------------+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+ +1 row selected (1.6 seconds) + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'").show(100, false) ++----------------------+---------+----------------------+---------+------------+-----------+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+ +| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+ + +# Snapshot Query +scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false) ++---------+----------------------+ +| symbol | max(ts) | ++---------+----------------------+ +| GOOG | 2018-08-31 10:59:00 | ++---------+----------------------+ + +scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false) ++----------------------+---------+----------------------+---------+------------+-----------+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+ +| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 | ++----------------------+---------+----------------------+---------+------------+-----------+ +``` + +### Step 11: Presto Read Optimized queries on MOR table after compaction +:::note +This section of the demo is not supported for Mac AArch64 users at this time. +::: + +```java +docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090 +presto> use hive.default; +USE + +# Read Optimized Query +resto:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; + symbol | _col1 +--------+--------------------- + GOOG | 2018-08-31 10:59:00 +(1 row) + +Query 20190822_182319_00011_segyw, FINISHED, 1 node +Splits: 49 total, 49 done (100.00%) +0:01 [197 rows, 613B] [133 rows/s, 414B/s] + +presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; + _hoodie_commit_time | symbol | ts | volume | open | close +---------------------+--------+---------------------+--------+-----------+---------- + 20190822180250 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 + 20190822181944 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 +(2 rows) + +Query 20190822_182333_00012_segyw, FINISHED, 1 node +Splits: 17 total, 17 done (100.00%) +0:02 [197 rows, 613B] [98 rows/s, 307B/s] + +presto:default> +``` + + +This brings the demo to an end. + +## Testing Hudi in Local Docker environment + +You can bring up a Hadoop Docker environment containing Hadoop, Hive and Spark services with support for Hudi. +```java +$ mvn pre-integration-test -DskipTests +``` +The above command builds Docker images for all the services with +current Hudi source installed at /var/hoodie/ws and also brings up the services using a compose file. We +currently use Hadoop (v2.8.4), Hive (v2.3.3) and Spark (v2.4.4) in Docker images. + +To bring down the containers +```java +$ cd hudi-integ-test +$ mvn docker-compose:down +``` + +If you want to bring up the Docker containers, use +```java +$ cd hudi-integ-test +$ mvn docker-compose:up -DdetachedMode=true +``` + +Hudi is a library that is operated in a broader data analytics/ingestion environment +involving Hadoop, Hive and Spark. Interoperability with all these systems is a key objective for us. We are +actively adding integration-tests under __hudi-integ-test/src/test/java__ that makes use of this +docker environment (See __hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java__ ) + + +### Building Local Docker Containers: + +The Docker images required for demo and running integration test are already in docker-hub. The Docker images +and compose scripts are carefully implemented so that they serve dual-purpose + +1. The Docker images have inbuilt Hudi jar files with environment variable pointing to those jars (HUDI_HADOOP_BUNDLE, ...) +2. For running integration-tests, we need the jars generated locally to be used for running services within docker. The + docker-compose scripts (see `docker/compose/docker-compose_hadoop284_hive233_spark244.yml`) ensures local jars override + inbuilt jars by mounting local Hudi workspace over the Docker location +3. As these Docker containers have mounted local Hudi workspace, any changes that happen in the workspace would automatically + reflect in the containers. This is a convenient way for developing and verifying Hudi for + developers who do not own a distributed environment. Note that this is how integration tests are run. + +This helps avoid maintaining separate Docker images and avoids the costly step of building Hudi Docker images locally. +But if users want to test Hudi from locations with lower network bandwidth, they can still build local images +run the script +`docker/build_local_docker_images.sh` to build local Docker images before running `docker/setup_demo.sh` + +Here are the commands: + +```java +cd docker +./build_local_docker_images.sh +..... + +[INFO] Reactor Summary: +[INFO] +[INFO] Hudi ............................................... SUCCESS [ 2.507 s] +[INFO] hudi-common ........................................ SUCCESS [ 15.181 s] +[INFO] hudi-aws ........................................... SUCCESS [ 2.621 s] +[INFO] hudi-timeline-service .............................. SUCCESS [ 1.811 s] +[INFO] hudi-client ........................................ SUCCESS [ 0.065 s] +[INFO] hudi-client-common ................................. SUCCESS [ 8.308 s] +[INFO] hudi-hadoop-mr ..................................... SUCCESS [ 3.733 s] +[INFO] hudi-spark-client .................................. SUCCESS [ 18.567 s] +[INFO] hudi-sync-common ................................... SUCCESS [ 0.794 s] +[INFO] hudi-hive-sync ..................................... SUCCESS [ 3.691 s] +[INFO] hudi-spark-datasource .............................. SUCCESS [ 0.121 s] +[INFO] hudi-spark-common_2.11 ............................. SUCCESS [ 12.979 s] +[INFO] hudi-spark2_2.11 ................................... SUCCESS [ 12.516 s] +[INFO] hudi-spark_2.11 .................................... SUCCESS [ 35.649 s] +[INFO] hudi-utilities_2.11 ................................ SUCCESS [ 5.881 s] +[INFO] hudi-utilities-bundle_2.11 ......................... SUCCESS [ 12.661 s] +[INFO] hudi-cli ........................................... SUCCESS [ 19.858 s] +[INFO] hudi-java-client ................................... SUCCESS [ 3.221 s] +[INFO] hudi-flink-client .................................. SUCCESS [ 5.731 s] +[INFO] hudi-spark3_2.12 ................................... SUCCESS [ 8.627 s] +[INFO] hudi-dla-sync ...................................... SUCCESS [ 1.459 s] +[INFO] hudi-sync .......................................... SUCCESS [ 0.053 s] +[INFO] hudi-hadoop-mr-bundle .............................. SUCCESS [ 5.652 s] +[INFO] hudi-hive-sync-bundle .............................. SUCCESS [ 1.623 s] +[INFO] hudi-spark-bundle_2.11 ............................. SUCCESS [ 10.930 s] +[INFO] hudi-presto-bundle ................................. SUCCESS [ 3.652 s] +[INFO] hudi-timeline-server-bundle ........................ SUCCESS [ 4.804 s] +[INFO] hudi-trino-bundle .................................. SUCCESS [ 5.991 s] +[INFO] hudi-hadoop-docker ................................. SUCCESS [ 2.061 s] +[INFO] hudi-hadoop-base-docker ............................ SUCCESS [ 53.372 s] +[INFO] hudi-hadoop-base-java11-docker ..................... SUCCESS [ 48.545 s] +[INFO] hudi-hadoop-namenode-docker ........................ SUCCESS [ 6.098 s] +[INFO] hudi-hadoop-datanode-docker ........................ SUCCESS [ 4.825 s] +[INFO] hudi-hadoop-history-docker ......................... SUCCESS [ 3.829 s] +[INFO] hudi-hadoop-hive-docker ............................ SUCCESS [ 52.660 s] +[INFO] hudi-hadoop-sparkbase-docker ....................... SUCCESS [01:02 min] +[INFO] hudi-hadoop-sparkmaster-docker ..................... SUCCESS [ 12.661 s] +[INFO] hudi-hadoop-sparkworker-docker ..................... SUCCESS [ 4.350 s] +[INFO] hudi-hadoop-sparkadhoc-docker ...................... SUCCESS [ 59.083 s] +[INFO] hudi-hadoop-presto-docker .......................... SUCCESS [01:31 min] +[INFO] hudi-hadoop-trinobase-docker ....................... SUCCESS [02:40 min] +[INFO] hudi-hadoop-trinocoordinator-docker ................ SUCCESS [ 14.003 s] +[INFO] hudi-hadoop-trinoworker-docker ..................... SUCCESS [ 12.100 s] +[INFO] hudi-integ-test .................................... SUCCESS [ 13.581 s] +[INFO] hudi-integ-test-bundle ............................. SUCCESS [ 27.212 s] +[INFO] hudi-examples ...................................... SUCCESS [ 8.090 s] +[INFO] hudi-flink_2.11 .................................... SUCCESS [ 4.217 s] +[INFO] hudi-kafka-connect ................................. SUCCESS [ 2.966 s] +[INFO] hudi-flink-bundle_2.11 ............................. SUCCESS [ 11.155 s] +[INFO] hudi-kafka-connect-bundle .......................... SUCCESS [ 12.369 s] +[INFO] ------------------------------------------------------------------------ +[INFO] BUILD SUCCESS +[INFO] ------------------------------------------------------------------------ +[INFO] Total time: 14:35 min +[INFO] Finished at: 2022-01-12T18:41:27-08:00 +[INFO] ------------------------------------------------------------------------ +``` diff --git a/website/versioned_docs/version-1.0.0/encryption.md b/website/versioned_docs/version-1.0.0/encryption.md new file mode 100644 index 0000000000000..9bce5d646a48d --- /dev/null +++ b/website/versioned_docs/version-1.0.0/encryption.md @@ -0,0 +1,73 @@ +--- +title: Encryption +keywords: [ hudi, security ] +summary: This section offers an overview of encryption feature in Hudi +toc: true +last_modified_at: 2022-02-14T15:59:57-04:00 +--- + +Since Hudi 0.11.0, Spark 3.2 support has been added and accompanying that, Parquet 1.12 has been included, which brings encryption feature to Hudi. In this section, we will show a guide on how to enable encryption in Hudi tables. + +## Encrypt Copy-on-Write tables + +First, make sure Hudi Spark 3.2 bundle jar is used. + +Then, set the following Parquet configurations to make data written to Hudi COW tables encrypted. + +```java +// Activate Parquet encryption, driven by Hadoop properties +jsc.hadoopConfiguration().set("parquet.crypto.factory.class", "org.apache.parquet.crypto.keytools.PropertiesDrivenCryptoFactory") +// Explicit master keys (base64 encoded) - required only for mock InMemoryKMS +jsc.hadoopConfiguration().set("parquet.encryption.kms.client.class" , "org.apache.parquet.crypto.keytools.mocks.InMemoryKMS") +jsc.hadoopConfiguration().set("parquet.encryption.key.list", "k1:AAECAwQFBgcICQoLDA0ODw==, k2:AAECAAECAAECAAECAAECAA==") +// Write encrypted dataframe files. +// Column "rider" will be protected with master key "key2". +// Parquet file footers will be protected with master key "key1" +jsc.hadoopConfiguration().set("parquet.encryption.footer.key", "k1") +jsc.hadoopConfiguration().set("parquet.encryption.column.keys", "k2:rider") + +spark.read().format("org.apache.hudi").load("path").show(); +``` + +Here is an example. + +```java +JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); +jsc.hadoopConfiguration().set("parquet.crypto.factory.class", "org.apache.parquet.crypto.keytools.PropertiesDrivenCryptoFactory"); +jsc.hadoopConfiguration().set("parquet.encryption.kms.client.class" , "org.apache.parquet.crypto.keytools.mocks.InMemoryKMS"); +jsc.hadoopConfiguration().set("parquet.encryption.footer.key", "k1"); +jsc.hadoopConfiguration().set("parquet.encryption.column.keys", "k2:rider"); +jsc.hadoopConfiguration().set("parquet.encryption.key.list", "k1:AAECAwQFBgcICQoLDA0ODw==, k2:AAECAAECAAECAAECAAECAA=="); + +QuickstartUtils.DataGenerator dataGen = new QuickstartUtils.DataGenerator(); +List inserts = convertToStringList(dataGen.generateInserts(3)); +Dataset inputDF1 = spark.read().json(jsc.parallelize(inserts, 1)); +inputDF1.write().format("org.apache.hudi") + .option("hoodie.table.name", "encryption_table") + .option("hoodie.upsert.shuffle.parallelism","2") + .option("hoodie.insert.shuffle.parallelism","2") + .option("hoodie.delete.shuffle.parallelism","2") + .option("hoodie.bulkinsert.shuffle.parallelism","2") + .mode(SaveMode.Overwrite) + .save("path"); + +spark.read().format("org.apache.hudi").load("path").select("rider").show(); +``` + +Reading the table works if configured correctly + +``` ++---------+ +|rider | ++---------+ +|rider-213| +|rider-213| +|rider-213| ++---------+ +``` + +Read more from [Spark docs](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#columnar-encryption) and [Parquet docs](https://github.com/apache/parquet-format/blob/master/Encryption.md). + +### Note + +This feature is currently only available for COW tables due to only Parquet base files present there. diff --git a/website/versioned_docs/version-1.0.0/faq.md b/website/versioned_docs/version-1.0.0/faq.md new file mode 100644 index 0000000000000..26c3eb50d2147 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/faq.md @@ -0,0 +1,15 @@ +--- +title: Overview +keywords: [hudi, writing, reading] +--- +# Overview + +The FAQs are split into following pages. Please refer to the specific pages for more info. + +- [General](faq_general) +- [Design & Concepts](faq_design_and_concepts) +- [Writing Tables](faq_writing_tables) +- [Reading Tables](faq_reading_tables) +- [Table Services](faq_table_services) +- [Storage](faq_storage) +- [Integrations](faq_integrations) diff --git a/website/versioned_docs/version-1.0.0/faq_design_and_concepts.md b/website/versioned_docs/version-1.0.0/faq_design_and_concepts.md new file mode 100644 index 0000000000000..c0fd9d105b38d --- /dev/null +++ b/website/versioned_docs/version-1.0.0/faq_design_and_concepts.md @@ -0,0 +1,61 @@ +--- +title: Design & Concepts +keywords: [hudi, writing, reading] +--- +# Design & Concepts FAQ + +### How does Hudi ensure atomicity? + +Hudi writers atomically move an inflight write operation to a "completed" state by writing an object/file to the [timeline](timeline) folder, identifying the write operation with an instant time that denotes the time the action is deemed to have occurred. This is achieved on the underlying DFS (in the case of S3/Cloud Storage, by an atomic PUT operation) and can be observed by files of the pattern `..` in Hudi’s timeline. + +### Does Hudi extend the Hive table layout? + +Hudi is very different from Hive in important aspects described below. However, based on practical considerations, it chooses to be compatible with Hive table layout by adopting partitioning, schema evolution and being queryable through Hive query engine. Here are the key aspect where Hudi differs: + +* Unlike Hive, Hudi does not remove the partition columns from the data files. Hudi in fact adds record level [meta fields](/tech-specs#meta-fields) including instant time, primary record key, and partition path to the data to support efficient upserts and [incremental queries/ETL](/learn/use_cases/#incremental-processing-pipelines).  Hudi tables can be non-partitioned and the Hudi metadata table adds rich indexes on Hudi tables which are beyond simple Hive extensions. +* Hive advocates partitioning as the main remedy for most performance-based issues. Features like partition evolution and hidden partitioning are primarily based on this Hive based principle of partitioning and aim to tackle the metadata problem partially.  Whereas, Hudi biases to coarse-grained partitioning and emphasizes [clustering](/docs/clustering) for more fine-grained partitioning. Further, users can strategize and evolve the clustering asynchronously which “actually” help users experiencing performance issues with too granular partitions. +* Hudi considers partition evolution as an anti-pattern and avoids such schemes due to the inconsistent performance of queries that goes to depend on which part of the table is being queried. Hudi’s design favors consistent performance and is aware of the need to redesign to partitioning/tables to achieve the same. + +### What concurrency control approaches does Hudi adopt? + +Hudi provides snapshot isolation between all three types of processes - writers, readers, and table services, meaning they all operate on a consistent snapshot of the table. Hudi provides optimistic concurrency control (OCC) between writers, while providing lock-free, non-blocking MVCC-based concurrency control between writers and table-services and between different table services. Widely accepted database literature like “[Architecture of a database system, pg 81](https://dsf.berkeley.edu/papers/fntdb07-architecture.pdf)” clearly lays out 2Phase Locking, OCC and MVCC as the different concurrency control approaches. Purely OCC-based approaches assume conflicts rarely occur and suffer from significant retries and penalties for any continuous/incremental workloads which are normal for modern lake based workloads. Hudi has been cognizant about this, and has a less enthusiastic view on [OCC](/blog/2021/12/16/lakehouse-concurrency-control-are-we-too-optimistic/), built out things like MVCC-based non-blocking async compaction (the commit time decision significantly aids this), that can have writers working non-stop with table services like compactions running in the background. + +### Hudi’s commits are based on transaction start time instead of completed time. Does this cause data loss or inconsistency in case of incremental and time travel queries? + +Let’s take a closer look at the scenario here: two commits C1 and C2 (with C2 starting later than C1) start with a later commit (C2) finishing first leaving the inflight transaction of the earlier commit (C1) +before the completed write of the later transaction (C2) in Hudi’s timeline. This is not an uncommon scenario, especially with various ingestions needs such as backfilling, deleting, bootstrapping, etc +alongside regular writes. When/Whether the first job would commit will depend on factors such as conflicts between concurrent commits, inflight compactions, other actions on the table’s timeline etc. +If the first job fails for some reason, Hudi will abort the earlier commit inflight (c1) and the writer has to retry next time with a new instant time > c2 much similar to other OCC implementations. +Firstly, for snapshot queries the order of commits should not matter at all, since any incomplete writes on the active timeline is ignored by queries and cause no side-effects. + +In these scenarios, it might be tempting to think of data inconsistencies/data loss when using Hudi’s incremental queries. However, Hudi takes special handling +(examples [1](https://github.com/apache/hudi/blob/aea5bb6f0ab824247f5e3498762ad94f643a2cb6/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java#L76), +[2](https://github.com/apache/hudi/blame/7a6543958368540d221ddc18e0c12b8d526b6859/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java#L173)) in incremental queries to ensure that no data +is served beyond the point there is an inflight instant in its timeline, so no data loss or drop happens. This detection is made possible because Hudi writes first request a transaction on the timeline, before planning/executing +the write, as explained in the [timeline](/docs/timeline#states) section. + +In this case, on seeing C1’s inflight commit (publish to timeline is atomic), C2 data (which is > C1 in the timeline) is not served until C1 inflight transitions to a terminal state such as completed or marked as failed. +This [test](https://github.com/apache/hudi/blob/master/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestHoodieIncrSource.java#L137) demonstrates how Hudi incremental source stops proceeding until C1 completes. +Hudi favors [safety and sacrifices liveness](https://en.wikipedia.org/wiki/Safety_and_liveness_properties), in such a case. For a single writer, the start times of the transactions are the same as the order of completion of transactions, and both incremental and time-travel queries work as expected. +In the case of multi-writer, incremental queries still work as expected but time travel queries don't. Since most time travel queries are on historical snapshots with a stable continuous timeline, this has not been implemented upto Hudi 0.13. +However, a similar approach like above can be easily applied to failing time travel queries as well in this window. + +### How does Hudi plan to address the liveness issue above for incremental queries? + +Hudi 0.14 improves the liveness aspects by enabling change streams, incremental query and time-travel based on the file/object's timestamp (similar to [Delta Lake](https://docs.delta.io/latest/delta-batch.html#query-an-older-snapshot-of-a-table-time-travel)). + +To expand more on the long term approach, Hudi has had a proposal to streamline/improve this experience by adding a transition-time to our timeline, which will remove the [liveness sacrifice](https://en.wikipedia.org/wiki/Safety_and_liveness_properties) and makes it easier to understand. +This has been delayed for a few reasons + +- Large hosted query engines and users not upgrading fast enough. +- The issues brought up - \[[1](faq_design_and_concepts#does-hudis-use-of-wall-clock-timestamp-for-instants-pose-any-clock-skew-issues),[2](faq_design_and_concepts#hudis-commits-are-based-on-transaction-start-time-instead-of-completed-time-does-this-cause-data-loss-or-inconsistency-in-case-of-incremental-and-time-travel-queries)\], +relevant to this are not practically very important to users beyond good pedantic discussions, +- Wanting to do it alongside [non-blocking concurrency control](https://github.com/apache/hudi/pull/7907) in Hudi version 1.x. + +It's planned to be addressed in the first 1.x release. + +### Does Hudi’s use of wall clock timestamp for instants pose any clock skew issues? + +Theoretically speaking, a clock skew between two writers can result in different notions of time, and order the timeline differently. But, the current NTP implementations and regions standardizing on UTC make this very impractical to happen in practice. Even many popular OLTP-based systems such as DynamoDB and Cassandra use timestamps for record level conflict detection, cloud providers/OSS NTP are moving towards atomic/synchronized clocks all the time \[[1](https://aws.amazon.com/about-aws/whats-new/2017/11/introducing-the-amazon-time-sync-service/),[2](https://engineering.fb.com/2020/03/18/production-engineering/ntp-service/)\]. We haven't had these as practical issues raised over the last several years, across several large scale data lakes. + +Further - Hudi’s commit time can be a logical time and need not strictly be a timestamp. If there are still uniqueness concerns over clock skew, it is easy for Hudi to further extend the timestamp implementation with salts or employ [TrueTime](https://www.cockroachlabs.com/blog/living-without-atomic-clocks/) approaches that have been proven at planet scale. In short, this is not a design issue, but more of a pragmatic implementation choice, that allows us to implement unique features like async compaction in face of updates to the same file group, by scheduling actions on discrete timestamp space. diff --git a/website/versioned_docs/version-1.0.0/faq_general.md b/website/versioned_docs/version-1.0.0/faq_general.md new file mode 100644 index 0000000000000..9f0a6c7d5153a --- /dev/null +++ b/website/versioned_docs/version-1.0.0/faq_general.md @@ -0,0 +1,98 @@ +--- +title: General +keywords: [hudi, writing, reading] +--- +# General FAQ + +### When is Hudi useful for me or my organization? + +If you are looking to quickly ingest data onto HDFS or cloud storage, Hudi provides you [tools](/docs/hoodie_streaming_ingestion). Also, if you have ETL/hive/spark jobs which are slow/taking up a lot of resources, Hudi can potentially help by providing an incremental approach to reading and writing data. + +As an organization, Hudi can help you build an [efficient data lake](https://docs.google.com/presentation/d/1FHhsvh70ZP6xXlHdVsAI0g__B_6Mpto5KQFlZ0b8-mM/edit#slide=id.p), solving some of the most complex, low-level storage management problems, while putting data into hands of your data analysts, engineers and scientists much quicker. + +### What are some non-goals for Hudi? + +Hudi is not designed for any OLTP use-cases, where typically you are using existing NoSQL/RDBMS data stores. Hudi cannot replace your in-memory analytical database (at-least not yet!). Hudi support near-real time ingestion in the order of few minutes, trading off latency for efficient batching. If you truly desirable sub-minute processing delays, then stick with your favorite stream processing solution. + +### What is incremental processing? Why does Hudi docs/talks keep talking about it? + +Incremental processing was first introduced by Vinoth Chandar, in the O'reilly [blog](https://www.oreilly.com/content/ubers-case-for-incremental-processing-on-hadoop/), that set off most of this effort. In purely technical terms, incremental processing merely refers to writing mini-batch programs in streaming processing style. Typical batch jobs consume **all input** and recompute **all output**, every few hours. Typical stream processing jobs consume some **new input** and recompute **new/changes to output**, continuously/every few seconds. While recomputing all output in batch fashion can be simpler, it's wasteful and resource expensive. Hudi brings ability to author the same batch pipelines in streaming fashion, run every few minutes. + +While we can merely refer to this as stream processing, we call it _incremental processing_, to distinguish from purely stream processing pipelines built using Apache Flink or Apache Kafka Streams. + +### How is Hudi optimized for CDC and streaming use cases? + +One of the core use-cases for Apache Hudi is enabling seamless, efficient database ingestion to your lake, and change data capture is a direct application of that. Hudi’s core design primitives support fast upserts and deletes of data that are suitable for CDC and streaming use cases. Here is a glimpse of some of the challenges accompanying streaming and cdc workloads that Hudi handles efficiently out of the box. + +* **_Processing of deletes:_** Deletes are treated no differently than updates and are logged with the same filegroups where the corresponding keys exist. This helps process deletes faster same like regular inserts and updates and Hudi processes deletes at file group level using compaction in MOR tables. This can be very expensive in other open source systems that store deletes as separate files than data files and incur N(Data files)*N(Delete files) merge cost to process deletes every time, soon lending into a complex graph problem to solve whose planning itself is expensive. This gets worse with volume, especially when dealing with CDC style workloads that streams changes to records frequently. +* **_Operational overhead of merging deletes at scale:_** When deletes are stored as separate files without any notion of data locality, the merging of data and deletes can become a run away job that cannot complete in time due to various reasons (Spark retries, executor failure, OOM, etc.). As more data files and delete files are added, the merge becomes even more expensive and complex later on, making it hard to manage in practice causing operation overhead. Hudi removes this complexity from users by treating deletes similarly to any other write operation. +* **_File sizing with updates:_** Other open source systems, process updates by generating new data files for inserting the new records after deletion, where both data files and delete files get introduced for every batch of updates. This yields to small file problem and requires file sizing. Whereas, Hudi embraces mutations to the data, and manages the table automatically by keeping file sizes in check without passing the burden of file sizing to users as manual maintenance. +* **_Support for partial updates and payload ordering:_** Hudi support partial updates where already existing record can be updated for specific fields that are non null from newer records (with newer timestamps). Similarly, Hudi supports payload ordering with timestamp through specific payload implementation where late-arriving data with older timestamps will be ignored or dropped. Users can even implement custom logic and plug in to handle what they want. + +### How do I choose a storage type for my workload? + +A key goal of Hudi is to provide **upsert functionality** that is orders of magnitude faster than rewriting entire tables or partitions. + +Choose Copy-on-write storage if : + +* You are looking for a simple alternative, that replaces your existing parquet tables without any need for real-time data. +* Your current job is rewriting entire table/partition to deal with updates, while only a few files actually change in each partition. +* You are happy keeping things operationally simpler (no compaction etc), with the ingestion/write performance bound by the [parquet file size](/docs/configurations#hoodieparquetmaxfilesize) and the number of such files affected/dirtied by updates +* Your workload is fairly well-understood and does not have sudden bursts of large amount of update or inserts to older partitions. COW absorbs all the merging cost on the writer side and thus these sudden changes can clog up your ingestion and interfere with meeting normal mode ingest latency targets. + +Choose merge-on-read storage if : + +* You want the data to be ingested as quickly & queryable as much as possible. +* Your workload can have sudden spikes/changes in pattern (e.g bulk updates to older transactions in upstream database causing lots of updates to old partitions on DFS). Asynchronous compaction helps amortize the write amplification caused by such scenarios, while normal ingestion keeps up with incoming stream of changes. + +Immaterial of what you choose, Hudi provides + +* Snapshot isolation and atomic write of batch of records +* Incremental pulls +* Ability to de-duplicate data + +Find more [here](/docs/concepts/). + +### Is Hudi an analytical database? + +A typical database has a bunch of long running storage servers always running, which takes writes and reads. Hudi's architecture is very different and for good reasons. It's highly decoupled where writes and queries/reads can be scaled independently to be able to handle the scale challenges. So, it may not always seems like a database. + +Nonetheless, Hudi is designed very much like a database and provides similar functionality (upserts, change capture) and semantics (transactional writes, snapshot isolated reads). + +### How do I model the data stored in Hudi? + +When writing data into Hudi, you model the records like how you would on a key-value store - specify a key field (unique for a single partition/across table), a partition field (denotes partition to place key into) and preCombine/combine logic that specifies how to handle duplicates in a batch of records written. This model enables Hudi to enforce primary key constraints like you would get on a database table. See [here](writing_data) for an example. + +When querying/reading data, Hudi just presents itself as a json-like hierarchical table, everyone is used to querying using Hive/Spark/Presto over Parquet/Json/Avro. + +### Why does Hudi require a key field to be configured? + +Hudi was designed to support fast record level Upserts and thus requires a key to identify whether an incoming record is +an insert or update or delete, and process accordingly. Additionally, Hudi automatically maintains indexes on this primary +key and for many use-cases like CDC, ensuring such primary key constraints is crucial to ensure data quality. In this context, +pre combine key helps reconcile multiple records with same key in a single batch of input records. Even for append-only data +streams, Hudi supports key based de-duplication before inserting records. For e-g; you may have atleast once data integration +systems like Kafka MirrorMaker that can introduce duplicates during failures. Even for plain old batch pipelines, keys +help eliminate duplication that could be caused by backfill pipelines, where commonly it's unclear what set of records +need to be re-written. We are actively working on making keys easier by only requiring them for Upsert and/or automatically +generate the key internally (much like RDBMS row_ids) + +### How does Hudi actually store data inside a table? + +At a high level, Hudi is based on MVCC design that writes data to versioned parquet/base files and log files that contain changes to the base file. All the files are stored under a partitioning scheme for the table, which closely resembles how Apache Hive tables are laid out on DFS. Please refer [here](/docs/concepts/) for more details. + +### How Hudi handles partition evolution requirements ? + +Hudi recommends keeping coarse grained top level partition paths e.g date(ts) and within each such partition do clustering in a flexible way to z-order, sort data based on interested columns. This provides excellent performance by : minimzing the number of files in each partition, while still packing data that will be queried together physically closer (what partitioning aims to achieve). + +Let's take an example of a table, where we store log_events with two fields `ts` (time at which event was produced) and `cust_id` (user for which event was produced) and a common option is to partition by both date(ts) and cust_id. +Some users may want to start granular with hour(ts) and then later evolve to new partitioning scheme say date(ts). But this means, the number of partitions in the table could be very high - 365 days x 1K customers = at-least 365K potentially small parquet files, that can significantly slow down queries, facing throttling issues on the actual S3/DFS reads. + +For the afore mentioned reasons, we don't recommend mixing different partitioning schemes within the same table, since it adds operational complexity, and unpredictable performance. +Old data stays in old partitions and only new data gets into newer evolved partitions. If you want to tidy up the table, one has to rewrite all partition/data anwyay! This is where we suggest start with coarse grained partitions +and lean on clustering techniques to optimize for query performance. + +We find that most datasets have at-least one high fidelity field, that can be used as a coarse partition. Clustering strategies in Hudi provide a lot of power - you can alter which partitions to cluster, and which fields to cluster each by etc. +Unlike Hive partitioning, Hudi does not remove the partition field from the data files i.e if you write new partition paths, it does not mean old partitions need to be rewritten. +Partitioning by itself is a relic of the Hive era; Hudi is working on replacing partitioning with database like indexing schemes/functions, +for even more flexibility and get away from Hive-style partition evol route. diff --git a/website/versioned_docs/version-1.0.0/faq_integrations.md b/website/versioned_docs/version-1.0.0/faq_integrations.md new file mode 100644 index 0000000000000..614b1b45b62a1 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/faq_integrations.md @@ -0,0 +1,68 @@ +--- +title: Integrations +keywords: [hudi, writing, reading] +--- +# Integrations FAQ + +### Does AWS GLUE support Hudi ? + +AWS Glue jobs can write, read and update Glue Data Catalog for hudi tables. In order to successfully integrate with Glue Data Catalog, you need to subscribe to one of the AWS provided Glue connectors named "AWS Glue Connector for Apache Hudi". Glue job needs to have "Use Glue data catalog as the Hive metastore" option ticked. Detailed steps with a sample scripts is available on this article provided by AWS - [https://aws.amazon.com/blogs/big-data/writing-to-apache-hudi-tables-using-aws-glue-connector/](https://aws.amazon.com/blogs/big-data/writing-to-apache-hudi-tables-using-aws-glue-connector/). + +In case if your using either notebooks or Zeppelin through Glue dev-endpoints, your script might not be able to integrate with Glue DataCatalog when writing to hudi tables. + +### How to override Hudi jars in EMR? + +If you are looking to override Hudi jars in your EMR clusters one way to achieve this is by providing the Hudi jars through a bootstrap script. + +Here are the example steps for overriding Hudi version 0.7.0 in EMR 0.6.2. + +**Build Hudi Jars:** + +```bash +# Git clone +git clone https://github.com/apache/hudi.git && cd hudi + +# Get version 0.7.0 +git checkout --track origin/release-0.7.0 + +# Build jars with spark 3.0.0 and scala 2.12 (since emr 6.2.0 uses spark 3 which requires scala 2.12): +mvn clean package -DskipTests -Dspark3 -Dscala-2.12 -T 30 +``` + +**Copy jars to s3:** + +These are the jars we are interested in after build completes. Copy them to a temp location first. + +```bash +mkdir -p ~/Downloads/hudi-jars +cp packaging/hudi-hadoop-mr-bundle/target/hudi-hadoop-mr-bundle-0.7.0.jar ~/Downloads/hudi-jars/ +cp packaging/hudi-hive-sync-bundle/target/hudi-hive-sync-bundle-0.7.0.jar ~/Downloads/hudi-jars/ +cp packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.12-0.7.0.jar ~/Downloads/hudi-jars/ +cp packaging/hudi-timeline-server-bundle/target/hudi-timeline-server-bundle-0.7.0.jar ~/Downloads/hudi-jars/ +cp packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.7.0.jar ~/Downloads/hudi-jars/ +``` + +Upload all jars from ~/Downloads/hudi-jars/ to the s3 location s3://xxx/yyy/hudi-jars + +**Include Hudi jars as part of the emr bootstrap script:** + +Below script downloads Hudi jars from above s3 location. Use this script as part `bootstrap-actions` when launching the EMR cluster to install the jars in each node. + +```bash +#!/bin/bash +sudo mkdir -p /mnt1/hudi-jars + +sudo aws s3 cp s3://xxx/yyy/hudi-jars /mnt1/hudi-jars --recursive + +# create symlinks +cd /mnt1/hudi-jars +sudo ln -sf hudi-hadoop-mr-bundle-0.7.0.jar hudi-hadoop-mr-bundle.jar +sudo ln -sf hudi-hive-sync-bundle-0.7.0.jar hudi-hive-sync-bundle.jar +sudo ln -sf hudi-spark-bundle_2.12-0.7.0.jar hudi-spark-bundle.jar +sudo ln -sf hudi-timeline-server-bundle-0.7.0.jar hudi-timeline-server-bundle.jar +sudo ln -sf hudi-utilities-bundle_2.12-0.7.0.jar hudi-utilities-bundle.jar +``` + +**Using the overriden jar in Deltastreamer:** + +When invoking DeltaStreamer specify the above jar location as part of spark-submit command. diff --git a/website/versioned_docs/version-1.0.0/faq_reading_tables.md b/website/versioned_docs/version-1.0.0/faq_reading_tables.md new file mode 100644 index 0000000000000..207d90c487b82 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/faq_reading_tables.md @@ -0,0 +1,31 @@ +--- +title: Reading Tables +keywords: [hudi, writing, reading] +--- +# Reading Tables FAQ + +### Does deleted records appear in Hudi's incremental query results? + +Soft Deletes (unlike hard deletes) do appear in the incremental pull query results. So, if you need a mechanism to propagate deletes to downstream tables, you can use Soft deletes. + +### How do I pass hudi configurations to my beeline Hive queries? + +If Hudi's input format is not picked the returned results may be incorrect. To ensure correct inputformat is picked, please use `org.apache.hadoop.hive.ql.io.HiveInputFormat` or `org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat` for `hive.input.format` config. This can be set like shown below: + +```plain +set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat +``` + +or + +```plain +set hive.input.format=org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat +``` + +### Does Hudi guarantee consistent reads? How to think about read optimized queries? + +Hudi does offer consistent reads. To read the latest snapshot of a MOR table, a user should use snapshot query. The [read-optimized queries](/docs/table_types#query-types) (targeted for the MOR table ONLY) are an add on benefit to provides users with a practical tradeoff of decoupling writer performance vs query performance, leveraging the fact that most queries query say the most recent data in the table. + +Hudi’s read-optimized query is targeted for the MOR table only, with guidance around how compaction should be run to achieve predictable results. In the MOR table, the compaction, which runs every few commits (or “deltacommit” to be exact for the MOR table) by default, merges the base (parquet) file and corresponding change log files to a new base file within each file group, so that the snapshot query serving the latest data immediately after compaction reads the base files only.  Similarly, the read-optimized query always reads the base files only as of the latest compaction commit, usually a few commits before the latest commit, which is still a valid table state. + +Users must use snapshot queries to read the latest snapshot of a MOR table.  Popular engines including Spark, Presto, and Hive already support snapshot queries on MOR table and the snapshot query support in Trino is in progress (the [PR](https://github.com/trinodb/trino/pull/14786) is under review).  Note that the read-optimized query does not apply to the COW table. diff --git a/website/versioned_docs/version-1.0.0/faq_storage.md b/website/versioned_docs/version-1.0.0/faq_storage.md new file mode 100644 index 0000000000000..fcce76aa46e1c --- /dev/null +++ b/website/versioned_docs/version-1.0.0/faq_storage.md @@ -0,0 +1,193 @@ +--- +title: Storage +keywords: [hudi, writing, reading] +--- +# Storage FAQ + +### Does Hudi support cloud storage/object stores? + +Yes. Generally speaking, Hudi is able to provide its functionality on any Hadoop FileSystem implementation and thus can read and write tables on [Cloud stores](/docs/cloud) (Amazon S3 or Microsoft Azure or Google Cloud Storage). Over time, Hudi has also incorporated specific design aspects that make building Hudi tables on the cloud easy, such as [consistency checks for s3](/docs/configurations#hoodieconsistencycheckenabled), Zero moves/renames involved for data files. + +### What is the difference between copy-on-write (COW) vs merge-on-read (MOR) table types? + +**Copy On Write** - This storage type enables clients to ingest data on columnar file formats, currently parquet. Any new data that is written to the Hudi table using COW storage type, will write new parquet files. Updating an existing set of rows will result in a rewrite of the entire parquet files that collectively contain the affected rows being updated. Hence, all writes to such tables are limited by parquet writing performance, the larger the parquet file, the higher is the time taken to ingest the data. + +**Merge On Read** - This storage type enables clients to ingest data quickly onto row based data format such as avro. Any new data that is written to the Hudi table using MOR table type, will write new log/delta files that internally store the data as avro encoded bytes. A compaction process (configured as inline or asynchronous) will convert log file format to columnar file format (parquet). Two different InputFormats expose 2 different views of this data, Read Optimized view exposes columnar parquet reading performance while Realtime View exposes columnar and/or log reading performance respectively. Updating an existing set of rows will result in either a) a companion log/delta file for an existing base parquet file generated from a previous compaction or b) an update written to a log/delta file in case no compaction ever happened for it. Hence, all writes to such tables are limited by avro/log file writing performance, much faster than parquet. Although, there is a higher cost to pay to read log/delta files vs columnar (parquet) files. + +More details can be found [here](/docs/concepts/) and also [Design And Architecture](https://cwiki.apache.org/confluence/display/HUDI/Design+And+Architecture). + +### How do I migrate my data to Hudi? + +Hudi provides built in support for rewriting your entire table into Hudi one-time using the HDFSParquetImporter tool available from the hudi-cli . You could also do this via a simple read and write of the dataset using the Spark datasource APIs. Once migrated, writes can be performed using normal means discussed [here](faq_writing_tables#what-are-some-ways-to-write-a-hudi-table). This topic is discussed in detail [here](/docs/migration_guide/), including ways to doing partial migrations. + +### How to convert an existing COW table to MOR? + +All you need to do is to edit the table type property in hoodie.properties(located at hudi_table_path/.hoodie/hoodie.properties). + +But manually changing it will result in checksum errors. So, we have to go via hudi-cli. + +1. Copy existing hoodie.properties to a new location. +2. Edit table type to MERGE_ON_READ +3. launch hudi-cli + 1. connect --path hudi_table_path + 2. repair overwrite-hoodie-props --new-props-file new_hoodie.properties + +### How can I find the average record size in a commit? + +The `commit showpartitons` command in [HUDI CLI](/docs/cli) will show both "bytes written" and + +"records inserted." Divide the bytes written by records inserted to find the average size. Note that this answer assumes + +metadata overhead is negligible. For a small table (such as 5 columns, 100 records) this will not be the case. + +### How does the Hudi indexing work & what are its benefits? + +The indexing component is a key part of the Hudi writing and it maps a given recordKey to a fileGroup inside Hudi consistently. This enables faster identification of the file groups that are affected/dirtied by a given write operation. + +Hudi supports a few options for indexing as below + +* _HoodieBloomIndex_ : Uses a bloom filter and ranges information placed in the footer of parquet/base files (and soon log files as well) +* _HoodieGlobalBloomIndex_ : The non global indexing only enforces uniqueness of a key inside a single partition i.e the user is expected to know the partition under which a given record key is stored. This helps the indexing scale very well for even [very large datasets](https://eng.uber.com/uber-big-data-platform/). However, in some cases, it might be necessary instead to do the de-duping/enforce uniqueness across all partitions and the global bloom index does exactly that. If this is used, incoming records are compared to files across the entire table and ensure a recordKey is only present in one partition. +* _HBaseIndex_ : Apache HBase is a key value store, typically found in close proximity to HDFS. You can also store the index inside HBase, which could be handy if you are already operating HBase. +* _HoodieSimpleIndex (default)_ : A simple index which reads interested fields (record key and partition path) from base files and joins with incoming records to find the tagged location. +* _HoodieGlobalSimpleIndex_ : Global version of Simple Index, where in uniqueness is on record key across entire table. +* _HoodieBucketIndex_ : Each partition has statically defined buckets to which records are tagged with. Since locations are tagged via hashing mechanism, this index lookup will be very efficient. +* _HoodieSparkConsistentBucketIndex_ : This is also similar to Bucket Index. Only difference is that, data skews can be tackled by dynamically changing the bucket number. + +You can implement your own index if you'd like, by subclassing the `HoodieIndex` class and configuring the index class name in configs. + +### Can I switch from one index type to another without having to rewrite the entire table? + +It should be okay to switch between Bloom index and Simple index as long as they are not global. + +Moving from global to non-global and vice versa may not work. Also switching between Hbase (gloabl index) and regular bloom might not work. + +### I have an existing dataset and want to evaluate Hudi using portion of that data ? + +You can bulk import portion of that data to a new hudi table. For example, if you want to try on a month of data - + +```scala +spark.read.parquet("your_data_set/path/to/month") + .write.format("org.apache.hudi") + .option("hoodie.datasource.write.operation", "bulk_insert") + .option("hoodie.datasource.write.storage.type", "storage_type") // COPY_ON_WRITE or MERGE_ON_READ + .option("hoodie.datasource.write.recordkey.field", ""). + .option("hoodie.datasource.write.partitionpath.field", "") + ... + .mode(SaveMode.Append) + .save(basePath); +``` + +Once you have the initial copy, you can simply run upsert operations on this by selecting some sample of data every round + +```scala +spark.read.parquet("your_data_set/path/to/month").limit(n) // Limit n records + .write.format("org.apache.hudi") + .option("hoodie.datasource.write.operation", "upsert") + .option("hoodie.datasource.write.recordkey.field", ""). + .option("hoodie.datasource.write.partitionpath.field", "") + ... + .mode(SaveMode.Append) + .save(basePath); +``` + +For merge on read table, you may want to also try scheduling and running compaction jobs. You can run compaction directly using spark submit on org.apache.hudi.utilities.HoodieCompactor or by using [HUDI CLI](/docs/cli). + +### Why does maintain record level commit metadata? Isn't tracking table version at file level good enough?  + +By generating a commit time ahead of time, Hudi is able to stamp each record with effectively a transaction id that it's part of that commit enabling record level change tracking. This means, that even if that file is compacted/clustered ([they mean different things in Hudi](/docs/clustering#how-is-compaction-different-from-clustering)) many times, in between incremental queries, we are able to [preserve history of the records](/blog/2023/05/19/hudi-metafields-demystified). Further more, Hudi is able to leverage compaction to amortize the cost of "catching up" for incremental readers by handing latest state of a record after a point in time - which is orders of magnitude efficient than processing each record. Other similar systems lack such decoupling of change streams from physical files the records were part of and core table management services being aware of the history of records. Such similar approaches of record level metadata fields for efficient incremental processing has been also applied in other leading industry [data warehouses](https://twitter.com/apachehudi/status/1676021143697002496?s=20). + +### Why partition fields are also stored in parquet files in addition to the partition path ? + +Hudi supports customizable partition values which could be a derived value of another field. Also, storing the partition value only as part of the field results in losing type information when queried by various query engines. + +### How do I configure Bloom filter (when Bloom/Global_Bloom index is used)? + +Bloom filters are used in bloom indexes to look up the location of record keys in write path. Bloom filters are used only when the index type is chosen as “BLOOM” or “GLOBAL_BLOOM”. Hudi has few config knobs that users can use to tune their bloom filters. + +On a high level, hudi has two types of blooms: Simple and Dynamic. + +Simple, as the name suggests, is simple. Size is statically allocated based on few configs. + +`hoodie.bloom.index.filter.type`: SIMPLE + +`hoodie.index.bloom.num_entries` refers to the total number of entries per bloom filter, which refers to one file slice. Default value is 60000. + +`hoodie.index.bloom.fpp` refers to the false positive probability with the bloom filter. Default value: 1*10^-9. + +Size of the bloom filter depends on these two values. This is statically allocated and here is the formula that determines the size of bloom. Until the total number of entries added to the bloom is within the configured `hoodie.index.bloom.num_entries` value, the fpp will be honored. i.e. with default values of 60k and 1*10^-9, bloom filter serialized size = 430kb. But if more entries are added, then the false positive probability will not be honored. Chances that more false positives could be returned if you add more number of entries than the configured value. So, users are expected to set the right values for both num_entries and fpp. + +Hudi suggests to have roughly 100 to 120 mb sized files for better query performance. So, based on the record size, one could determine how many records could fit into one data file. + +Lets say your data file max size is 128Mb and default avg record size is 1024 bytes. Hence, roughly this translates to 130k entries per data file. For this config, you should set num_entries to ~130k. + +Dynamic bloom filter: + +`hoodie.bloom.index.filter.type` : DYNAMIC + +This is an advanced version of the bloom filter which grows dynamically as the number of entries grows. So, users are expected to set two values wrt num_entries. `hoodie.index.bloom.num_entries` will determine the starting size of the bloom. `hoodie.bloom.index.filter.dynamic.max.entries` will determine the max size to which the bloom can grow upto. And fpp needs to be set similar to “Simple” bloom filter. Bloom size will be allotted based on the first config `hoodie.index.bloom.num_entries`. Once the number of entries reaches this value, bloom will dynamically grow its size to 2X. This will go on until the size reaches a max of `hoodie.bloom.index.filter.dynamic.max.entries` value. Until the size reaches this max value, fpp will be honored. If the entries added exceeds the max value, then the fpp may not be honored. + +### How do I verify datasource schema reconciliation in Hudi? + +With Hudi you can reconcile schema, meaning you can apply target table schema on your incoming data, so if there's a missing field in your batch it'll be injected null value. You can enable schema reconciliation using [hoodie.datasource.write.reconcile.schema](/docs/configurations/#hoodiedatasourcewritereconcileschema) config. + +Example how schema reconciliation works with Spark: + +```scala +hudi_options = { + 'hoodie.table.name': "test_recon1", + 'hoodie.datasource.write.recordkey.field': 'uuid', + 'hoodie.datasource.write.table.name': "test_recon1", + 'hoodie.datasource.write.precombine.field': 'ts', + 'hoodie.upsert.shuffle.parallelism': 2, + 'hoodie.insert.shuffle.parallelism': 2, + "hoodie.datasource.write.hive_style_partitioning":"true", + "hoodie.datasource.write.reconcile.schema": "true", + "hoodie.datasource.hive_sync.jdbcurl":"thrift://localhost:9083", + "hoodie.datasource.hive_sync.database":"hudi", + "hoodie.datasource.hive_sync.table":"test_recon1", + "hoodie.datasource.hive_sync.enable":"true", + "hoodie.datasource.hive_sync.mode": "hms" +} + +some_json = '{"uuid":1,"ts":1,"Url":"hudi.apache.com"}' +df = spark.read.json(sc.parallelize([some_json])) + +df.write.format("hudi").mode("append").options(**hudi_options).save(base_path) + +spark.sql("select * from hudi.test_recon1;").show() + +missing_field_json = '{"uuid":2,"ts":1}' +df = spark.read.json(sc.parallelize([missing_field_json])) + +df.write.format("hudi").mode("append").options(**hudi_options).save(base_path) + +spark.sql("select * from hudi.test_recon1;").show() +``` + +After first write: + +| _hoodie_commit_time | _hoodie_commit_seqno | _hoodie_record_key | _hoodie_partition_path | _hoodie_file_name | Url | ts | uuid | +| ---| ---| ---| ---| ---| ---| ---| --- | +| 20220622204044318 | 20220622204044318... | 1 | | 890aafc0-d897-44d... | [hudi.apache.com](http://hudi.apache.com) | 1 | 1 | + +After the second write: + +| _hoodie_commit_time | _hoodie_commit_seqno | _hoodie_record_key | _hoodie_partition_path | _hoodie_file_name | Url | ts | uuid | +| ---| ---| ---| ---| ---| ---| ---| --- | +| 20220622204044318 | 20220622204044318... | 1 | | 890aafc0-d897-44d... | [hudi.apache.com](http://hudi.apache.com) | 1 | 1 | +| 20220622204208997 | 20220622204208997... | 2 | | 890aafc0-d897-44d... | null | 1 | 2 | + +### Can I change keygenerator for an existing table? + +No. There are small set of properties that cannot change once chosen. KeyGenerator is one among them. [Here](https://github.com/apache/hudi/blob/3f37d4fb08169c95930f9cc32389abf4e5cd5551/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala#L128) is a code referecne where we + +validate the properties. + +### Is Hudi JVM dependent? Does Hudi leverage Java specific serialization? + +Hudi was not originally designed as a database layer that would fit under the various big data query engines, that were painfully hard to integrate with (Spark did not have DataSet/DataSource APIs, Trino was still Presto, Presto SPI was still budding, Hive storage handlers were just out). Popular engines including Spark, Flink, Presto, Trino, and Athena do not have issues integrating with Hudi as they are all based on JVM, and access access to Timeline, Metadata table are well-abstracted by Hudi APIs. Even non-jvm engines like Redshift have successfully integrated with Hudi. + +Since it was not thought of as a "format", the focus on the APIs for such lower level integrations and documenting the serialized bytes has been historically inadequate. However, with some understanding of the serialization, looking beyond the APIs used and focus on what the serialized bytes are, its possible to integrate Hudi from outside the JVM. For e.g Bloom filters are serialized as hex strings, from byte arrays/primitive types, and should be **readable cross language**. The Hudi Log Format bytes and layout are clearly defined as well, the header/footers are also binary serialized only with primitive types/byte arrays. So with the right endianity information and documentation of these bytes, **cross jvm clients can read this**. The Hudi metadata table uses [HFile format](https://hbase.apache.org/book.html#_hfile_format_2) as the base file format, which while being a well-documented open file format with clear protobuf specifications, does not have native readers. Community has taken efforts towards improving the docs on [tech specs](/tech-specs). Going forward, Hudi community plans on improving the [table APIs](https://github.com/apache/hudi/pull/7080) to facilitate faster engine integrations, including native language support, as a big part of the [Hudi 1.0](https://github.com/apache/hudi/blob/master/rfc/rfc-69/rfc-69.md) format changes to generalize Hudi more. + +**_Note_**: _In a recent release the delete block keys were unintentionally serialized as kryo, and is being fixed in the 0.14 release. Thankfully, since Hudi’s log blocks and format are versioned, when the file slice is compacted things return to normal._ diff --git a/website/versioned_docs/version-1.0.0/faq_table_services.md b/website/versioned_docs/version-1.0.0/faq_table_services.md new file mode 100644 index 0000000000000..7ff398687e392 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/faq_table_services.md @@ -0,0 +1,55 @@ +--- +title: Table Services +keywords: [hudi, writing, reading] +--- +# Table Services FAQ + +### What does the Hudi cleaner do? + +The Hudi cleaner process often runs right after a commit and deltacommit and goes about deleting old files that are no longer needed. If you are using the incremental pull feature, then ensure you configure the cleaner to [retain sufficient amount of last commits](/docs/configurations#hoodiecleanercommitsretained) to rewind. Another consideration is to provide sufficient time for your long running jobs to finish running. Otherwise, the cleaner could delete a file that is being or could be read by the job and will fail the job. Typically, the default configuration of 10 allows for an ingestion running every 30 mins to retain up-to 5 hours worth of data. If you run ingestion more frequently or if you want to give more running time for a query, consider increasing the value for the config : `hoodie.cleaner.commits.retained` + +### How do I run compaction for a MOR table? + +Simplest way to run compaction on MOR table is to run the [compaction inline](/docs/configurations#hoodiecompactinline), at the cost of spending more time ingesting; This could be particularly useful, in common cases where you have small amount of late arriving data trickling into older partitions. In such a scenario, you may want to just aggressively compact the last N partitions while waiting for enough logs to accumulate for older partitions. The net effect is that you have converted most of the recent data, that is more likely to be queried to optimized columnar format. + +That said, for obvious reasons of not blocking ingesting for compaction, you may want to run it asynchronously as well. This can be done either via a separate [compaction job](https://github.com/apache/hudi/blob/master/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java) that is scheduled by your workflow scheduler/notebook independently. If you are using delta streamer, then you can run in [continuous mode](https://github.com/apache/hudi/blob/d3edac4612bde2fa9deca9536801dbc48961fb95/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java#L241) where the ingestion and compaction are both managed concurrently in a single spark run time. + +### What options do I have for asynchronous/offline compactions on MOR table? + +There are a couple of options depending on how you write to Hudi. But first let us understand briefly what is involved. There are two parts to compaction + +* Scheduling: In this step, Hudi scans the partitions and selects file slices to be compacted. A compaction plan is finally written to Hudi timeline. Scheduling needs tighter coordination with other writers (regular ingestion is considered one of the writers). If scheduling is done inline with the ingestion job, this coordination is automatically taken care of. Else when scheduling happens asynchronously a lock provider needs to be configured for this coordination among multiple writers. +* Execution: In this step the compaction plan is read and file slices are compacted. Execution doesnt need the same level of coordination with other writers as Scheduling step and can be decoupled from ingestion job easily. + +Depending on how you write to Hudi these are the possible options currently. + +* DeltaStreamer: + * In Continuous mode, asynchronous compaction is achieved by default. Here scheduling is done by the ingestion job inline and compaction execution is achieved asynchronously by a separate parallel thread. + * In non continuous mode, only inline compaction is possible. + * Please note in either mode, by passing --disable-compaction compaction is completely disabled +* Spark datasource: + * Async scheduling and async execution can be achieved by periodically running an offline Hudi Compactor Utility or Hudi CLI. However this needs a lock provider to be configured. + * Alternately, from 0.11.0, to avoid dependency on lock providers, scheduling alone can be done inline by regular writer using the config `hoodie.compact.schedule.inline` . And compaction execution can be done offline by periodically triggering the Hudi Compactor Utility or Hudi CLI. +* Spark structured streaming: + * Compactions are scheduled and executed asynchronously inside the streaming job. Async Compactions are enabled by default for structured streaming jobs on Merge-On-Read table. + * Please note it is not possible to disable async compaction for MOR table with spark structured streaming. +* Flink: + * Async compaction is enabled by default for Merge-On-Read table. + * Offline compaction can be achieved by setting `compaction.async.enabled` to `false` and periodically running [Flink offline Compactor](compaction/#flink-offline-compaction). When running the offline compactor, one needs to ensure there are no active writes to the table. + * Third option (highly recommended over the second one) is to schedule the compactions from the regular ingestion job and executing the compaction plans from an offline job. To achieve this set `compaction.async.enabled` to `false`, `compaction.schedule.enabled` to `true` and then run the [Flink offline Compactor](compaction/#flink-offline-compaction) periodically to execute the plans. + +### How to disable all table services in case of multiple writers? + +[hoodie.table.services.enabled](/docs/configurations#hoodietableservicesenabled) is an umbrella config that can be used to turn off all table services at once without having to individually disable them. This is handy in use cases where there are multiple writers doing ingestion. While one of the main pipelines can take care of the table services, other ingestion pipelines can disable them to avoid frequent trigger of cleaning/clustering etc. This does not apply to singe writer scenarios. + +### Why does Hudi retain at-least one previous commit even after setting hoodie.cleaner.commits.retained': 1 ? + +Hudi runs cleaner to remove old file versions as part of writing data either in inline or in asynchronous mode (0.6.0 onwards). Hudi Cleaner retains at-least one previous commit when cleaning old file versions. This is to prevent the case when concurrently running queries which are reading the latest file versions suddenly see those files getting deleted by cleaner because a new file version got added . In other words, retaining at-least one previous commit is needed for ensuring snapshot isolation for readers. + +### Can I get notified when new commits happen in my Hudi table? + +Yes. Hudi provides the ability to post a callback notification about a write commit. You can use a http hook or choose to + +be notified via a Kafka/pulsar topic or plug in your own implementation to get notified. Please refer [here](platform_services_post_commit_callback) + +for details diff --git a/website/versioned_docs/version-1.0.0/faq_writing_tables.md b/website/versioned_docs/version-1.0.0/faq_writing_tables.md new file mode 100644 index 0000000000000..2374006d95533 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/faq_writing_tables.md @@ -0,0 +1,194 @@ +--- +title: Writing Tables +keywords: [hudi, writing, reading] +--- +# Writing Tables FAQ + +### What are some ways to write a Hudi table? + +Typically, you obtain a set of partial updates/inserts from your source and issue [write operations](/docs/write_operations/) against a Hudi table. If you ingesting data from any of the standard sources like Kafka, or tailing DFS, the [delta streamer](/docs/hoodie_streaming_ingestion#hudi-streamer) tool is invaluable and provides an easy, self-managed solution to getting data written into Hudi. You can also write your own code to capture data from a custom source using the Spark datasource API and use a [Hudi datasource](writing_data#spark-datasource-api) to write into Hudi. + +### How is a Hudi writer job deployed? + +The nice thing about Hudi writing is that it just runs like any other spark job would on a YARN/Mesos or even a K8S cluster. So you could simply use the Spark UI to get visibility into write operations. + +### Can I implement my own logic for how input records are merged with record on storage? + +Here is the payload interface that is used in Hudi to represent any hudi record. + +```java +public interface HoodieRecordPayload extends Serializable { + /** + * When more than one HoodieRecord have the same HoodieKey, this function combines them before attempting to insert/upsert by taking in a property map. + * Implementation can leverage the property to decide their business logic to do preCombine. + * @param another instance of another {@link HoodieRecordPayload} to be combined with. + * @param properties Payload related properties. For example pass the ordering field(s) name to extract from value in storage. + * @return the combined value + */ + default T preCombine(T another, Properties properties); +/** + * This methods lets you write custom merging/combining logic to produce new values as a function of current value on storage and whats contained + * in this object. Implementations can leverage properties if required. + *

+ * eg: + * 1) You are updating counters, you may want to add counts to currentValue and write back updated counts + * 2) You may be reading DB redo logs, and merge them with current image for a database row on storage + *

+ * + * @param currentValue Current value in storage, to merge/combine this payload with + * @param schema Schema used for record + * @param properties Payload related properties. For example pass the ordering field(s) name to extract from value in storage. + * @return new combined/merged value to be written back to storage. EMPTY to skip writing this record. + */ + default Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema, Properties properties) throws IOException; + +/** + * Generates an avro record out of the given HoodieRecordPayload, to be written out to storage. Called when writing a new value for the given + * HoodieKey, wherein there is no existing record in storage to be combined against. (i.e insert) Return EMPTY to skip writing this record. + * Implementations can leverage properties if required. + * @param schema Schema used for record + * @param properties Payload related properties. For example pass the ordering field(s) name to extract from value in storage. + * @return the {@link IndexedRecord} to be inserted. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + default Option getInsertValue(Schema schema, Properties properties) throws IOException; +/** + * This method can be used to extract some metadata from HoodieRecordPayload. The metadata is passed to {@code WriteStatus.markSuccess()} and + * {@code WriteStatus.markFailure()} in order to compute some aggregate metrics using the metadata in the context of a write success or failure. + * @return the metadata in the form of Map if any. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + default Option> getMetadata() { + return Option.empty(); + } +} +``` + +As you could see, ([combineAndGetUpdateValue(), getInsertValue()](https://github.com/apache/hudi/blob/master/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java)) that control how the record on storage is combined with the incoming update/insert to generate the final value to be written back to storage. preCombine() is used to merge records within the same incoming batch. + +### How do I delete records in the dataset using Hudi? + +GDPR has made deletes a must-have tool in everyone's data management toolbox. Hudi supports both soft and hard deletes. For details on how to actually perform them, see [here](writing_data#deletes). + +### Should I need to worry about deleting all copies of the records in case of duplicates? + +No. Hudi removes all the copies of a record key when deletes are issued. Here is the long form explanation - Sometimes accidental user errors can lead to duplicates introduced into a Hudi table by either [concurrent inserts](faq_writing_tables#can-concurrent-inserts-cause-duplicates) or by [not deduping the input records](faq_writing_tables#can-single-writer-inserts-have-duplicates) for an insert operation. However, using the right index (e.g., in the default [Simple Index](https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java#L116) and [Bloom Index](https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java#L309)), any subsequent updates and deletes are applied to all copies of the same primary key. This is because the indexing phase identifies records of a primary key in all locations.  So deletes in Hudi remove all copies of the same primary key, i.e., duplicates, and comply with GDPR or CCPA requirements.  Here are two examples [1](https://gist.github.com/yihua/6eb11ce3f888a71935dbf21c77199a48), [2](https://gist.github.com/yihua/e3afe0f34400e60f81f6da925560118e) demonstrating that duplicates are properly deleted from a Hudi table. Hudi is adding [auto key generation](https://github.com/apache/hudi/pull/8107), which will remove the burden of key generation from the user for insert workloads. + +### How does Hudi handle duplicate record keys in an input? + +When issuing an `upsert` operation on a table and the batch of records provided contains multiple entries for a given key, then all of them are reduced into a single final value by repeatedly calling payload class's [preCombine()](https://github.com/apache/hudi/blob/d3edac4612bde2fa9deca9536801dbc48961fb95/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java#L40) method . By default, we pick the record with the greatest value (determined by calling .compareTo()) giving latest-write-wins style semantics. [This FAQ entry](faq_writing_tables#can-i-implement-my-own-logic-for-how-input-records-are-merged-with-record-on-storage) shows the interface for HoodieRecordPayload if you are interested. + +For an insert or bulk_insert operation, no such pre-combining is performed. Thus, if your input contains duplicates, the table would also contain duplicates. If you don't want duplicate records either issue an **upsert** or consider specifying option to de-duplicate input in either datasource using [`hoodie.datasource.write.insert.drop.duplicates`](/docs/configurations#hoodiedatasourcewriteinsertdropduplicates) & [`hoodie.combine.before.insert`](/docs/configurations/#hoodiecombinebeforeinsert) or in deltastreamer using [`--filter-dupes`](https://github.com/apache/hudi/blob/d3edac4612bde2fa9deca9536801dbc48961fb95/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java#L229). + +### How can I pass hudi configurations to my spark writer job? + +Hudi configuration options covering the datasource and low level Hudi write client (which both deltastreamer & datasource internally call) are [here](/docs/configurations/). Invoking _--help_ on any tool such as DeltaStreamer would print all the usage options. A lot of the options that control upsert, file sizing behavior are defined at the write client level and below is how we pass them to different options available for writing data. + +* For Spark DataSource, you can use the "options" API of DataFrameWriter to pass in these configs. + +```scala +inputDF.write().format("org.apache.hudi") + .options(clientOpts) // any of the Hudi client opts can be passed in as well + .option("hoodie.datasource.write.recordkey.field", "_row_key") + ... +``` + +* When using `HoodieWriteClient` directly, you can simply construct HoodieWriteConfig object with the configs in the link you mentioned. +* When using HoodieDeltaStreamer tool to ingest, you can set the configs in properties file and pass the file as the cmdline argument "_--props_" + +### How to create Hive style partition folder structure? + +By default Hudi creates the partition folders with just the partition values, but if would like to create partition folders similar to the way Hive will generate the structure, with paths that contain key value pairs, like country=us/… or datestr=2021-04-20. This is Hive style (or format) partitioning. The paths include both the names of the partition keys and the values that each path represents. + +To enable hive style partitioning, you need to add this hoodie config when you write your data: + +```plain +hoodie.datasource.write.hive_style_partitioning: true +``` + +### Can I register my Hudi table with Apache Hive metastore? + +Yes. This can be performed either via the standalone [Hive Sync tool](/docs/syncing_metastore#hive-sync-tool) or using options in [Hudi Streamer](https://github.com/apache/hudi/blob/d3edac4612bde2fa9deca9536801dbc48961fb95/docker/demo/sparksql-incremental.commands#L50) tool or [datasource](/docs/configurations#hoodiedatasourcehive_syncenable). + +### What's Hudi's schema evolution story? + +Hudi uses Avro as the internal canonical representation for records, primarily due to its nice [schema compatibility & evolution](https://docs.confluent.io/platform/current/schema-registry/avro.html) properties. This is a key aspect of having reliability in your ingestion or ETL pipelines. As long as the schema passed to Hudi (either explicitly in Hudi Streamer schema provider configs or implicitly by Spark Datasource's Dataset schemas) is backwards compatible (e.g no field deletes, only appending new fields to schema), Hudi will seamlessly handle read/write of old and new data and also keep the Hive schema up-to date. + +Starting 0.11.0, Spark SQL DDL support (experimental) was added for Spark 3.1.x and Spark 3.2.1 via ALTER TABLE syntax. Please refer to the [schema evolution guide](/docs/schema_evolution) for more details on Schema-on-read for Spark.. + +### What performance/ingest latency can I expect for Hudi writing? + +The speed at which you can write into Hudi depends on the [write operation](/docs/write_operations) and some trade-offs you make along the way like file sizing. Just like how databases incur overhead over direct/raw file I/O on disks, Hudi operations may have overhead from supporting database like features compared to reading/writing raw DFS files. That said, Hudi implements advanced techniques from database literature to keep these minimal. User is encouraged to have this perspective when trying to reason about Hudi performance. As the saying goes : there is no free lunch (not yet atleast) + +| Storage Type | Type of workload | Performance | Tips | +| ---| ---| ---| --- | +| copy on write | bulk_insert | Should match vanilla spark writing + an additional sort to properly size files | properly size [bulk insert parallelism](/docs/configurations#hoodiebulkinsertshuffleparallelism) to get right number of files. Use insert if you want this auto tuned. Configure [hoodie.bulkinsert.sort.mode](/docs/configurations#hoodiebulkinsertsortmode) for better file sizes at the cost of memory. The default value `NONE` offers the fastest performance and matches `spark.write.parquet()` in terms of number of files, overheads. | +| copy on write | insert | Similar to bulk insert, except the file sizes are auto tuned requiring input to be cached into memory and custom partitioned. | Performance would be bound by how parallel you can write the ingested data. Tune [this limit](/docs/configurations#hoodieinsertshuffleparallelism) up, if you see that writes are happening from only a few executors. | +| copy on write | upsert/ de-duplicate & insert | Both of these would involve index lookup. Compared to naively using Spark (or similar framework)'s JOIN to identify the affected records, Hudi indexing is often 7-10x faster as long as you have ordered keys (discussed below) or less than 50% updates. Compared to naively overwriting entire partitions, Hudi write can be several magnitudes faster depending on how many files in a given partition is actually updated. For example, if a partition has 1000 files out of which only 100 is dirtied every ingestion run, then Hudi would only read/merge a total of 100 files and thus 10x faster than naively rewriting entire partition. | Ultimately performance would be bound by how quickly we can read and write a parquet file and that depends on the size of the parquet file, configured [here](/docs/configurations#hoodieparquetmaxfilesize). Also be sure to properly tune your [bloom filters](/docs/configurations#INDEX). [HUDI-56](https://issues.apache.org/jira/browse/HUDI-56) will auto-tune this. | +| merge on read | bulk insert | Currently new data only goes to parquet files and thus performance here should be similar to copy on write bulk insert. This has the nice side-effect of getting data into parquet directly for query performance. [HUDI-86](https://issues.apache.org/jira/browse/HUDI-86) will add support for logging inserts directly and this up drastically. | | +| merge on read | insert | Similar to above | | +| merge on read | upsert/ de-duplicate & insert | Indexing performance would remain the same as copy-on-write, while ingest latency for updates (costliest I/O operation in copy on write) are sent to log files and thus with asynchronous compaction provides very good ingest performance with low write amplification. | | + +Like with many typical system that manage time-series data, Hudi performs much better if your keys have a timestamp prefix or monotonically increasing/decreasing. You can almost always achieve this. Even if you have UUID keys, you can follow tricks like [this](https://www.percona.com/blog/2014/12/19/store-uuid-optimized-way/) to get keys that are ordered. See also [Tuning Guide](/docs/tuning-guide) for more tips on JVM and other configurations. + +### What performance can I expect for Hudi reading/queries? + +* For ReadOptimized views, you can expect the same best in-class columnar query performance as a standard parquet table in Hive/Spark/Presto +* For incremental views, you can expect speed up relative to how much data usually changes in a given time window and how much time your entire scan takes. For e.g, if only 100 files changed in the last hour in a partition of 1000 files, then you can expect a speed of 10x using incremental pull in Hudi compared to full scanning the partition to find out new data. +* For real time views, you can expect performance similar to the same avro backed table in Hive/Spark/Presto + +### How do I to avoid creating tons of small files? + +A key design decision in Hudi was to avoid creating small files and always write properly sized files. + +There are 2 ways to avoid creating tons of small files in Hudi and both of them have different trade-offs: + +a) **Auto Size small files during ingestion**: This solution trades ingest/writing time to keep queries always efficient. Common approaches to writing very small files and then later stitching them together only solve for system scalability issues posed by small files and also let queries slow down by exposing small files to them anyway. + +Hudi has the ability to maintain a configured target file size, when performing **upsert/insert** operations. (Note: **bulk_insert** operation does not provide this functionality and is designed as a simpler replacement for normal `spark.write.parquet` ) + +For **copy-on-write**, this is as simple as configuring the [maximum size for a base/parquet file](/docs/configurations#hoodieparquetmaxfilesize) and the [soft limit](/docs/configurations#hoodieparquetsmallfilelimit) below which a file should be considered a small file. For the initial bootstrap to Hudi table, tuning record size estimate is also important to ensure sufficient records are bin-packed in a parquet file. For subsequent writes, Hudi automatically uses average record size based on previous commit. Hudi will try to add enough records to a small file at write time to get it to the configured maximum limit. For e.g , with `hoodie.parquet.max.file.size=100MB` and hoodie.parquet.small.file.limit=120MB, Hudi will pick all files < 100MB and try to get them upto 120MB. + +For **merge-on-read**, there are few more configs to set. MergeOnRead works differently for different INDEX choices. + +* Indexes with **canIndexLogFiles = true** : Inserts of new data go directly to log files. In this case, you can configure the [maximum log size](/docs/configurations#hoodielogfilemaxsize) and a [factor](/docs/configurations#hoodielogfiletoparquetcompressionratio) that denotes reduction in size when data moves from avro to parquet files. +* Indexes with **canIndexLogFiles = false** : Inserts of new data go only to parquet files. In this case, the same configurations as above for the COPY_ON_WRITE case applies. + +NOTE : In either case, small files will be auto sized only if there is no PENDING compaction or associated log file for that particular file slice. For example, for case 1: If you had a log file and a compaction C1 was scheduled to convert that log file to parquet, no more inserts can go into that log file. For case 2: If you had a parquet file and an update ended up creating an associated delta log file, no more inserts can go into that parquet file. Only after the compaction has been performed and there are NO log files associated with the base parquet file, can new inserts be sent to auto size that parquet file. + +b) [**Clustering**](/blog/2021/01/27/hudi-clustering-intro) : This is a feature in Hudi to group small files into larger ones either synchronously or asynchronously. Since first solution of auto-sizing small files has a tradeoff on ingestion speed (since the small files are sized during ingestion), if your use-case is very sensitive to ingestion latency where you don't want to compromise on ingestion speed which may end up creating a lot of small files, clustering comes to the rescue. Clustering can be scheduled through the ingestion job and an asynchronus job can stitch small files together in the background to generate larger files. NOTE that during this, ingestion can continue to run concurrently. + +_Please note that Hudi always creates immutable files on disk. To be able to do auto-sizing or clustering, Hudi will always create a newer version of the smaller file, resulting in 2 versions of the same file. The cleaner service will later kick in and delte the older version small file and keep the latest one._ + +### How do I use DeltaStreamer or Spark DataSource API to write to a Non-partitioned Hudi table ? + +Hudi supports writing to non-partitioned tables. For writing to a non-partitioned Hudi table and performing hive table syncing, you need to set the below configurations in the properties passed: + +```plain +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.NonPartitionedExtractor +``` + +### How can I reduce table versions created by Hudi in AWS Glue Data Catalog/ metastore? + +With each commit, Hudi creates a new table version in the metastore. This can be reduced by setting the option + +[hoodie.datasource.meta_sync.condition.sync](/docs/configurations#hoodiedatasourcemeta_syncconditionsync) to true. + +This will ensure that hive sync is triggered on schema or partitions changes. + +### If there are failed writes in my timeline, do I see duplicates? + +No, Hudi does not expose uncommitted files/blocks to the readers. Further, Hudi strives to automatically manage the table for the user, by actively cleaning up files created from failed/aborted writes. See [marker mechanism](/blog/2021/08/18/improving-marker-mechanism/). + +### How are conflicts detected in Hudi between multiple writers? + +Hudi employs [optimistic concurrency control](concurrency_control) between writers, while implementing MVCC based concurrency control between writers and the table services. Concurrent writers to the same table need to be configured with the same lock provider configuration, to safely perform writes. By default (implemented in “[SimpleConcurrentFileWritesConflictResolutionStrategy](https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/SimpleConcurrentFileWritesConflictResolutionStrategy.java)”), Hudi allows multiple writers to concurrently write data and commit to the timeline if there is no conflicting writes to the same underlying file group IDs. This is achieved by holding a lock, checking for changes that modified the same file IDs. Hudi then supports a pluggable interface “[ConflictResolutionStrategy](https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConflictResolutionStrategy.java)” that determines how conflicts are handled. By default, the later conflicting write is aborted. Hudi also support eager conflict detection to help speed up conflict detection and release cluster resources back early to reduce costs. + +### Can single-writer inserts have duplicates? + +By default, Hudi turns off key based de-duplication for INSERT/BULK_INSERT operations and thus the table could contain duplicates. If users believe, they have duplicates in inserts, they can either issue UPSERT or consider specifying the option to de-duplicate input in either datasource using [`hoodie.datasource.write.insert.drop.duplicates`](/docs/configurations#hoodiedatasourcewriteinsertdropduplicates) & [`hoodie.combine.before.insert`](/docs/configurations/#hoodiecombinebeforeinsert) or in deltastreamer using [`--filter-dupes`](https://github.com/apache/hudi/blob/d3edac4612bde2fa9deca9536801dbc48961fb95/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java#L229). + +### Can concurrent inserts cause duplicates? + +Yes. As mentioned before, the default conflict detection strategy only check for conflicting updates to the same file group IDs. In the case of concurrent inserts, inserted records end up creating new file groups and thus can go undetected. Most common workload patterns use multi-writer capability in the case of running ingestion of new data and concurrently backfilling/deleting older data, with NO overlap in the primary keys of the records. However, this can be implemented (or better yet contributed) by a new “[ConflictResolutionStrategy](https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConflictResolutionStrategy.java)”, that reads out keys of new conflicting operations, to check the uncommitted data against other concurrent writes and then decide whether or not to commit/abort. This is rather a fine tradeoff between saving the additional cost of reading keys on most common workloads. Historically, users have preferred to take this into their control to save costs e.g we turned off de-duplication for inserts due to the same feedback. Hudi supports a pre-commit validator mechanism already where such tests can be authored as well. diff --git a/website/versioned_docs/version-1.0.0/file_sizing.md b/website/versioned_docs/version-1.0.0/file_sizing.md new file mode 100644 index 0000000000000..62ad0f7a43208 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/file_sizing.md @@ -0,0 +1,177 @@ +--- +title: "File Sizing" +toc: true +toc_min_heading_level: 2 +toc_max_heading_level: 4 +--- + +Solving the [small file problem](https://hudi.apache.org/blog/2021/03/01/hudi-file-sizing/) is fundamental to ensuring +great experience on the data lake. If you don’t size the files appropriately, you can slow down the queries and the pipelines. +Some of the issues you may encounter with small files include the following: + +- **Queries slow down**: You’ll have to scan through many small files to retrieve data for a query. It’s a very inefficient + way of accessing and utilizing the data. Also, cloud storage, like S3, enforces a rate-limit on how many requests can + be processed per second per prefix in a bucket. A higher number of files, i.e., at least one request per file regardless + of the file size, increases the chance of encountering a rate-limit, as well as additional fixed costs for opening/closing + them. All of these causes the queries to slow down. + +- **Pipelines slow down**: You can slow down your Spark, Flink or Hive jobs due to excessive scheduling overhead or memory + requirements; the more files you have, the more tasks you create. + +- **Storage inefficiencies**: When working with many small files, you can be inefficient in using your storage. For example, + many small files can yield a lower compression ratio, increasing storage costs. If you’re indexing the data, that also + takes up more storage space to store additional metadata, such as column statistics. If you’re working with a smaller + amount of data, you might not see a significant impact with storage. However, when dealing with petabyte and exabyte + data, you’ll need to be efficient in managing storage resources. + +A critical design decision in the Hudi architecture is to avoid small file creation. Hudi is uniquely designed to write +appropriately sized files automatically. This page will show you how Apache Hudi overcomes the dreaded small files problem. +There are two ways to manage small files in Hudi: + +- [Auto-size during writes](#auto-sizing-during-writes) +- [Clustering after writes](#auto-sizing-with-clustering) + +Below, we will describe the advantages and trade-offs of each. + +:::note +the bulk_insert write operation does not have auto-sizing capabilities during ingestion +::: + +## Auto-sizing during writes + +You can manage file sizes through Hudi’s auto-sizing capability during ingestion. The default targeted file size for +Parquet base files is 120MB, which can be configured by `hoodie.parquet.max.file.size`. Auto-sizing may add some write +latency, but it ensures that the queries are always efficient when a write transaction is committed. It’s important to +note that if you don’t manage file sizing as you write and, instead, try to run clustering to fix your file sizing +periodically, your queries might be slow until the point when the clustering finishes. This is only supported for +**append** use cases only; **mutable** are not supported at the moment. Please refer to the +[clustering documentation](https://hudi.apache.org/docs/clustering) for more details. + + + +If you need to control the file sizing, i.e., increase the target file size or change how small files are identified, +follow the instructions below for Copy-On-Write and Merge-On-Read tables. + +### File sizing for Copy-On-Write (COW) and Merge-On-Read (MOR) tables +To tune the file sizing for both COW and MOR tables, you can set the small file limit and the maximum Parquet file size. +Hudi will try to add enough records to a small file at write time to get it to the configured maximum limit. + + - For example, if the `hoodie.parquet.small.file.limit=104857600` (100MB) and `hoodie.parquet.max.file.size=125829120` (120MB), + Hudi will pick all files < 100MB and try to get them up to 120MB. + +For creating a Hudi table initially, setting an accurate record size estimate is vital to ensure Hudi can adequately +estimate how many records need to be bin-packed in a Parquet file for the first ingestion batch. Then, Hudi automatically +uses the average record size for subsequent writes based on previous commits. + + +### More details about file sizing for Merge-On-Read(MOR) tables +As a MOR table aims to reduce the write amplification, compared to a COW table, when writing to a MOR table, Hudi limits +the number of Parquet base files to one for auto file sizing during insert and upsert operation. This limits the number +of rewritten files. This can be configured through `hoodie.merge.small.file.group.candidates.limit`. + +For storage systems that support append operation, in addition to file sizing Parquet base files for a MOR table, you +can also tune the log files file-sizing with `hoodie.logfile.max.size`. + +MergeOnRead works differently for different INDEX choices so there are few more configs to set: + +- Indexes with **canIndexLogFiles = true** : Inserts of new data go directly to log files. In this case, you can configure + the [maximum log size](https://hudi.apache.org/docs/configurations#hoodielogfilemaxsize) and a + [factor](https://hudi.apache.org/docs/configurations#hoodielogfiletoparquetcompressionratio) that denotes reduction + in size when data moves from avro to parquet files. +- Indexes with **canIndexLogFiles = false** : Inserts of new data go only to parquet files. In this case, the same configurations + as above for the COPY_ON_WRITE case applies. +**NOTE** : In either case, small files will be auto sized only if there is no PENDING compaction or associated log file + for that particular file slice. For example, for case 1: If you had a log file and a compaction C1 was scheduled to + convert that log file to parquet, no more inserts can go into that log file. For case 2: If you had a parquet file and + an update ended up creating an associated delta log file, no more inserts can go into that parquet file. Only after the + compaction has been performed and there are NO log files associated with the base parquet file, can new inserts be sent + to auto size that parquet file. + +### Configs +Here are the essential configurations for **COW tables**. + +**Spark based configs:** + +| Config Name | Default | Description | +|-----------------------------------------|----------------------|| +| hoodie.parquet.small.file.limit | 104857600 (Optional) | During an insert and upsert operation, we opportunistically expand existing small files on storage, instead of writing new files, to keep number of files to an optimum. This config sets the file size limit below which a file on storage becomes a candidate to be selected as such a `small file`. By default, treat any file <= 100MB as a small file. Also note that if this set <= 0, will not try to get small files and directly write new files

`Config Param: PARQUET_SMALL_FILE_LIMIT` | +| hoodie.parquet.max.file.size | 125829120 (Optional) | Target size in bytes for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.

`Config Param: PARQUET_MAX_FILE_SIZE` | +| hoodie.copyonwrite.record.size.estimate | 1024 (Optional) | The average record size. If not explicitly specified, hudi will compute the record size estimate compute dynamically based on commit metadata. This is critical in computing the insert parallelism and bin-packing inserts into small files.

`Config Param: COPY_ON_WRITE_RECORD_SIZE_ESTIMATE` | + +**Flink based configs:** + +| Config Name | Default | Description | +|-----------------------------------------|----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| write.parquet.max.file.size | 120 (Optional) | Target size for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.

`Config Param: WRITE_PARQUET_MAX_FILE_SIZE` | + +Here are the essential configurations for **MOR tables**: + +**Spark based configs:** + +| Config Name | Default | Description | +|------------------------------------------------|-----------------------|| +| hoodie.parquet.small.file.limit | 104857600 (Optional) | During an insert and upsert operation, we opportunistically expand existing small files on storage, instead of writing new files, to keep number of files to an optimum. This config sets the file size limit below which a file on storage becomes a candidate to be selected as such a `small file`. By default, treat any file <= 100MB as a small file. Also note that if this set <= 0, will not try to get small files and directly write new files

`Config Param: PARQUET_SMALL_FILE_LIMIT` | +| hoodie.parquet.max.file.size | 125829120 (Optional) | Target size in bytes for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.

`Config Param: PARQUET_MAX_FILE_SIZE` | +| hoodie.merge.small.file.group.candidates.limit | 1 (Optional) | Limits number of file groups, whose base file satisfies small-file limit, to consider for appending records during upsert operation. Only applicable to MOR tables

`Config Param: MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT` | +| hoodie.logfile.max.size | 1073741824 (Optional) | LogFile max size in bytes. This is the maximum size allowed for a log file before it is rolled over to the next version. This log rollover limit only works on storage systems that support append operation. Please note that on cloud storage like S3/GCS, this may not be respected

`Config Param: LOGFILE_MAX_SIZE` | +| hoodie.logfile.to.parquet.compression.ratio | 0.35 (Optional) | Expected additional compression as records move from log files to parquet. Used for merge_on_read table to send inserts into log files & control the size of compacted parquet file.

`Config Param: LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION` | + + +**Flink based configs:** + +| Config Name | Default | Description | +|-----------------------------------------|----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| write.parquet.max.file.size | 120 (Optional) | Target size for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.

`Config Param: WRITE_PARQUET_MAX_FILE_SIZE` | +| write.log.max.size | 1024 (Optional) | Maximum size allowed in MB for a log file before it is rolled over to the next version, default 1GB

`Config Param: WRITE_LOG_MAX_SIZE` | + + +## Auto-Sizing With Clustering +Clustering is a service that allows you to combine small files into larger ones while at the same time (optionally) changing +the data layout by sorting or applying [space-filling curves](https://hudi.apache.org/blog/2021/12/29/hudi-zorder-and-hilbert-space-filling-curves/) +like Z-order or Hilbert curve. We won’t go into all the details about clustering here, but please refer to the +[clustering section](https://hudi.apache.org/docs/clustering) for more details. + +Clustering is one way to achieve file sizing, so you can have faster queries. When you ingest data, you may still have a +lot of small files (depending on your configurations and the data size from ingestion i.e., input batch). In this case, +you will want to cluster all the small files to larger files to improve query performance. Clustering can be performed +in different ways. Please check out the [clustering documentation](https://hudi.apache.org/docs/clustering) for more details. + +An example where clustering might be very useful is when a user has a Hudi table with many small files. For example, if +you're using BULK_INSERT without any sort modes, or you want a different file layout, you can use the clustering service +to fix all the file sizes without ingesting any new data. + +:::note +Clustering in Hudi is not a blocking operation, and writes can continue concurrently as long as no files need to be +updated while the clustering service is running. The writes will fail if there are updates to the data being clustered +while the clustering service runs. +::: + +:::note +Hudi always creates immutable files on storage. To be able to do auto-sizing or clustering, Hudi will always create a +newer version of the smaller file, resulting in 2 versions of the same file. The [cleaner service](cleaning) +will later kick in and delete the older version small file and keep the latest one. +::: + +Here are the critical file sizing configurations: + +### Configs + +**Spark based configs:** + +| Config Name | Default | Description | +|-------------------------------------------------------|----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.clustering.plan.strategy.small.file.limit | 314572800 (Optional) | Files smaller than the size in bytes specified here are candidates for clustering

`Config Param: PLAN_STRATEGY_SMALL_FILE_LIMIT`
`Since Version: 0.7.0` | +| hoodie.clustering.plan.strategy.target.file.max.bytes | 1073741824 (Optional) | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups

`Config Param: PLAN_STRATEGY_TARGET_FILE_MAX_BYTES`
`Since Version: 0.7.0` | + +**Flink based configs:** + +| Config Name | Default | Description | +|-------------------------------------------------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| clustering.plan.strategy.small.file.limit | 600 (Optional) | Files smaller than the size specified here are candidates for clustering, default 600 MB

`Config Param: CLUSTERING_PLAN_STRATEGY_SMALL_FILE_LIMIT` | +| clustering.plan.strategy.target.file.max.bytes | 1073741824 (Optional)| Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups, default 1 GB

`Config Param: CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES` | + +## Related Resources +

Videos

+ +* [Mastering File Sizing in Hudi: Boosting Performance and Efficiency](https://www.youtube.com/watch?v=qg-2aYyvfts) +* ["How do I Ingest Extremely Small Files into Hudi Data lake with Glue Incremental data processing](https://www.youtube.com/watch?v=BvoLVeidd-0) \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/flink-quick-start-guide.md b/website/versioned_docs/version-1.0.0/flink-quick-start-guide.md new file mode 100644 index 0000000000000..1cfda067c71c5 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/flink-quick-start-guide.md @@ -0,0 +1,474 @@ +--- +title: "Flink Quick Start" +toc: true +last_modified_at: 2023-08-16T12:53:57+08:00 +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +This page introduces Flink-Hudi integration. We can feel the unique charm of how Flink brings in the power of streaming into Hudi. + +## Setup + + +### Flink Support Matrix + + +| Hudi | Supported Flink version | +|:-------|:-----------------------------------------------------------------------| +| 1.0.x | 1.14.x, 1.15.x, 1.16.x, 1.17.x, 1.18.x, 1.19.x, 1.20.x (default build) | +| 0.15.x | 1.14.x, 1.15.x, 1.16.x, 1.17.x, 1.18.x | +| 0.14.x | 1.13.x, 1.14.x, 1.15.x, 1.16.x, 1.17.x | +| 0.13.x | 1.13.x, 1.14.x, 1.15.x, 1.16.x | +| 0.12.x | 1.13.x, 1.14.x, 1.15.x | +| 0.11.x | 1.13.x, 1.14.x | + + +### Download Flink and Start Flink cluster + +Hudi works with Flink 1.13 (up to Hudi 0.14.x release), Flink 1.14, Flink 1.15, Flink 1.16, Flink 1.17, and Flink 1.18. +You can follow the instructions [here](https://flink.apache.org/downloads) for setting up Flink. Then, start a standalone Flink cluster +within hadoop environment. In case we are trying on local setup, then we could download hadoop binaries and set HADOOP_HOME. + +```bash +# HADOOP_HOME is your hadoop root directory after unpack the binary package. +export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath` + +# Start the Flink standalone cluster +./bin/start-cluster.sh +``` +
+

Please note the following:

+
    +
  • We suggest hadoop 2.9.x+ version because some of the object storage has filesystem implementation only after that
  • +
  • The flink-parquet and flink-avro formats are already packaged into the hudi-flink-bundle jar
  • +
+
+ + + + +We use the [Flink Sql Client](https://ci.apache.org/projects/flink/flink-docs-release-1.13/docs/dev/table/sqlclient/) because it's a good +quick start tool for SQL users. + +### Start Flink SQL client + +Hudi supports packaged bundle jar for Flink, which should be loaded in the Flink SQL Client when it starts up. +You can build the jar manually under path `hudi-source-dir/packaging/hudi-flink-bundle`(see [Build Flink Bundle Jar](/docs/syncing_metastore#install)), or download it from the +[Apache Official Repository](https://repo.maven.apache.org/maven2/org/apache/hudi/). + +Now start the SQL CLI: + +```bash +# For Flink versions: 1.13 - 1.18 +export FLINK_VERSION=1.17 +export HUDI_VERSION=0.15.0 +wget https://repo1.maven.org/maven2/org/apache/hudi/hudi-flink${FLINK_VERSION}-bundle/${HUDI_VERSION}/hudi-flink${FLINK_VERSION}-bundle-${HUDI_VERSION}.jar -P $FLINK_HOME/lib/ +./bin/sql-client.sh embedded -j lib/hudi-flink${FLINK_VERSION}-bundle-${HUDI_VERSION}.jar shell +``` + +Setup table name, base path and operate using SQL for this guide. +The SQL CLI only executes the SQL line by line. + + + + +Hudi works with Flink 1.13 (up to Hudi 0.14.x release), Flink 1.14, Flink 1.15, Flink 1.16, Flink 1.17, and Flink 1.18. +Please add the desired dependency to your project: +```xml + + + 1.17.0 + 1.17 + 0.15.0 + + + org.apache.hudi + hudi-flink${flink.binary.version}-bundle + ${hudi.version} + +``` + + + + + +## Create Table + +First, let's create a Hudi table. Here, we use a partitioned table for illustration, but Hudi also supports non-partitioned tables. + + + + + +Here is an example of creating a flink Hudi table. + +```sql +-- sets up the result mode to tableau to show the results directly in the CLI +set sql-client.execution.result-mode = tableau; +DROP TABLE hudi_table; +CREATE TABLE hudi_table( + ts BIGINT, + uuid VARCHAR(40) PRIMARY KEY NOT ENFORCED, + rider VARCHAR(20), + driver VARCHAR(20), + fare DOUBLE, + city VARCHAR(20) +) +PARTITIONED BY (`city`) +WITH ( + 'connector' = 'hudi', + 'path' = 'file:///tmp/hudi_table', + 'table.type' = 'MERGE_ON_READ' +); +``` + + + + + +```java +// Java +// First commit will auto-initialize the table, if it did not exist in the specified base path. +``` + + + + + + +## Insert Data + + + + + +Insert data into the Hudi table using SQL `VALUES`. + +```sql +-- insert data using values +INSERT INTO hudi_table +VALUES +(1695159649087,'334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco'), +(1695091554788,'e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70 ,'san_francisco'), +(1695046462179,'9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90 ,'san_francisco'), +(1695332066204,'1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco'), +(1695516137016,'e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo'), +(1695376420876,'7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40 ,'sao_paulo'), +(1695173887231,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 ,'chennai'), +(1695115999911,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai'); +``` + + + + +Add some streaming source to flink and load the data in hudi table. Since, this is the first write, it will also auto-create the table. + +```java +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.util.HoodiePipeline; + +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +String targetTable = "hudi_table"; +String basePath = "file:///tmp/hudi_table"; + +Map options = new HashMap<>(); +options.put("path", basePath); +options.put("table.type", HoodieTableType.MERGE_ON_READ.name()); +options.put("precombine.field", "ts"); + +DataStream dataStream = env.addSource(...); +HoodiePipeline.Builder builder = HoodiePipeline.builder(targetTable) + .column("uuid VARCHAR(20)") + .column("name VARCHAR(10)") + .column("age INT") + .column("ts TIMESTAMP(3)") + .column("`partition` VARCHAR(20)") + .pk("uuid") + .partition("partition") + .options(options); + +builder.sink(dataStream, false); // The second parameter indicating whether the input data stream is bounded +env.execute("Api_Sink"); +``` +Refer Full Quickstart Example [here](https://github.com/ad1happy2go/hudi-examples/blob/main/flink/src/main/java/com/hudi/flink/quickstart/HudiDataStreamWriter.java) + + + + + +## Query Data + + + + + +```sql +-- query from the Hudi table +select * from hudi_table; +``` + + + + +```java +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.util.HoodiePipeline; + +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +String targetTable = "hudi_table"; +String basePath = "file:///tmp/hudi_table"; + +Map options = new HashMap<>(); +options.put("path", basePath); +options.put("table.type", HoodieTableType.MERGE_ON_READ.name()); +options.put("read.streaming.enabled", "true"); // this option enable the streaming read +options.put("read.start-commit", "20210316134557"); // specifies the start commit instant time + +HoodiePipeline.Builder builder = HoodiePipeline.builder(targetTable) + .column("uuid VARCHAR(20)") + .column("name VARCHAR(10)") + .column("age INT") + .column("ts TIMESTAMP(3)") + .column("`partition` VARCHAR(20)") + .pk("uuid") + .partition("partition") + .options(options); + +DataStream rowDataDataStream = builder.source(env); +rowDataDataStream.print(); +env.execute("Api_Source"); +``` +Refer Full Streaming Reader Example [here](https://github.com/ad1happy2go/hudi-examples/blob/main/flink/src/main/java/com/hudi/flink/quickstart/HudiDataStreamReader.java) + + + + + +This statement queries snapshot view of the dataset. +Refers to [Table types and queries](/docs/concepts#table-types--queries) for more info on all table types and query types supported. + +## Update Data + +This is similar to inserting new data. + + + + + +Hudi tables can be updated by either inserting reocrds with same primary key or using a standard UPDATE statement shown as below. + +```sql +-- Update Queries only works with batch execution mode +SET 'execution.runtime-mode' = 'batch'; +UPDATE hudi_table SET fare = 25.0 WHERE uuid = '334e26e9-8355-45cc-97c6-c31daf0df330'; +``` + +:::note +The `UPDATE` statement is supported since Flink 1.17, so only Hudi Flink bundle compiled with Flink 1.17+ supplies this functionality. +Only **batch** queries on Hudi table with primary key work correctly. +::: + + + + +Add some streaming source to flink and load the data in hudi table using DataStream API as [above](#insert-data). +When new rows with the same primary key arrive in stream, then it will be be updated. +In the insert example incoming row with same record id will be updated. + +Refer Update Example [here](https://github.com/ad1happy2go/hudi-examples/blob/main/flink/src/main/java/com/hudi/flink/quickstart/HudiDataStreamWriter.java) + + + + + +[Querying](#query-data) the data again will now show updated records. Each write operation generates a new [commit](/docs/concepts) +denoted by the timestamp. + + +## Delete Data {#deletes} + + + + +### Row-level Delete + +When consuming data in streaming query, Hudi Flink source can also accept the change logs from the upstream data source if the `RowKind` is set up per-row, +it can then apply the UPDATE and DELETE in row level. You can then sync a NEAR-REAL-TIME snapshot on Hudi for all kinds +of RDBMS. + +### Batch Delete + +```sql +-- delete all the records with age greater than 23 +-- NOTE: only works for batch sql queries +SET 'execution.runtime-mode' = 'batch'; +DELETE FROM t1 WHERE age > 23; +``` + +:::note +The `DELETE` statement is supported since Flink 1.17, so only Hudi Flink bundle compiled with Flink 1.17+ supplies this functionality. +Only **batch** queries on Hudi table with primary key work correctly. +::: + + + + + +Creates a Flink Hudi table first and insert data into the Hudi table using DataStream API as below. +When new rows with the same primary key and Row Kind as Delete arrive in stream, then it will be be deleted. + +Refer Delete Example [here](https://github.com/ad1happy2go/hudi-examples/blob/main/flink/src/main/java/com/hudi/flink/quickstart/HudiDataStreamWriter.java) + + + + + +## Streaming Query + +Hudi Flink also provides capability to obtain a stream of records that changed since given commit timestamp. +This can be achieved using Hudi's streaming querying and providing a start time from which changes need to be streamed. +We do not need to specify endTime, if we want all changes after the given commit (as is the common case). + +```sql +CREATE TABLE t1( + uuid VARCHAR(20) PRIMARY KEY NOT ENFORCED, + name VARCHAR(10), + age INT, + ts TIMESTAMP(3), + `partition` VARCHAR(20) +) +PARTITIONED BY (`partition`) +WITH ( + 'connector' = 'hudi', + 'path' = '${path}', + 'table.type' = 'MERGE_ON_READ', + 'read.streaming.enabled' = 'true', -- this option enable the streaming read + 'read.start-commit' = '20210316134557', -- specifies the start commit instant time + 'read.streaming.check-interval' = '4' -- specifies the check interval for finding new source commits, default 60s. +); + +-- Then query the table in stream mode +select * from t1; +``` + +## Change Data Capture Query + +Hudi Flink also provides capability to obtain a stream of records with Change Data Capture. +CDC queries are useful for applications that need to obtain all the changes, along with before/after images of records. + +```sql +set sql-client.execution.result-mode = tableau; + +CREATE TABLE hudi_table( + ts BIGINT, + uuid VARCHAR(40) PRIMARY KEY NOT ENFORCED, + rider VARCHAR(20), + driver VARCHAR(20), + fare DOUBLE, + city VARCHAR(20) +) +PARTITIONED BY (`city`) +WITH ( + 'connector' = 'hudi', + 'path' = 'file:///tmp/hudi_table', + 'table.type' = 'COPY_ON_WRITE', + 'cdc.enabled' = 'true' -- this option enable the cdc log enabled +); +-- insert data using values +INSERT INTO hudi_table +VALUES +(1695159649087,'334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco'), +(1695091554788,'e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70 ,'san_francisco'), +(1695046462179,'9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90 ,'san_francisco'), +(1695332066204,'1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco'), +(1695516137016,'e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo'), +(1695376420876,'7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40 ,'sao_paulo'), +(1695173887231,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 ,'chennai'), +(1695115999911,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai'); +SET 'execution.runtime-mode' = 'batch'; +UPDATE hudi_table SET fare = 25.0 WHERE uuid = '334e26e9-8355-45cc-97c6-c31daf0df330'; +-- Query the table in stream mode in another shell to see change logs +SET 'execution.runtime-mode' = 'streaming'; +select * from hudi_table/*+ OPTIONS('read.streaming.enabled'='true')*/; +``` + +This will give all changes that happened after the `read.start-commit` commit. The unique thing about this +feature is that it now lets you author streaming pipelines on streaming or batch data source. + +## Where To Go From Here? +- **Quick Start** : Read [Quick Start](#quick-start) to get started quickly Flink sql client to write to(read from) Hudi. +- **Configuration** : For [Global Configuration](flink_tuning#global-configurations), sets up through `$FLINK_HOME/conf/flink-conf.yaml`. For per job configuration, sets up through [Table Option](flink_tuning#table-options). +- **Writing Data** : Flink supports different modes for writing, such as [CDC Ingestion](ingestion_flink#cdc-ingestion), [Bulk Insert](ingestion_flink#bulk-insert), [Index Bootstrap](ingestion_flink#index-bootstrap), [Changelog Mode](ingestion_flink#changelog-mode) and [Append Mode](ingestion_flink#append-mode). Flink also supports multiple streaming writers with [non-blocking concurrency control](sql_dml#non-blocking-concurrency-control-experimental). +- **Reading Data** : Flink supports different modes for reading, such as [Streaming Query](sql_queries#streaming-query) and [Incremental Query](/docs/sql_queries#incremental-query). +- **Tuning** : For write/read tasks, this guide gives some tuning suggestions, such as [Memory Optimization](flink_tuning#memory-optimization) and [Write Rate Limit](flink_tuning#write-rate-limit). +- **Optimization**: Offline compaction is supported [Offline Compaction](/docs/compaction#flink-offline-compaction). +- **Query Engines**: Besides Flink, many other engines are integrated: [Hive Query](/docs/syncing_metastore#flink-setup), [Presto Query](sql_queries#presto). +- **Catalog**: A Hudi specific catalog is supported: [Hudi Catalog](/docs/sql_ddl/#create-catalog). + +If you are relatively new to Apache Hudi, it is important to be familiar with a few core concepts: + - [Hudi Timeline](timeline) – How Hudi manages transactions and other table services + - [Hudi Storage Layout](storage_layouts) - How the files are laid out on storage + - [Hudi Table Types](table_types) – `COPY_ON_WRITE` and `MERGE_ON_READ` + - [Hudi Query Types](table_types#query-types) – Snapshot Queries, Incremental Queries, Read-Optimized Queries + +See more in the "Concepts" section of the docs. + +Take a look at recent [blog posts](/blog) that go in depth on certain topics or use cases. + +Hudi tables can be queried from query engines like Hive, Spark, Flink, Presto and much more. We have put together a +[demo video](https://www.youtube.com/watch?v=VhNgUsxdrD0) that show cases all of this on a docker based setup with all +dependent systems running locally. We recommend you replicate the same setup and run the demo yourself, by following +steps [here](/docs/docker_demo) to get a taste for it. Also, if you are looking for ways to migrate your existing data +to Hudi, refer to [migration guide](/docs/migration_guide). diff --git a/website/versioned_docs/version-1.0.0/flink_tuning.md b/website/versioned_docs/version-1.0.0/flink_tuning.md new file mode 100644 index 0000000000000..e3cc7f70735c7 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/flink_tuning.md @@ -0,0 +1,117 @@ +--- +title: Flink Tuning Guide +toc: true +--- + +## Global Configurations +When using Flink, you can set some global configurations in `$FLINK_HOME/conf/flink-conf.yaml` + +### Parallelism + +| Option Name | Default | Type | Description | +| ----------- | ------- | ------- | ------- | +| `taskmanager.numberOfTaskSlots` | `1` | `Integer` | The number of parallel operator or user function instances that a single TaskManager can run. We recommend setting this value > 4, and the actual value needs to be set according to the amount of data | +| `parallelism.default` | `1` | `Integer` | The default parallelism used when no parallelism is specified anywhere (default: 1). For example, If the value of [`write.bucket_assign.tasks`](#parallelism-1) is not set, this value will be used | + +### Memory + +| Option Name | Default | Type | Description | +| ----------- | ------- | ------- | ------- | +| `jobmanager.memory.process.size` | `(none)` | `MemorySize` | Total Process Memory size for the JobManager. This includes all the memory that a JobManager JVM process consumes, consisting of Total Flink Memory, JVM Metaspace, and JVM Overhead | +| `taskmanager.memory.task.heap.size` | `(none)` | `MemorySize` | Task Heap Memory size for TaskExecutors. This is the size of JVM heap memory reserved for write cache | +| `taskmanager.memory.managed.size` | `(none)` | `MemorySize` | Managed Memory size for TaskExecutors. This is the size of off-heap memory managed by the memory manager, reserved for sorting and RocksDB state backend. If you choose RocksDB as the state backend, you need to set this memory | + +### Checkpoint + +| Option Name | Default | Type | Description | +| ----------- | ------- | ------- | ------- | +| `execution.checkpointing.interval` | `(none)` | `Duration` | Setting this value as `execution.checkpointing.interval = 150000ms`, 150000ms = 2.5min. Configuring this parameter is equivalent to enabling the checkpoint | +| `state.backend` | `(none)` | `String` | The state backend to be used to store state. We recommend setting store state as `rocksdb` : `state.backend: rocksdb` | +| `state.backend.rocksdb.localdir` | `(none)` | `String` | The local directory (on the TaskManager) where RocksDB puts its files | +| `state.checkpoints.dir` | `(none)` | `String` | The default directory used for storing the data files and meta data of checkpoints in a Flink supported filesystem. The storage path must be accessible from all participating processes/nodes(i.e. all TaskManagers and JobManagers), like hdfs and oss path | +| `state.backend.incremental` | `false` | `Boolean` | Option whether the state backend should create incremental checkpoints, if possible. For an incremental checkpoint, only a diff from the previous checkpoint is stored, rather than the complete checkpoint state. If store state is setting as `rocksdb`, recommending to turn on | + +## Table Options + +Flink SQL jobs can be configured through options in the `WITH` clause. +The actual datasource level configs are listed below. + +### Memory + +:::note +When optimizing memory, we need to pay attention to the memory configuration +and the number of taskManagers, parallelism of write tasks (write.tasks : 4) first. After confirm each write task to be +allocated with enough memory, we can try to set these memory options. +::: + +| Option Name | Description | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `write.task.max.size` | Maximum memory in MB for a write task, when the threshold hits, it flushes the max size data bucket to avoid OOM. Default `1024MB` | `1024D` | The memory reserved for write buffer is `write.task.max.size` - `compaction.max_memory`. When total buffer of write tasks reach the threshold, the largest buffer in the memory will be flushed | +| `write.batch.size` | In order to improve the efficiency of writing, Flink write task will cache data in buffer according to the write bucket until the memory reaches the threshold. When reached threshold, the data buffer would be flushed out. Default `64MB` | `64D` | Recommend to use the default settings | +| `write.log_block.size` | The log writer of Hudi will not flush the data immediately after receiving data. The writer flush data to the disk in the unit of `LogBlock`. Before `LogBlock` reached threshold, records will be buffered in the writer in form of serialized bytes. Default `128MB` | `128` | Recommend to use the default settings | +| `write.merge.max_memory` | If write type is `COPY_ON_WRITE`, Hudi will merge the incremental data and base file data. The incremental data will be cached and spilled to disk. this threshold controls the max heap size that can be used. Default `100MB` | `100` | Recommend to use the default settings | +| `compaction.max_memory` | Same as `write.merge.max_memory`, but occurs during compaction. Default `100MB` | `100` | If it is online compaction, it can be turned up when resources are sufficient, such as setting as `1024MB` | + +### Parallelism + +| Option Name | Description | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `write.tasks` | The parallelism of writer tasks. Each write task writes 1 to `N` buckets in sequence. Default `4` | `4` | Increases the parallelism has no effect on the number of small files | +| `write.bucket_assign.tasks` | The parallelism of bucket assigner operators. No default value, using Flink `parallelism.default` | [`parallelism.default`](#parallelism) | Increases the parallelism also increases the number of buckets, thus the number of small files (small buckets) | +| `write.index_boostrap.tasks` | The parallelism of index bootstrap. Increasing parallelism can speed up the efficiency of the bootstrap stage. The bootstrap stage will block checkpointing. Therefore, it is necessary to set more checkpoint failure tolerance times. Default using Flink `parallelism.default` | [`parallelism.default`](#parallelism) | It only take effect when `index.bootsrap.enabled` is `true` | +| `read.tasks` | The parallelism of read operators (batch and stream). Default `4` | `4` | | +| `compaction.tasks` | The parallelism of online compaction. Default `4` | `4` | `Online compaction` will occupy the resources of the write task. It is recommended to use [`offline compaction`](/docs/compaction/#flink-offline-compaction) | + +### Compaction + +:::note +These are options only for `online compaction`. +::: + +:::note +Turn off online compaction by setting `compaction.async.enabled` = `false`, but we still recommend turning on `compaction.schedule.enable` for the writing job. You can then execute the compaction plan by [`offline compaction`](#offline-compaction). +::: + +| Option Name | Description | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `compaction.schedule.enabled` | Whether to generate compaction plan periodically | `true` | Recommend to turn it on, even if `compaction.async.enabled` = `false` | +| `compaction.async.enabled` | Async Compaction, enabled by default for MOR | `true` | Turn off `online compaction` by turning off this option | +| `compaction.trigger.strategy` | Strategy to trigger compaction | `num_commits` | Options are `num_commits`: trigger compaction when reach N delta commits; `time_elapsed`: trigger compaction when time elapsed > N seconds since last compaction; `num_and_time`: trigger compaction when both `NUM_COMMITS` and `TIME_ELAPSED` are satisfied; `num_or_time`: trigger compaction when `NUM_COMMITS` or `TIME_ELAPSED` is satisfied. | +| `compaction.delta_commits` | Max delta commits needed to trigger compaction, default `5` commits | `5` | -- | +| `compaction.delta_seconds` | Max delta seconds time needed to trigger compaction, default `1` hour | `3600` | -- | +| `compaction.max_memory` | Max memory in MB for compaction spillable map, default `100MB` | `100` | If your have sufficient resources, recommend to adjust to `1024MB` | +| `compaction.target_io` | Target IO per compaction (both read and write), default `500GB`| `512000` | -- | + +## Memory Optimization + +### MOR + +1. [Setting Flink state backend to `rocksdb`](#checkpoint) (the default `in memory` state backend is very memory intensive). +2. If there is enough memory, `compaction.max_memory` can be set larger (`100MB` by default, and can be adjust to `1024MB`). +3. Pay attention to the memory allocated to each write task by taskManager to ensure that each write task can be allocated to the + desired memory size `write.task.max.size`. For example, taskManager has `4GB` of memory running two streamWriteFunction, so each write task + can be allocated with `2GB` memory. Please reserve some buffers because the network buffer and other types of tasks on taskManager (such as bucketAssignFunction) will also consume memory. +4. Pay attention to the memory changes of compaction. `compaction.max_memory` controls the maximum memory that each task can be used when compaction tasks read + logs. `compaction.tasks` controls the parallelism of compaction tasks. + +### COW + +1. [Setting Flink state backend to `rocksdb`](#checkpoint) (the default `in memory` state backend is very memory intensive). +2. Increase both `write.task.max.size` and `write.merge.max_memory` (`1024MB` and `100MB` by default, adjust to `2014MB` and `1024MB`). +3. Pay attention to the memory allocated to each write task by taskManager to ensure that each write task can be allocated to the + desired memory size `write.task.max.size`. For example, taskManager has `4GB` of memory running two write tasks, so each write task + can be allocated with `2GB` memory. Please reserve some buffers because the network buffer and other types of tasks on taskManager (such as `BucketAssignFunction`) will also consume memory. + + +## Write Rate Limit + +In the existing data synchronization, `snapshot data` and `incremental data` are send to kafka first, and then streaming write +to Hudi by Flink. Because the direct consumption of `snapshot data` will lead to problems such as high throughput and serious +disorder (writing partition randomly), which will lead to write performance degradation and throughput glitches. At this time, +the `write.rate.limit` option can be turned on to ensure smooth writing. + +### Options + +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `write.rate.limit` | `false` | `0` | Turn off by default | \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/gcp_bigquery.md b/website/versioned_docs/version-1.0.0/gcp_bigquery.md new file mode 100644 index 0000000000000..59f6e678f62d9 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/gcp_bigquery.md @@ -0,0 +1,102 @@ +--- +title: Google BigQuery +keywords: [ hudi, gcp, bigquery ] +summary: Introduce BigQuery integration in Hudi. +--- + +Hudi tables can be queried from [Google Cloud BigQuery](https://cloud.google.com/bigquery) as external tables. As of +now, the Hudi-BigQuery integration only works for hive-style partitioned Copy-On-Write and Read-Optimized Merge-On-Read tables. + +## Sync Modes +### Manifest File +As of version 0.14.0, the `BigQuerySyncTool` supports syncing table to BigQuery using [manifests](https://cloud.google.com/blog/products/data-analytics/bigquery-manifest-file-support-for-open-table-format-queries). On the first run, the tool will create a manifest file representing the current base files in the table and a table in BigQuery based on the provided configurations. The tool produces a new manifest file on each subsequent run and will update the schema of the table in BigQuery if the schema changes in your Hudi table. +#### Benefits of using the new manifest approach: +
    +
  1. Only the files in the manifest can be scanned leading to less cost and better performance for your queries
  2. +
  3. The schema is now synced from the Hudi commit metadata allowing for proper schema evolution
  4. +
  5. Lists no longer have unnecessary nesting when querying in BigQuery as list inference is enabled by default
  6. +
  7. Partition column no longer needs to be dropped from the files due to new schema handling improvements
  8. +
+ +To enable this feature, set `hoodie.gcp.bigquery.sync.use_bq_manifest_file` to true. + +### View Over Files (Legacy) +This is the current default behavior to preserve compatibility as users upgrade to 0.14.0 and beyond. +After run, the sync tool will create 2 tables and 1 view in the target dataset in BigQuery. The tables and the view +share the same name prefix, which is taken from the Hudi table name. Query the view for the same results as querying the +Copy-on-Write Hudi table. +**NOTE:** The view can scan all of the parquet files under your table's base path so it is recommended to upgrade to the manifest based approach for improved cost and performance. + +## Configurations + +Hudi uses `org.apache.hudi.gcp.bigquery.BigQuerySyncTool` to sync tables. It works with `HoodieStreamer` via +setting sync tool class. A few BigQuery-specific configurations are required. + +| Config | Notes | +|:---------------------------------------------|:----------------------------------------------------------------------------------------------------------------| +| `hoodie.gcp.bigquery.sync.project_id` | The target Google Cloud project | +| `hoodie.gcp.bigquery.sync.dataset_name` | BigQuery dataset name; create before running the sync tool | +| `hoodie.gcp.bigquery.sync.dataset_location` | Region info of the dataset; same as the GCS bucket that stores the Hudi table | +| `hoodie.gcp.bigquery.sync.source_uri` | A wildcard path pattern pointing to the first level partition; partition key can be specified or auto-inferred. Only required for partitioned tables | +| `hoodie.gcp.bigquery.sync.source_uri_prefix` | The common prefix of the `source_uri`, usually it's the path to the Hudi table, trailing slash does not matter. | +| `hoodie.gcp.bigquery.sync.base_path` | The usual basepath config for Hudi table. | +| `hoodie.gcp.bigquery.sync.use_bq_manifest_file` | Set to true to enable the manifest based sync | +| `hoodie.gcp.bigquery.sync.require_partition_filter` | Introduced in Hudi version 0.14.1, this configuration accepts a BOOLEAN value, with the default being false. When enabled (set to true), you must create a partition filter (a WHERE clause) for all queries, targeting the partitioning column of a partitioned table. Queries lacking such a filter will result in an error. | + + +Refer to `org.apache.hudi.gcp.bigquery.BigQuerySyncConfig` for the complete configuration list. +### Partition Handling +In addition to the BigQuery-specific configs, you will need to use hive style partitioning for partition pruning in BigQuery. On top of that, the value in partition path will be the value returned for that field in your query. For example if you partition on a time-millis field, `time`, with an output format of `time=yyyy-MM-dd`, the query will return `time` values with day level granularity instead of the original milliseconds so keep this in mind while setting up your tables. + +``` +hoodie.datasource.write.hive_style_partitioning = 'true' +``` + +For the view based sync you must also specify the following configurations: +``` +hoodie.datasource.write.drop.partition.columns = 'true' +hoodie.partition.metafile.use.base.format = 'true' +``` + +## Example + +Below shows an example for running `BigQuerySyncTool` with `HoodieStreamer`. + +```shell +spark-submit --master yarn \ +--packages com.google.cloud:google-cloud-bigquery:2.10.4 \ +--jars "/opt/hudi-gcp-bundle-0.13.0.jar,/opt/hudi-utilities-slim-bundle_2.12-1.0.0.jar,/opt/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ +--class org.apache.hudi.utilities.streamer.HoodieStreamer \ +/opt/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ +--target-base-path gs://my-hoodie-table/path \ +--target-table mytable \ +--table-type COPY_ON_WRITE \ +--base-file-format PARQUET \ +# ... other Hudi Streamer options +--enable-sync \ +--sync-tool-classes org.apache.hudi.gcp.bigquery.BigQuerySyncTool \ +--hoodie-conf hoodie.streamer.source.dfs.root=gs://my-source-data/path \ +--hoodie-conf hoodie.gcp.bigquery.sync.project_id=hudi-bq \ +--hoodie-conf hoodie.gcp.bigquery.sync.dataset_name=rxusandbox \ +--hoodie-conf hoodie.gcp.bigquery.sync.dataset_location=asia-southeast1 \ +--hoodie-conf hoodie.gcp.bigquery.sync.table_name=mytable \ +--hoodie-conf hoodie.gcp.bigquery.sync.base_path=gs://rxusandbox/testcases/stocks/data/target/${NOW} \ +--hoodie-conf hoodie.gcp.bigquery.sync.partition_fields=year,month,day \ +--hoodie-conf hoodie.gcp.bigquery.sync.source_uri=gs://my-hoodie-table/path/year=* \ +--hoodie-conf hoodie.gcp.bigquery.sync.source_uri_prefix=gs://my-hoodie-table/path/ \ +--hoodie-conf hoodie.gcp.bigquery.sync.use_file_listing_from_metadata=true \ +--hoodie-conf hoodie.gcp.bigquery.sync.assume_date_partitioning=false \ +--hoodie-conf hoodie.datasource.hive_sync.mode=jdbc \ +--hoodie-conf hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://localhost:10000 \ +--hoodie-conf hoodie.datasource.hive_sync.skip_ro_suffix=true \ +--hoodie-conf hoodie.datasource.hive_sync.ignore_exceptions=false \ +--hoodie-conf hoodie.datasource.hive_sync.database=mydataset \ +--hoodie-conf hoodie.datasource.hive_sync.table=mytable \ +--hoodie-conf hoodie.datasource.write.recordkey.field=mykey \ +--hoodie-conf hoodie.datasource.write.partitionpath.field=year,month,day \ +--hoodie-conf hoodie.datasource.write.precombine.field=ts \ +--hoodie-conf hoodie.datasource.write.keygenerator.type=COMPLEX \ +--hoodie-conf hoodie.datasource.write.hive_style_partitioning=true \ +--hoodie-conf hoodie.datasource.write.drop.partition.columns=true \ +--hoodie-conf hoodie.partition.metafile.use.base.format=true \ +``` diff --git a/website/versioned_docs/version-1.0.0/gcs_hoodie.md b/website/versioned_docs/version-1.0.0/gcs_hoodie.md new file mode 100644 index 0000000000000..f0171aff16b5c --- /dev/null +++ b/website/versioned_docs/version-1.0.0/gcs_hoodie.md @@ -0,0 +1,60 @@ +--- +title: Google Cloud +keywords: [ hudi, hive, google cloud, storage, spark, presto] +summary: In this page, we go over how to configure hudi with Google Cloud Storage. +last_modified_at: 2019-12-30T15:59:57-04:00 +--- +For Hudi storage on GCS, **regional** buckets provide an DFS API with strong consistency. + +## GCS Configs + +There are two configurations required for Hudi GCS compatibility: + +- Adding GCS Credentials for Hudi +- Adding required jars to classpath + +### GCS Credentials + +Add the required configs in your core-site.xml from where Hudi can fetch them. Replace the `fs.defaultFS` with your GCS bucket name and Hudi should be able to read/write from the bucket. + +```xml + + fs.defaultFS + gs://hudi-bucket + + + + fs.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem + The FileSystem for gs: (GCS) uris. + + + + fs.AbstractFileSystem.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS + The AbstractFileSystem for gs: (GCS) uris. + + + + fs.gs.project.id + GCS_PROJECT_ID + + + google.cloud.auth.service.account.enable + true + + + google.cloud.auth.service.account.email + GCS_SERVICE_ACCOUNT_EMAIL + + + google.cloud.auth.service.account.keyfile + GCS_SERVICE_ACCOUNT_KEYFILE + +``` + +### GCS Libs + +GCS hadoop libraries to add to our classpath + +- com.google.cloud.bigdataoss:gcs-connector:1.6.0-hadoop2 diff --git a/website/versioned_docs/version-1.0.0/hoodie_streaming_ingestion.md b/website/versioned_docs/version-1.0.0/hoodie_streaming_ingestion.md new file mode 100644 index 0000000000000..60586cbfc469c --- /dev/null +++ b/website/versioned_docs/version-1.0.0/hoodie_streaming_ingestion.md @@ -0,0 +1,638 @@ +--- +title: Using Spark +keywords: [hudi, streamer, hoodiestreamer, spark_streaming] +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Hudi Streamer + +The `HoodieStreamer` utility (part of `hudi-utilities-slim-bundle` and `hudi-utilities-bundle`) provides ways to ingest +from different sources such as DFS or Kafka, with the following capabilities. + +- Exactly once ingestion of new events from + Kafka, [incremental imports](https://sqoop.apache.org/docs/1.4.2/SqoopUserGuide#_incremental_imports) from Sqoop or + output of `HiveIncrementalPuller` or files under a DFS folder +- Support json, avro or a custom record types for the incoming data +- Manage checkpoints, rollback & recovery +- Leverage Avro schemas from DFS or Confluent [schema registry](https://github.com/confluentinc/schema-registry). +- Support for plugging in transformations + +:::danger Important + +The following classes were renamed and relocated to `org.apache.hudi.utilities.streamer` package. +- `DeltastreamerMultiWriterCkptUpdateFunc` is renamed to `StreamerMultiWriterCkptUpdateFunc` +- `DeltaSync` is renamed to `StreamSync` +- `HoodieDeltaStreamer` is renamed to `HoodieStreamer` +- `HoodieDeltaStreamerMetrics` is renamed to `HoodieStreamerMetrics` +- `HoodieMultiTableDeltaStreamer` is renamed to `HoodieMultiTableStreamer` + +To maintain backward compatibility, the original classes are still present in the `org.apache.hudi.utilities.deltastreamer` +package, but have been deprecated. + +::: + +### Options +
+ + +Expand this to see HoodieStreamer's "--help" output describing its capabilities in more details. + + +```shell +[hoodie]$ spark-submit \ + --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ + --class org.apache.hudi.utilities.streamer.HoodieStreamer `ls packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle-*.jar` --help +Usage:
[options] + Options: + --allow-commit-on-no-checkpoint-change + allow commits even if checkpoint has not changed before and after fetch + datafrom source. This might be useful in sources like SqlSource where + there is not checkpoint. And is not recommended to enable in continuous + mode. + Default: false + --base-file-format + File format for the base files. PARQUET (or) HFILE + Default: PARQUET + --bootstrap-index-class + subclass of BootstrapIndex + Default: org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex + --bootstrap-overwrite + Overwrite existing target table, default false + Default: false + --checkpoint + Resume Hudi Streamer from this checkpoint. + --cluster-scheduling-minshare + Minshare for clustering as defined in + https://spark.apache.org/docs/latest/job-scheduling.html + Default: 0 + --cluster-scheduling-weight + Scheduling weight for clustering as defined in + https://spark.apache.org/docs/latest/job-scheduling.html + Default: 1 + --commit-on-errors + Commit even when some records failed to be written + Default: false + --compact-scheduling-minshare + Minshare for compaction as defined in + https://spark.apache.org/docs/latest/job-scheduling.html + Default: 0 + --compact-scheduling-weight + Scheduling weight for compaction as defined in + https://spark.apache.org/docs/latest/job-scheduling.html + Default: 1 + --config-hot-update-strategy-class + Configuration hot update in continuous mode + Default: + --continuous + Hudi Streamer runs in continuous mode running source-fetch -> Transform + -> Hudi Write in loop + Default: false + --delta-sync-scheduling-minshare + Minshare for delta sync as defined in + https://spark.apache.org/docs/latest/job-scheduling.html + Default: 0 + --delta-sync-scheduling-weight + Scheduling weight for delta sync as defined in + https://spark.apache.org/docs/latest/job-scheduling.html + Default: 1 + --disable-compaction + Compaction is enabled for MoR table by default. This flag disables it + Default: false + --enable-hive-sync + Enable syncing to hive + Default: false + --enable-sync + Enable syncing meta + Default: false + --filter-dupes + Should duplicate records from source be dropped/filtered out before + insert/bulk-insert + Default: false + --force-empty-sync + Force syncing meta even on empty commit + Default: false + --help, -h + + --hoodie-conf + Any configuration that can be set in the properties file (using the CLI + parameter "--props") can also be passed command line using this + parameter. This can be repeated + Default: [] + --ingestion-metrics-class + Ingestion metrics class for reporting metrics during ingestion + lifecycles. + Default: org.apache.hudi.utilities.streamer.HoodieStreamerMetrics + --initial-checkpoint-provider + subclass of + org.apache.hudi.utilities.checkpointing.InitialCheckpointProvider. + Generate check point for Hudi Streamer for the first run. This field + will override the checkpoint of last commit using the checkpoint field. + Use this field only when switching source, for example, from DFS source + to Kafka Source. + --max-pending-clustering + Maximum number of outstanding inflight/requested clustering. Delta Sync + will not happen unlessoutstanding clustering is less than this number + Default: 5 + --max-pending-compactions + Maximum number of outstanding inflight/requested compactions. Delta Sync + will not happen unlessoutstanding compactions is less than this number + Default: 5 + --max-retry-count + the max retry count if --retry-on-source-failures is enabled + Default: 3 + --min-sync-interval-seconds + the min sync interval of each sync in continuous mode + Default: 0 + --op + Takes one of these values : UPSERT (default), INSERT, BULK_INSERT, + INSERT_OVERWRITE, INSERT_OVERWRITE_TABLE, DELETE_PARTITION + Default: UPSERT + Possible Values: [INSERT, INSERT_PREPPED, UPSERT, UPSERT_PREPPED, BULK_INSERT, BULK_INSERT_PREPPED, DELETE, DELETE_PREPPED, BOOTSTRAP, INSERT_OVERWRITE, CLUSTER, DELETE_PARTITION, INSERT_OVERWRITE_TABLE, COMPACT, INDEX, ALTER_SCHEMA, LOG_COMPACT, UNKNOWN] + --payload-class + subclass of HoodieRecordPayload, that works off a GenericRecord. + Implement your own, if you want to do something other than overwriting + existing value + Default: org.apache.hudi.common.model.OverwriteWithLatestAvroPayload + --post-write-termination-strategy-class + Post writer termination strategy class to gracefully shutdown + deltastreamer in continuous mode + Default: + --props + path to properties file on localfs or dfs, with configurations for + hoodie client, schema provider, key generator and data source. For + hoodie client props, sane defaults are used, but recommend use to + provide basic things like metrics endpoints, hive configs etc. For + sources, referto individual classes, for supported properties. + Properties in this file can be overridden by "--hoodie-conf" + Default: file:///Users/shiyanxu/src/test/resources/streamer-config/dfs-source.properties + --retry-interval-seconds + the retry interval for source failures if --retry-on-source-failures is + enabled + Default: 30 + --retry-last-pending-inline-clustering, -rc + Retry last pending inline clustering plan before writing to sink. + Default: false + --retry-last-pending-inline-compaction + Retry last pending inline compaction plan before writing to sink. + Default: false + --retry-on-source-failures + Retry on any source failures + Default: false + --run-bootstrap + Run bootstrap if bootstrap index is not found + Default: false + --schemaprovider-class + subclass of org.apache.hudi.utilities.schema.SchemaProvider to attach + schemas to input & target table data, built in options: + org.apache.hudi.utilities.schema.FilebasedSchemaProvider.Source (See + org.apache.hudi.utilities.sources.Source) implementation can implement + their own SchemaProvider. For Sources that return Dataset, the + schema is obtained implicitly. However, this CLI option allows + overriding the schemaprovider returned by Source. + --source-class + Subclass of org.apache.hudi.utilities.sources to read data. Built-in + options: org.apache.hudi.utilities.sources.{JsonDFSSource (default), + AvroDFSSource, JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource} + Default: org.apache.hudi.utilities.sources.JsonDFSSource + --source-limit + Maximum amount of data to read from source. Default: No limit, e.g: + DFS-Source => max bytes to read, Kafka-Source => max events to read + Default: 9223372036854775807 + --source-ordering-field + Field within source record to decide how to break ties between records + with same key in input data. Default: 'ts' holding unix timestamp of + record + Default: ts + --spark-master + spark master to use, if not defined inherits from your environment + taking into account Spark Configuration priority rules (e.g. not using + spark-submit command). + Default: + --sync-tool-classes + Meta sync client tool, using comma to separate multi tools + Default: org.apache.hudi.hive.HiveSyncTool + * --table-type + Type of table. COPY_ON_WRITE (or) MERGE_ON_READ + * --target-base-path + base path for the target hoodie table. (Will be created if did not exist + first time around. If exists, expected to be a hoodie table) + * --target-table + name of the target table + --transformer-class + A subclass or a list of subclasses of + org.apache.hudi.utilities.transform.Transformer. Allows transforming raw + source Dataset to a target Dataset (conforming to target schema) before + writing. Default : Not set. E.g. - + org.apache.hudi.utilities.transform.SqlQueryBasedTransformer (which + allows a SQL query templated to be passed as a transformation function). + Pass a comma-separated list of subclass names to chain the + transformations. If there are two or more transformers using the same + config keys and expect different values for those keys, then transformer + can include an identifier. E.g. - + tr1:org.apache.hudi.utilities.transform.SqlQueryBasedTransformer. Here + the identifier tr1 can be used along with property key like + `hoodie.streamer.transformer.sql.tr1` to identify properties related to + the transformer. So effective value for + `hoodie.streamer.transformer.sql` is determined by key + `hoodie.streamer.transformer.sql.tr1` for this transformer. If + identifier is used, it should be specified for all the transformers. + Further the order in which transformer is applied is determined by the + occurrence of transformer irrespective of the identifier used for the + transformer. For example: In the configured value below tr2:org.apache.hudi.utilities.transform.SqlQueryBasedTransformer,tr1:org.apache.hudi.utilities.transform.SqlQueryBasedTransformer + , tr2 is applied before tr1 based on order of occurrence. +``` +
+ +The tool takes a hierarchically composed property file and has pluggable interfaces for extracting data, key generation and providing schema. Sample configs for ingesting from kafka and dfs are +provided under `hudi-utilities/src/test/resources/streamer-config`. + +For e.g: once you have Confluent Kafka, Schema registry up & running, produce some test data using ([impressions.avro](https://docs.confluent.io/current/ksql/docs/tutorials/generate-custom-test-data) provided by schema-registry repo) + +```java +[confluent-5.0.0]$ bin/ksql-datagen schema=../impressions.avro format=avro topic=impressions key=impressionid +``` + +and then ingest it as follows. + +```java +[hoodie]$ spark-submit \ + --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ + --class org.apache.hudi.utilities.streamer.HoodieStreamer `ls packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle-*.jar` \ + --props file://${PWD}/hudi-utilities/src/test/resources/streamer-config/kafka-source.properties \ + --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \ + --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \ + --source-ordering-field impresssiontime \ + --target-base-path file:\/\/\/tmp/hudi-streamer-op \ + --target-table uber.impressions \ + --op BULK_INSERT +``` + +In some cases, you may want to migrate your existing table into Hudi beforehand. Please refer to [migration guide](/docs/migration_guide). + +### Using `hudi-utilities-slim-bundle` bundle jar + +It is recommended to use `hudi-utilities-slim-bundle`, which should be used along with a Hudi Spark bundle +corresponding the Spark version used to make utilities work with Spark, e.g., +`--packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0`. + +### Concurrency Control + +Using optimistic concurrency control (OCC) via Hudi Streamer requires the configs below to the properties file that can be passed to the +job. + +```properties +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.write.lock.provider= +hoodie.cleaner.policy.failed.writes=LAZY +``` + +As an example, adding the configs to `kafka-source.properties` file and passing them to Hudi Streamer will enable OCC. +A Hudi Streamer job can then be triggered as follows: + +```java +[hoodie]$ spark-submit \ + --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ + --class org.apache.hudi.utilities.streamer.HoodieStreamer `ls packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle-*.jar` \ + --props file://${PWD}/hudi-utilities/src/test/resources/streamer-config/kafka-source.properties \ + --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \ + --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \ + --source-ordering-field impresssiontime \ + --target-base-path file:///tmp/hudi-streamer-op \ + --target-table uber.impressions \ + --op BULK_INSERT +``` + +Read more in depth about concurrency control in the [concurrency control concepts](/docs/concurrency_control) section + +### Checkpointing + +`HoodieStreamer` uses checkpoints to keep track of what data has been read already so it can resume without needing to reprocess all data. +When using a Kafka source, the checkpoint is the [Kafka Offset](https://cwiki.apache.org/confluence/display/KAFKA/Offset+Management) +When using a DFS source, the checkpoint is the 'last modified' timestamp of the latest file read. +Checkpoints are saved in the .hoodie commit file as `streamer.checkpoint.key`. + +If you need to change the checkpoints for reprocessing or replaying data you can use the following options: + +- `--checkpoint` will set `streamer.checkpoint.reset_key` in the commit file to overwrite the current checkpoint. Format of checkpoint depends on [KAFKA_CHECKPOINT_TYPE](/docs/configurations#hoodiestreamersourcekafkacheckpointtype). By default (for type `string`), checkpoint should be provided as: `topicName,0:offset0,1:offset1,2:offset2`. For type `timestamp`, checkpoint should be provided as long value of desired timestamp. For type `single_offset`, we assume that topic consists of a single partition, so checkpoint should be provided as long value of desired offset. +- `--source-limit` will set a maximum amount of data to read from the source. For DFS sources, this is max # of bytes read. +For Kafka, this is the max # of events to read. + +### Transformers + +`HoodieStreamer` supports custom transformation on records before writing to storage. This is done by supplying +implementation of `org.apache.hudi.utilities.transform.Transformer` via `--transformer-class` option. + +#### SQL Query Transformer +You can pass a SQL Query to be executed during write. + +```scala +--transformer-class org.apache.hudi.utilities.transform.SqlQueryBasedTransformer +--hoodie-conf hoodie.streamer.transformer.sql=SELECT a.col1, a.col3, a.col4 FROM a +``` + +#### SQL File Transformer +You can specify a File with a SQL script to be executed during write. The SQL file is configured with this hoodie property: +hoodie.streamer.transformer.sql.file + +The query should reference the source as a table named "\" + +The final sql statement result is used as the write payload. + +Example Spark SQL Query: +```sql +CACHE TABLE tmp_personal_trips AS +SELECT * FROM WHERE trip_type='personal_trips'; + +SELECT * FROM tmp_personal_trips; +``` + +#### Flattening Transformer +This transformer can flatten nested objects. It flattens the nested fields in the incoming records by prefixing +inner-fields with outer-field and _ in a nested fashion. Currently flattening of arrays is not supported. + +An example schema may look something like the below where name is a nested field of StructType in the original source +```scala +age as intColumn,address as stringColumn,name.first as name_first,name.last as name_last, name.middle as name_middle +``` + +Set the config as: +```scala +--transformer-class org.apache.hudi.utilities.transform.FlatteningTransformer +``` + +#### Chained Transformer +If you wish to use multiple transformers together, you can use the Chained transformers to pass multiple to be executed sequentially. + +Example below first flattens the incoming records and then does sql projection based on the query specified: +```scala +--transformer-class org.apache.hudi.utilities.transform.FlatteningTransformer,org.apache.hudi.utilities.transform.SqlQueryBasedTransformer +--hoodie-conf hoodie.streamer.transformer.sql=SELECT a.col1, a.col3, a.col4 FROM a +``` + +#### AWS DMS Transformer +This transformer is specific for AWS DMS data. It adds `Op` field with value `I` if the field is not present. + +Set the config as: +```scala +--transformer-class org.apache.hudi.utilities.transform.AWSDmsTransformer +``` + +#### Custom Transformer Implementation +You can write your own custom transformer by extending [this class](https://github.com/apache/hudi/tree/master/hudi-utilities/src/main/java/org/apache/hudi/utilities/transform) + +#### Related Resources + +* [Learn about Apache Hudi Transformers with Hands on Lab](https://www.youtube.com/watch?v=AprlZ8hGdJo) +* [Apache Hudi with DBT Hands on Lab.Transform Raw Hudi tables with DBT and Glue Interactive Session](https://youtu.be/DH3LEaPG6ss) + +### Schema Providers + +By default, Spark will infer the schema of the source and use that inferred schema when writing to a table. If you need +to explicitly define the schema you can use one of the following Schema Providers below. + +#### Schema Registry Provider + +You can obtain the latest schema from an online registry. You pass a URL to the registry and if needed, you can also +pass userinfo and credentials in the url like: `https://foo:bar@schemaregistry.org` The credentials are then extracted +and are set on the request as an Authorization Header. + +When fetching schemas from a registry, you can specify both the source schema and the target schema separately. + +| Config | Description | Example | +|---------------------------------------------------|------------------------------------------------|------------------------------------| +| hoodie.streamer.schemaprovider.registry.url | The schema of the source you are reading from | https://foo:bar@schemaregistry.org | +| hoodie.streamer.schemaprovider.registry.targetUrl | The schema of the target you are writing to | https://foo:bar@schemaregistry.org | + +The above configs are passed to Hudi Streamer spark-submit command like: + +```shell +--hoodie-conf hoodie.streamer.schemaprovider.registry.url=https://foo:bar@schemaregistry.org +``` + +There are other optional configs to work with schema registry provider such as SSL-store related configs, and supporting +custom transformation of schema returned by schema registry, e.g., converting the original json schema to avro schema +via `org.apache.hudi.utilities.schema.converter.JsonToAvroSchemaConverter`. + +| Config | Description | Example | +|---------------------------------------------------------|------------------------------------------------------|------------------------------------------------------------------------| +| hoodie.streamer.schemaprovider.registry.schemaconverter | The class name of the custom schema converter to use | `org.apache.hudi.utilities.schema.converter.JsonToAvroSchemaConverter` | +| schema.registry.ssl.keystore.location | SSL key store location | | +| schema.registry.ssl.keystore.password | SSL key store password | | +| schema.registry.ssl.truststore.location | SSL trust store location | | +| schema.registry.ssl.truststore.password | SSL trust store password | | +| schema.registry.ssl.key.password | SSL key password | | + +#### JDBC Schema Provider + +You can obtain the latest schema through a JDBC connection. + +| Config | Description | Example | +|------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------| +| hoodie.streamer.schemaprovider.source.schema.jdbc.connection.url | The JDBC URL to connect to. You can specify source specific connection properties in the URL | jdbc:postgresql://localhost/test?user=fred&password=secret | +| hoodie.streamer.schemaprovider.source.schema.jdbc.driver.type | The class name of the JDBC driver to use to connect to this URL | org.h2.Driver | +| hoodie.streamer.schemaprovider.source.schema.jdbc.username | username for the connection | fred | +| hoodie.streamer.schemaprovider.source.schema.jdbc.password | password for the connection | secret | +| hoodie.streamer.schemaprovider.source.schema.jdbc.dbtable | The table with the schema to reference | test_database.test1_table or test1_table | +| hoodie.streamer.schemaprovider.source.schema.jdbc.timeout | The number of seconds the driver will wait for a Statement object to execute to the given number of seconds. Zero means there is no limit. In the write path, this option depends on how JDBC drivers implement the API setQueryTimeout, e.g., the h2 JDBC driver checks the timeout of each query instead of an entire JDBC batch. It defaults to 0. | 0 | +| hoodie.streamer.schemaprovider.source.schema.jdbc.nullable | If true, all columns are nullable | true | + +The above configs are passed to Hudi Streamer spark-submit command like: +```--hoodie-conf hoodie.streamer.jdbcbasedschemaprovider.connection.url=jdbc:postgresql://localhost/test?user=fred&password=secret``` + +#### File Based Schema Provider + +You can use a .avsc file to define your schema. You can then point to this file on DFS as a schema provider. + +| Config | Description | Example | +|---------------------------------------------------|-----------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.streamer.schemaprovider.source.schema.file | The schema of the source you are reading from | [example schema file](https://github.com/apache/hudi/blob/a8fb69656f522648233f0310ca3756188d954281/docker/demo/config/test-suite/source.avsc) | +| hoodie.streamer.schemaprovider.target.schema.file | The schema of the target you are writing to | [example schema file](https://github.com/apache/hudi/blob/a8fb69656f522648233f0310ca3756188d954281/docker/demo/config/test-suite/target.avsc) | + +#### Hive Schema Provider + +You can use hive tables to fetch source and target schema. + +| Config | Description | +|------------------------------------------------------------|--------------------------------------------------------| +| hoodie.streamer.schemaprovider.source.schema.hive.database | Hive database from where source schema can be fetched | +| hoodie.streamer.schemaprovider.source.schema.hive.table | Hive table from where source schema can be fetched | +| hoodie.streamer.schemaprovider.target.schema.hive.database | Hive database from where target schema can be fetched | +| hoodie.streamer.schemaprovider.target.schema.hive.table | Hive table from where target schema can be fetched | + + +#### Schema Provider with Post Processor +The SchemaProviderWithPostProcessor, will extract the schema from one of the previously mentioned Schema Providers and +then will apply a post processor to change the schema before it is used. You can write your own post processor by extending +this class: https://github.com/apache/hudi/blob/master/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaPostProcessor.java + +### Sources + +Hoodie Streamer can read data from a wide variety of sources. The following are a list of supported sources: + +#### Distributed File System (DFS) +See the storage configurations page to see some examples of DFS applications Hudi can read from. The following are the +supported file formats Hudi can read/write with on DFS Sources. (Note: you can still use Spark/Flink readers to read from +other formats and then write data as Hudi format.) + +- CSV +- AVRO +- JSON +- PARQUET +- ORC +- HUDI + +For DFS sources the following behaviors are expected: + +- For JSON DFS source, you always need to set a schema. If the target Hudi table follows the same schema as from the source file, you just need to set the source schema. If not, you need to set schemas for both source and target. +- `HoodieStreamer` reads the files under the source base path (`hoodie.streamer.source.dfs.root`) directly, and it won't use the partition paths under this base path as fields of the dataset. Detailed examples can be found [here](https://github.com/apache/hudi/issues/5485). + +#### Kafka +Hudi can read directly from Kafka clusters. See more details on `HoodieStreamer` to learn how to setup streaming +ingestion with exactly once semantics, checkpointing, and plugin transformations. The following formats are supported +when reading data from Kafka: + +- AVRO: `org.apache.hudi.utilities.sources.AvroKafkaSource` +- JSON: `org.apache.hudi.utilities.sources.JsonKafkaSource` +- Proto: `org.apache.hudi.utilities.sources.ProtoKafkaSource` + +Check out [Kafka source config](https://hudi.apache.org/docs/configurations#Kafka-Source-Configs) for more details. + +#### Pulsar + +`HoodieStreamer` also supports ingesting from Apache Pulsar via `org.apache.hudi.utilities.sources.PulsarSource`. +Check out [Pulsar source config](https://hudi.apache.org/docs/configurations#Pulsar-Source-Configs) for more details. + +#### Cloud storage event sources +AWS S3 storage provides an event notification service which will post notifications when certain events happen in your S3 bucket: +https://docs.aws.amazon.com/AmazonS3/latest/userguide/NotificationHowTo.html +AWS will put these events in a Simple Queue Service (SQS). Apache Hudi provides `S3EventsSource` +and `S3EventsHoodieIncrSource` that can read from SQS to trigger/processing of new or changed data as soon as it is +available on S3. Check out [S3 source configs](https://hudi.apache.org/docs/configurations#S3-Source-Configs) for more details. + +Similar to S3 event source, Google Cloud Storage (GCS) event source is also supported via `GcsEventsSource` and +`GcsEventsHoodieIncrSource`. Check out [GCS events source configs](https://hudi.apache.org/docs/configurations#GCS-Events-Source-Configs) for more details. + +##### AWS Setup +1. Enable S3 Event Notifications https://docs.aws.amazon.com/AmazonS3/latest/userguide/NotificationHowTo.html +2. Download the aws-java-sdk-sqs jar. +3. Find the queue URL and Region to set these configurations: + 1. hoodie.streamer.s3.source.queue.url=https://sqs.us-west-2.amazonaws.com/queue/url + 2. hoodie.streamer.s3.source.queue.region=us-west-2 +4. Start the `S3EventsSource` and `S3EventsHoodieIncrSource` using the `HoodieStreamer` utility as shown in sample commands below: + +Insert code sample from this blog: https://hudi.apache.org/blog/2021/08/23/s3-events-source/#configuration-and-setup + +#### JDBC Source +Hudi can read from a JDBC source with a full fetch of a table, or Hudi can even read incrementally with checkpointing from a JDBC source. + +| Config | Description | Example | +|--------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.streamer.jdbc.url | URL of the JDBC connection | jdbc:postgresql://localhost/test | +| hoodie.streamer.jdbc.user | User to use for authentication of the JDBC connection | fred | +| hoodie.streamer.jdbc.password | Password to use for authentication of the JDBC connection | secret | +| hoodie.streamer.jdbc.password.file | If you prefer to use a password file for the connection | | +| hoodie.streamer.jdbc.driver.class | Driver class to use for the JDBC connection | | +| hoodie.streamer.jdbc.table.name | | my_table | +| hoodie.streamer.jdbc.table.incr.column.name | If run in incremental mode, this field will be used to pull new data incrementally | | +| hoodie.streamer.jdbc.incr.pull | Will the JDBC connection perform an incremental pull? | | +| hoodie.streamer.jdbc.extra.options. | How you pass extra configurations that would normally by specified as spark.read.option() | hoodie.streamer.jdbc.extra.options.fetchSize=100 hoodie.streamer.jdbc.extra.options.upperBound=1 hoodie.streamer.jdbc.extra.options.lowerBound=100 | +| hoodie.streamer.jdbc.storage.level | Used to control the persistence level | Default = MEMORY_AND_DISK_SER | +| hoodie.streamer.jdbc.incr.fallback.to.full.fetch | Boolean which if set true makes an incremental fetch fallback to a full fetch if there is any error in the incremental read | FALSE | + +#### SQL Sources + +SQL Source `org.apache.hudi.utilities.sources.SqlSource` reads from any table, used mainly for backfill jobs which will process specific partition dates. +This won't update the streamer.checkpoint.key to the processed commit, instead it will fetch the latest successful +checkpoint key and set that value as this backfill commits checkpoint so that it won't interrupt the regular incremental +processing. To fetch and use the latest incremental checkpoint, you need to also set this hoodie_conf for Hudi Streamer +jobs: `hoodie.write.meta.key.prefixes = 'streamer.checkpoint.key'` + +Spark SQL should be configured using this hoodie config: +`hoodie.streamer.source.sql.sql.query = 'select * from source_table'` + +Using `org.apache.hudi.utilities.sources.SqlFileBasedSource` allows setting the SQL queries in a file to read from any +table. SQL file path should be configured using this hoodie config: +`hoodie.streamer.source.sql.file = 'hdfs://xxx/source.sql'` + +### Error Table + +`HoodieStreamer` supports segregating error records into a separate table called "Error table" alongside with the +target data table. This allows easy integration with dead-letter queues (DLQ). Error Table is supported with +user-provided subclass of `org.apache.hudi.utilities.streamer.BaseErrorTableWriter` supplied via +config `hoodie.errortable.write.class`. Check out more in `org.apache.hudi.config.HoodieErrorTableConfig`. + +### Termination Strategy + +Users can configure a post-write termination strategy under `continuous` mode if need be. For instance, +users can configure graceful shutdown if there is no new data from the configured source for 5 consecutive times. +Here is the interface for the termination strategy. + +```java +/** + * Post write termination strategy for deltastreamer in continuous mode. + */ +public interface PostWriteTerminationStrategy { + + /** + * Returns whether HoodieStreamer needs to be shutdown. + * @param scheduledCompactionInstantAndWriteStatuses optional pair of scheduled compaction instant and write statuses. + * @return true if HoodieStreamer has to be shutdown. false otherwise. + */ + boolean shouldShutdown(Option, JavaRDD>> scheduledCompactionInstantAndWriteStatuses); + +} +``` + +Also, this might help in bootstrapping a new table. Instead of doing one bulk load or bulk_insert leveraging a large +cluster for a large input of data, one could start `HoodieStreamer` on the `continuous` mode and add a shutdown strategy +to terminate, once all data has been bootstrapped. This way, each batch could be smaller and may not need a large +cluster to bootstrap data. There is a concrete implementation provided out-of-the-box: [NoNewDataTerminationStrategy](https://github.com/apache/hudi/blob/0d0a4152cfd362185066519ae926ac4513c7a152/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/NoNewDataTerminationStrategy.java). +Users can feel free to implement their own strategy as they see fit. + +### Dynamic configuration updates + +When Hoodie Streamer is running in `continuous` mode, the properties can be refreshed/updated before each sync calls. +Interested users can implement `org.apache.hudi.utilities.deltastreamer.ConfigurationHotUpdateStrategy` to leverage this. + +## MultiTableStreamer + +`HoodieMultiTableStreamer`, an extension of `HoodieStreamer`, facilitates the simultaneous ingestion of multiple tables into Hudi datasets. At present, it supports the sequential ingestion of tables and accommodates both COPY_ON_WRITE and MERGE_ON_READ storage types. The command line parameters for `HoodieMultiTableStreamer` largely mirror those of `HoodieStreamer`, with the notable difference being the necessity to supply table-specific configurations in separate files in a dedicated config folder. New command line options have been introduced to support this functionality: + +```java + * --config-folder + the path to the folder which contains all the table wise config files + --base-path-prefix + this is added to enable users to create all the hudi datasets for related tables under one path in FS. The datasets are then created under the path - //. However you can override the paths for every table by setting the property hoodie.streamer.ingestion.targetBasePath +``` + +The following properties are needed to be set properly to ingest data using `HoodieMultiTableStreamer`. + +```java +hoodie.streamer.ingestion.tablesToBeIngested + comma separated names of tables to be ingested in the format ., for example db1.table1,db1.table2 +hoodie.streamer.ingestion.targetBasePath + if you wish to ingest a particular table in a separate path, you can mention that path here +hoodie.streamer.ingestion..
.configFile + path to the config file in dedicated config folder which contains table overridden properties for the particular table to be ingested. +``` + +Sample config files for table wise overridden properties can be found +under `hudi-utilities/src/test/resources/streamer-config`. The command to run `HoodieMultiTableStreamer` is also similar +to how you run `HoodieStreamer`. + +```java +[hoodie]$ spark-submit \ + --packages org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0,org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 \ + --class org.apache.hudi.utilities.streamer.HoodieMultiTableStreamer `ls packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle-*.jar` \ + --props file://${PWD}/hudi-utilities/src/test/resources/streamer-config/kafka-source.properties \ + --config-folder file://tmp/hudi-ingestion-config \ + --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \ + --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \ + --source-ordering-field impresssiontime \ + --base-path-prefix file:\/\/\/tmp/hudi-streamer-op \ + --target-table uber.impressions \ + --op BULK_INSERT +``` + +For detailed information on how to configure and use `HoodieMultiTableStreamer`, please refer [blog section](/blog/2020/08/22/ingest-multiple-tables-using-hudi). diff --git a/website/versioned_docs/version-1.0.0/hudi_stack.md b/website/versioned_docs/version-1.0.0/hudi_stack.md new file mode 100644 index 0000000000000..59517ede41dac --- /dev/null +++ b/website/versioned_docs/version-1.0.0/hudi_stack.md @@ -0,0 +1,174 @@ +--- +title: Apache Hudi Stack +summary: "Explains about the various layers of software components that make up Hudi" +toc: true +toc_min_heading_level: 2 +toc_max_heading_level: 3 +last_modified_at: +--- + +Apache Hudi adds core warehouse and database functionality directly to a data lake (more recently known as the data lakehouse architecture) elevating it from a collection of +objects/files to well-managed tables. Hudi adds table abstraction over open file formats like Apache Parquet/ORC using a table format layer, that is optimized for frequent writes, +large-scale queries on a table snapshot as well efficient incremental scans. To understand the Hudi stack, we can simply translate the components to the seminal paper +on "[Architecture of a Database System](https://dsf.berkeley.edu/papers/fntdb07-architecture.pdf)", with modernized names. + +On top of this foundation, Hudi adds [storage engine](https://en.wikipedia.org/wiki/Database_engine) functionality found in many databases ("transactional storage manager" in the paper), +enabling transactional capabilities such as concurrency control, indexing, change capture and updates/deletes. The storage engine also consists of essential table services +to manage/maintain the tables, that are tightly integrated with the underlying storage layer and executed automatically by upper-layer writers or platform components +like an independent table management service. + +Hudi then defined clear read/write APIs that help interact with the tables, from a variety of SQL engines and code written in many programming languages using their popular data +processing frameworks. Hudi also comes with several platform services that help tune performance, operate tables, monitor tables, ingest data, import/export data, and more. + +Thus, when all things considered, the Hudi stack expands out of being just a 'table format' to a comprehensive and robust [data lakehouse](https://hudi.apache.org/blog/2024/07/11/what-is-a-data-lakehouse/) platform. In this section, +we will explore the Hudi stack and deconstruct the layers of software components that constitute Hudi. The features marked with an asterisk (*) represent work in progress, and +the dotted boxes indicate planned future work. These components collectively aim to fulfill the [vision](https://github.com/apache/hudi/blob/master/rfc/rfc-69/rfc-69.md) for the project. + +![Hudi Stack](/assets/images/hudi-stack-1-x.png) +

Figure: Apache Hudi Database Architecture

+ +## Lake Storage +The storage layer is where the data files/objects (such as Parquet) as well as all table format metadata are stored. Hudi interacts with the storage layer through Cloud native and [Hadoop FileSystem API](https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/fs/FileSystem.html), enabling compatibility +with various systems including HDFS for fast appends, and various cloud stores such as Amazon S3, Google Cloud Storage (GCS), and Azure Blob Storage. Additionally, Hudi offers its own storage APIs that can rely on Hadoop-independent file system +implementation to simplify the integration of various file systems. Hudi adds a custom wrapper filesystem that lays out the foundation for improved storage optimizations. + +## File Formats +![File Format](/assets/images/blog/hudistack/file_format_2.png) +

Figure: File format structure in Hudi

+ +File formats hold the actual data and are physically stored on the lake storage. Hudi operates on logical structures of File Groups and File Slices, which consist of Base File and Log Files. +Log files store updates/deletes/inserts on top of records stored in base files, and periodically log files are compacted into small set of log files (log compaction) or base files (compaction). +Future updates aim to integrate diverse formats like unstructured data (e.g., JSON, images), and compatibility with different storage layers in event-streaming, OLAP engines, and warehouses. +Hudi's layout scheme encodes all changes to a Log File as a sequence of blocks (data, delete, rollback). By making data available in open file formats (such as Parquet/Avro), Hudi enables users to +bring any compute engine for specific workloads. + +## Table Format +![Table Format](/assets/images/blog/hudistack/table_format_1.png) +

Figure: Apache Hudi's Table format

+ +Drawing an analogy to file formats, a table format simply concerns with how files are distributed with the table, partitioning schemes, schema and metadata tracking changes. Hudi organizes files within a table or partition into +File Groups. Updates are captured in log files tied to these File Groups, ensuring efficient merges. There are three major components related to Hudi’s table format. + +- **Timeline** : Hudi's [timeline](./timeline), stored in the `/.hoodie/timeline` folder, is a crucial event log recording all table actions in an ordered manner, + with events kept for a specified period. Hudi uniquely designs each File Group as a self-contained log, enabling record state reconstruction through delta logs, even after archival of historical actions. This approach effectively limits metadata size based on table activity frequency, essential for managing tables with frequent updates. + +- **File Group and File Slice** : Within each partition the data is physically stored as base and Log Files and organized into logical concepts as [File groups](https://hudi.apache.org/tech-specs-1point0/#storage-layout) and +File Slices. File groups contain multiple versions of File Slices and are split into multiple File Slices. A File Slice comprises the Base and Log File. Each File Slice within +the file-group is uniquely identified by the write that created its base file or the first log file, which helps order the File Slices. + +- **Metadata Table** : Implemented as an another merge-on-read Hudi table, the [metadata table](./metadata) efficiently handles quick updates with low write amplification. +It leverages a [SSTable](https://cassandra.apache.org/doc/stable/cassandra/architecture/storage_engine.html#sstables) based file format for quick, indexed key lookups, +storing vital information like file paths, column statistics and schema. This approach streamlines operations by reducing the necessity for expensive cloud file listings. + +Hudi’s approach of recording updates into Log Files is more efficient and involves low merge overhead than systems like Hive ACID, where merging all delta records against +all Base Files is required. Read more about the various table types in Hudi [here](./table_types). + + +## Storage Engine +The storage layer of Hudi comprises the core components that are responsible for the fundamental operations and services that enable Hudi to store, retrieve, and manage data +efficiently on [data lakehouse](https://hudi.apache.org/blog/2024/07/11/what-is-a-data-lakehouse/) storages. This functionality is comparable to that of roles play by storage engines in popular databases like PostgreSQL, MySQL, MongoDB, +Cassandra and Clickhouse. + + +### Indexes +![Indexes](/assets/images/hudi-stack-indexes.png) +

Figure: Indexes in Hudi

+ +[Indexes](./indexes) in Hudi enhance query planning, minimizing I/O, speeding up response times and providing faster writes with low merge costs. The [metadata table](./metadata/#metadata-table-indices) acts +as an additional [indexing system](./metadata#supporting-multi-modal-index-in-hudi) and brings the benefits of indexes generally to both the readers and writers. Compute engines can leverage various indexes in the metadata +table, like file listings, column statistics, bloom filters, record-level indexes, and [expression indexes](https://github.com/apache/hudi/blob/master/rfc/rfc-63/rfc-63.md) to quickly generate optimized query plans and improve read +performance. In addition to the metadata table indexes, Hudi supports simple join based indexing, bloom filters stored in base file footers, external key-value stores like HBase, +and optimized storage techniques like bucketing , to efficiently locate File Groups containing specific record keys. Hudi also provides reader indexes such as [expression](https://github.com/apache/hudi/blob/master/rfc/rfc-63/rfc-63.md) and +secondary indexes to boost reads. The table partitioning scheme in Hudi is consciously exploited for implementing global and non-global indexing strategies, that limit scope of a record's +uniqueness to a given partition or globally across all partitions. + +### Table Services +![Table Services](/assets/images/blog/hudistack/table_services_2.png) +

Figure: Table services in Hudi

+ +Apache Hudi offers various table services to help keep the table storage layout and metadata management performant. Hudi was designed with built-in table services that enables +running them in inline, semi-asynchronous or full-asynchronous modes. Furthermore, Spark and Flink streaming writers can run in continuous mode, and invoke table services +asynchronously sharing the underlying executors intelligently with writers. Let’s take a look at these services. + +#### Clustering +The [clustering](./clustering) service, akin to features in cloud data warehouses, allows users to group frequently queried records using sort keys or merge smaller Base Files into +larger ones for optimal file size management. It's fully integrated with other timeline actions like cleaning and compaction, enabling smart optimizations such as avoiding +compaction for File Groups undergoing clustering, thereby saving on I/O. + +#### Compaction +Hudi's [compaction](./compaction) service, featuring strategies like date partitioning and I/O bounding, merges Base Files with delta logs to create updated Base Files. It allows +concurrent writes to the same File Froup, enabled by Hudi's file grouping and flexible log merging. This facilitates non-blocking execution of deletes even during concurrent +record updates. + +#### Cleaning +[Cleaner](http://hudi.apache.org/blog/2021/06/10/employing-right-configurations-for-hudi-cleaner) service works off the timeline incrementally, removing File Slices that are past the configured retention period for incremental queries, +while also allowing sufficient time for long running batch jobs (e.g Hive ETLs) to finish running. This allows users to reclaim storage space, thereby saving on costs. + +#### Indexing +Hudi's scalable metadata table contains auxiliary data about the table. This subsystem encompasses various indices, including files, column_stats, and bloom_filters, +facilitating efficient record location and data skipping. Balancing write throughput with index updates presents a fundamental challenge, as traditional indexing methods, +like locking out writes during indexing, are impractical for large tables due to lengthy processing times. Hudi addresses this with its innovative asynchronous [metadata indexing](./metadata_indexing), +enabling the creation of various indices without impeding writes. This approach not only improves write latency but also minimizes resource waste by reducing contention between writing and indexing activities. + +### Concurrency Control +[Concurrency control](./concurrency_control) defines how different writers/readers/table services coordinate access to the table. Hudi uses monotonically increasing time to sequence and order various +changes to table state. Much like databases, Hudi take an approach of clearly differentiating between writers (responsible for upserts/deletes), table services +(focusing on storage optimization and bookkeeping), and readers (for query execution). Hudi provides snapshot isolation, offering a consistent view of the table across +these different operations. It employs lock-free, non-blocking MVCC for concurrency between writers and table-services, as well as between different table services, and +optimistic concurrency control (OCC) for multi-writers with early conflict detection. With [Hudi 1.0](https://github.com/apache/hudi/blob/master/rfc/rfc-69/rfc-69.md), non-blocking concurrency control ([NBCC](https://github.com/apache/hudi/blob/master/rfc/rfc-66/rfc-66.md)) +is introduced, allowing multiple writers to concurrently operate on the table with non-blocking conflict resolution. + +### Lake Cache* +![Lake Cache](/assets/images/blog/hudistack/lake_cache_3.png) +

Figure: Proposed Lake Cache in Hudi

+ +Data lakes today face a tradeoff between fast data writing and optimal query performance. Writing smaller files or logging deltas enhances writing speed, but superior query performance typically requires opening fewer files and pre-materializing merges. Most databases use a buffer pool to reduce storage access costs. Hudi’s design supports creating a multi-tenant caching tier that can store pre-merged File Slices. Hudi’s timeline can then be used to simply communicate caching policies. Traditionally, caching is near query engines or in-memory file systems. Integrating a [caching layer](https://issues.apache.org/jira/browse/HUDI-6489) with Hudi's transactional storage enables shared caching across query engines, supporting updates and deletions, and reducing costs. The goal is to build a buffer pool for lakes, compatible with all major engines, with the contributions from the rest of the community. + + +## Programming APIs + +### Writers +Hudi tables can be used as sinks for Spark/Flink pipelines and the Hudi writing path provides several enhanced capabilities over file writing done by vanilla parquet/avro sinks. It categorizes write operations into incremental (`insert`, `upsert`, `delete`) and batch/bulk (`insert_overwrite`, `delete_partition`, `bulk_insert`) with specific functionalities. `upsert` and `delete` efficiently merge records with identical keys and integrate with the file sizing mechanism, while `insert` operations smartly bypass certain steps like pre-combining, maintaining pipeline benefits. Similarly, `bulk_insert` operation offers control over file sizes for data imports. Batch operations integrate MVCC for seamless transitions between incremental and batch processing. Additionally, the write pipeline includes optimizations like handling large merges via rocksDB and concurrent I/O, enhancing write performance. + +### Readers +Hudi provides snapshot isolation for writers and readers, enabling consistent table snapshot queries across major query engines (Spark, Hive, Flink, Presto, Trino, Impala) and cloud warehouses. It optimizes query performance by utilizing lightweight processes, especially for base columnar file reads, and integrates engine-specific vectorized readers like in Presto and Trino. This scalable model surpasses the need for separate readers and taps into each engine's unique optimizations, such as Presto and Trino's data/metadata caches. For queries merging Base and Log Files, Hudi employs mechanisms such as spillable maps and lazy reading to boost merge performance. Additionally, Hudi offers a read-optimized query option, trading off data freshness for improved query speed. There are also recently added features such as positional merge, encoding partial Log File to only changed columns and support for Parquet as the Log File format to improve MoR snapshot query performance. + +## User Access + +### SQL Engines +Apache Hudi is compatible with a wide array of SQL query engines, catering to various analytical needs. For distributed ETL batch processing, Apache Spark is frequently utilized, +leveraging its efficient handling of large-scale data. In the realm of streaming use cases, compute engines such as Apache Flink and Apache Spark's Structured Streaming provide +robust support when paired with Hudi. Moreover, Hudi supports modern data lake query engines such as Trino and Presto, as well as modern analytical databases such as ClickHouse +and StarRocks. This diverse support of compute engines positions Apache Hudi as a flexible and adaptable platform for a broad spectrum of use cases. + +### Code Frameworks +While SQL still rules the roost when it comes to data engineering, an equally important and widespread data engineering/data science practice is to write code in different +languages like Java, Scala, Python and R, to analyze data using sophisticated algorithms with full expressiveness of the language. To this end, Hudi supports several +popular data processing frameworks like Apache Spark and Apache Flink, as well as python based distributed frameworks like Daft, Ray and native bindings in Rust for easy +integration with engines written in C/C++. + +... + +## Platform Services +![Platform Services](/assets/images/blog/hudistack/platform_2.png) +

Figure: Various platform services in Hudi

+ +Platform services offer functionality that is specific to data and workloads, and they sit directly on top of the table services, interfacing with writers and readers. +Services, like [Hudi Streamer](./hoodie_streaming_ingestion#hudi-streamer) (or its Flink counterpart), are specialized in handling data and workloads, seamlessly integrating with Kafka streams and various +formats to build data lakes. They support functionalities like automatic checkpoint management, integration with major schema registries (including Confluent), and +deduplication of data. Hudi Streamer also offers features for backfills, one-off runs, and continuous mode operation with Spark/Flink streaming writers. Additionally, +Hudi provides tools for [snapshotting](./snapshot_exporter) and incrementally [exporting](./snapshot_exporter#examples) Hudi tables, importing new tables, and [post-commit callback](platform_services_post_commit_callback) for analytics or +workflow management, enhancing the deployment of production-grade incremental pipelines. Apart from these services, Hudi also provides broad support for different +catalogs such as [Hive Metastore](./syncing_metastore), [AWS Glue](./syncing_aws_glue_data_catalog/), [Google BigQuery](./gcp_bigquery), [DataHub](./syncing_datahub), etc. that allows syncing of Hudi tables to be queried by +interactive engines such as Trino and Presto. + +### Metaserver* +![Metaserver](/assets/images/blog/hudistack/metaserver_2.png) +

Figure: Proposed Metaserver in Hudi

+ +Storing table metadata on lake storage, while scalable, is less efficient than RPCs to a scalable meta server. Hudi addresses this with its metadata server, called "metaserver," +an efficient alternative for managing table metadata for a large number of tables. Currently, the timeline server, embedded in Hudi's writer processes, uses a local rocksDB store and [Javalin](https://javalin.io/) REST API to serve file listings, reducing cloud storage listings. +Since version 0.6.0, there's a trend towards standalone timeline servers, aimed at horizontal scaling and improved security. These developments are set to create a more efficient lake [metastore](https://issues.apache.org/jira/browse/HUDI-3345) +for future needs. + + diff --git a/website/versioned_docs/version-1.0.0/ibm_cos_hoodie.md b/website/versioned_docs/version-1.0.0/ibm_cos_hoodie.md new file mode 100644 index 0000000000000..d4e8971535766 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/ibm_cos_hoodie.md @@ -0,0 +1,77 @@ +--- +title: IBM Cloud +keywords: [ hudi, hive, ibm, cos, spark, presto] +summary: In this page, we go over how to configure Hudi with IBM Cloud Object Storage filesystem. +last_modified_at: 2020-10-01T11:38:24-10:00 +--- +In this page, we explain how to get your Hudi spark job to store into IBM Cloud Object Storage. + +## IBM COS configs + +There are two configurations required for Hudi-IBM Cloud Object Storage compatibility: + +- Adding IBM COS Credentials for Hudi +- Adding required Jars to classpath + +### IBM Cloud Object Storage Credentials + +Simplest way to use Hudi with IBM Cloud Object Storage, is to configure your `SparkSession` or `SparkContext` with IBM Cloud Object Storage credentials using [Stocator](https://github.com/CODAIT/stocator) storage connector for Spark. Hudi will automatically pick this up and talk to IBM Cloud Object Storage. + +Alternatively, add the required configs in your `core-site.xml` from where Hudi can fetch them. Replace the `fs.defaultFS` with your IBM Cloud Object Storage bucket name and Hudi should be able to read/write from the bucket. + +For example, using HMAC keys and service name `myCOS`: +```xml + + fs.defaultFS + cos://myBucket.myCOS + + + + fs.cos.flat.list + true + + + + fs.stocator.scheme.list + cos + + + + fs.cos.impl + com.ibm.stocator.fs.ObjectStoreFileSystem + + + + fs.stocator.cos.impl + com.ibm.stocator.fs.cos.COSAPIClient + + + + fs.stocator.cos.scheme + cos + + + + fs.cos.myCos.access.key + ACCESS KEY + + + + fs.cos.myCos.endpoint + http://s3-api.us-geo.objectstorage.softlayer.net + + + + fs.cos.myCos.secret.key + SECRET KEY + + +``` + +For more options see Stocator [documentation](https://github.com/CODAIT/stocator/blob/master/README.md). + +### IBM Cloud Object Storage Libs + +IBM Cloud Object Storage hadoop libraries to add to our classpath + + - com.ibm.stocator:stocator:1.1.3 diff --git a/website/versioned_docs/version-1.0.0/indexes.md b/website/versioned_docs/version-1.0.0/indexes.md new file mode 100644 index 0000000000000..73310e431b686 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/indexes.md @@ -0,0 +1,226 @@ +--- +title: Indexes +toc: true +toc_min_heading_level: 2 +toc_max_heading_level: 4 +--- + +In databases, indexes are auxiliary data structures maintained to quickly locate records needed, without reading unnecessary data +from storage. Given that Hudi’s design has been heavily optimized for handling mutable change streams, with different +write patterns, Hudi considers [indexing](#indexing) as an integral part of its design and has uniquely supported +[indexing capabilities](https://hudi.apache.org/blog/2020/11/11/hudi-indexing-mechanisms/) from its inception, to speed +up writes on the [data lakehouse](https://hudi.apache.org/blog/2024/07/11/what-is-a-data-lakehouse/), while still providing +columnar query performance. + +## Mapping keys to file groups +The most foundational index mechanism in Hudi tracks a mapping from a given key (record key + optionally partition path) consistently to a file id. Other types of indexes like secondary indexes, +build on this foundation. This mapping between record key and file group/file id rarely changes once the first version of a record has been written to a file group. +Only clustering or cross-partition updates that are implemented as deletes + inserts remap the record key to a different file group. Even then, a given record key is associated with exactly one +file group at any completed instant on the timeline. + +## Need for indexing +For [Copy-On-Write tables](table_types#copy-on-write-table), indexing enables fast upsert/delete operations, by avoiding the need to join against the entire dataset to determine which files to rewrite. +For [Merge-On-Read tables](table_types#merge-on-read-table), indexing allows Hudi to bound the amount of change records any given base file needs to be merged against. Specifically, a given base file needs to merged +only against updates for records that are part of that base file. + +![Fact table](/assets/images/blog/hudi-indexes/with_without_index.png) +

Figure: Comparison of merge cost for updates (dark blue blocks) against base files (light blue blocks)

+ +In contrast, +- Designs without an indexing component (e.g: [Apache Hive/Apache Iceberg](https://cwiki.apache.org/confluence/display/Hive/Hive+Transactions)) end up having to merge all the base files against all incoming updates/delete records + (10-100x more [read amplification](table_types#comparison)). +- Designs that implement heavily write-optimized OLTP data structures like LSM trees do not require an indexing component. But they perform poorly scan heavy workloads + against cloud storage making them unsuitable for serving analytical queries. + +Hudi shines by achieving both great write performance and read performance, at the extra storage costs of an index, which can however unlock a lot more, as we explore below. + +## Multi-modal Indexing +[Multi-modal indexing](https://www.onehouse.ai/blog/introducing-multi-modal-index-for-the-lakehouse-in-apache-hudi), +introduced in [0.11.0 Hudi release](https://hudi.apache.org/releases/release-0.11.0/#multi-modal-index), +is a re-imagination of what a general purpose indexing subsystem should look like for the lake. Multi-modal indexing is +implemented by enhancing the metadata table with the flexibility to extend to new index types as new partitions, +along with an [asynchronous index](https://hudi.apache.org/docs/metadata_indexing/#setup-async-indexing) building + +Hudi supports a multi-modal index by augmenting the metadata table with the capability to incorporate new types of indexes, complemented by an +asynchronous mechanism for [index construction](metadata_indexing). This enhancement supports a range of indexes within +the [metadata table](metadata#metadata-table), significantly improving the efficiency of both writing to and reading from the table. + +![Indexes](/assets/images/hudi-stack-indexes.png) +

Figure: Indexes in Hudi

+ +### Bloom Filters + + [Bloom filter](https://github.com/apache/hudi/blob/46f41d186c6c84a6af2c54a907ff2736b6013e15/rfc/rfc-37/rfc-37.md) indexes as *bloom_filter* partition in the metadata table. + This index employs range-based pruning on the minimum and maximum values of the record keys and bloom-filter-based lookups to tag incoming records. For large tables, this + involves reading the footers of all matching data files for bloom filters, which can be expensive in the case of random + updates across the entire dataset. This index stores bloom filters of all data files centrally to avoid scanning the + footers directly from all data files. + +### Record Indexes + + [Record indexes](https://cwiki.apache.org/confluence/display/HUDI/RFC-08++Record+level+indexing+mechanisms+for+Hudi+datasets) as *record_index* partition in the metadata table. + Contains the mapping of the record key to location. Record index is a global index, enforcing key uniqueness across all partitions in the table. This index aids in locating records faster than + other existing indexes and can provide a speedup orders of magnitude faster in large deployments where index lookup dominates write latencies. To accommodate very high scales, it utilizes hash-based + sharding of the key space. Additionally, when it comes to reading data, the index allows for point lookups significantly speeding up index mapping retrieval process. + + +### Expression Index + An [expression index](https://github.com/apache/hudi/blob/3789840be3d041cbcfc6b24786740210e4e6d6ac/rfc/rfc-63/rfc-63.md) is an index on a function of a column. If a query has a predicate on a function of a column, the expression index can + be used to speed up the query. Expression index is stored in *expr_index_* prefixed partitions (one for each + expression index) under metadata table. Expression index can be created using SQL syntax. Please checkout SQL DDL + docs [here](sql_ddl#create-expression-index) for more details. + +### Secondary Index + + Secondary indexes allow users to create indexes on columns that are not part of record key columns in Hudi tables (for + record key fields, Hudi supports [Record-level Index](/blog/2023/11/01/record-level-index). Secondary indexes + can be used to speed up queries with predicate on columns other than record key columns. + +Following are configurations that control enabling index building and maintenance on the writer. + +| Config Name | Default | Description | +|----------------------------------------------|-------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.metadata.index.bloom.filter.enable | false (Optional) | Enable indexing bloom filters of user data files under metadata table. When enabled, metadata table will have a partition to store the bloom filter index and will be used during the index lookups.

`Config Param: ENABLE_METADATA_INDEX_BLOOM_FILTER`
`Since Version: 0.11.0` | +| hoodie.metadata.record.index.enable | false (Optional) | Create the record Index within the metadata table

`Config Param: RECORD_INDEX_ENABLE_PROP`
`Since Version: 0.14.0`. This is a pre-requisite for secondary indexes or expression indexes on them. | + +## Additional writer-side indexes + +All the indexes discussed above are available both readers/writers using integration with metadata table. There are also indexing mechanisms +implemented by the storage engine, by efficiently reading/joining/processing incoming input records against information stored in base/log +files themselves (e.g. bloom filters stored in parquet file footers) or intelligent data layout (e.g. bucket index). + +Currently, Hudi supports the following index types. Default is SIMPLE on Spark engine, and INMEMORY on Flink and Java +engines. Writers can pick one of these options using `hoodie.index.type` config option. + +- **SIMPLE (default for Spark engines)**: This is the standard index type for the Spark engine. It executes an efficient join of incoming records with keys retrieved from the table stored on disk. It requires keys to be partition-level unique so it can function correctly. + +- **RECORD_INDEX** : Use the record index from section above as the writer side index. + +- **BLOOM**: Uses bloom filters generated from record keys, with the option to further narrow down candidate files based on the ranges of the record keys. It requires keys to be partition-level unique so it can function correctly. + +- **GLOBAL_BLOOM**: Utilizes bloom filters created from record keys, and may also refine the selection of candidate files by using the ranges of record keys. It requires keys to be table/global-level unique so it can function correctly. + +- **GLOBAL_SIMPLE**: Performs a lean join of the incoming records against keys extracted from the table on storage. It requires keys to be table/global-level unique so it can function correctly. + +- **HBASE**: Mangages the index mapping through an external table in Apache HBase. + +- **INMEMORY (default for Flink and Java)**: Uses in-memory hashmap in Spark and Java engine and Flink in-memory state in Flink for indexing. + +- **BUCKET**: Utilizes bucket hashing to identify the file group that houses the records, which proves to be particularly advantageous on a large scale. To select the type of bucket engine—that is, the method by which buckets are created—use the `hoodie.index.bucket.engine` configuration option. + - **SIMPLE(default)**: This index employs a fixed number of buckets for file groups within each partition, which do not have the capacity to decrease or increase in size. It is applicable to both COW and MOR tables. Due to the unchangeable number of buckets and the design principle of mapping each bucket to a single file group, this indexing method may not be ideal for partitions with significant data skew. + + - **CONSISTENT_HASHING**: This index accommodates a dynamic number of buckets, with the capability for bucket resizing to ensure each bucket is sized appropriately. This addresses the issue of data skew in partitions with a high volume of data by allowing these partitions to be dynamically resized. As a result, partitions can have multiple reasonably sized buckets, unlike the fixed bucket count per partition seen in the SIMPLE bucket engine type. This feature is exclusively compatible with MOR tables. + +- **Bring your own implementation:** You can extend this [public API](https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java) + and supply a subclass of `SparkHoodieIndex` (for Apache Spark writers) using `hoodie.index.class` to implement custom indexing. + +### Global and Non-Global Indexes + +Another key aspect worth understanding is the difference between global and non-global indexes. Both bloom and simple index have +global options - `hoodie.index.type=GLOBAL_BLOOM` and `hoodie.index.type=GLOBAL_SIMPLE` - respectively. Record index and +HBase index are by nature a global index. + +- **Global index:** Global indexes enforce uniqueness of keys across all partitions of a table i.e guarantees that exactly + one record exists in the table for a given record key. Global indexes offer stronger guarantees, but the update/delete + cost can still grow with size of the table `O(size of table)`, since the record could belong to any partition in storage. + In case of non-global index, lookup involves file groups only for the matching partitions from the incoming records and + so its not impacted by the total size of the table. These global indexes(GLOBAL_SIMPLE or GLOBAL_BLOOM), might be + acceptable for decent sized tables, but for large tables, a newly added index (0.14.0) called Record Level Index (RLI), + can offer pretty good index lookup performance compared to other global indexes(GLOBAL_SIMPLE or GLOBAL_BLOOM) or + Hbase and also avoids the operational overhead of maintaining external systems. +- **Non Global index:** On the other hand, the default index implementations enforce this constraint only within a specific partition. + As one might imagine, non global indexes depends on the writer to provide the same consistent partition path for a given record key during update/delete, + but can deliver much better performance since the index lookup operation becomes `O(number of records updated/deleted)` and + scales well with write volume. + +### Configs + +#### Spark based configs + +For Spark DataSource, Spark SQL, DeltaStreamer and Structured Streaming following are the key configs that control +indexing behavior. Please refer to [Advanced Configs](https://hudi.apache.org/docs/next/configurations#Common-Index-Configs-advanced-configs) +for more details. All these, support the index types mentioned [above](#index-types-in-hudi). + +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------- || +| hoodie.index.type| N/A **(Required)** | org.apache.hudi.index.HoodieIndex$IndexType: Determines how input records are indexed, i.e., looked up based on the key for the location in the existing table. Default is SIMPLE on Spark engine, and INMEMORY on Flink and Java engines. Possible Values:
  • BLOOM
  • GLOBAL_BLOOM
  • SIMPLE
  • GLOBAL_SIMPLE
  • HBASE
  • INMEMORY
  • FLINK_STATE
  • BUCKET
  • RECORD_INDEX

`Config Param: INDEX_TYPE` | +| hoodie.index.bucket.engine | SIMPLE (Optional) | org.apache.hudi.index.HoodieIndex$BucketIndexEngineType: Determines the type of bucketing or hashing to use when `hoodie.index.type` is set to `BUCKET`. Possible Values:
  • SIMPLE
  • CONSISTENT_HASHING

`Config Param: BUCKET_INDEX_ENGINE_TYPE`
`Since Version: 0.11.0` | +| hoodie.index.class | (Optional) | Full path of user-defined index class and must be a subclass of HoodieIndex class. It will take precedence over the hoodie.index.type configuration if specified

`Config Param: INDEX_CLASS_NAME` | +| hoodie.bloom.index.update.partition.path | true (Optional) | Only applies if index type is GLOBAL_BLOOM. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition, ignoring the new incoming partition if there is a mis-match between partition value for an incoming record with whats in storage.

`Config Param: BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE` | +| hoodie.record.index.update.partition.path | false (Optional) | Similar to Key: 'hoodie.bloom.index.update.partition.path' , Only applies if index type is RECORD_INDEX. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition, ignoring the new incoming partition if there is a mis-match between partition value for an incoming record with whats in storage.

`Config Param: RECORD_INDEX_UPDATE_PARTITION_PATH_ENABLE` | +| hoodie.simple.index.update.partition.path | true (Optional) | Similar to Key: 'hoodie.bloom.index.update.partition.path' , Only applies if index type is GLOBAL_SIMPLE. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition, ignoring the new incoming partition if there is a mis-match between partition value for an incoming record with whats in storage.

`Config Param: SIMPLE_INDEX_UPDATE_PARTITION_PATH_ENABLE` | +| hoodie.hbase.index.update.partition.path | false (Optional) | Only applies if index type is HBASE. When an already existing record is upserted to a new partition compared to whats in storage, this config when set, will delete old record in old partition and will insert it as new record in new partition.

`Config Param: UPDATE_PARTITION_PATH_ENABLE` | + +#### Flink based configs + +For Flink DataStream and Flink SQL only support Bucket Index and internal Flink state store backed in memory index. +Following are the basic configs that control the indexing behavior. Please refer [here](https://hudi.apache.org/docs/next/configurations#Flink-Options-advanced-configs) +for advanced configs. + +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------| ----------------------------------------------------------------------------------------------- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| index.type | FLINK_STATE (Optional) | Index type of Flink write job, default is using state backed index. Possible values:
  • FLINK_STATE
  • BUCKET

`Config Param: INDEX_TYPE` | +| hoodie.index.bucket.engine | SIMPLE (Optional) | org.apache.hudi.index.HoodieIndex$BucketIndexEngineType: Determines the type of bucketing or hashing to use when `hoodie.index.type` is set to `BUCKET`. Possible Values:
  • SIMPLE
  • CONSISTENT_HASHING
| + + + + +### Picking Indexing Strategies + +Since data comes in at different volumes, velocity and has different access patterns, different indexes could be used for different workload types. +Let’s walk through some typical workload types and see how to leverage the right Hudi index for such use-cases. +This is based on our experience and you should diligently decide if the same strategies are best for your workloads. + +#### Workload 1: Late arriving updates to fact tables +Many companies store large volumes of transactional data in NoSQL data stores. For eg, trip tables in case of ride-sharing, buying and selling of shares, +orders in an e-commerce site. These tables are usually ever growing with random updates on most recent data with long tail updates going to older data, either +due to transactions settling at a later date/data corrections. In other words, most updates go into the latest partitions with few updates going to older ones. + +![Fact table](/assets/images/blog/hudi-indexes/nosql.png) +

Figure: Typical update pattern for Fact tables

+ +For such workloads, the `BLOOM` index performs well, since index look-up will prune a lot of data files based on a well-sized bloom filter. +Additionally, if the keys can be constructed such that they have a certain ordering, the number of files to be compared is further reduced by range pruning. +Hudi constructs an interval tree with all the file key ranges and efficiently filters out the files that don't match any key ranges in the updates/deleted records. + +In order to efficiently compare incoming record keys against bloom filters i.e with minimal number of bloom filter reads and uniform distribution of work across +the executors, Hudi leverages caching of input records and employs a custom partitioner that can iron out data skews using statistics. At times, if the bloom filter +false positive ratio is high, it could increase the amount of data shuffled to perform the lookup. Hudi supports dynamic bloom filters +(enabled using `hoodie.bloom.index.filter.type=DYNAMIC_V0`), which adjusts its size based on the number of records stored in a given file to deliver the +configured false positive ratio. + +#### Workload 2: De-Duplication in event tables +Event Streaming is everywhere. Events coming from Apache Kafka or similar message bus are typically 10-100x the size of fact tables and often treat "time" (event's arrival time/processing +time) as a first class citizen. For eg, IoT event stream, click stream data, ad impressions etc. Inserts and updates only span the last few partitions as these are mostly append only data. +Given duplicate events can be introduced anywhere in the end-end pipeline, de-duplication before storing on the data lake is a common requirement. + +![Event table](/assets/images/blog/hudi-indexes/event_bus.png) +

Figure showing the spread of updates for Event table.

+ +In general, this is a very challenging problem to solve at lower cost. Although, we could even employ a key value store to perform this de-duplication with HBASE index, the index storage +costs would grow linear with number of events and thus can be prohibitively expensive. In fact, `BLOOM` index with range pruning is the optimal solution here. One can leverage the fact +that time is often a first class citizen and construct a key such as `event_ts + event_id` such that the inserted records have monotonically increasing keys. This yields great returns +by pruning large amounts of files even within the latest table partitions. + +#### Workload 3: Random updates/deletes to a dimension table +These types of tables usually contain high dimensional data and hold reference data e.g user profile, merchant information. These are high fidelity tables where the updates are often small but also spread +across a lot of partitions and data files ranging across the dataset from old to new. Often times, these tables are also un-partitioned, since there is also not a good way to partition these tables. + +![Dimensions table](/assets/images/blog/hudi-indexes/dimension.png) +

Figure showing the spread of updates for Dimensions table.

+ +As discussed before, the `BLOOM` index may not yield benefits if a good number of files cannot be pruned out by comparing ranges/filters. In such a random write workload, updates end up touching +most files within in the table and thus bloom filters will typically indicate a true positive for all files based on some incoming update. Consequently, we would end up comparing ranges/filter, only +to finally check the incoming updates against all files. The `SIMPLE` Index will be a better fit as it does not do any upfront pruning based, but directly joins with interested fields from every data file. +`HBASE` index can be employed, if the operational overhead is acceptable and would provide much better lookup times for these tables. + +When using a global index, users should also consider setting `hoodie.bloom.index.update.partition.path=true` or `hoodie.simple.index.update.partition.path=true` to deal with cases where the +partition path value could change due to an update e.g users table partitioned by home city; user relocates to a different city. These tables are also excellent candidates for the Merge-On-Read table type. + + +## Related Resources +

Videos

+ +* [Global Bloom Index: Remove duplicates & guarantee uniquness - Hudi Labs](https://youtu.be/XlRvMFJ7g9c) +* [Multi-Modal Index for the Lakehouse in Apache Hudi](https://www.onehouse.ai/blog/introducing-multi-modal-index-for-the-lakehouse-in-apache-hudi) + diff --git a/website/versioned_docs/version-1.0.0/ingestion_flink.md b/website/versioned_docs/version-1.0.0/ingestion_flink.md new file mode 100644 index 0000000000000..e941002137402 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/ingestion_flink.md @@ -0,0 +1,179 @@ +--- +title: Using Flink +keywords: [hudi, flink, streamer, ingestion] +--- + +### CDC Ingestion +CDC(change data capture) keep track of the data changes evolving in a source system so a downstream process or system can action that change. +We recommend two ways for syncing CDC data into Hudi: + +![slide1 title](/assets/images/cdc-2-hudi.png) + +1. Using the Ververica [flink-cdc-connectors](https://github.com/ververica/flink-cdc-connectors) directly connect to DB Server to sync the binlog data into Hudi. + The advantage is that it does not rely on message queues, but the disadvantage is that it puts pressure on the db server; +2. Consume data from a message queue (for e.g, the Kafka) using the flink cdc format, the advantage is that it is highly scalable, + but the disadvantage is that it relies on message queues. + +:::note +- If the upstream data cannot guarantee the order, you need to specify option `write.precombine.field` explicitly; +::: + +### Bulk Insert + +For the demand of snapshot data import. If the snapshot data comes from other data sources, use the `bulk_insert` mode to quickly +import the snapshot data into Hudi. + + +:::note +`bulk_insert` eliminates the serialization and data merging. The data deduplication is skipped, so the user need to guarantee the uniqueness of the data. +::: + +:::note +`bulk_insert` is more efficient in the `batch execution mode`. By default, the `batch execution mode` sorts the input records +by the partition path and writes these records to Hudi, which can avoid write performance degradation caused by +frequent `file handle` switching. +::: + +:::note +The parallelism of `bulk_insert` is specified by `write.tasks`. The parallelism will affect the number of small files. +In theory, the parallelism of `bulk_insert` is the number of `bucket`s (In particular, when each bucket writes to maximum file size, it +will rollover to the new file handle. Finally, `the number of files` >= [`write.bucket_assign.tasks`](/docs/configurations#writebucket_assigntasks). +::: + +#### Options + +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `write.operation` | `true` | `upsert` | Setting as `bulk_insert` to open this function | +| `write.tasks` | `false` | `4` | The parallelism of `bulk_insert`, `the number of files` >= [`write.bucket_assign.tasks`](/docs/configurations#writebucket_assigntasks) | +| `write.bulk_insert.shuffle_input` | `false` | `true` | Whether to shuffle data according to the input field before writing. Enabling this option will reduce the number of small files, but there may be a risk of data skew | +| `write.bulk_insert.sort_input` | `false` | `true` | Whether to sort data according to the input field before writing. Enabling this option will reduce the number of small files when a write task writes multiple partitions | +| `write.sort.memory` | `false` | `128` | Available managed memory of sort operator. default `128` MB | + +### Index Bootstrap + +For the demand of `snapshot data` + `incremental data` import. If the `snapshot data` already insert into Hudi by [bulk insert](#bulk-insert). +User can insert `incremental data` in real time and ensure the data is not repeated by using the index bootstrap function. + +:::note +If you think this process is very time-consuming, you can add resources to write in streaming mode while writing `snapshot data`, +and then reduce the resources to write `incremental data` (or open the rate limit function). +::: + +#### Options + +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `index.bootstrap.enabled` | `true` | `false` | When index bootstrap is enabled, the remain records in Hudi table will be loaded into the Flink state at one time | +| `index.partition.regex` | `false` | `*` | Optimize option. Setting regular expressions to filter partitions. By default, all partitions are loaded into flink state | + +#### How To Use + +1. `CREATE TABLE` creates a statement corresponding to the Hudi table. Note that the `table.type` must be correct. +2. Setting `index.bootstrap.enabled` = `true` to enable the index bootstrap function. +3. Setting Flink checkpoint failure tolerance in `flink-conf.yaml` : `execution.checkpointing.tolerable-failed-checkpoints = n` (depending on Flink checkpoint scheduling times). +4. Waiting until the first checkpoint succeeds, indicating that the index bootstrap completed. +5. After the index bootstrap completed, user can exit and save the savepoint (or directly use the externalized checkpoint). +6. Restart the job, setting `index.bootstrap.enable` as `false`. + +:::note +1. Index bootstrap is blocking, so checkpoint cannot be completed during index bootstrap. +2. Index bootstrap triggers by the input data. User need to ensure that there is at least one record in each partition. +3. Index bootstrap executes concurrently. User can search in log by `finish loading the index under partition` and `Load record form file` to observe the progress of index bootstrap. +4. The first successful checkpoint indicates that the index bootstrap completed. There is no need to load the index again when recovering from the checkpoint. +::: + +### Changelog Mode +Hudi can keep all the intermediate changes (I / -U / U / D) of messages, then consumes through stateful computing of flink to have a near-real-time +data warehouse ETL pipeline (Incremental computing). Hudi MOR table stores messages in the forms of rows, which supports the retention of all change logs (Integration at the format level). +All changelog records can be consumed with Flink streaming reader. + +#### Options + +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `changelog.enabled` | `false` | `false` | It is turned off by default, to have the `upsert` semantics, only the merged messages are ensured to be kept, intermediate changes may be merged. Setting to true to support consumption of all changes | + +:::note +Batch (Snapshot) read still merge all the intermediate changes, regardless of whether the format has stored the intermediate changelog messages. +::: + +:::note +After setting `changelog.enable` as `true`, the retention of changelog records are only best effort: the asynchronous compaction task will merge the changelog records into one record, so if the +stream source does not consume timely, only the merged record for each key can be read after compaction. The solution is to reserve some buffer time for the reader by adjusting the compaction strategy, such as +the compaction options: [`compaction.delta_commits`](#compaction) and [`compaction.delta_seconds`](#compaction). +::: + + +### Append Mode + +For `INSERT` mode write operation, the current work flow is: + +- For Merge_On_Read table, the small file strategies are by default applied: tries to append to the small avro log files first +- For Copy_On_Write table, write new parquet files directly, no small file strategies are applied + +Hudi supports rich clustering strategies to optimize the files layout for `INSERT` mode: + +#### Inline Clustering + +:::note +Only Copy_On_Write table is supported. +::: + +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `write.insert.cluster` | `false` | `false` | Whether to merge small files while ingesting, for COW table, open the option to enable the small file merging strategy(no deduplication for keys but the throughput will be affected) | + +#### Async Clustering + +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `clustering.schedule.enabled` | `false` | `false` | Whether to schedule clustering plan during write process, by default false | +| `clustering.delta_commits` | `false` | `4` | Delta commits to schedule the clustering plan, only valid when `clustering.schedule.enabled` is true | +| `clustering.async.enabled` | `false` | `false` | Whether to execute clustering plan asynchronously, by default false | +| `clustering.tasks` | `false` | `4` | Parallelism of the clustering tasks | +| `clustering.plan.strategy.target.file.max.bytes` | `false` | `1024*1024*1024` | The target file size for clustering group, by default 1GB | +| `clustering.plan.strategy.small.file.limit` | `false` | `600` | The file that has less size than the threshold (unit MB) are candidates for clustering | +| `clustering.plan.strategy.sort.columns` | `false` | `N/A` | The columns to sort by when clustering | + +#### Clustering Plan Strategy + +Custom clustering strategy is supported. + +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `clustering.plan.partition.filter.mode` | `false` | `NONE` | Valid options 1) `NONE`: no limit; 2) `RECENT_DAYS`: choose partitions that represent recent days; 3) `SELECTED_PARTITIONS`: specific partitions | +| `clustering.plan.strategy.daybased.lookback.partitions` | `false` | `2` | Valid for `RECENT_DAYS` mode | +| `clustering.plan.strategy.cluster.begin.partition` | `false` | `N/A` | Valid for `SELECTED_PARTITIONS` mode, specify the partition to begin with(inclusive) | +| `clustering.plan.strategy.cluster.end.partition` | `false` | `N/A` | Valid for `SELECTED_PARTITIONS` mode, specify the partition to end with(inclusive) | +| `clustering.plan.strategy.partition.regex.pattern` | `false` | `N/A` | The regex to filter the partitions | +| `clustering.plan.strategy.partition.selected` | `false` | `N/A` | Specific partitions separated by comma `,` | + +### Bucket Index + +By default, flink uses the state-backend to keep the file index: the mapping from primary key to fileId. When the input data set is large, +there is possibility the cost of the state be a bottleneck, the bucket index use deterministic hash algorithm for shuffling the records into +buckets, thus can avoid the storage and query overhead of indexes. + +#### Options + +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `index.type` | `false` | `FLINK_STATE` | Set up as `BUCKET` to use bucket index | +| `hoodie.bucket.index.hash.field` | `false` | Primary key | Can be a subset of the primary key | +| `hoodie.bucket.index.num.buckets` | `false` | `4` | The number of buckets per-partition, it is immutable once set up | + +Comparing to state index: + +- Bucket index has no computing and storage cost of state-backend index, thus has better performance +- Bucket index can not expand the buckets dynamically, the state-backend index can expand the buckets dynamically based on current file layout +- Bucket index can not handle changes among partitions(no limit if the input itself is CDC stream), state-backend index has no limit + +### Rate Limit +There are many use cases that user put the full history data set onto the message queue together with the realtime incremental data. Then they consume the data from the queue into the hudi from the earliest offset using flink. Consuming history data set has these characteristics: +1). The instant throughput is huge 2). It has serious disorder (with random writing partitions). It will lead to degradation of writing performance and throughput glitches. For this case, the speed limit parameter can be turned on to ensure smooth writing of the flow. + +#### Options +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| `write.rate.limit` | `false` | `0` | Default disable the rate limit | diff --git a/website/versioned_docs/version-1.0.0/ingestion_kafka_connect.md b/website/versioned_docs/version-1.0.0/ingestion_kafka_connect.md new file mode 100644 index 0000000000000..a474904c44c83 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/ingestion_kafka_connect.md @@ -0,0 +1,51 @@ +--- +title: Using Kafka Connect +keywords: [hudi, kafka, connector, ingestion] +--- + + +[Kafka Connect](https://kafka.apache.org/documentation/#connect) is a popularly used framework for integrating and moving streaming data between various systems. +Hudi provides a sink for Kafka Connect, that can ingest/stream records from Apache Kafka to Hudi Tables. To do so, while providing the same transactional features +the sink implements transaction co-ordination across the tasks and workers in the Kafka Connect framework. + +See [readme](https://github.com/apache/hudi/tree/master/hudi-kafka-connect) for a full demo, build instructions and configurations. + +## Design + +At a high level, the sink treats the connect task/worker owning partition 0 of the source topic as the transaction coordinator. +The transaction coordinator implements a safe two-phase commit protocol that periodically commits data into the table. Transaction +co-ordination between the coordinator and workers reading messages from source topic partitions and writing to Hudi file groups +happens via a special kafka control topic, that all processes are listening to. + + +![Txn Coordination](/assets/images/kafka-connect-txn.png) +

Figure: Transaction Coordinator State Machine

+ +This distributed coordination helps the sink achieve high throughput, low-latency while still limiting the number of write actions +on the timeline to just 1 every commit interval. This helps scale table metadata even in the face large volume of writes, compared to +approaches where each worker commits a separate action independently leading to 10s-100s of commits per interval. + +The Hudi Kafka Connect sink uses `Merge-On-Read` by default to reduce memory pressure of writing columnar/base files (typical scaling/operational problem with the +Kafka Connect parquet sink) and inserts/appends the kafka records directly to the log file(s). Asynchronously, compaction service can be executed to merge the log files +into base file (Parquet format). Alternatively, users have the option to reconfigure the table type to `COPY_ON_WRITE` in config-sink.json if desired. + +## Configs + +To use the Hudi sink, use `connector.class=org.apache.hudi.connect.HudiSinkConnector` in Kafka Connect. Below lists additional configurations for the sink. + +| Config Name | Default | Description | +|--------------------|-------------------------------|-----------------------------------------------------------------| +| target.base.path | **Required** | base path of the Hudi table written. | +| target.table.name | **Required** | name of the table | +| hoodie.kafka.control.topic | hudi-control-topic (optional) | topic used for transaction co-ordination | +| hoodie.kafka.commit.interval.secs | 60 (optional) | The frequency at which the Sink will commit data into the table | + +See [RFC](https://cwiki.apache.org/confluence/display/HUDI/RFC-32+Kafka+Connect+Sink+for+Hudi) for more details. + + +## Current Limitations + + * Only append-only or insert operations are supported at this time. + + * Limited support for metadata table (file listings) with no support for advanced indexing during write operations. + diff --git a/website/versioned_docs/version-1.0.0/intro.md b/website/versioned_docs/version-1.0.0/intro.md new file mode 100644 index 0000000000000..45e8604c8bf8f --- /dev/null +++ b/website/versioned_docs/version-1.0.0/intro.md @@ -0,0 +1,47 @@ +--- +sidebar_position: 1 +--- + +# Tutorial Intro + +Let's discover **Docusaurus in less than 5 minutes**. + +## Getting Started + +Get started by **creating a new site**. + +Or **try Docusaurus immediately** with **[docusaurus.new](https://docusaurus.new)**. + +### What you'll need + +- [Node.js](https://nodejs.org/en/download/) version 18.0 or above: + - When installing Node.js, you are recommended to check all checkboxes related to dependencies. + +## Generate a new site + +Generate a new Docusaurus site using the **classic template**. + +The classic template will automatically be added to your project after you run the command: + +```bash +npm init docusaurus@latest my-website classic +``` + +You can type this command into Command Prompt, Powershell, Terminal, or any other integrated terminal of your code editor. + +The command also installs all necessary dependencies you need to run Docusaurus. + +## Start your site + +Run the development server: + +```bash +cd my-website +npm run start +``` + +The `cd` command changes the directory you're working with. In order to work with your newly created Docusaurus site, you'll need to navigate the terminal there. + +The `npm run start` command builds your website locally and serves it through a development server, ready for you to view at http://localhost:3000/. + +Open `docs/intro.md` (this page) and edit some lines: the site **reloads automatically** and displays your changes. diff --git a/website/versioned_docs/version-1.0.0/jfs_hoodie.md b/website/versioned_docs/version-1.0.0/jfs_hoodie.md new file mode 100644 index 0000000000000..94bf6e6ea26a0 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/jfs_hoodie.md @@ -0,0 +1,96 @@ +--- +title: JuiceFS +keywords: [ hudi, hive, juicefs, jfs, spark, flink ] +summary: In this page, we go over how to configure Hudi with JuiceFS file system. +last_modified_at: 2021-10-12T10:50:00+08:00 +--- + +In this page, we explain how to use Hudi with JuiceFS. + +## JuiceFS configs + +[JuiceFS](https://github.com/juicedata/juicefs) is a high-performance distributed file system. Any data stored into JuiceFS, the data itself will be persisted in object storage (e.g. Amazon S3), and the metadata corresponding to the data can be persisted in various database engines such as Redis, MySQL, and TiKV according to the needs of the scene. + +There are three configurations required for Hudi-JuiceFS compatibility: + +1. Creating JuiceFS file system +2. Adding JuiceFS configuration for Hudi +3. Adding required JAR to `classpath` + +### Creating JuiceFS file system + +JuiceFS supports multiple [metadata engines](https://juicefs.com/docs/community/databases_for_metadata) such as Redis, MySQL, SQLite, and TiKV. And supports almost all [object storage](https://juicefs.com/docs/community/how_to_setup_object_storage#supported-object-storage) as data storage, e.g. Amazon S3, Google Cloud Storage, Azure Blob Storage. + +The following example uses Redis as "Metadata Engine" and Amazon S3 as "Data Storage" in Linux environment. + +#### Download JuiceFS client + +```shell +$ JFS_LATEST_TAG=$(curl -s https://api.github.com/repos/juicedata/juicefs/releases/latest | grep 'tag_name' | cut -d '"' -f 4 | tr -d 'v') +$ wget "https://github.com/juicedata/juicefs/releases/download/v${JFS_LATEST_TAG}/juicefs-${JFS_LATEST_TAG}-linux-amd64.tar.gz" +``` + +#### Install JuiceFS client + +```shell +$ mkdir juice && tar -zxvf "juicefs-${JFS_LATEST_TAG}-linux-amd64.tar.gz" -C juice +$ sudo install juice/juicefs /usr/local/bin +``` + +#### Format a JuiceFS file system + +```shell +$ juicefs format \ + --storage s3 \ + --bucket https://.s3..amazonaws.com \ + --access-key \ + --secret-key \ + redis://:@:6379/1 \ + myjfs +``` + +For more information, please refer to ["JuiceFS Quick Start Guide"](https://juicefs.com/docs/community/quick_start_guide). + +### Adding JuiceFS configuration for Hudi + +Add the required configurations in your `core-site.xml` from where Hudi can fetch them. + +```xml + + fs.defaultFS + jfs://myjfs + Optional, you can also specify full path "jfs://myjfs/path-to-dir" with location to use JuiceFS + + + fs.jfs.impl + io.juicefs.JuiceFileSystem + + + fs.AbstractFileSystem.jfs.impl + io.juicefs.JuiceFS + + + juicefs.meta + redis://:@:6379/1 + + + juicefs.cache-dir + /path-to-your-disk + + + juicefs.cache-size + 1024 + + + juicefs.access-log + /tmp/juicefs.access.log + +``` + +You can visit [here](https://juicefs.com/docs/community/hadoop_java_sdk#client-configurations) for more configuration information. + +### Adding JuiceFS Hadoop Java SDK + +You can download latest JuiceFS Hadoop Java SDK from [here](http://github.com/juicedata/juicefs/releases/latest) (download the file called like `juicefs-hadoop-X.Y.Z-linux-amd64.jar`), and place it to the `classpath`. You can also [compile](https://juicefs.com/docs/community/hadoop_java_sdk#client-compilation) it by yourself. + +For example, if you use Hudi in Spark, please put the JAR in `$SPARK_HOME/jars`. diff --git a/website/versioned_docs/version-1.0.0/key_generation.md b/website/versioned_docs/version-1.0.0/key_generation.md new file mode 100644 index 0000000000000..2e4fa4263876a --- /dev/null +++ b/website/versioned_docs/version-1.0.0/key_generation.md @@ -0,0 +1,215 @@ +--- +title: Key Generation +summary: "In this page, we describe key generation in Hudi." +toc: true +last_modified_at: +--- + +Hudi needs some way to point to records in the table, so that base/log files can be merged efficiently for updates/deletes, +index entries can reference these rows and records can move around within the table from clustering without side effects. +In fact, most databases adopt similar techniques. Every record in Hudi is uniquely identified a pair of record key and an optional +partition path that can limit the scope of the key's uniqueness (non-global indexing). For tables with a global index, records are +identified by just the record key such that uniqueness is applied across partitions. + +Using keys, Hudi can impose partition/table level uniqueness integrity constraint as well as enable fast updates and deletes on records. Record keys are materialized in a +special `_hoodie_record_key` field in the table, to ensure key uniqueness is maintained even when the record generation is changed +during the table's lifetime. Without materialization, there are no guarantees that the past data written for a new key is unique across the table. + +Hudi offers many ways to generate record keys from the input data during writes. + + * For Java client/Spark/Flink writers, Hudi provides built-in key generator classes (described below) as well as an [interface](https://github.com/apache/hudi/blob/master/hudi-common/src/main/java/org/apache/hudi/keygen/KeyGenerator.java) + to write custom implementations. + + * SQL engines offer options to pass in key fields and use `PARTITIONED BY` clauses to control partitioning. + +By default, Hudi auto-generates keys for INSERT, BULK_INSERT write operations, that are efficient +for compute, storage and read to meet the uniqueness requirements of the primary key. Auto generated keys are highly +compressible compared to UUIDs costing about $0.023 per GB in cloud storage and 3-10x computationally lighter to generate +than base64/uuid encoded keys. + +## Key Generators + +Hudi provides several key generators out of the box for JVM users can use based on their need, while having a pluggable +interface for users to implement and use their own. + +Before diving into different types of key generators, let’s go over some of the common configs relevant to key generators. + +| Config Name | Default | Description | +|-------------------------------------------------|------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.datasource.write.recordkey.field | N/A (Optional) | Record key field. Value to be used as the `recordKey` component of `HoodieKey`.
  • When configured, actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`.
  • When not configured record key will be automatically generated by Hudi. This feature is handy for use cases like log ingestion that do not have a naturally present record key.

`Config Param: RECORDKEY_FIELD_NAME` | +| hoodie.datasource.write.partitionpath.field | N/A (Optional) | Partition path field. Value to be used at the partitionPath component of HoodieKey. This needs to be specified if a partitioned table is desired. Actual value obtained by invoking .toString()
`Config Param: PARTITIONPATH_FIELD_NAME` | +| hoodie.datasource.write.keygenerator.type | SIMPLE | String representing key generator type

`Config Param: KEYGENERATOR_TYPE` | +| hoodie.datasource.write.keygenerator.class | N/A (Optional) | Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator` extract a key out of incoming records.
  • When set, the configured value takes precedence to be in effect and automatic inference is not triggered.
  • When not configured, if `hoodie.datasource.write.keygenerator.type` is set, the configured value is used else automatic inference is triggered.
  • In case of auto generated record keys, if neither the key generator class nor type are configured, Hudi will also auto infer the partitioning. for eg, if partition field is not configured, hudi will assume its non-partitioned.

`Config Param: KEYGENERATOR_CLASS_NAME` | +| hoodie.datasource.write.hive_style_partitioning | false (Optional) | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)

`Config Param: HIVE_STYLE_PARTITIONING_ENABLE` | +| hoodie.datasource.write.partitionpath.urlencode | false (Optional) | Should we url encode the partition path value, before creating the folder structure.

`Config Param: URL_ENCODE_PARTITIONING` | + +For all advanced configs refer [here](https://hudi.apache.org/docs/next/configurations#KEY_GENERATOR). + +### [SIMPLE](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java) + +This is the most commonly used option. Record key is generated from two fields from the schema, one for record key and one for partition path. Values are interpreted as is from dataframe and converted to string. + +### [COMPLEX](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java) +Both record key and partition paths comprise one or more than one field by name(combination of multiple fields). Fields +are expected to be comma separated in the config value. For example ```"Hoodie.datasource.write.recordkey.field" : “col1,col4”``` + +### [NON_PARTITION](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java) +If your hudi dataset is not partitioned, you could use this “NonpartitionedKeyGenerator” which will return an empty +partition for all records. In other words, all records go to the same partition (which is empty “”) + + +### [CUSTOM](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/CustomKeyGenerator.java) +This is a generic implementation of KeyGenerator where users are able to leverage the benefits of SimpleKeyGenerator, +ComplexKeyGenerator and TimestampBasedKeyGenerator all at the same time. One can configure record key and partition +paths as a single field or a combination of fields. + +```java +hoodie.datasource.write.recordkey.field +hoodie.datasource.write.partitionpath.field +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.CustomKeyGenerator +``` + +This keyGenerator is particularly useful if you want to define +complex partition paths involving regular fields and timestamp based fields. It expects value for prop ```"hoodie.datasource.write.partitionpath.field"``` +in a specific format. The format should be "field1:PartitionKeyType1,field2:PartitionKeyType2..." + +The complete partition path is created as +```/ ``` +and so on. Each partition key type could either be SIMPLE or TIMESTAMP. + +Example config value: ```“field_3:simple,field_5:timestamp”``` + +RecordKey config value is either single field incase of SimpleKeyGenerator or a comma separate field names if referring to ComplexKeyGenerator. +Example: +```java +hoodie.datasource.write.recordkey.field=field1,field2 +``` +This will create your record key in the format `field1:value1,field2:value2` and so on, otherwise you can specify only one field in case of simple record keys. `CustomKeyGenerator` class defines an enum `PartitionKeyType` for configuring partition paths. It can take two possible values - SIMPLE and TIMESTAMP. +The value for `hoodie.datasource.write.partitionpath.field` property in case of partitioned tables needs to be provided in the format `field1:PartitionKeyType1,field2:PartitionKeyType2` and so on. For example, if you want to create partition path using 2 fields `country` and `date` where the latter has timestamp based values and needs to be customised in a given format, you can specify the following + +```java +hoodie.datasource.write.partitionpath.field=country:SIMPLE,date:TIMESTAMP +``` +This will create the partition path in the format `/` or `country=/date=` depending on whether you want hive style partitioning or not. + +### [TIMESTAMP](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java) +This key generator relies on timestamps for the partition field. The field values are interpreted as timestamps +and not just converted to string while generating partition path value for records. Record key is same as before where it is chosen by +field name. Users are expected to set few more configs to use this KeyGenerator. + + +Configs to be set: + +| Config Name | Default | Description | +|-------------------------------------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.keygen.timebased.timestamp.type | N/A **(Required)** | Required only when the key generator is TimestampBasedKeyGenerator. One of the timestamp types supported(UNIX_TIMESTAMP, DATE_STRING, MIXED, EPOCHMILLISECONDS, SCALAR) | +| hoodie.keygen.timebased.output.dateformat | "" (Optional) | Output date format such as `yyyy-MM-dd'T'HH:mm:ss.SSSZ` | +| hoodie.keygen.timebased.timezone | "UTC" (Optional) | Timezone of both input and output timestamp if they are the same, such as `UTC`. Please use `hoodie.keygen.timebased.input.timezone` and `hoodie.keygen.timebased.output.timezone` instead if the input and output timezones are different. | +| hoodie.keygen.timebased.input.dateformat | "" (Optional) | Input date format such as `yyyy-MM-dd'T'HH:mm:ss.SSSZ`. | + +Let's go over some example values for TimestampBasedKeyGenerator. + +#### Timestamp is GMT + +| Config Name | Value | +|----------------------------------------------------------| -------------| +| ```hoodie.streamer.keygen.timebased.timestamp.type``` | "EPOCHMILLISECONDS"| +| ```hoodie.streamer.keygen.timebased.output.dateformat``` | "yyyy-MM-dd hh" | +| ```hoodie.streamer.keygen.timebased.timezone``` | "GMT+8:00" | + +Input Field value: “1578283932000L”
+Partition path generated from key generator: “2020-01-06 12” + +If input field value is null for some rows.
+Partition path generated from key generator: “1970-01-01 08” + +#### Timestamp is DATE_STRING + +| Config Name | Value | +|----------------------------------------------------------| -------------| +| ```hoodie.streamer.keygen.timebased.timestamp.type``` | "DATE_STRING" | +| ```hoodie.streamer.keygen.timebased.output.dateformat``` | "yyyy-MM-dd hh" | +| ```hoodie.streamer.keygen.timebased.timezone``` | "GMT+8:00" | +| ```hoodie.streamer.keygen.timebased.input.dateformat``` | "yyyy-MM-dd hh:mm:ss" | + +Input field value: “2020-01-06 12:12:12”
+Partition path generated from key generator: “2020-01-06 12” + +If input field value is null for some rows.
+Partition path generated from key generator: “1970-01-01 12:00:00” +
+ +#### Scalar examples + +| Config Name | Value | +|-------------------------------------------------------------------| -------------| +| ```hoodie.streamer.keygen.timebased.timestamp.type``` | "SCALAR"| +| ```hoodie.streamer.keygen.timebased.output.dateformat``` | "yyyy-MM-dd hh" | +| ```hoodie.streamer.keygen.timebased.timezone``` | "GMT" | +| ```hoodie.streamer.keygen.timebased.timestamp.scalar.time.unit``` | "days" | + +Input field value: “20000L”
+Partition path generated from key generator: “2024-10-04 12” + +If input field value is null.
+Partition path generated from key generator: “1970-01-02 12” + +#### ISO8601WithMsZ with Single Input format + +| Config Name | Value | +|------------------------------------------------------------------------------| -------------| +| ```hoodie.streamer.keygen.timebased.timestamp.type``` | "DATE_STRING"| +| ```hoodie.streamer.keygen.timebased.input.dateformat``` | "yyyy-MM-dd'T'HH:mm:ss.SSSZ" | +| ```hoodie.streamer.keygen.timebased.input.dateformat.list.delimiter.regex``` | "" | +| ```hoodie.streamer.keygen.timebased.input.timezone``` | "" | +| ```hoodie.streamer.keygen.timebased.output.dateformat``` | "yyyyMMddHH" | +| ```hoodie.streamer.keygen.timebased.output.timezone``` | "GMT" | + +Input field value: "2020-04-01T13:01:33.428Z"
+Partition path generated from key generator: "2020040113" + +#### ISO8601WithMsZ with Multiple Input formats + +| Config Name | Value | +|------------------------------------------------------------------------------| -------------| +| ```hoodie.streamer.keygen.timebased.timestamp.type``` | "DATE_STRING"| +| ```hoodie.streamer.keygen.timebased.input.dateformat``` | "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ" | +| ```hoodie.streamer.keygen.timebased.input.dateformat.list.delimiter.regex``` | "" | +| ```hoodie.streamer.keygen.timebased.input.timezone``` | "" | +| ```hoodie.streamer.keygen.timebased.output.dateformat``` | "yyyyMMddHH" | +| ```hoodie.streamer.keygen.timebased.output.timezone``` | "UTC" | + +Input field value: "2020-04-01T13:01:33.428Z"
+Partition path generated from key generator: "2020040113" + +#### ISO8601NoMs with offset using multiple input formats + +| Config Name | Value | +|------------------------------------------------------------------------------| -------------| +| ```hoodie.streamer.keygen.timebased.timestamp.type``` | "DATE_STRING"| +| ```hoodie.streamer.keygen.timebased.input.dateformat``` | "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ" | +| ```hoodie.streamer.keygen.timebased.input.dateformat.list.delimiter.regex``` | "" | +| ```hoodie.streamer.keygen.timebased.input.timezone``` | "" | +| ```hoodie.streamer.keygen.timebased.output.dateformat``` | "yyyyMMddHH" | +| ```hoodie.streamer.keygen.timebased.output.timezone``` | "UTC" | + +Input field value: "2020-04-01T13:01:33-**05:00**"
+Partition path generated from key generator: "2020040118" + +#### Input as short date string and expect date in date format + +| Config Name | Value | +|------------------------------------------------------------------------------| -------------| +| ```hoodie.streamer.keygen.timebased.timestamp.type``` | "DATE_STRING"| +| ```hoodie.streamer.keygen.timebased.input.dateformat``` | "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ,yyyyMMdd" | +| ```hoodie.streamer.keygen.timebased.input.dateformat.list.delimiter.regex``` | "" | +| ```hoodie.streamer.keygen.timebased.input.timezone``` | "UTC" | +| ```hoodie.streamer.keygen.timebased.output.dateformat``` | "MM/dd/yyyy" | +| ```hoodie.streamer.keygen.timebased.output.timezone``` | "UTC" | + +Input field value: "20200401"
+Partition path generated from key generator: "04/01/2020" + +## Related Resources + +* [Hudi metafields demystified](https://www.onehouse.ai/blog/hudi-metafields-demystified) \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/markers.md b/website/versioned_docs/version-1.0.0/markers.md new file mode 100644 index 0000000000000..71321d70c1910 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/markers.md @@ -0,0 +1,91 @@ +--- +title: Marker Mechanism +toc: true +--- + +## Purpose of Markers +A write operation can fail before it completes, leaving partial or corrupt data files on storage. Markers are used to track +and cleanup any partial or failed write operations. As a write operation begins, a marker is created indicating +that a file write is in progress. When the write commit succeeds, the marker is deleted. If a write operation fails part +way through, a marker is left behind which indicates that the file is incomplete. Two important operations that use markers include: + +- **Removing duplicate/partial data files**: + - In Spark, the Hudi write client delegates the data file writing to multiple executors. One executor can fail the task, + leaving partial data files written, and Spark retries the task in this case until it succeeds. + - When speculative execution is enabled, there can also be multiple successful attempts at writing out the same data + into different files, only one of which is finally handed to the Spark driver process for committing. + The markers help efficiently identify the partial data files written, which contain duplicate data compared to the data + files written by the successful trial later, and these duplicate data files are cleaned up when the commit is finalized. +- **Rolling back failed commits**: If a write operation fails, the next write client will roll back the failed commit before proceeding with the new write. The rollback is done with the help of markers to identify the data files written as part of the failed commit. + +If we did not have markers to track the per-commit data files, we would have to list all files in the file system, +correlate that with the files seen in timeline and then delete the ones that belong to partial write failures. +As you could imagine, this would be very costly in a very large installation of a datalake. + +## Marker structure +Each marker entry is composed of three parts, the data file name, +the marker extension (`.marker`), and the I/O operation created the file (`CREATE` - inserts, `MERGE` - updates/deletes, +or `APPEND` - either). For example, the marker `91245ce3-bb82-4f9f-969e-343364159174-0_140-579-0_20210820173605.parquet.marker.CREATE` indicates +that the corresponding data file is `91245ce3-bb82-4f9f-969e-343364159174-0_140-579-0_20210820173605.parquet` and the I/O type is `CREATE`. + +## Marker Writing Options +There are two ways to write Markers: + +- Directly writing markers to storage, which is a legacy configuration. +- Writing markers to the Timeline Server which batches marker requests before writing them to storage (Default). This option improves write performance of large files as described below. + +### Direct Write Markers +Directly writing to storage creates a new marker file corresponding to each data file, with the marker filename as described above. +The marker file does not have any content, i.e., empty. Each marker file is written to storage in the same directory +hierarchy, i.e., commit instant and partition path, under a temporary folder `.hoodie/.temp` under the base path of the Hudi table. +For example, the figure below shows one example of the marker files created and the corresponding data files when writing +data to the Hudi table. When getting or deleting all the marker file paths, the mechanism first lists all the paths +under the temporary folder, `.hoodie/.temp/`, and then does the operation. + +![An example of marker and data files in direct marker file mechanism](/assets/images/blog/marker-mechanism/direct-marker-file-mechanism.png) + +While it's much efficient over scanning the entire table for uncommitted data files, as the number of data files to write +increases, so does the number of marker files to create. For large writes which need to write significant number of data +files, e.g., 10K or more, this can create performance bottlenecks for cloud storage such as AWS S3. In AWS S3, each +file create and delete call triggers an HTTP request and there is [rate-limiting](https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html) +on how many requests can be processed per second per prefix in a bucket. When the number of data files to write concurrently +and the number of marker files is huge, the marker file operations could take up non-trivial time during the write operation, +sometimes on the order of a few minutes or more. + +### Timeline Server Markers (Default) +To address the performance bottleneck due to rate-limiting of AWS S3 explained above, we introduce a new marker mechanism +leveraging the timeline server, which optimizes the marker-related latency for storage with non-trivial file I/O latency. +In the diagram below you can see the timeline-server-based marker mechanism delegates the marker creation and other marker-related +operations from individual executors to the timeline server for centralized processing. The timeline server batches the +marker creation requests and writes the markers to a bounded set of files in the file system at configurable batch intervals (default 50ms). +In this way, the number of actual file operations and latency related to markers can be significantly reduced even with +a huge number of data files, leading to improved performance of large writes. + +![Timeline-server-based marker mechanism](/assets/images/blog/marker-mechanism/timeline-server-based-marker-mechanism.png) + +Each marker creation request is handled asynchronously in the Javalin timeline server and queued before processing. +For every batch interval, the timeline server pulls the pending marker creation requests from the queue and +writes all markers to the next file in a round robin fashion. Inside the timeline server, such batch processing is +multi-threaded, designed and implemented to guarantee consistency and correctness. Both the batch interval and the batch +concurrency can be configured through the write options. + +![Batched processing of marker creation requests](/assets/images/blog/marker-mechanism/batched-marker-creation.png) + +Note that the worker thread always checks whether the marker has already been created by comparing the marker name from +the request with the memory copy of all markers maintained at the timeline server. The underlying files storing the +markers are only read upon the first marker request (lazy loading). The responses of requests are only sent back once the +new markers are flushed to the files, so that in the case of the timeline server failure, the timeline server can recover +the already created markers. These ensure consistency between storage and the in-memory copy, and improve the performance +of processing marker requests. + +**NOTE:** Timeline based markers are not yet supported for HDFS, however, users may barely notice performance challenges +with direct markers because the file system metadata is efficiently cached in memory and doesn't face the same rate-limiting as S3. + +## Marker Configuration Parameters + +| Property Name | Default | Meaning | +| ------------- | ----------- | :-------------:| +| `hoodie.write.markers.type` | timeline_server_based | Marker type to use. Two modes are supported: (1) `direct`: individual marker file corresponding to each data file is directly created by the executor; (2) `timeline_server_based`: marker operations are all handled at the timeline service which serves as a proxy. New marker entries are batch processed and stored in a limited number of underlying files for efficiency. | +| `hoodie.markers.timeline_server_based.batch.num_threads` | 20 | Number of threads to use for batch processing marker creation requests at the timeline server. | +| `hoodie.markers.timeline_server_based.batch.interval_ms` | 50 | The batch interval in milliseconds for marker creation batch processing. | + diff --git a/website/versioned_docs/version-1.0.0/metadata.md b/website/versioned_docs/version-1.0.0/metadata.md new file mode 100644 index 0000000000000..47661f314114d --- /dev/null +++ b/website/versioned_docs/version-1.0.0/metadata.md @@ -0,0 +1,134 @@ +--- +title: Table Metadata +keywords: [ hudi, metadata, S3, GCS, file listings, statistics] +--- + +Hudi tracks metadata about a table to remove bottlenecks in achieving great read/write performance, specifically on cloud storage. + +- **Avoid list operations to obtain set of files in a table**: A fundamental need for any engine that wants to read or write Hudi tables is + to know all the files/objects that are part of the table, by performing listing of table partitions/folders. Unlike many distributed file systems, + such operation scales poorly on cloud storage taking few seconds or even many minutes on large tables. This is particularly amplified when tables + are large and partitioned multiple levels deep. Hudi tracks the file listings so they are readily available for readers/writers without listing the folders + containing the data files. + +- **Expose columns statistics for better query planning and faster queries**: Query engines rely on techniques such as partitioning and data skipping + to cut down on the amount of irrelevant data scanned for query planning and execution. During query planning phase, file footer statistics like column value ranges, + null counts are read from all data files to determine if a particular file needs to be read to satisfy the query. This approach is expensive since reading + footers from all files can increase cloud storage API costs and even be subject to throttling issues for larger tables. Hudi enables relevant query predicates to + be efficiently evaluated on operate on column statistics without incurring these costs. + +## Metadata Table + +Hudi employs a special **_metadata table_**, within each table to provide these capabilities. The metadata table implemented as a single +internal Hudi Merge-On-Read table that hosts different types of table metadata in each partition. This is similar to common practices in databases where metadata +is tracked using internal tables. This approach provides the following advantages. + +- **Scalable**: The table metadata must scale to large sizes as well (see [Big Metadata paper](https://vldb.org/pvldb/vol14/p3083-edara.pdf) from Google). + Different types of indexes should be easily integrated to support various use cases with consistent management of metadata. By implementing metadata using the + same storage format and engine used for data, Hudi is able to scale to even TBs of metadata with built-in table services for managing metadata. + +- **Flexible**: The foundational framework for multi-modal indexing is built to enable and disable new indexes as needed. The + [async indexing](https://www.onehouse.ai/blog/asynchronous-indexing-using-hudi) protocol index building alongside regular writers without impacting the write latency. + +- **transactional updates**: Tables data, metadata and indexes must be upto-date and consistent with each other as writes happen or table services are performed. and table metadata must be always up-to-date and in sync with the data table. + The data and metadata table's timelines share a parent-child relationship, to ensure they are always in sync with each other. Furthermore, the MoR table storage helps absorb fast changes to metadata from streaming writes without requiring + rewriting of all table metadata on each write. + +- **Fast lookups**: By employing a SSTable like base file format (HFile) in the metadata table, query engines are able to efficiently perform lookup scans for only specific parts of + metadata needed. For e.g. query accessing only 10 out of 100 columns in a table can read stats about only the 10 columns it's interested in, during down planning time and costs. + Further, these metadata can also be served via a centralized/embedded timeline server which caches the metadata, further reducing the latency of the lookup from executors. + + +![Metadata Table Mechanics](/assets/images/metadata_table_anim.gif) +

Figure: Mechanics for Metadata Table in Hudi

+ +## Types of table metadata + +Following are the different types of metadata currently supported. + +- ***[files listings](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+15%3A+HUDI+File+Listing+Improvements)***: + Stored as *files* partition in the metadata table. Contains file information such as file name, size, and active state + for each partition in the data table, along with list of all partitions in the table. Improves the files listing performance + by avoiding direct storage calls such as *exists, listStatus* and *listFiles* on the data table. + +- ***[column statistics](https://github.com/apache/hudi/blob/master/rfc/rfc-27/rfc-27.md)***: Stored as *column_stats* + partition in the metadata table. Contains the statistics for a set of tracked columns, such as min and max values, total values, + null counts, size, etc., for all data files and are used while serving queries with predicates matching interested + columns. This is heavily used by techniques like [data skipping](https://www.onehouse.ai/blog/hudis-column-stats-index-and-data-skipping-feature-help-speed-up-queries-by-an-orders-of-magnitude) to speed up queries by orders of magnitude, by skipping + irrelevant files. + +- ***Partition Statistics***: Partition stats index aggregates statistics at the partition level for the columns tracked by + the column statistics for which it is enabled. This helps in efficient partition pruning by skipping entire folders very quickly, + even without examining column statistics at the file level. The partition stats index is stored in *partition_stats* partition in the metadata table. + Partition stats index can be enabled using the following configs (note it is required to specify the columns for which stats should be aggregated). + +To try out these features, refer to the [SQL guide](sql_ddl#create-partition-stats-index). + +## Metadata Tracking on Writers + +Following are based basic configs that are needed to enable metadata tracking. For advanced configs please refer +[here](configurations#Metadata-Configs). + +| Config Name | Default | Description | +|----------------------------------------------|-------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.metadata.enable | true (Optional) Enabled on the write side | Enable the internal metadata table serving file listings. For 0.10.1 and prior releases, metadata table is disabled by default and needs to be explicitly enabled.

`Config Param: ENABLE`
`Since Version: 0.7.0` | +| hoodie.metadata.index.column.stats.enable | false (Optional) | Enable column statistics tracking of files under metadata table. When enabled, metadata table will have a partition to store the column ranges and will be used for pruning files during data skipping.

`Config Param: ENABLE_METADATA_INDEX_COLUMN_STATS`
`Since Version: 0.11.0` | +| hoodie.metadata.index.column.stats.columns | all columns in the table | Comma separated list of columns to track column statistics on. | +| hoodie.metadata.index.partition.stats.enable | false (Optional) | Enable the partition stats tracking, on the same columns tracked by column stats metadata. | + +For Flink, following are the basic configs of interest to enable metadata tracking. Please refer +[here](https://hudi.apache.org/docs/next/configurations#Flink-Options) for advanced configs + +| Config Name | Default | Description | +|-------------------------------------------|------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| metadata.enabled | true (Optional) | Enable the internal metadata table which serves table metadata like level file listings, default enabled

`Config Param: METADATA_ENABLED` | + + +:::note +If you turn off the metadata table after enabling, be sure to wait for a few commits so that the metadata table is fully +cleaned up, before re-enabling the metadata table again. +::: + +## Leveraging metadata during queries + +### files index +Metadata based listing using *files_index* can be leveraged on the read side by setting appropriate configs/session properties +from different engines as shown below: + +| Readers | Config | Description | +|--------------------------------------------------|------------------------|-------------------------------------------------------------------------------------------------------------------------------| +| Spark DataSource, Spark SQL, Strucured Streaming | hoodie.metadata.enable | When set to `true` enables use of the spark file index implementation for Hudi, that speeds up listing of large tables.
| +| Flink DataStream, Flink SQL | metadata.enabled | When set to `true` from DDL uses the internal metadata table to serves table metadata like level file listings | +| Presto | [hudi.metadata-table-enabled](https://prestodb.io/docs/current/connector/hudi.html) | When set to `true` fetches the list of file names and sizes from Hudi’s metadata table rather than storage. | +| Trino | N/A | Support for reading from the metadata table [has been dropped in Trino 419](https://issues.apache.org/jira/browse/HUDI-7020). | +| Athena | [hudi.metadata-listing-enabled](https://docs.aws.amazon.com/athena/latest/ug/querying-hudi.html) | When this table property is set to `TRUE` enables the Hudi metadata table and the related file listing functionality | + +### column_stats index and data skipping +Enabling metadata table and column stats index is a prerequisite to enabling data skipping capabilities. Following are the +corresponding configs across Spark and Flink readers. + +| Readers | Config | Description | +|------------------------------------------------------------|----------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Spark DataSource, Spark SQL, Strucured Streaming |
  • `hoodie.metadata.enable`
  • `hoodie.enable.data.skipping`
|
  • When set to `true` enables use of the spark file index implementation for Hudi, that speeds up listing of large tables.
  • When set to `true` enables data-skipping allowing queries to leverage indexes to reduce the search space by skipping over files
    `Config Param: ENABLE_DATA_SKIPPING`
    `Since Version: 0.10.0`
| +| Flink DataStream, Flink SQL |
  • `metadata.enabled`
  • `read.data.skipping.enabled`
|
  • When set to `true` from DDL uses the internal metadata table to serves table metadata like level file listings
  • When set to `true` enables data-skipping allowing queries to leverage indexes to reduce the search space byskipping over files
| + + +## Concurrency Control for Metadata Table + +To ensure that metadata table stays up to date and table metadata is tracked safely across concurrent write and +table operations, there are some additional considerations. If async table services are enabled for the table (i.e. running a separate compaction (`HoodieCompactor`) or +clustering (`HoodieClusteringJob`) job), even with just a single writer, lock providers +must be configured. Please refer to [concurrency control](concurrency_control) for more details. + +Before enabling metadata table for the first time, all writers on the same table must and table services must be stopped. +If your current deployment model is [multi-writer](concurrency_control#full-on-multi-writer--async-table-services) along with a lock +provider and other required configs set for every writer as follows, there is no additional configuration required. You +can bring up the writers sequentially after stopping the writers for enabling metadata table. Applying the proper +configurations to only a subset of writers or table services is unsafe and can lead to loss of data. So, please ensure you enable +metadata table across all writers. + +## Related Resources +

Blogs

+ +* [Table service deployment models in Apache Hudi](https://medium.com/@simpsons/table-service-deployment-models-in-apache-hudi-9cfa5a44addf) +* [Multi Modal Indexing for the Data Lakehouse](https://www.onehouse.ai/blog/introducing-multi-modal-index-for-the-lakehouse-in-apache-hudi) diff --git a/website/versioned_docs/version-1.0.0/metadata_indexing.md b/website/versioned_docs/version-1.0.0/metadata_indexing.md new file mode 100644 index 0000000000000..560d51cafed73 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/metadata_indexing.md @@ -0,0 +1,318 @@ +--- +title: Indexing +summary: "In this page, we describe how to run metadata indexing asynchronously." +toc: true +last_modified_at: +--- + +Hudi maintains a scalable [metadata](/docs/metadata) that has some auxiliary data about the table. +The [pluggable indexing subsystem](https://www.onehouse.ai/blog/introducing-multi-modal-index-for-the-lakehouse-in-apache-hudi) +of Hudi depends on the metadata table. Different types of index, from `files` index for locating records efficiently +to `column_stats` index for data skipping, are part of the metadata table. A fundamental tradeoff in any data system +that supports indices is to balance the write throughput with index updates. A brute-force way is to lock out the writes +while indexing. Hudi supports index creation using SQL, Datasource as well as async indexing. However, very large tables +can take hours to index. This is where Hudi's novel concurrent indexing comes into play. + +## Concurrent Indexing + +Indexes in Hudi are created in two phases and uses a mix of optimistic concurrency control and multi-version concurrency control techniques. The two +phase approach ensures that the other writers are unblocked. + +- **Scheduling & Planning** : This is the first phase which schedules an indexing plan and is protected by a lock. Indexing plan considers all the completed commits upto indexing instant. +- **Execution** : This phase creates the index files as mentioned in the index plan. At the end of the phase Hudi ensures the completed commits after indexing instant used already created index plan to add corresponding index metadata. This check is protected by a metadata table lock and in case of failures indexing is aborted. + +We can now create different indexes and metadata, including `bloom_filters`, `column_stats`, `partition_stats`, `record_index`, `secondary_index` +and `expression_index` asynchronously in Hudi. Being able to index without blocking writing ensures write performance is unaffected and no +additional manual maintenance is necessary to add/remove indexes. It also reduces resource wastage by avoiding contention between writing and indexing. + +Please refer section [Setup Async Indexing](#setup-async-indexing) to get more details on how to setup +asynchronous indexing. To learn more about the design of asynchronous indexing feature, please check out [this blog](https://www.onehouse.ai/blog/asynchronous-indexing-using-hudi). + +## Index Creation Using SQL + +Currently indexes like secondary index, expression index and record index can be created using SQL create index command. +For more information on these indexes please refer [metadata section](metadata/#types-of-table-metadata) + +:::note +Please note in order to create secondary index: +1. The table must have a primary key and merge mode should be [COMMIT_TIME_ORDERING](/docs/next/record_merger#commit_time_ordering). +2. Record index must be enabled. This can be done by setting `hoodie.metadata.record.index.enable=true` and then creating `record_index`. Please note the example below. +::: + +**Examples** +```sql +-- Create record index on primary key - uuid +CREATE INDEX record_index ON hudi_indexed_table (uuid); + +-- Create secondary index on rider column. +CREATE INDEX idx_rider ON hudi_indexed_table (rider); + +-- Create expression index by performing transformation on ts and driver column +-- The index is created on the transformed column. Here column stats index is created on ts column +-- and bloom filters index is created on driver column. +CREATE INDEX idx_column_ts ON hudi_indexed_table USING column_stats(ts) OPTIONS(expr='from_unixtime', format = 'yyyy-MM-dd'); +CREATE INDEX idx_bloom_driver ON hudi_indexed_table USING bloom_filters(driver) OPTIONS(expr='identity'); +``` + +For more information on index creation using SQL refer [SQL DDL](sql_ddl#create-index) + +## Index Creation Using Datasource + +Indexes like `bloom_filters`, `column_stats`, `partition_stats` and `record_index` can be created using Datasource. +Below we list the various configs which are needed to create the indexes mentioned. + +```sql +-- [Required Configs] Partition stats +hoodie.metadata.index.partition.stats.enable=true +hoodie.metadata.index.column.stats.enable=true +-- [Optional Configs] - list of columns to index on. By default all columns are indexed +hoodie.metadata.index.column.stats.column.list=col1,col2,... + +-- [Required Configs] Column stats +hoodie.metadata.index.column.stats.enable=true +-- [Optional Configs] - list of columns to index on. By default all columns are indexed +hoodie.metadata.index.column.stats.column.list=col1,col2,... + +-- [Required Configs] Record Level Index +hoodie.metadata.record.index.enable=true + +-- [Required Configs] Bloom filter Index +hoodie.metadata.index.bloom.filter.enable=true +``` + +Here is an example which shows how to create indexes for a table created using Datasource API. + +**Examples** +```scala +import scala.collection.JavaConversions._ +import org.apache.spark.sql.SaveMode._ +import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.common.table.HoodieTableConfig._ +import org.apache.hudi.config.HoodieWriteConfig._ +import org.apache.hudi.keygen.constant.KeyGeneratorOptions._ +import org.apache.hudi.common.model.HoodieRecord +import spark.implicits._ + +val tableName = "trips_table_index" +val basePath = "file:///tmp/trips_table_index" + +val columns = Seq("ts","uuid","rider","driver","fare","city") +val data = + Seq((1695159649087L,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"), + (1695091554788L,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70 ,"san_francisco"), + (1695046462179L,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90 ,"san_francisco"), + (1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo" ), + (1695115999911L,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai")); + +var inserts = spark.createDataFrame(data).toDF(columns:_*) +inserts.write.format("hudi"). + option("hoodie.datasource.write.partitionpath.field", "city"). + option("hoodie.table.name", tableName). + option("hoodie.write.record.merge.mode", "COMMIT_TIME_ORDERING"). + option(RECORDKEY_FIELD_OPT_KEY, "uuid"). + mode(Overwrite). + save(basePath) + +// Create record index and secondary index for the table +spark.sql(s"CREATE TABLE test_table_external USING hudi LOCATION '$basePath'") +spark.sql(s"SET hoodie.metadata.record.index.enable=true") +spark.sql(s"CREATE INDEX record_index ON test_table_external (uuid)") +spark.sql(s"CREATE INDEX idx_rider ON test_table_external (rider)") +spark.sql(s"SHOW INDEXES FROM hudi_indexed_table").show(false) +spark.sql(s"SELECT * FROM hudi_indexed_table WHERE rider = 'rider-E'").show(false) +``` + +## Setup Async Indexing + +In the example we will have continuous writing using Hudi Streamer and also create index in parallel. The index creation +in example is done using HoodieIndexer so that schedule and execute phases are clearly visible for indexing. The asynchronous +configurations can be used with Datasource and SQL based configs to create index as well. + +First, we will generate a continuous workload. In the below example, we are going to start a [Hudi Streamer](/docs/hoodie_streaming_ingestion#hudi-streamer) which will continuously write data +from raw parquet to Hudi table. We used the widely available [NY Taxi dataset](https://registry.opendata.aws/nyc-tlc-trip-records-pds/), whose setup details are as below: +
+ Ingestion write config +

+ +```bash +hoodie.datasource.write.recordkey.field=VendorID +hoodie.datasource.write.partitionpath.field=tpep_dropoff_datetime +hoodie.datasource.write.precombine.field=tpep_dropoff_datetime +hoodie.streamer.source.dfs.root=/Users/home/path/to/data/parquet_files/ +hoodie.streamer.schemaprovider.target.schema.file=/Users/home/path/to/schema/schema.avsc +hoodie.streamer.schemaprovider.source.schema.file=/Users/home/path/to/schema/schema.avsc +// set lock provider configs +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider +hoodie.write.lock.zookeeper.url= +hoodie.write.lock.zookeeper.port= +hoodie.write.lock.zookeeper.lock_key= +hoodie.write.lock.zookeeper.base_path= +``` + +

+
+ +
+ Run Hudi Streamer +

+ +```bash +spark-submit \ +--jars "packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar,packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ +--class org.apache.hudi.utilities.streamer.HoodieStreamer `ls /Users/home/path/to/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar` \ +--props `ls /Users/home/path/to/write/config.properties` \ +--source-class org.apache.hudi.utilities.sources.ParquetDFSSource --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ +--source-ordering-field tpep_dropoff_datetime \ +--table-type COPY_ON_WRITE \ +--target-base-path file:///tmp/hudi-ny-taxi/ \ +--target-table ny_hudi_tbl \ +--op UPSERT \ +--continuous \ +--source-limit 5000000 \ +--min-sync-interval-seconds 60 +``` + +

+
+ +Hudi metadata table is enabled by default and the files index will be automatically created. While the Hudi Streamer is running in continuous mode, let +us schedule the indexing for COLUMN_STATS index. First we need to define a properties file for the indexer. + +### Configurations + +As mentioned before, metadata indices are pluggable. One can add any index at any point in time depending on changing +business requirements. Some configurations to enable particular indices are listed below. Currently, available indices under +metadata table can be explored [here](indexes#multi-modal-indexing) along with [configs](metadata#metadata-tracking-on-writers) +to enable them. The full set of metadata configurations can be explored [here](configurations/#Metadata-Configs). + +:::note +Enabling the metadata table and configuring a lock provider are the prerequisites for using async indexer. Checkout a sample +configuration below. +::: + +``` +# ensure that async indexing is enabled +hoodie.metadata.index.async=true +# enable column_stats index config +hoodie.metadata.index.column.stats.enable=true +# set concurrency mode and lock configs as this is a multi-writer scenario +# check https://hudi.apache.org/docs/concurrency_control/ for differnt lock provider configs +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider +hoodie.write.lock.zookeeper.url= +hoodie.write.lock.zookeeper.port= +hoodie.write.lock.zookeeper.lock_key= +hoodie.write.lock.zookeeper.base_path= +``` + +### Schedule indexing + +Now, we can schedule indexing using `HoodieIndexer` in `schedule` mode as follows: + +``` +spark-submit \ +--jars "packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar,packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ +--class org.apache.hudi.utilities.HoodieIndexer \ +/Users/home/path/to/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ +--props /Users/home/path/to/indexer.properties \ +--mode schedule \ +--base-path /tmp/hudi-ny-taxi \ +--table-name ny_hudi_tbl \ +--index-types COLUMN_STATS \ +--parallelism 1 \ +--spark-memory 1g +``` + +This will write an `indexing.requested` instant to the timeline. + +### Execute Indexing + +To execute indexing, run the indexer in `execute` mode as below. + +``` +spark-submit \ +--jars "packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar,packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ +--class org.apache.hudi.utilities.HoodieIndexer \ +/Users/home/path/to/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ +--props /Users/home/path/to/indexer.properties \ +--mode execute \ +--base-path /tmp/hudi-ny-taxi \ +--table-name ny_hudi_tbl \ +--index-types COLUMN_STATS \ +--parallelism 1 \ +--spark-memory 1g +``` + +We can also run the indexer in `scheduleAndExecute` mode to do the above two steps in one shot. Doing it separately gives us better control over when we want to execute. + +Let's look at the data timeline. + +``` +ls -lrt /tmp/hudi-ny-taxi/.hoodie +total 1816 +-rw-r--r-- 1 sagars wheel 0 Apr 14 19:53 20220414195327683.commit.requested +-rw-r--r-- 1 sagars wheel 153423 Apr 14 19:54 20220414195327683.inflight +-rw-r--r-- 1 sagars wheel 207061 Apr 14 19:54 20220414195327683.commit +-rw-r--r-- 1 sagars wheel 0 Apr 14 19:54 20220414195423420.commit.requested +-rw-r--r-- 1 sagars wheel 659 Apr 14 19:54 20220414195437837.indexing.requested +-rw-r--r-- 1 sagars wheel 323950 Apr 14 19:54 20220414195423420.inflight +-rw-r--r-- 1 sagars wheel 0 Apr 14 19:55 20220414195437837.indexing.inflight +-rw-r--r-- 1 sagars wheel 222920 Apr 14 19:55 20220414195423420.commit +-rw-r--r-- 1 sagars wheel 734 Apr 14 19:55 hoodie.properties +-rw-r--r-- 1 sagars wheel 979 Apr 14 19:55 20220414195437837.indexing +``` + +In the data timeline, we can see that indexing was scheduled after one commit completed (`20220414195327683.commit`) and another was requested +(`20220414195423420.commit.requested`). This would have picked `20220414195327683` as the base instant. Indexing was inflight with an inflight writer as well. If we parse the +indexer logs, we would find that it indeed caught up with instant `20220414195423420` after indexing upto the base instant. + +``` +22/04/14 19:55:22 INFO HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1, baseFileFormat=HFILE) from /tmp/hudi-ny-taxi/.hoodie/metadata +22/04/14 19:55:22 INFO RunIndexActionExecutor: Starting Index Building with base instant: 20220414195327683 +22/04/14 19:55:22 INFO HoodieBackedTableMetadataWriter: Creating a new metadata index for partition 'column_stats' under path /tmp/hudi-ny-taxi/.hoodie/metadata upto instant 20220414195327683 +... +... +22/04/14 19:55:38 INFO RunIndexActionExecutor: Total remaining instants to index: 1 +22/04/14 19:55:38 INFO HoodieTableMetaClient: Loading HoodieTableMetaClient from /tmp/hudi-ny-taxi/.hoodie/metadata +22/04/14 19:55:38 INFO HoodieTableConfig: Loading table properties from /tmp/hudi-ny-taxi/.hoodie/metadata/.hoodie/hoodie.properties +22/04/14 19:55:38 INFO HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1, baseFileFormat=HFILE) from /tmp/hudi-ny-taxi/.hoodie/metadata +22/04/14 19:55:38 INFO HoodieActiveTimeline: Loaded instants upto : Option{val=[20220414195423420__deltacommit__COMPLETED]} +22/04/14 19:55:38 INFO RunIndexActionExecutor: Starting index catchup task +... +``` + +### Drop Index + +To drop an index, just run the index in `dropindex` mode. + +``` +spark-submit \ +--jars "packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar,packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ +--class org.apache.hudi.utilities.HoodieIndexer \ +/Users/home/path/to/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ +--props /Users/home/path/to/indexer.properties \ +--mode dropindex \ +--base-path /tmp/hudi-ny-taxi \ +--table-name ny_hudi_tbl \ +--index-types COLUMN_STATS \ +--parallelism 1 \ +--spark-memory 2g +``` + +## Caveats + +Asynchronous indexing feature is still evolving. Few points to note from deployment perspective while running the indexer: + +- Files index is created by default as long as the metadata table is enabled. +- Trigger indexing for one metadata partition (or index type) at a time. +- If an index is enabled via async indexing, then ensure that index is also enabled in configs corresponding to regular ingestion writers. Otherwise, metadata writer will + think that particular index was disabled and cleanup the metadata partition. + +Some of these limitations will be removed in the upcoming releases. Please +follow [HUDI-2488](https://issues.apache.org/jira/browse/HUDI-2488) for developments on this feature. + +## Related Resources +

Videos

+ +* [Advantages of Metadata Indexing and Asynchronous Indexing in Hudi Hands on Lab](https://www.youtube.com/watch?v=TSphQCsY4pY) \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/metrics.md b/website/versioned_docs/version-1.0.0/metrics.md new file mode 100644 index 0000000000000..c6f2833f5ada2 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/metrics.md @@ -0,0 +1,232 @@ +--- +title: Metrics +keywords: [ hudi, administration, operation, devops, metrics] +summary: This section offers an overview of metrics in Hudi +toc: true +last_modified_at: 2020-06-20T15:59:57-04:00 +--- + +In this section, we will introduce the `MetricsReporter` and `HoodieMetrics` in Hudi. You can view the metrics-related configurations [here](configurations#METRICS). + +## MetricsReporter + +MetricsReporter provides APIs for reporting `HoodieMetrics` to user-specified backends. Currently, the implementations include InMemoryMetricsReporter, JmxMetricsReporter, MetricsGraphiteReporter and DatadogMetricsReporter. Since InMemoryMetricsReporter is only used for testing, we will introduce the other three implementations. + +### JmxMetricsReporter + +JmxMetricsReporter is an implementation of JMX reporter, which used to report JMX metrics. + +#### Configurations +The following is an example of `JmxMetricsReporter`. More detailed configurations can be referenced [here](configurations#Metrics-Configurations-for-Jmx). + + ```properties + hoodie.metrics.on=true + hoodie.metrics.reporter.type=JMX + hoodie.metrics.jmx.host=192.168.0.106 + hoodie.metrics.jmx.port=4001 + ``` + +#### Demo +As configured above, JmxMetricsReporter will started JMX server on port 4001. We can start a jconsole to connect to 192.168.0.106:4001. Below is an illustration of monitoring Hudi JMX metrics through jconsole. +
+ hudi_jxm_metrics.png +
+ +### MetricsGraphiteReporter + +MetricsGraphiteReporter is an implementation of Graphite reporter, which connects to a Graphite server, and send `HoodieMetrics` to it. + +#### Configurations +The following is an example of `MetricsGraphiteReporter`. More detaile configurations can be referenced [here](configurations#Metrics-Configurations-for-Graphite). + + ```properties + hoodie.metrics.on=true + hoodie.metrics.reporter.type=GRAPHITE + hoodie.metrics.graphite.host=192.168.0.106 + hoodie.metrics.graphite.port=2003 + hoodie.metrics.graphite.metric.prefix= + ``` +#### Demo +As configured above, assuming a Graphite server is running on host 192.168.0.106 and port 2003, a running Hudi job will connect and report metrics data to it. Below is an illustration of monitoring hudi metrics through Graphite. +
+ hudi_graphite_metrics.png +
+ +### DatadogMetricsReporter + +DatadogMetricsReporter is an implementation of Datadog reporter. +A reporter which publishes metric values to Datadog monitoring service via Datadog HTTP API. + +#### Configurations +The following is an example of `DatadogMetricsReporter`. More detailed configurations can be referenced [here](configurations#Metrics-Configurations-for-Datadog-reporter). + +```properties +hoodie.metrics.on=true +hoodie.metrics.reporter.type=DATADOG +hoodie.metrics.datadog.api.site=EU # or US +hoodie.metrics.datadog.api.key= +hoodie.metrics.datadog.metric.prefix= +``` + + * `hoodie.metrics.datadog.api.site` will set the Datadog API site, which determines whether the requests will be sent to api.datadoghq.eu (EU) or api.datadoghq.com (US). Set this according to your Datadog account settings. + * `hoodie.metrics.datadog.api.key` will set the api key. + * `hoodie.metrics.datadog.metric.prefix` will help segregate metrics by setting different prefixes for different jobs. Note that it will use `.` to delimit the prefix and the metric name. For example, if the prefix is set to `foo`, then `foo.` will be prepended to the metric name. + +#### Demo +In this demo, we ran a `HoodieStreamer` job with `HoodieMetrics` turned on and other configurations set properly. + +
+ hudi_datadog_metrics.png +
+ + As shown above, we were able to collect Hudi's action-related metrics like + + * `.
.commit.totalScanTime` + * `.
.clean.duration` + * `.
.index.lookup.duration` + + as well as `HoodieStreamer`-specific metrics + + * `.
.deltastreamer.duration` + * `.
.deltastreamer.hiveSyncDuration` + +### PrometheusMetricsReporter +[Prometheus](https://prometheus.io/) is an open source systems monitoring and alerting toolkit. +Prometheus has a [PushGateway](https://prometheus.io/docs/practices/pushing/) that Apache Hudi can leverage for metrics reporting. +Follow [Prometheus documentation](https://prometheus.io/docs/introduction/first_steps/) for basic setup instructions. + +Similar to other supported reporters, the following attributes are required to enable pushgateway reporters: + +```scala +hoodie.metrics.on=true +hoodie.metrics.reporter.type=PROMETHEUS_PUSHGATEWAY +``` + +The following properties are used to configure the address and port number of pushgateway. The default address is +localhost, and the default port is 9091 + +```scala +hoodie.metrics.pushgateway.host=xxxx +hoodie.metrics.pushgateway.port=9091 +``` + +You can configure whether to delete the monitoring information from pushgateway at the end of the task, the default is true + +```scala +hoodie.metrics.pushgateway.delete.on.shutdown=false +``` + +You can configure the task name prefix and whether a random suffix is required. The default is true + +```scala +hoodie.metrics.pushgateway.job.name=xxxx +hoodie.metrics.pushgateway.random.job.name.suffix=false +``` + +### AWS CloudWatchReporter +Hudi supports publishing metrics to Amazon CloudWatch. It can be configured by setting [`hoodie.metrics.reporter.type`](https://hudi.apache.org/docs/next/configurations#hoodiemetricsreportertype) +to “CLOUDWATCH”. Static AWS credentials to be used can be configured using +[`hoodie.aws.access.key`](https://hudi.apache.org/docs/next/configurations#hoodieawsaccesskey), +[`hoodie.aws.secret.key`](https://hudi.apache.org/docs/next/configurations#hoodieawssecretkey), +[`hoodie.aws.session.token`](https://hudi.apache.org/docs/next/configurations#hoodieawssessiontoken) properties. +In the absence of static AWS credentials being configured, `DefaultAWSCredentialsProviderChain` will be used to get +credentials by checking environment properties. Additional Amazon CloudWatch reporter specific properties that can be +tuned are in the `HoodieMetricsCloudWatchConfig` class. + +### UserDefinedMetricsReporter + +Allows users to define a custom metrics reporter. + +#### Configurations +The following is an example of `UserDefinedMetricsReporter`. More detailed configurations can be referenced [here](configurations#Metrics-Configurations). + +```properties +hoodie.metrics.on=true +hoodie.metrics.reporter.class=test.TestUserDefinedMetricsReporter +``` + +#### Demo +In this simple demo, TestMetricsReporter will print all gauges every 10 seconds + +```java +public static class TestUserDefinedMetricsReporter + extends AbstractUserDefinedMetricsReporter { + private static final Logger log = LogManager.getLogger(DummyMetricsReporter.class); + + private ScheduledExecutorService exec = Executors.newScheduledThreadPool(1, r -> { + Thread t = Executors.defaultThreadFactory().newThread(r); + t.setDaemon(true); + return t; + }); + + public TestUserDefinedMetricsReporter(Properties props, MetricRegistry registry) { + super(props, registry); + } + + @Override + public void start() { + exec.schedule(this::report, 10, TimeUnit.SECONDS); + } + + @Override + public void report() { + this.getRegistry().getGauges().forEach((key, value) -> + log.info("key: " + key + " value: " + value.getValue().toString())); + } + + @Override + public Closeable getReporter() { + return null; + } + + @Override + public void stop() { + exec.shutdown(); + } +} +``` + +## HoodieMetrics + +Once the Hudi writer is configured with the right table and environment for `HoodieMetrics`, it produces the following `HoodieMetrics`, that aid in debugging hudi tables + + - **Commit Duration** - The amount of time it took to successfully commit a batch of records + - **Rollback Duration** - Similarly, the amount of time taken to undo partial data left over by a failed commit (rollback happens automatically after a failing write) + - **File Level metrics** - Shows the amount of new files added, versions, deleted (cleaned) in each commit + - **Record Level Metrics** - Total records inserted/updated etc per commit + - **Partition Level metrics** - number of partitions upserted (super useful to understand sudden spikes in commit duration) + +These `HoodieMetrics` can then be plotted on a standard tool like grafana. Below is a sample commit duration chart. + +
+ hudi_commit_duration.png +
+ +## List of metrics: + +The below metrics are available in all timeline operations that involves a commit such as deltacommit, compaction, clustering and rollback. + +Name | Description +--- | --- +commitFreshnessInMs | Milliseconds from the commit end time and the maximum event time of the incoming records +commitLatencyInMs | Milliseconds from the commit end time and the minimum event time of incoming records +commitTime | Time of commit in epoch milliseconds +duration | Total time taken for the commit/rollback in milliseconds +numFilesDeleted | Number of files deleted during a clean/rollback +numFilesFinalized | Number of files finalized in a write +totalBytesWritten | Bytes written in a HoodieCommit +totalCompactedRecordsUpdated | Number of records updated in a compaction operation +totalCreateTime | Time taken for file creation during a Hoodie Insert operation +totalFilesInsert | Number of newly written files in a HoodieCommit +totalFilesUpdate | Number of files updated in a HoodieCommit +totalInsertRecordsWritten | Number of records inserted or converted to updates(for small file handling) in a HoodieCommit +totalLogFilesCompacted | Number of log files under a base file in a file group compacted +totalLogFilesSize | Total size in bytes of all log files under a base file in a file group +totalPartitionsWritten | Number of partitions that took writes in a HoodieCommit +totalRecordsWritten | Number of records written in a HoodieCommit. For inserts, it is the total numbers of records inserted. And for updates, it the total number of records in the file. +totalScanTime | Time taken for reading and merging logblocks in a log file +totalUpdateRecordsWritten | Number of records that got changed in a HoodieCommit +totalUpsertTime | Time taken for Hoodie Merge + +These metrics can be found at org.apache.hudi.metrics.HoodieMetrics and referenced from +org.apache.hudi.common.model.HoodieCommitMetadata and org.apache.hudi.common.model.HoodieWriteStat diff --git a/website/versioned_docs/version-1.0.0/migration_guide.md b/website/versioned_docs/version-1.0.0/migration_guide.md new file mode 100644 index 0000000000000..c8839a2005f56 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/migration_guide.md @@ -0,0 +1,118 @@ +--- +title: Bootstrapping +keywords: [ hudi, migration, use case] +summary: In this page, we will discuss some available tools for migrating your existing table into a Hudi table +last_modified_at: 2019-12-30T15:59:57-04:00 +toc: true +toc_min_heading_level: 2 +toc_max_heading_level: 4 +--- + +Hudi maintains metadata such as commit timeline and indexes to manage a table. The commit timelines helps to understand the actions happening on a table as well as the current state of a table. Indexes are used by Hudi to maintain a record key to file id mapping to efficiently locate a record. At the moment, Hudi supports writing only parquet columnar formats. +To be able to start using Hudi for your existing table, you will need to migrate your existing table into a Hudi managed table. There are a couple of ways to achieve this. + + +## Approaches + + +### Use Hudi for new partitions alone + +Hudi can be used to manage an existing table without affecting/altering the historical data already present in the +table. Hudi has been implemented to be compatible with such a mixed table with a caveat that either the complete +Hive partition is Hudi managed or not. Thus the lowest granularity at which Hudi manages a table is a Hive +partition. Start using the datasource API or the WriteClient to write to the table and make sure you start writing +to a new partition or convert your last N partitions into Hudi instead of the entire table. Note, since the historical + partitions are not managed by HUDI, none of the primitives provided by HUDI work on the data in those partitions. More concretely, one cannot perform upserts or incremental pull on such older partitions not managed by the HUDI table. +Take this approach if your table is an append only type of table and you do not expect to perform any updates to existing (or non Hudi managed) partitions. + + +### Convert existing table to Hudi + +Import your existing table into a Hudi managed table. Since all the data is Hudi managed, none of the limitations + of Approach 1 apply here. Updates spanning any partitions can be applied to this table and Hudi will efficiently + make the update available to queries. Note that not only do you get to use all Hudi primitives on this table, + there are other additional advantages of doing this. Hudi automatically manages file sizes of a Hudi managed table + . You can define the desired file size when converting this table and Hudi will ensure it writes out files + adhering to the config. It will also ensure that smaller files later get corrected by routing some new inserts into + small files rather than writing new small ones thus maintaining the health of your cluster. + +There are a few options when choosing this approach. + +#### Using Hudi Streamer + +Use the [Hudi Streamer](/docs/hoodie_streaming_ingestion#hudi-streamer) tool. HoodieStreamer supports bootstrap with +--run-bootstrap command line option. There are two types of bootstrap, METADATA_ONLY and FULL_RECORD. METADATA_ONLY will +generate just skeleton base files with keys/footers, avoiding full cost of rewriting the dataset. FULL_RECORD will +perform a full copy/rewrite of the data as a Hudi table. Additionally, once can choose selective partitions using regex +patterns to apply one of the above bootstrap modes. + +Here is an example for running FULL_RECORD bootstrap on all partitions that match the regex pattern `.*` and keeping +hive style partition with HoodieStreamer. This example configures +[hoodie.bootstrap.mode.selector](https://hudi.apache.org/docs/configurations#hoodiebootstrapmodeselector) to +`org.apache.hudi.client.bootstrap.selector.BootstrapRegexModeSelector` which allows applying `FULL_RECORD` bootstrap +mode to selective partitions based on the regex pattern [hoodie.bootstrap.mode.selector.regex](https://hudi.apache.org/docs/configurations#hoodiebootstrapmodeselectorregex) + +``` +spark-submit --master local \ +--jars "packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar,packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ +--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ +--class org.apache.hudi.utilities.streamer.HoodieStreamer `ls packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle-*.jar` \ +--run-bootstrap \ +--target-base-path /tmp/hoodie/bootstrap_table \ +--target-table bootstrap_table \ +--table-type COPY_ON_WRITE \ +--hoodie-conf hoodie.bootstrap.base.path=/tmp/source_table \ +--hoodie-conf hoodie.datasource.write.recordkey.field=${KEY_FIELD} \ +--hoodie-conf hoodie.datasource.write.partitionpath.field=${PARTITION_FIELD} \ +--hoodie-conf hoodie.datasource.write.precombine.field=${PRECOMBINE_FILED} \ +--hoodie-conf hoodie.bootstrap.keygen.class=org.apache.hudi.keygen.SimpleKeyGenerator \ +--hoodie-conf hoodie.bootstrap.mode.selector=org.apache.hudi.client.bootstrap.selector.BootstrapRegexModeSelector \ +--hoodie-conf hoodie.bootstrap.mode.selector.regex='.*' \ +--hoodie-conf hoodie.bootstrap.mode.selector.regex.mode=FULL_RECORD \ +--hoodie-conf hoodie.datasource.write.hive_style_partitioning=true +``` + +#### Using Spark Datasource Writer + +For huge tables, this could be as simple as : +```java +for partition in [list of partitions in source table] { + val inputDF = spark.read.format("any_input_format").load("partition_path") + inputDF.write.format("org.apache.hudi").option()....save("basePath") +} +``` + +#### Using Spark SQL CALL Procedure + +Refer to [Bootstrap procedure](https://hudi.apache.org/docs/next/procedures#bootstrap) for more details. + +#### Using Hudi CLI + +Write your own custom logic of how to load an existing table into a Hudi managed one. Please read about the RDD API +[here](/docs/quick-start-guide). Using the bootstrap run CLI. Once hudi has been built via `mvn clean install -DskipTests`, the shell can be +fired by via `cd hudi-cli && ./hudi-cli.sh`. + +```java +hudi->bootstrap run --srcPath /tmp/source_table --targetPath /tmp/hoodie/bootstrap_table --tableName bootstrap_table --tableType COPY_ON_WRITE --rowKeyField ${KEY_FIELD} --partitionPathField ${PARTITION_FIELD} --sparkMaster local --hoodieConfigs hoodie.datasource.write.hive_style_partitioning=true --selectorClass org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector +``` +Unlike Hudi Streamer, FULL_RECORD or METADATA_ONLY is set with --selectorClass, see details with help "bootstrap run". + + +## Configs + +Here are the basic configs that control bootstrapping. + +| Config Name | Default | Description | +| --------------------------------------------------- | ------------------ |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.bootstrap.base.path | N/A **(Required)** | Base path of the dataset that needs to be bootstrapped as a Hudi table

`Config Param: BASE_PATH`
`Since Version: 0.6.0` | +| hoodie.bootstrap.mode.selector | org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector (Optional) | Selects the mode in which each file/partition in the bootstrapped dataset gets bootstrapped
Possible values:
  • `org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector`: In this mode, the full record data is not copied into Hudi therefore it avoids full cost of rewriting the dataset. Instead, 'skeleton' files containing just the corresponding metadata columns are added to the Hudi table. Hudi relies on the data in the original table and will face data-loss or corruption if files in the original table location are deleted or modified.
  • `org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector`: In this mode, the full record data is copied into hudi and metadata columns are added. A full record bootstrap is functionally equivalent to a bulk-insert. After a full record bootstrap, Hudi will function properly even if the original table is modified or deleted.
  • `org.apache.hudi.client.bootstrap.selector.BootstrapRegexModeSelector`: A bootstrap selector which employs bootstrap mode by specified partitions.

`Config Param: MODE_SELECTOR_CLASS_NAME`
`Since Version: 0.6.0` | +| hoodie.bootstrap.mode.selector.regex | .* (Optional) | Matches each bootstrap dataset partition against this regex and applies the mode below to it. This is **applicable only when** `hoodie.bootstrap.mode.selector` equals `org.apache.hudi.client.bootstrap.selector.BootstrapRegexModeSelector`

`Config Param: PARTITION_SELECTOR_REGEX_PATTERN`
`Since Version: 0.6.0` | +| hoodie.bootstrap.mode.selector.regex.mode | METADATA_ONLY (Optional) | When specified, applies one of the possible [Bootstrap Modes](https://github.com/apache/hudi/blob/bc583b4158684c23f35d787de5afda13c2865ad4/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapMode.java) to the partitions that match the regex provided as part of the `hoodie.bootstrap.mode.selector.regex`. For unmatched partitions the other Bootstrap Mode is applied. This is **applicable only when** `hoodie.bootstrap.mode.selector` equals `org.apache.hudi.client.bootstrap.selector.BootstrapRegexModeSelector`.
Possible values:
  • [FULL_RECORD](https://github.com/apache/hudi/blob/bc583b4158684c23f35d787de5afda13c2865ad4/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapMode.java#L36C5-L36C5)
  • [METADATA_ONLY](https://github.com/apache/hudi/blob/bc583b4158684c23f35d787de5afda13c2865ad4/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapMode.java#L44C4-L44C4)

`Config Param: PARTITION_SELECTOR_REGEX_MODE`
`Since Version: 0.6.0` | + +By default, with only `hoodie.bootstrap.base.path` being provided METADATA_ONLY mode is selected. For other options, please refer [bootstrap configs](https://hudi.apache.org/docs/next/configurations#Bootstrap-Configs) for more details. + +## Related Resources +

Videos

+ +* [Bootstrapping in Apache Hudi on EMR Serverless with Lab](https://www.youtube.com/watch?v=iTNLqbW3YYA) + \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/oci_hoodie.md b/website/versioned_docs/version-1.0.0/oci_hoodie.md new file mode 100644 index 0000000000000..872b2656405eb --- /dev/null +++ b/website/versioned_docs/version-1.0.0/oci_hoodie.md @@ -0,0 +1,80 @@ +--- +title: Oracle Cloud Infrastructure +keywords: [ hudi, hive, oracle cloud, storage, spark ] +summary: In this page, we go over how to configure hudi with Oracle Cloud Infrastructure Object Storage. +last_modified_at: 2022-03-03T16:57:05-08:00 +--- +The [Oracle Object Storage](https://docs.oracle.com/en-us/iaas/Content/Object/Concepts/objectstorageoverview.htm) system provides strongly-consistent operations on all buckets in all regions. OCI Object Storage provides an [HDFS Connector](https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/hdfsconnector.htm) your Application will need to access data. + +## OCI Configs + +To use HUDI on OCI Object Storage you must: + +- Configure the HDFS Connector using an API key +- Include the HDFS Connector and dependencies in your application +- Construct an OCI HDFS URI + +### Configuring the HDFS Connector + +The OCI HDFS Connector requires configurations from an API key to authenticate and select the correct region. Start by [generating an API key](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm). + +If you are using Hadoop, include these in your core-site.xml: + +```xml + + fs.oci.client.auth.tenantId + ocid1.tenancy.oc1..[tenant] + The OCID of your OCI tenancy + + + + fs.oci.client.auth.userId + ocid1.user.oc1..[user] + The OCID of your OCI user + + + + fs.oci.client.auth.fingerprint + XX::XX + Your 32-digit hexidecimal public key fingerprint + + + + fs.oci.client.auth.pemfilepath + /path/to/file + Local path to your private key file + + + + fs.oci.client.auth.hostname + https://objectstorage.[region].oraclecloud.com + HTTPS endpoint of your regional object store + +``` + +If you are using Spark outside of Hadoop, set these configurations in your Spark Session: + +| Key | Description | +| ------------------------------------------- | ------------------------------------------------ | +| spark.hadoop.fs.oci.client.auth.tenantId | The OCID of your OCI tenancy | +| spark.hadoop.fs.oci.client.auth.userId | The OCID of your OCI user | +| spark.hadoop.fs.oci.client.auth.fingerprint | Your 32-digit hexidecimal public key fingerprint | +| spark.hadoop.fs.oci.client.auth.pemfilepath | Local path to your private key file | +| spark.hadoop.fs.oci.client.hostname | HTTPS endpoint of your regional object store | + +If you are running Spark in OCI Data Flow you do not need to configure these settings, object storage access is configured for you. + +### Libraries + +These libraries need to be added to your application. The versions below are a reference, the libraries are continuously updated and you should check for later releases in Maven Central. + +- com.oracle.oci.sdk:oci-java-sdk-core:2.18.0 +- com.oracle.oci.sdk:oci-hdfs-connector:3.3.0.5 + +### Construct an OCI HDFS URI + +OCI HDFS URIs have the form of: + +`oci://@/` + +The HDFS connector allows you to treat these locations similar to an `HDFS` location on Hadoop. Your tenancy has a unique Object Storage namespace. If you're not sure what your namespace is you can find it by installing the [OCI CLI](https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/cliinstall.htm) and running `oci os ns get`. \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/oss_hoodie.md b/website/versioned_docs/version-1.0.0/oss_hoodie.md new file mode 100644 index 0000000000000..894bbcff237b7 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/oss_hoodie.md @@ -0,0 +1,70 @@ +--- +title: Alibaba Cloud +keywords: [ hudi, hive, aliyun, oss, spark, presto] +summary: In this page, we go over how to configure Hudi with OSS filesystem. +last_modified_at: 2020-04-21T11:38:24-10:00 +--- +In this page, we explain how to get your Hudi spark job to store into Aliyun OSS. + +## Aliyun OSS configs + +There are two configurations required for Hudi-OSS compatibility: + +- Adding Aliyun OSS Credentials for Hudi +- Adding required Jars to classpath + +### Aliyun OSS Credentials + +Add the required configs in your core-site.xml from where Hudi can fetch them. Replace the `fs.defaultFS` with your OSS bucket name, replace `fs.oss.endpoint` with your OSS endpoint, replace `fs.oss.accessKeyId` with your OSS key, replace `fs.oss.accessKeySecret` with your OSS secret. Hudi should be able to read/write from the bucket. + +```xml + + fs.defaultFS + oss://bucketname/ + + + + fs.oss.endpoint + oss-endpoint-address + Aliyun OSS endpoint to connect to. + + + + fs.oss.accessKeyId + oss_key + Aliyun access key ID + + + + fs.oss.accessKeySecret + oss-secret + Aliyun access key secret + + + + fs.oss.impl + org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem + +``` + +### Aliyun OSS Libs + +Aliyun hadoop libraries jars to add to our pom.xml. Since hadoop-aliyun depends on the version of hadoop 2.9.1+, you need to use the version of hadoop 2.9.1 or later. + +```xml + + org.apache.hadoop + hadoop-aliyun + 3.2.1 + + + com.aliyun.oss + aliyun-sdk-oss + 3.8.1 + + + org.jdom + jdom + 1.1 + +``` diff --git a/website/versioned_docs/version-1.0.0/overview.mdx b/website/versioned_docs/version-1.0.0/overview.mdx new file mode 100644 index 0000000000000..bb8910f9c7eda --- /dev/null +++ b/website/versioned_docs/version-1.0.0/overview.mdx @@ -0,0 +1,78 @@ +--- +title: "Overview" +keywords: [ hudi, design, table, queries, timeline] +summary: "Here we introduce some basic concepts & give a broad technical overview of Hudi" +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +--- + +import SlackCommunity from '@site/src/components/SlackCommunity'; + +Hello there! This overview will provide a high level summary of what Apache Hudi is and will orient you on +how to learn more to get started. + +## What is Apache Hudi + +Apache Hudi (pronounced "hoodie") pioneered the concept of "[transactional data lakes](https://www.uber.com/blog/hoodie/)", which is more popularly known today as +the data lakehouse architecture. Today, Hudi has grown into an [open data lakehouse platform](/blog/2021/07/21/streaming-data-lake-platform), with a open table format purpose-built for high performance writes on +incremental data pipelines and fast query performance due to comprehensive table optimizations. + +Hudi brings core database functionality directly to a data lake - [tables](/docs/next/sql_ddl), +[transactions](/docs/next/timeline), [efficient upserts/deletes](/docs/next/write_operations), [advanced indexes](/docs/next/indexes), +[ingestion services](/docs/hoodie_streaming_ingestion), data [clustering](/docs/next/clustering)/[compaction](/docs/next/compaction) optimizations, +and [concurrency control](/docs/next/concurrency_control) all while keeping your data in open file formats. Not only is Apache Hudi great for streaming workloads, +but it also allows you to create efficient incremental batch pipelines. Apache Hudi can easily be used on any [cloud storage platform](/docs/cloud). +Hudi’s advanced performance optimizations, make analytical queries/pipelines faster with any of the popular query engines including, Apache Spark, Flink, Presto, Trino, Hive, etc. + +Read the docs for more [use case descriptions](/docs/use_cases) and check out [who's using Hudi](/powered-by), to see how some of the +largest data lakes in the world including [Uber](https://eng.uber.com/uber-big-data-platform/), [Amazon](https://aws.amazon.com/blogs/big-data/how-amazon-transportation-service-enabled-near-real-time-event-analytics-at-petabyte-scale-using-aws-glue-with-apache-hudi/), +[ByteDance](http://hudi.apache.org/blog/2021/09/01/building-eb-level-data-lake-using-hudi-at-bytedance), +[Robinhood](https://s.apache.org/hudi-robinhood-talk) and more are transforming their production data lakes with Hudi. + +[Hudi-rs](https://github.com/apache/hudi-rs) is the native Rust implementation for Apache Hudi, which also provides bindings to Python. It +expands the use of Apache Hudi for a diverse range of use cases in the non-JVM ecosystems. + +## Core Concepts to Learn + +If you are relatively new to Apache Hudi, it is important to be familiar with a few core concepts: +- [Hudi Timeline](/docs/next/timeline) – How Hudi manages transactions and other table services +- [Hudi File Layout](/docs/next/storage_layouts) - How the files are laid out on storage +- [Hudi Table Types](/docs/next/table_types) – `COPY_ON_WRITE` and `MERGE_ON_READ` +- [Hudi Query Types](/docs/next/table_types#query-types) – Snapshot Queries, Incremental Queries, Read-Optimized Queries + +See more in the "Design & Concepts" section of the docs. + +Take a look at recent [blog posts](/blog) that go in depth on certain topics or use cases. + +## Getting Started + +Sometimes the fastest way to learn is by doing. Try out these Quick Start resources to get up and running in minutes: + +- [Spark Quick Start Guide](/docs/quick-start-guide) – if you primarily use Apache Spark +- [Flink Quick Start Guide](/docs/flink-quick-start-guide) – if you primarily use Apache Flink +- [Python/Rust Quick Start Guide (Hudi-rs)](/docs/python-rust-quick-start-guide) - if you primarily use Python or Rust + +If you want to experience Apache Hudi integrated into an end to end demo with Kafka, Spark, Hive, Presto, etc, try out the [Docker Demo](/docs/docker_demo) + +## Connect With The Community +Apache Hudi is community-focused and community-led and welcomes new-comers with open arms. Leverage the following +resources to learn more, engage, and get help as you get started. + +### Join in on discussions +See all the ways to [engage with the community here](/community/get-involved). Two most popular methods include: +- +- [Hudi mailing list](mailto:users-subscribe@hudi.apache.org) - (send any msg to subscribe) + +### Come to Office Hours for help +Weekly office hours are [posted here](/community/office_hours) + +### Community Calls +Attend [monthly community calls](/community/syncs#monthly-community-call) to learn best practices and see what others are building. + +## Contribute +Apache Hudi welcomes you to join in on the fun and make a lasting impact on the industry as a whole. See our +[contributor guide](/contribute/how-to-contribute) to learn more, and don’t hesitate to directly reach out to any of the +current committers to learn more. + +Have an idea, an ask, or feedback about a pain-point, but don’t have time to contribute? Join the +and share! diff --git a/website/versioned_docs/version-1.0.0/performance.md b/website/versioned_docs/version-1.0.0/performance.md new file mode 100644 index 0000000000000..0663535c07d7d --- /dev/null +++ b/website/versioned_docs/version-1.0.0/performance.md @@ -0,0 +1,133 @@ +--- +title: Performance +keywords: [ hudi, index, storage, compaction, cleaning, implementation] +toc: false +last_modified_at: 2019-12-30T15:59:57-04:00 +--- + +## Optimized DFS Access + +Hudi also performs several key storage management functions on the data stored in a Hudi table. A key aspect of storing data on DFS is managing file sizes and counts +and reclaiming storage space. For e.g HDFS is infamous for its handling of small files, which exerts memory/RPC pressure on the Name Node and can potentially destabilize +the entire cluster. In general, query engines provide much better performance on adequately sized columnar files, since they can effectively amortize cost of obtaining +column statistics etc. Even on some cloud data stores, there is often cost to listing directories with large number of small files. + +Here are some ways to efficiently manage the storage of your Hudi tables. + +- The [small file handling feature](/docs/configurations/#hoodieparquetsmallfilelimit) in Hudi, profiles incoming workload + and distributes inserts to existing file groups instead of creating new file groups, which can lead to small files. +- Cleaner can be [configured](/docs/configurations#hoodiecleanercommitsretained) to clean up older file slices, more or less aggressively depending on maximum time for queries to run & lookback needed for incremental pull +- User can also tune the size of the [base/parquet file](/docs/configurations#hoodieparquetmaxfilesize), [log files](/docs/configurations#hoodielogfilemaxsize) & expected [compression ratio](/docs/configurations#hoodieparquetcompressionratio), + such that sufficient number of inserts are grouped into the same file group, resulting in well sized base files ultimately. +- Intelligently tuning the [bulk insert parallelism](/docs/configurations#hoodiebulkinsertshuffleparallelism), can again in nicely sized initial file groups. It is in fact critical to get this right, since the file groups + once created cannot be changed without re-clustering the table. Writes will simply expand given file groups with new updates/inserts as explained before. +- For workloads with heavy updates, the [merge-on-read table](/docs/concepts#merge-on-read-table) provides a nice mechanism for ingesting quickly into smaller files and then later merging them into larger base files via compaction. + +## Performance Optimizations + +In this section, we go over some real world performance numbers for Hudi upserts, incremental pull and compare them against +the conventional alternatives for achieving these tasks. + +### Write Path + +#### Bulk Insert + +Write configurations in Hudi are optimized for incremental upserts by default. In fact, the default write operation type is UPSERT as well. +For simple append-only use case to bulk load the data, following set of configurations are recommended for optimal writing: +``` +-- Use “bulk-insert” write-operation instead of default “upsert” +hoodie.datasource.write.operation = BULK_INSERT +-- Disable populating meta columns and metadata, and enable virtual keys +hoodie.populate.meta.fields = false +hoodie.metadata.enable = false +-- Enable snappy compression codec for lesser CPU cycles (but more storage overhead) +hoodie.parquet.compression.codec = snappy +``` + +For ingesting via spark-sql +``` +-- Use “bulk-insert” write-operation instead of default “upsert” +hoodie.sql.insert.mode = non-strict, +hoodie.sql.bulk.insert.enable = true, +-- Disable populating meta columns and metadata, and enable virtual keys +hoodie.populate.meta.fields = false +hoodie.metadata.enable = false +-- Enable snappy compression codec for lesser CPU cycles (but more storage overhead) +hoodie.parquet.compression.codec = snappy +``` + +We recently benchmarked Hudi against TPC-DS workload. +Please check out [our blog](/blog/2022/06/29/Apache-Hudi-vs-Delta-Lake-transparent-tpc-ds-lakehouse-performance-benchmarks) for more details. + +#### Upserts + +Following shows the speed up obtained for NoSQL database ingestion, from incrementally upserting on a Hudi table on the copy-on-write storage, +on 5 tables ranging from small to huge (as opposed to bulk loading the tables) + +
+ hudi_upsert_perf1.png +
+ +Given Hudi can build the table incrementally, it opens doors for also scheduling ingesting more frequently thus reducing latency, with +significant savings on the overall compute cost. + +
+ hudi_upsert_perf2.png +
+ +Hudi upserts have been stress tested upto 4TB in a single commit across the t1 table. +See [here](https://cwiki.apache.org/confluence/display/HUDI/Tuning+Guide) for some tuning tips. + +#### Indexing + +In order to efficiently upsert data, Hudi needs to classify records in a write batch into inserts & updates (tagged with the file group +it belongs to). In order to speed this operation, Hudi employs a pluggable index mechanism that stores a mapping between recordKey and +the file group id it belongs to. By default, Hudi uses a built in index that uses file ranges and bloom filters to accomplish this, with +upto 10x speed up over a spark join to do the same. + +Hudi provides best indexing performance when you model the recordKey to be monotonically increasing (e.g timestamp prefix), leading to range pruning filtering +out a lot of files for comparison. Even for UUID based keys, there are [known techniques](https://www.percona.com/blog/2014/12/19/store-uuid-optimized-way/) to achieve this. +For e.g , with 100M timestamp prefixed keys (5% updates, 95% inserts) on a event table with 80B keys/3 partitions/11416 files/10TB data, Hudi index achieves a +**~7X (2880 secs vs 440 secs) speed up** over vanilla spark join. Even for a challenging workload like an '100% update' database ingestion workload spanning +3.25B UUID keys/30 partitions/6180 files using 300 cores, Hudi indexing offers a **80-100% speedup**. + + +### Read Path + +#### Data Skipping + + +Data Skipping is a technique (originally introduced in Hudi 0.10) that leverages metadata to very effectively prune the search space of a query, +by eliminating files that cannot possibly contain data matching the query's filters. By maintaining this metadata in the internal Hudi metadata table, +Hudi avoids reading file footers to obtain this information, which can be costly for queries spanning tens of thousands of files. + +Data Skipping leverages metadata table's `col_stats` partition bearing column-level statistics (such as min-value, max-value, count of null-values in the column, etc) +for every file of the Hudi table. This then allows Hudi for every incoming query instead of enumerating every file in the table and reading its corresponding metadata +(for ex, Parquet footers) for analysis whether it could contain any data matching the query filters, to simply do a query against a Column Stats Index +in the Metadata Table (which in turn is a Hudi table itself) and within seconds (even for TBs scale tables, with 10s of thousands of files) obtain the list +of _all the files that might potentially contain the data_ matching query's filters with crucial property that files that could be ruled out as not containing such data +(based on their column-level statistics) will be stripped out. See [RFC-27](https://github.com/apache/hudi/blob/master/rfc/rfc-27/rfc-27.md) for detailed design. + +Partitioning can be considered a coarse form of indexing and data skipping using the col_stats partition can be thought of as a range index, that databases use to identify potential +blocks of data interesting to a query. Unlike partition pruning for tables using physical partitioning where records in the dataset are organized into a folder structure based +on some column's value, data skipping using col_stats delivers a logical/virtual partitioning. + +For very large tables (1Tb+, 10s of 1000s of files), Data skipping could + +1. Substantially improve query execution runtime **10x** as compared to the same query on the same dataset but w/o Data Skipping enabled. +2. Help avoid hitting Cloud Storages throttling limits (for issuing too many requests, for ex, AWS limits # of requests / sec that could be issued based on the object's prefix which considerably complicates things for partitioned tables) + +To unlock the power of Data Skipping you will need to + +1. Enable Metadata Table along with Column Stats Index on the _write path_ (See [Metadata Indexing](/docs/metadata_indexing)), using `hoodie.metadata.enable=true` (to enable Metadata Table on the write path, enabled by default) +2. Enable Data Skipping in your queries, using `hoodie.metadata.index.column.stats.enable=true` (to enable Column Stats Index being populated on the write path, disabled by default) + +:::note +If you're planning on enabling Column Stats Index for already existing table, please check out the [Metadata Indexing](/docs/metadata_indexing) guide on how to build Metadata Table Indices (such as Column Stats Index) for existing tables. +::: + +To enable Data Skipping in your queries make sure to set following properties to `true` (on the read path): + + - `hoodie.enable.data.skipping` (to control data skipping, enabled by default) + - `hoodie.metadata.enable` (to enable metadata table use on the read path, enabled by default) + - `hoodie.metadata.index.column.stats.enable` (to enable column stats index use on the read path) diff --git a/website/versioned_docs/version-1.0.0/platform_services_post_commit_callback.md b/website/versioned_docs/version-1.0.0/platform_services_post_commit_callback.md new file mode 100644 index 0000000000000..e02f0d01146a1 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/platform_services_post_commit_callback.md @@ -0,0 +1,58 @@ +--- +title: Post-commit Callback +keywords: [hudi, platform, commit, callback] +--- + +Apache Hudi provides the ability to post a callback notification about a write commit. This may be valuable if you need +an event notification stream to take actions with other services after a Hudi write commit. +You can push a write commit callback notification into HTTP endpoints or to a Kafka server. + +## HTTP Endpoints +You can push a commit notification to an HTTP URL and can specify custom values by extending a callback class defined below. + +| Config | Description | Required | Default | +| ----------- | ------- | ------- | ------ | +| TURN_CALLBACK_ON | Turn commit callback on/off | optional | false (*callbacks off*) | +| CALLBACK_HTTP_URL | Callback host to be sent along with callback messages | required | N/A | +| CALLBACK_HTTP_TIMEOUT_IN_SECONDS | Callback timeout in seconds | optional | 3 | +| CALLBACK_CLASS_NAME | Full path of callback class and must be a subclass of HoodieWriteCommitCallback class, org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback by default | optional | org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback | +| CALLBACK_HTTP_API_KEY_VALUE | Http callback API key | optional | hudi_write_commit_http_callback | +| CALLBACK_HTTP_CUSTOM_HEADERS | Http callback custom headers. Format: HeaderName1:HeaderValue1;HeaderName2:HeaderValue2 | optional | N/A | +| | | | | + +## Kafka Endpoints +You can push a commit notification to a Kafka topic so it can be used by other real time systems. + +| Config | Description | Required | Default | +| ----------- | ------- | ------- | ------ | +| TOPIC | Kafka topic name to publish timeline activity into. | required | N/A | +| PARTITION | It may be desirable to serialize all changes into a single Kafka partition for providing strict ordering. By default, Kafka messages are keyed by table name, which guarantees ordering at the table level, but not globally (or when new partitions are added) | required | N/A | +| RETRIES | Times to retry the produce | optional | 3 | +| ACKS | kafka acks level, all by default to ensure strong durability | optional | all | +| BOOTSTRAP_SERVERS | Bootstrap servers of kafka cluster, to be used for publishing commit metadata | required | N/A | +| | | | | + +## Pulsar Endpoints +You can push a commit notification to a Pulsar topic so it can be used by other real time systems. + +| Config | Description | Required | Default | +| ----------- |-----------------------------------------------------------------------------| ------- |--------| +| hoodie.write.commit.callback.pulsar.broker.service.url | Server's Url of pulsar cluster to use to publish commit metadata. | required | N/A | +| hoodie.write.commit.callback.pulsar.topic | Pulsar topic name to publish timeline activity into | required | N/A | +| hoodie.write.commit.callback.pulsar.producer.route-mode | Message routing logic for producers on partitioned topics. | optional | RoundRobinPartition | +| hoodie.write.commit.callback.pulsar.producer.pending-queue-size | The maximum size of a queue holding pending messages. | optional | 1000 | +| hoodie.write.commit.callback.pulsar.producer.pending-total-size | The maximum number of pending messages across partitions. | required | 50000 | +| hoodie.write.commit.callback.pulsar.producer.block-if-queue-full | When the queue is full, the method is blocked instead of an exception is thrown. | optional | true | +| hoodie.write.commit.callback.pulsar.producer.send-timeout | The timeout in each sending to pulsar. | optional | 30s | +| hoodie.write.commit.callback.pulsar.operation-timeout | Duration of waiting for completing an operation. | optional | 30s | +| hoodie.write.commit.callback.pulsar.connection-timeout | Duration of waiting for a connection to a broker to be established. | optional | 10s | +| hoodie.write.commit.callback.pulsar.request-timeout | Duration of waiting for completing a request. | optional | 60s | +| hoodie.write.commit.callback.pulsar.keepalive-interval | Duration of keeping alive interval for each client broker connection. | optional | 30s | +| | | | | + +## Bring your own implementation +You can extend the HoodieWriteCommitCallback class to implement your own way to asynchronously handle the callback +of a successful write. Use this public API: + +https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/HoodieWriteCommitCallback.java + diff --git a/website/versioned_docs/version-1.0.0/precommit_validator.md b/website/versioned_docs/version-1.0.0/precommit_validator.md new file mode 100644 index 0000000000000..d5faf61057dee --- /dev/null +++ b/website/versioned_docs/version-1.0.0/precommit_validator.md @@ -0,0 +1,101 @@ +--- +title: Data Quality +keywords: [ hudi, quality, expectations, pre-commit validator] +--- + +Data quality refers to the overall accuracy, completeness, consistency, and validity of data. Ensuring data quality is vital for accurate analysis and reporting, as well as for compliance with regulations and maintaining trust in your organization's data infrastructure. + +Hudi offers **Pre-Commit Validators** that allow you to ensure that your data meets certain data quality expectations as you are writing with Hudi Streamer or Spark Datasource writers. + +To configure pre-commit validators, use this setting `hoodie.precommit.validators=`. + +Example: +```scala +spark.write.format("hudi") + .option("hoodie.precommit.validators", "org.apache.hudi.client.validator.SqlQueryEqualityPreCommitValidator") +``` + +Today you can use any of these validators and even have the flexibility to extend your own: + +## SQL Query Single Result +[org.apache.hudi.client.validator.SqlQuerySingleResultPreCommitValidator](https://github.com/apache/hudi/blob/bf5a52e51bbeaa089995335a0a4c55884792e505/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQuerySingleResultPreCommitValidator.java) + +The SQL Query Single Result validator can be used to validate that a query on the table results in a specific value. This validator allows you to run a SQL query and abort the commit if it does not match the expected output. + +Multiple queries can be separated by `;` delimiter. Include the expected result as part of the query separated by `#`. + +Syntax: `query1#result1;query2#result2` + +Example: +```scala +// In this example, we set up a validator that expects there is no row with `col` column as `null` + +import org.apache.hudi.config.HoodiePreCommitValidatorConfig._ + +df.write.format("hudi").mode(Overwrite). + option("hoodie.table.name", tableName). + option("hoodie.precommit.validators", "org.apache.hudi.client.validator.SqlQuerySingleResultPreCommitValidator"). + option("hoodie.precommit.validators.single.value.sql.queries", "select count(*) from where col is null#0"). + save(basePath) +``` + +## SQL Query Equality +[org.apache.hudi.client.validator.SqlQueryEqualityPreCommitValidator](https://github.com/apache/hudi/blob/bf5a52e51bbeaa089995335a0a4c55884792e505/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryEqualityPreCommitValidator.java) + +The SQL Query Equality validator runs a query before ingesting the data, then runs the same query after ingesting the data and confirms that both outputs match. This allows you to validate for equality of rows before and after the commit. + +This validator is useful when you want to verify that your query does not change a specific subset of the data. Some examples: +- Validate that the number of null fields is the same before and after your query +- Validate that there are no duplicate records after your query runs +- Validate that you are only updating the data, and no inserts slip through + +Example: +```scala +// In this example, we set up a validator that expects no change of null rows with the new commit + +import org.apache.hudi.config.HoodiePreCommitValidatorConfig._ + +df.write.format("hudi").mode(Overwrite). + option("hoodie.table.name", tableName). + option("hoodie.precommit.validators", "org.apache.hudi.client.validator.SqlQueryEqualityPreCommitValidator"). + option("hoodie.precommit.validators.equality.sql.queries", "select count(*) from where col is null"). + save(basePath) +``` + +## SQL Query Inequality +[org.apache.hudi.client.validator.SqlQueryInequalityPreCommitValidator](https://github.com/apache/hudi/blob/bf5a52e51bbeaa089995335a0a4c55884792e505/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryInequalityPreCommitValidator.java) + +The SQL Query Inquality validator runs a query before ingesting the data, then runs the same query after ingesting the data and confirms that both outputs DO NOT match. This allows you to confirm changes in the rows before and after the commit. + +Example: +```scala +// In this example, we set up a validator that expects a change of null rows with the new commit + +import org.apache.hudi.config.HoodiePreCommitValidatorConfig._ + +df.write.format("hudi").mode(Overwrite). + option("hoodie.table.name", tableName). + option("hoodie.precommit.validators", "org.apache.hudi.client.validator.SqlQueryInequalityPreCommitValidator"). + option("hoodie.precommit.validators.inequality.sql.queries", "select count(*) from where col is null"). + save(basePath) +``` + +## Extend Custom Validator +Users can also provide their own implementations by extending the abstract class [SparkPreCommitValidator](https://github.com/apache/hudi/blob/bf5a52e51bbeaa089995335a0a4c55884792e505/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SparkPreCommitValidator.java) +and overriding this method + +```java +void validateRecordsBeforeAndAfter(Dataset before, + Dataset after, + Set partitionsAffected) +``` + +## Additional Monitoring with Notifications +Hudi offers a [commit notification service](platform_services_post_commit_callback) that can be configured to trigger notifications about write commits. + +The commit notification service can be combined with pre-commit validators to send a notification when a commit fails a validation. This is possible by passing details about the validation as a custom value to the HTTP endpoint. + +## Related Resources +

Videos

+ +* [Learn About Apache Hudi Pre Commit Validator with Hands on Lab](https://www.youtube.com/watch?v=KNzs9dj_Btc) diff --git a/website/versioned_docs/version-1.0.0/privacy.md b/website/versioned_docs/version-1.0.0/privacy.md new file mode 100644 index 0000000000000..dd8f78c0c98c0 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/privacy.md @@ -0,0 +1,22 @@ +--- +title: Privacy Policy +keywords: [ hudi, privacy] +last_modified_at: 2019-12-30T15:59:57-04:00 +--- + +Information about your use of this website is collected using server access logs and a tracking cookie. +The collected information consists of the following: + +* The IP address from which you access the website; +* The type of browser and operating system you use to access our site; +* The date and time you access our site; +* The pages you visit; +* The addresses of pages from where you followed a link to our site. + +Part of this information is gathered using a tracking cookie set by the [Google Analytics](http://www.google.com/analytics) service and handled by Google as described in their [privacy policy](http://www.google.com/privacy). See your browser documentation for instructions on how to disable the cookie if you prefer not to share this data with Google. + +We use the gathered information to help us make our site more useful to visitors and to better understand how and when our site is used. We do not track or collect personally identifiable information or associate gathered data with any personally identifying information from other sources. + +By using this website, you consent to the collection of this data in the manner and for the purpose described above. + +The Hudi development community welcomes your questions or comments regarding this Privacy Policy. Send them to dev@hudi.apache.org diff --git a/website/versioned_docs/version-1.0.0/procedures.md b/website/versioned_docs/version-1.0.0/procedures.md new file mode 100644 index 0000000000000..19d6566801117 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/procedures.md @@ -0,0 +1,2001 @@ +--- +title: SQL Procedures +summary: "In this page, we introduce how to use SQL procedures with Hudi." +toc: true +last_modified_at: +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +Stored procedures are available when use Hudi SparkSQL extensions in all spark's version. + +## Usage +`CALL` supports passing arguments by name (recommended) or by position. Mixing position and named arguments is also supported. + +#### Named arguments +All procedure arguments are named. When passing arguments by name, arguments can be in any order and any optional argument can be omitted. +``` +CALL system.procedure_name(arg_name_2 => arg_2, arg_name_1 => arg_1, ... arg_name_n => arg_n); +``` +#### Positional arguments +When passing arguments by position, the arguments may be omitted if they are optional. +``` +CALL system.procedure_name(arg_1, arg_2, ... arg_n); +``` +*note:* The system here has no practical meaning, the complete procedure name is system.procedure_name. + +### help + +Show parameters and outputTypes of a procedure. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|---------------------| +| cmd | String | N | None | name of a procedure | + +**Output** + +| Output Name | Type | +|--------------|--------| +| result | String | + +**Example** + +``` +call help(cmd => 'show_commits'); +``` + +| result | +|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| parameters:
param type_name default_value required
table string None true
limit integer 10 false
outputType:
name type_name nullable metadata
commit_time string true \{}
action string true \{}
total_bytes_written long true \{}
total_files_added long true \{}
total_files_updated long true \{}
total_partitions_written long true \{}
total_records_written long true \{}
total_update_records_written long true \{}
total_errors long true \{} | + +## Commit management + +### show_commits + +Show commits' info. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | + +**Output** + +| Output Name | Type | +|------------------------------|--------| +| commit_time | String | +| total_bytes_written | Long | +| total_files_added | Long | +| total_files_updated | Long | +| total_partitions_written | Long | +| total_records_written | Long | +| total_update_records_written | Long | +| total_errors | Long | + +**Example** + +``` +call show_commits(table => 'test_hudi_table', limit => 10); +``` + +| commit_time | total_bytes_written | total_files_added | total_files_updated | total_partitions_written | total_records_written | total_update_records_written | total_errors | +|-------------------|--------------------------|-------------------|---------------------|--------------------------|-----------------------|------------------------------|--------------| +| 20220216171049652 | 432653 | 0 | 1 | 1 | 0 | 0 | 0 | +| 20220216171027021 | 435346 | 1 | 0 | 1 | 1 | 0 | 0 | +| 20220216171019361 | 435349 | 1 | 0 | 1 | 1 | 0 | 0 | + +### show_commits_metadata + +Show commits' metadata. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | + +**Output** + +| Output Name | Type | +|---------------------------------|--------| +| commit_time | String | +| action | String | +| partition | String | +| file_id | String | +| previous_commit | String | +| num_writes | Long | +| num_inserts | Long | +| num_deletes | Long | +| num_update_writes | String | +| total_errors | Long | +| total_log_blocks | Long | +| total_corrupt_logblocks | Long | +| total_rollback_blocks | Long | +| total_log_records | Long | +| total_updated_records_compacted | Long | +| total_bytes_written | Long | + +**Example** + +``` +call show_commits_metadata(table => 'test_hudi_table'); +``` + +| commit_time | action | partition | file_id | previous_commit | num_writes | num_inserts | num_deletes | num_update_writes | total_errors | total_log_blocks | total_corrupt_logblocks | total_rollback_blocks | total_log_records | total_updated_records_compacted | total_bytes_written| +|----------------- |---------|---------------|----------------------------------------|-------------------|------------|-------------|-------------|-------------------|--------------|------------------|-------------------------|-----------------------|-------------------|---------------------------------|------------------- | +|20220109225319449 | commit | dt=2021-05-03 | d0073a12-085d-4f49-83e9-402947e7e90a-0 | null | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 435349 | +|20220109225311742 | commit | dt=2021-05-02 | b3b32bac-8a44-4c4d-b433-0cb1bf620f23-0 | 20220109214830592 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 435340 | +|20220109225301429 | commit | dt=2021-05-01 | 0d7298b3-6b55-4cff-8d7d-b0772358b78a-0 | 20220109214830592 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 435340 | +|20220109214830592 | commit | dt=2021-05-01 | 0d7298b3-6b55-4cff-8d7d-b0772358b78a-0 | 20220109191631015 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 432653 | +|20220109214830592 | commit | dt=2021-05-02 | b3b32bac-8a44-4c4d-b433-0cb1bf620f23-0 | 20220109191648181 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 432653 | +|20220109191648181 | commit | dt=2021-05-02 | b3b32bac-8a44-4c4d-b433-0cb1bf620f23-0 | null | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 435341 | +|20220109191631015 | commit | dt=2021-05-01 | 0d7298b3-6b55-4cff-8d7d-b0772358b78a-0 | null | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 435341 | + +### show_commit_extra_metadata + +Show commits' extra metadata. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | +| instant_time | String | N | None | Instant time | +| metadata_key | String | N | None | Key of metadata | + +**Output** + +| Output Name | Type | +|----------------|--------| +| instant_time | String | +| action | String | +| metadata_key | String | +| metadata_value | String | + +**Example** + +``` +call show_commit_extra_metadata(table => 'test_hudi_table'); +``` + +| instant_time | action | metadata_key | metadata_value | +|-------------------|-------------|---------------|| +| 20230206174349556 | deltacommit | schema | \{"type":"record","name":"hudi_mor_tbl","fields":[\{"name":"_hoodie_commit_time","type":["null","string"],"doc":"","default":null},\{"name":"_hoodie_commit_seqno","type":["null","string"],"doc":"","default":null},\{"name":"_hoodie_record_key","type":["null","string"],"doc":"","default":null},\{"name":"_hoodie_partition_path","type":["null","string"],"doc":"","default":null},\{"name":"_hoodie_file_name","type":["null","string"],"doc":"","default":null},\{"name":"id","type":"int"},\{"name":"ts","type":"long"}]} | +| 20230206174349556 | deltacommit | latest_schema | \{"max_column_id":8,"version_id":20230206174349556,"type":"record","fields":[\{"id":0,"name":"_hoodie_commit_time","optional":true,"type":"string","doc":""},\{"id":1,"name":"_hoodie_commit_seqno","optional":true,"type":"string","doc":""},\{"id":2,"name":"_hoodie_record_key","optional":true,"type":"string","doc":""},\{"id":3,"name":"_hoodie_partition_path","optional":true,"type":"string","doc":""},\{"id":4,"name":"_hoodie_file_name","optional":true,"type":"string","doc":""},\{"id":5,"name":"id","optional":false,"type":"int"},\{"id":8,"name":"ts","optional":false,"type":"long"}]} | + +### show_archived_commits + +Show archived commits. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|------------------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | +| start_ts | String | N | "" | Start time for commits, default: now - 10 days | +| end_ts | String | N | "" | End time for commits, default: now - 1 day | + +**Output** + +| Output Name | Type | +|------------------------------|--------| +| commit_time | String | +| total_bytes_written | Long | +| total_files_added | Long | +| total_files_updated | Long | +| total_partitions_written | Long | +| total_records_written | Long | +| total_update_records_written | Long | +| total_errors | Long | + +**Example** + +``` +call show_archived_commits(table => 'test_hudi_table'); +``` + +| commit_time | total_bytes_written | total_files_added | total_files_updated | total_partitions_written | total_records_written | total_update_records_written | total_errors | +|-------------------|--------------------------|-------------------|---------------------|--------------------------|-----------------------|------------------------------|--------------| +| 20220216171049652 | 432653 | 0 | 1 | 1 | 0 | 0 | 0 | +| 20220216171027021 | 435346 | 1 | 0 | 1 | 1 | 0 | 0 | +| 20220216171019361 | 435349 | 1 | 0 | 1 | 1 | 0 | 0 | + + +### show_archived_commits_metadata + +Show archived commits' metadata. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|------------------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | +| start_ts | String | N | "" | Start time for commits, default: now - 10 days | +| end_ts | String | N | "" | End time for commits, default: now - 1 day | + +**Output** + +| Output Name | Type | +|---------------------------------|--------| +| commit_time | String | +| action | String | +| partition | String | +| file_id | String | +| previous_commit | String | +| num_writes | Long | +| num_inserts | Long | +| num_deletes | Long | +| num_update_writes | String | +| total_errors | Long | +| total_log_blocks | Long | +| total_corrupt_logblocks | Long | +| total_rollback_blocks | Long | +| total_log_records | Long | +| total_updated_records_compacted | Long | +| total_bytes_written | Long | + +**Example** + +``` +call show_archived_commits_metadata(table => 'test_hudi_table'); +``` + +| commit_time | action | partition | file_id | previous_commit | num_writes | num_inserts | num_deletes | num_update_writes | total_errors | total_log_blocks | total_corrupt_logblocks | total_rollback_blocks | total_log_records | total_updated_records_compacted | total_bytes_written| +|----------------- |---------|---------------|----------------------------------------|-------------------|------------|-------------|-------------|-------------------|--------------|------------------|-------------------------|-----------------------|-------------------|---------------------------------|------------------- | +|20220109225319449 | commit | dt=2021-05-03 | d0073a12-085d-4f49-83e9-402947e7e90a-0 | null | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 435349 | +|20220109225311742 | commit | dt=2021-05-02 | b3b32bac-8a44-4c4d-b433-0cb1bf620f23-0 | 20220109214830592 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 435340 | +|20220109225301429 | commit | dt=2021-05-01 | 0d7298b3-6b55-4cff-8d7d-b0772358b78a-0 | 20220109214830592 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 435340 | +|20220109214830592 | commit | dt=2021-05-01 | 0d7298b3-6b55-4cff-8d7d-b0772358b78a-0 | 20220109191631015 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 432653 | +|20220109214830592 | commit | dt=2021-05-02 | b3b32bac-8a44-4c4d-b433-0cb1bf620f23-0 | 20220109191648181 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 432653 | +|20220109191648181 | commit | dt=2021-05-02 | b3b32bac-8a44-4c4d-b433-0cb1bf620f23-0 | null | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 435341 | +|20220109191631015 | commit | dt=2021-05-01 | 0d7298b3-6b55-4cff-8d7d-b0772358b78a-0 | null | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 435341 | + + +``` +call show_archived_commits(table => 'test_hudi_table'); +``` + +| commit_time | total_bytes_written | total_files_added | total_files_updated | total_partitions_written | total_records_written | total_update_records_written | total_errors | +|-------------------|--------------------------|-------------------|---------------------|--------------------------|-----------------------|------------------------------|--------------| +| 20220216171049652 | 432653 | 0 | 1 | 1 | 0 | 0 | 0 | +| 20220216171027021 | 435346 | 1 | 0 | 1 | 1 | 0 | 0 | +| 20220216171019361 | 435349 | 1 | 0 | 1 | 1 | 0 | 0 | + +### show_commit_files + +Show files of a commit. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | +| instant_time | String | Y | None | Instant time | + +**Output** + +| Output Name | Type | +|-----------------------|--------| +| action | String | +| partition_path | String | +| file_id | String | +| previous_commit | String | +| total_records_updated | Long | +| total_records_written | Long | +| total_bytes_written | Long | +| total_errors | Long | +| file_size | Long | + +**Example** + +``` +call show_commit_files(table => 'test_hudi_table', instant_time => '20230206174349556'); +``` + +| action | partition_path | file_id | previous_commit | total_records_updated | total_records_written | total_bytes_written | total_errors | file_size | +|-------------|-----------------|----------------------------------------|-----------------|-----------------------|-----------------------|---------------------|--------------|-----------| +| deltacommit | dt=2021-05-03 | 7fb52523-c7f6-41aa-84a6-629041477aeb-0 | null | 0 | 1 | 434768 | 0 | 434768 | + +### show_commit_partitions + +Show partitions of a commit. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | +| instant_time | String | Y | None | Instant time | + +**Output** + +| Output Name | Type | +|------------------------|--------| +| action | String | +| partition_path | String | +| total_files_added | Long | +| total_files_updated | Long | +| total_records_inserted | Long | +| total_records_updated | Long | +| total_bytes_written | Long | +| total_errors | Long | + +**Example** + +``` +call show_commit_partitions(table => 'test_hudi_table', instant_time => '20230206174349556'); +``` + +| action | partition_path | total_files_added | total_files_updated | total_records_inserted | total_records_updated | total_bytes_written | total_errors | +|-------------|-----------------|----------------------------------------|---------------------|------------------------|-----------------------|---------------------|--------------| +| deltacommit | dt=2021-05-03 | 7fb52523-c7f6-41aa-84a6-629041477aeb-0 | 0 | 1 | 434768 | 0 | 0 | + +### show_commit_write_stats + +Show write statistics of a commit. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | +| instant_time | String | Y | None | Instant time | + +**Output** + +| Output Name | Type | +|-----------------------|--------| +| action | String | +| total_bytes_written | Long | +| total_records_written | Long | +| avg_record_size | Long | + +**Example** + +``` +call show_commit_write_stats(table => 'test_hudi_table', instant_time => '20230206174349556'); +``` + +| action | total_bytes_written | total_records_written | avg_record_size | +|-------------|---------------------|-----------------------|-----------------| +| deltacommit | 434768 | 1 | 434768 | + +### show_rollbacks + +Show rollback commits. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | + +**Output** + +| Output Name | Type | +|----------------------|--------| +| instant | String | +| rollback_instant | String | +| total_files_deleted | Int | +| time_taken_in_millis | Long | +| total_partitions | Int | + +**Example** + +``` +call show_rollbacks(table => 'test_hudi_table'); +``` + +| instant | rollback_instant | total_files_deleted | time_taken_in_millis | time_taken_in_millis | +|-------------|------------------|---------------------|----------------------|----------------------| +| deltacommit | 434768 | 1 | 434768 | 2 | + + +### show_rollback_detail + +Show details of a rollback commit. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | +| instant_time | String | Y | None | Instant time | + +**Output** + +| Output Name | Type | +|------------------|--------| +| instant | String | +| rollback_instant | String | +| partition | String | +| deleted_file | String | +| succeeded | Int | + +**Example** + +``` +call show_rollback_detail(table => 'test_hudi_table', instant_time => '20230206174349556'); +``` + +| instant | rollback_instant | partition | deleted_file | succeeded | +|-------------|------------------|-----------|--------------|-----------| +| deltacommit | 434768 | 1 | 434768 | 2 | + +### commits_compare + +Compare commit with another path. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | +| path | String | Y | None | Path of table | + +**Output** + +| Output Name | Type | +|----------------|--------| +| compare_detail | String | + +**Example** + +``` +call commits_compare(table => 'test_hudi_table', path => 'hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table'); +``` + +| compare_detail | +|------------------------------------------------------------------------| +| Source test_hudi_table is ahead by 0 commits. Commits to catch up - [] | + +### archive_commits + +archive commits. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|------------------------------------------------------------------------|---------|----------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| table | String | N | None | Hudi table name | +| path | String | N | None | Path of table | +| [min_commits](configurations#hoodiekeepmincommits) | Int | N | 20 | Similar to hoodie.keep.max.commits, but controls the minimum number of instants to retain in the active timeline. | +| [max_commits](configurations#hoodiekeepmaxcommits) | Int | N | 30 | Archiving service moves older entries from timeline into an archived log after each write, to keep the metadata overhead constant, even as the table size grows. This config controls the maximum number of instants to retain in the active timeline. | +| [retain_commits](configurations#hoodiecommitsarchivalbatch) | Int | N | 10 | Archiving of instants is batched in best-effort manner, to pack more instants into a single archive log. This config controls such archival batch size. | +| [enable_metadata](configurations#hoodiemetadataenable) | Boolean | N | false | Enable the internal metadata table | + +**Output** + +| Output Name | Type | +|-------------|------| +| result | Int | + +**Example** + +``` +call archive_commits(table => 'test_hudi_table'); +``` + +| result | +|--------| +| 0 | + +### export_instants + +extract instants to local folder. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|---------|----------|-----------------------------------------------------|-----------------------------------| +| table | String | Y | None | Hudi table name | +| local_folder | String | Y | None | Local folder | +| limit | Int | N | -1 | Number of instants to be exported | +| actions | String | N | clean,commit,deltacommit,rollback,savepoint,restore | Commit action | +| desc | Boolean | N | false | Descending order | + +**Output** + +| Output Name | Type | +|---------------|--------| +| export_detail | String | + +**Example** + +``` +call export_instants(table => 'test_hudi_table', local_folder => '/tmp/folder'); +``` + +| export_detail | +|:-----------------------------------| +| Exported 6 Instants to /tmp/folder | + +### rollback_to_instant + +Rollback a table to the commit that was current at some time. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | + +**Output** + +| Output Name | Type | +|-----------------|---------| +| rollback_result | Boolean | + +**Example** + +Roll back test_hudi_table to one instant +``` +call rollback_to_instant(table => 'test_hudi_table', instant_time => '20220109225319449'); +``` + +| rollback_result | +|:----------------| +| true | + +### create_savepoint + +Create a savepoint to hudi's table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | +| commit_time | String | Y | None | Commit time | +| user | String | N | "" | User name | +| comments | String | N | "" | Comments | + +**Output** + +| Output Name | Type | +|-------------------------|---------| +| create_savepoint_result | Boolean | + +**Example** + +``` +call create_savepoint(table => 'test_hudi_table', commit_time => '20220109225319449'); +``` + +| create_savepoint_result | +|:------------------------| +| true | + +### show_savepoints + +Show savepoints. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | + +**Output** + +| Output Name | Type | +|----------------|--------| +| savepoint_time | String | + +**Example** + +``` +call show_savepoints(table => 'test_hudi_table'); +``` + +| savepoint_time | +|:------------------| +| 20220109225319449 | +| 20220109225311742 | +| 20220109225301429 | + +### delete_savepoint + +Delete a savepoint to hudi's table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | +| instant_time | String | Y | None | Instant time | + +**Output** + +| Output Name | Type | +|-------------------------|---------| +| delete_savepoint_result | Boolean | + +**Example** + +Delete a savepoint to test_hudi_table +``` +call delete_savepoint(table => 'test_hudi_table', instant_time => '20220109225319449'); +``` + +| delete_savepoint_result | +|:------------------------| +| true | + +### rollback_to_savepoint + +Rollback a table to the commit that was current at some time. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | +| instant_time | String | Y | None | Instant time | + +**Output** + +| Output Name | Type | +|---------------------------|---------| +| rollback_savepoint_result | Boolean | + +**Example** + +Rollback test_hudi_table to one savepoint +``` +call rollback_to_savepoint(table => 'test_hudi_table', instant_time => '20220109225319449'); +``` + +| rollback_savepoint_result | +|:--------------------------| +| true | + +### copy_to_temp_view + +copy table to a temporary view. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|-------------------------------------------------------------------|---------|----------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| table | String | Y | None | Hudi table name | +| [query_type](configurations#hoodiedatasourcequerytype) | String | N | "snapshot" | Whether data needs to be read, in `incremental` mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest view, based on base files) (or) `snapshot` mode (obtain latest view, by merging base and (if any) log files) | +| view_name | String | Y | None | Name of view | +| begin_instance_time | String | N | "" | Begin instance time | +| end_instance_time | String | N | "" | End instance time | +| as_of_instant | String | N | "" | As of instant time | +| replace | Boolean | N | false | Replace an existed view | +| global | Boolean | N | false | Global view | + +**Output** + +| Output Name | Type | +|-------------|---------| +| status | Boolean | + +**Example** + +``` +call copy_to_temp_view(table => 'test_hudi_table', view_name => 'copy_view_test_hudi_table'); +``` + +| status | +|--------| +| 0 | + +### copy_to_table + +copy table to a new table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|-------------------------------------------------------------------|--------|----------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| table | String | Y | None | Hudi table name | +| [query_type](configurations#hoodiedatasourcequerytype) | String | N | "snapshot" | Whether data needs to be read, in `incremental` mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest view, based on base files) (or) `snapshot` mode (obtain latest view, by merging base and (if any) log files) | +| new_table | String | Y | None | Name of new table | +| begin_instance_time | String | N | "" | Begin instance time | +| end_instance_time | String | N | "" | End instance time | +| as_of_instant | String | N | "" | As of instant time | +| save_mode | String | N | "overwrite" | Save mode | +| columns | String | N | "" | Columns of source table which should copy to new table | + + +**Output** + +| Output Name | Type | +|-------------|---------| +| status | Boolean | + +**Example** + +``` +call copy_to_table(table => 'test_hudi_table', new_table => 'copy_table_test_hudi_table'); +``` + +| status | +|--------| +| 0 | + +## Metadata Table management + +### create_metadata_table + +Create metadata table of a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | + +**Output** + +| Output Name | Type | +|-------------|--------| +| result | String | + +**Example** + +``` +call create_metadata_table(table => 'test_hudi_table'); +``` + +| result | +|:------------------------------------------------------------------------------------------------------------------| +| Created Metadata Table in hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table/.hoodie/metadata (duration=2.777secs) | + +### init_metadata_table + +Init metadata table of a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|---------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | +| read_only | Boolean | N | false | Read only | + +**Output** + +| Output Name | Type | +|-------------|---------| +| result | String | + +**Example** + +``` +call init_metadata_table(table => 'test_hudi_table'); +``` + +| result | +|:---------------------------------------------------------------------------------------------------------------------| +| Initialized Metadata Table in hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table/.hoodie/metadata (duration=0.023sec) | + +### delete_metadata_table + +Delete metadata table of a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|---------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | + +**Output** + +| Output Name | Type | +|-------------|---------| +| result | String | + +**Example** + +``` +call delete_metadata_table(table => 'test_hudi_table'); +``` + +| result | +|:-----------------------------------------------------------------------------------------------| +| Removed Metadata Table from hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table/.hoodie/metadata | + +### show_metadata_table_partitions + +Show partition of a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | + +**Output** + +| Output Name | Type | +|-------------|---------| +| partition | String | + +**Example** + +``` +call show_metadata_table_partitions(table => 'test_hudi_table'); +``` + +| partition | +|:--------------| +| dt=2021-05-01 | +| dt=2021-05-02 | +| dt=2021-05-03 | + +### show_metadata_table_files + +Show files of a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | +| partition | String | N | "" | Partition name | +| limit | Int | N | 100 | Limit number | + +**Output** + +| Output Name | Type | +|-------------|---------| +| file_path | String | + +**Example** + +Show files of a hudi table under one partition. +``` +call show_metadata_table_files(table => 'test_hudi_table', partition => 'dt=20230220'); +``` + +| file_path | +|:--------------------------------------------------------------------------| +| .d3cdf6ff-250a-4cee-9af4-ab179fdb9bfb-0_20230220190948086.log.1_0-111-123 | +| d3cdf6ff-250a-4cee-9af4-ab179fdb9bfb-0_0-78-81_20230220190948086.parquet | + +### show_metadata_table_stats + +Show metadata table stats of a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | + +**Output** + +| Output Name | Type | +|-------------|--------| +| stat_key | String | +| stat_value | String | + +**Example** + +``` +call show_metadata_table_stats(table => 'test_hudi_table'); +``` + +| stat_key | stat_value | +|----------------------------------------|------------| +| dt=2021-05-03.totalBaseFileSizeInBytes | 23142 | + +### validate_metadata_table_files + +Validate metadata table files of a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|---------|----------|---------------|----------------------------| +| table | String | Y | None | Hudi table name | +| verbose | Boolean | N | False | If verbose print all files | + + +**Output** + +| Output Name | Type | +|------------------------|---------| +| partition | String | +| file_name | String | +| is_present_in_fs | Boolean | +| is_present_in_metadata | Boolean | +| fs_size | Long | +| metadata_size | Long | + +**Example** + +``` +call validate_metadata_table_files(table => 'test_hudi_table'); +``` + +| partition | file_name | is_present_in_fs | is_present_in_metadata | fs_size | metadata_size | +|---------------|---------------------------------------------------------------------|------------------|------------------------|---------|---------------| +| dt=2021-05-03 | ad1e5a3f-532f-4a13-9f60-223676798bf3-0_0-4-4_00000000000002.parquet | true | true | 43523 | 43523 | + +## Table information + +### show_table_properties + +Show hudi properties of a table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| path | String | N | None | Path of table | +| limit | Int | N | 10 | Max number of records to be returned | + +**Output** + +| Output Name | Type | +|-------------|--------| +| key | String | +| value | String | + +**Example** + +``` +call show_table_properties(table => 'test_hudi_table', limit => 10); +``` + +| key | value | +|-------------------------------|-------| +| hoodie.table.precombine.field | ts | +| hoodie.table.partition.fields | dt | + +### show_fs_path_detail + +Show detail of a path. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|---------|----------|---------------|------------------------| +| path | String | Y | None | Hudi table name | +| is_sub | Boolean | N | false | Whether to list files | +| sort | Boolean | N | true | Sorted by storage_size | +| limit | Int | N | 100 | Limit number | + +**Output** + +| Output Name | Type | +|--------------------|--------| +| path_num | Long | +| file_num | Long | +| storage_size | Long | +| storage_size(unit) | String | +| storage_path | String | +| space_consumed | Long | +| quota | Long | +| space_quota | Long | + +**Example** + +``` +call show_fs_path_detail(path => 'hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table'); +``` + +| path_num | file_num | storage_size | storage_size(unit) | storage_path | space_consumed | quota | space_quota | +|----------|----------|--------------|--------------------|---------------------------------------------------|----------------|---------|-------------| +| 22 | 58 | 2065612 | 1.97MB | hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table | -1 | 6196836 | -1 | + +### stats_file_sizes + +Show file sizes of a table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| partition_path | String | N | "" | Partition path | +| limit | Int | N | 10 | Max number of records to be returned | + +**Output** + +| Output Name | Type | +|-------------|--------| +| commit_time | String | +| min | Long | +| 10th | Double | +| 50th | Double | +| avg | Double | +| 95th | Double | +| max | Long | +| num_files | Int | +| std_dev | Double | + +**Example** + +``` +call stats_file_sizes(table => 'test_hudi_table'); +``` + +| commit_time | min | 10th | 50th | avg | 95th | max | num_files | std_dev | +|-------------------|--------|----------|----------|----------|----------|--------|-----------|---------| +| 20230205134149455 | 435000 | 435000.0 | 435000.0 | 435000.0 | 435000.0 | 435000 | 1 | 0.0 | + +### stats_wa + +Show write stats and amplification of a table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| limit | Int | N | 10 | Max number of records to be returned | + +**Output** + +| Output Name | Type | +|----------------------------|--------| +| commit_time | String | +| total_upserted | Long | +| total_written | Long | +| write_amplification_factor | String | + +**Example** + +``` +call stats_wa(table => 'test_hudi_table'); +``` + +| commit_time | total_upserted | total_written | write_amplification_factor | +|-------------|----------------|---------------|----------------------------| +| Total | 0 | 0 | 0 | + +### show_logfile_records + +Show records in logfile of a table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|-----------------------|---------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| log_file_path_pattern | String | Y | 10 | Pattern of logfile | +| merge | Boolean | N | false | Merge results | +| limit | Int | N | 10 | Max number of records to be returned | + +**Output** + +| Output Name | Type | +|-------------|--------| +| records | String | + +**Example** + +``` +call show_logfile_records(table => 'test_hudi_table', log_file_path_pattern => 'hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table/*.log*'); +``` + +| records | +|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| \{"_hoodie_commit_time": "20230205133427059", "_hoodie_commit_seqno": "20230205133427059_0_10", "_hoodie_record_key": "1", "_hoodie_partition_path": "", "_hoodie_file_name": "3438e233-7b50-4eff-adbb-70b1cd76f518-0", "id": 1, "name": "a1", "price": 40.0, "ts": 1111} | + +### show_logfile_metadata + +Show metadatas in logfile of a table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|-----------------------|---------|----------|---------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| log_file_path_pattern | String | Y | 10 | Pattern of logfile | +| merge | Boolean | N | false | Merge results | +| limit | Int | N | 10 | Max number of records to be returned | + +**Output** + +| Output Name | Type | +|-----------------|--------| +| instant_time | String | +| record_count | Int | +| block_type | String | +| header_metadata | String | +| footer_metadata | String | + +**Example** + +``` +call show_logfile_metadata(table => 'hudi_mor_tbl', log_file_path_pattern => 'hdfs://ns1/hive/warehouse/hudi.db/hudi_mor_tbl/*.log*'); +``` + +| instant_time | record_count | block_type | header_metadata | footer_metadata | +|-------------------|--------------|-----------------||-----------------| +| 20230205133427059 | 1 | AVRO_DATA_BLOCK | \{"INSTANT_TIME":"20230205133427059","SCHEMA":"\{"type":"record","name":"hudi_mor_tbl_record","namespace":"hoodie.hudi_mor_tbl","fields":[\{"name":"_hoodie_commit_time","type":["null","string"],"doc":"","default":null},\{"name":"_hoodie_commit_seqno","type":["null","string"],"doc":"","default":null},\{"name":"_hoodie_record_key","type":["null","string"],"doc":"","default":null},\{"name":"_hoodie_partition_path","type":["null","string"],"doc":"","default":null},\{"name":"_hoodie_file_name","type":["null","string"],"doc":"","default":null},\{"name":"id","type":"int"},\{"name":"name","type":"string"},\{"name":"price","type":"double"},\{"name":"ts","type":"long"}]}"} | {} | + +### show_invalid_parquet + +Show invalid parquet files of a table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|---------|----------|---------------|--------------------------------------| +| Path | String | Y | None | Hudi table name | +| limit | Int | N | 100 | Limit number | +| needDelete | Boolean | N | false | should delete | + +**Output** + +| Output Name | Type | +|-------------|--------| +| Path | String | + +**Example** + +``` +call show_invalid_parquet(path => 'hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table'); +``` + +| Path | +|----------------------------------------------------------------------------------------------------------------------------| +| hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table/7fb52523-c7f6-41aa-84a6-629041477aeb-0_0-92-99_20230205133532199.parquet | + +### show_fsview_all + +Show file system views of a table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|--------------------|---------|----------|------------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| max_instant | String | N | "" | Max instant time | +| include_max | Boolean | N | false | Include max instant | +| include_in_flight | Boolean | N | false | Include in flight | +| exclude_compaction | Boolean | N | false | Exclude compaction | +| limit | Int | N | 10 | Max number of records to be returned | +| path_regex | String | N | "ALL_PARTITIONS" | Pattern of path | + +**Output** + +| Output Name | Type | +|-----------------------|--------| +| partition | String | +| file_id | String | +| base_instant | String | +| data_file | String | +| data_file_size | Long | +| num_delta_files | Long | +| total_delta_file_size | Long | +| delta_files | String | + +**Example** + +``` +call show_fsview_all(table => 'test_hudi_table'); +``` + +| partition | file_id | base_instant | data_file | data_file_size | num_delta_files | total_delta_file_size | delta_files | +|---------------|----------------------------------------|-------------------|--------------------------------------------------------------------------|----------------|-----------------|-----------------------|-------------------------------------------------------------------------| +| dt=2021-05-03 | d0073a12-085d-4f49-83e9-402947e7e90a-0 | 20220109225319449 | 7fb52523-c7f6-41aa-84a6-629041477aeb-0_0-92-99_20220109225319449.parquet | 5319449 | 1 | 213193 | .7fb52523-c7f6-41aa-84a6-629041477aeb-0_20230205133217210.log.1_0-60-63 | + +### show_fsview_latest + +Show latest file system view of a table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|--------------------|---------|----------|------------------|--------------------------------------| +| table | String | Y | None | Hudi table name | +| max_instant | String | N | "" | Max instant time | +| include_max | Boolean | N | false | Include max instant | +| include_in_flight | Boolean | N | false | Include in flight | +| exclude_compaction | Boolean | N | false | Exclude compaction | +| path_regex | String | N | "ALL_PARTITIONS" | Pattern of path | +| partition_path | String | N | "ALL_PARTITIONS" | Partition path | +| merge | Boolean | N | false | Merge results | + +**Output** + +| Output Name | Type | +|--------------------------------------------|--------| +| partition | String | +| file_id | String | +| base_instant | String | +| data_file | String | +| data_file_size | Long | +| num_delta_files | Long | +| total_delta_file_size | Long | +| delta_size_compaction_scheduled | Long | +| delta_size_compaction_unscheduled | Long | +| delta_to_base_radio_compaction_scheduled | Double | +| delta_to_base_radio_compaction_unscheduled | Double | +| delta_files_compaction_scheduled | String | +| delta_files_compaction_unscheduled | String | + +**Example** + +``` +call show_fsview_latest(table => 'test_hudi_table', partition => 'dt=2021-05-03'); +``` + +| partition | file_id | base_instant | data_file | data_file_size | num_delta_files | total_delta_file_size | delta_files | +|---------------|----------------------------------------|-------------------|--------------------------------------------------------------------------|----------------|-----------------|-----------------------|-------------------------------------------------------------------------| +| dt=2021-05-03 | d0073a12-085d-4f49-83e9-402947e7e90a-0 | 20220109225319449 | 7fb52523-c7f6-41aa-84a6-629041477aeb-0_0-92-99_20220109225319449.parquet | 5319449 | 1 | 213193 | .7fb52523-c7f6-41aa-84a6-629041477aeb-0_20230205133217210.log.1_0-60-63 | + +## Table services + +### run_clustering + +Trigger clustering on a hoodie table. By using partition predicates, clustering table can be run +with specified partitions, and you can also specify the order columns to sort data. + +:::note +Newly clustering instant will be generated every call, or some pending clustering instants are executed. +When calling this procedure, one of parameters ``table`` and ``path`` must be specified at least. If both +parameters are given, ``table`` will take effect. + +::: + + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|-------------------------|---------|----------|---------------|----------------------------------------------------------------| +| table | String | N | None | Name of table to be clustered | +| path | String | N | None | Path of table to be clustered | +| predicate | String | N | None | Predicate to filter partition | +| order | String | N | None | Order column split by `,` | +| show_involved_partition | Boolean | N | false | Show involved partition in the output | +| op | String | N | None | Operation type, `EXECUTE` or `SCHEDULE` | +| order_strategy | String | N | None | Records layout optimization, `linear/z-order/hilbert` | +| options | String | N | None | Customize hudi configs in the format "key1=value1,key2=value2` | +| instants | String | N | None | Specified instants by `,` | +| selected_partitions | String | N | None | Partitions to run clustering by `,` | +| limit | Int | N | None | Max number of plans to be executed | + +**Output** + +The output as follows: + +| Parameter Name | Type | Required | Default Value | Description | +|---------------------|--------|----------|---------------|------------------------------------------| +| timestamp | String | N | None | Instant name | +| input_group_size | Int | N | None | The input group sizes for each plan | +| state | String | N | None | The instant final state | +| involved_partitions | String | N | * | Show involved partitions, default is `*` | + +**Example** + +Clustering test_hudi_table with table name +``` +call run_clustering(table => 'test_hudi_table'); +``` + +Clustering test_hudi_table with table path +``` +call run_clustering(path => '/tmp/hoodie/test_hudi_table'); +``` + +Clustering test_hudi_table with table name, predicate and order column +``` +call run_clustering(table => 'test_hudi_table', predicate => 'ts <= 20220408L', order => 'ts'); +``` + +Clustering test_hudi_table with table name, show_involved_partition +``` +call run_clustering(table => 'test_hudi_table', show_involved_partition => true); +``` + +Clustering test_hudi_table with table name, op +``` +call run_clustering(table => 'test_hudi_table', op => 'schedule'); +``` + +Clustering test_hudi_table with table name, order_strategy +``` +call run_clustering(table => 'test_hudi_table', order_strategy => 'z-order'); +``` + +Clustering test_hudi_table with table name, op, options +``` +call run_clustering(table => 'test_hudi_table', op => 'schedule', options => ' +hoodie.clustering.plan.strategy.target.file.max.bytes=1024*1024*1024, +hoodie.clustering.plan.strategy.max.bytes.per.group=2*1024*1024*1024'); +``` + +Clustering test_hudi_table with table name, op, instants +``` +call run_clustering(table => 'test_hudi_table', op => 'execute', instants => 'ts1,ts2'); +``` + +Clustering test_hudi_table with table name, op, selected_partitions +``` +call run_clustering(table => 'test_hudi_table', op => 'execute', selected_partitions => 'par1,par2'); +``` + +Clustering test_hudi_table with table name, op, limit +``` +call run_clustering(table => 'test_hudi_table', op => 'execute', limit => 10); +``` +:::note +Limit parameter is valid only when op is execute. + +::: + +### show_clustering + +Show pending clusterings on a hoodie table. + +:::note +When calling this procedure, one of parameters ``table`` and ``path`` must be specified at least. +If both parameters are given, ``table`` will take effect. + +::: + + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | N | None | Name of table to be clustered | +| path | String | N | None | Path of table to be clustered | +| limit | Int | N | None | Max number of records to be returned | + +**Output** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|---------------------------------------| +| timestamp | String | N | None | Instant time | +| groups | Int | N | None | Number of file groups to be processed | + +**Example** + +Show pending clusterings with table name +``` +call show_clustering(table => 'test_hudi_table'); +``` +| timestamp | groups | +|-------------------|--------| +| 20220408153707928 | 2 | +| 20220408153636963 | 3 | + +Show pending clusterings with table path +``` +call show_clustering(path => '/tmp/hoodie/test_hudi_table'); +``` +| timestamp | groups | +|-------------------|--------| +| 20220408153707928 | 2 | +| 20220408153636963 | 3 | + +Show pending clusterings with table name and limit +``` +call show_clustering(table => 'test_hudi_table', limit => 1); +``` +| timestamp | groups | +|-------------------|--------| +| 20220408153707928 | 2 | + +### run_compaction + +Schedule or run compaction on a hoodie table. + +:::note +For scheduling compaction, if `timestamp` is specified, new scheduled compaction will use given +timestamp as instant time. Otherwise, compaction will be scheduled by using current system time. + +For running compaction, given ``timestamp`` must be a pending compaction instant time that +already exists, if it's not, exception will be thrown. Meanwhile, if ``timestamp``is specified +and there are pending compactions, all pending compactions will be executed without new compaction +instant generated. + +When calling this procedure, one of parameters ``table`` and ``path``must be specified at least. +If both parameters are given, ``table`` will take effect. +::: + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|----------------------------------------------------------------------------------------------------| +| op | String | N | None | Operation type, `RUN` or `SCHEDULE` | +| table | String | N | None | Name of table to be compacted | +| path | String | N | None | Path of table to be compacted | +| timestamp | String | N | None | Instant time | +| options | String | N | None | comma separated list of Hudi configs for compaction in the format "config1=value1,config2=value2" | + +**Output** + +The output of `RUN` operation is `EMPTY`, the output of `SCHEDULE` as follow: + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|-----------|---------------|--------------| +| instant | String | N | None | Instant name | + +**Example** + +Run compaction with table name +``` +call run_compaction(op => 'run', table => 'test_hudi_table'); +``` + +Run compaction with table path +``` +call run_compaction(op => 'run', path => '/tmp/hoodie/test_hudi_table'); +``` + +Run compaction with table path and timestamp +``` +call run_compaction(op => 'run', path => '/tmp/hoodie/test_hudi_table', timestamp => '20220408153658568'); +``` +Run compaction with options +``` +call run_compaction(op => 'run', table => 'test_hudi_table', options => hoodie.compaction.strategy=org.apache.hudi.table.action.compact.strategy.LogFileNumBasedCompactionStrategy,hoodie.compaction.logfile.num.threshold=3); +``` + +Schedule compaction with table name +``` +call run_compaction(op => 'schedule', table => 'test_hudi_table'); +``` +| instant | +|-------------------| +| 20220408153650834 | + +Schedule compaction with table path +``` +call run_compaction(op => 'schedule', path => '/tmp/hoodie/test_hudi_table'); +``` +| instant | +|-------------------| +| 20220408153650834 | + +Schedule compaction with table path and timestamp +``` +call run_compaction(op => 'schedule', path => '/tmp/hoodie/test_hudi_table', timestamp => '20220408153658568'); +``` +| instant | +|-------------------| +| 20220408153658568 | + +### show_compaction + +Show all compactions on a hoodie table, in-flight or completed compactions are included, and result will +be in reverse order according to trigger time. + +:::note +When calling this procedure, one of parameters ``table``and ``path`` must be specified at least. +If both parameters are given, ``table`` will take effect. +::: + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | N | None | Name of table to show compaction | +| path | String | N | None | Path of table to show compaction | +| limit | Int | N | None | Max number of records to be returned | + +**Output** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|---------------------------------------| +| timestamp | String | N | None | Instant time | +| action | String | N | None | Action name of compaction | +| size | Int | N | None | Number of file slices to be compacted | + +**Example** + +Show compactions with table name +``` +call show_compaction(table => 'test_hudi_table'); +``` +| timestamp | action | size | +|-------------------|------------|---------| +| 20220408153707928 | compaction | 10 | +| 20220408153636963 | compaction | 10 | + +Show compactions with table path +``` +call show_compaction(path => '/tmp/hoodie/test_hudi_table'); +``` +| timestamp | action | size | +|-------------------|------------|---------| +| 20220408153707928 | compaction | 10 | +| 20220408153636963 | compaction | 10 | + +Show compactions with table name and limit +``` +call show_compaction(table => 'test_hudi_table', limit => 1); +``` +| timestamp | action | size | +|-------------------|------------|---------| +| 20220408153707928 | compaction | 10 | + +### run_clean + +Run cleaner on a hoodie table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|---------------------------------------------------------------------------------------|---------|----------|---------------|| +| table | String | Y | None | Name of table to be cleaned | +| schedule_in_line | Boolean | N | true | Set "true" if you want to schedule and run a clean. Set false if you have already scheduled a clean and want to run that. | +| [clean_policy](configurations#hoodiecleanerpolicy) | String | N | None | org.apache.hudi.common.model.HoodieCleaningPolicy: Cleaning policy to be used. The cleaner service deletes older file slices files to re-claim space. Long running query plans may often refer to older file slices and will break if those are cleaned, before the query has had a chance to run. So, it is good to make sure that the data is retained for more than the maximum query execution time. By default, the cleaning policy is determined based on one of the following configs explicitly set by the user (at most one of them can be set; otherwise, KEEP_LATEST_COMMITS cleaning policy is used). KEEP_LATEST_FILE_VERSIONS: keeps the last N versions of the file slices written; used when "hoodie.cleaner.fileversions.retained" is explicitly set only. KEEP_LATEST_COMMITS(default): keeps the file slices written by the last N commits; used when "hoodie.cleaner.commits.retained" is explicitly set only. KEEP_LATEST_BY_HOURS: keeps the file slices written in the last N hours based on the commit time; used when "hoodie.cleaner.hours.retained" is explicitly set only. | +| [retain_commits](configurations#hoodiecleanercommitsretained) | Int | N | None | When KEEP_LATEST_COMMITS cleaning policy is used, the number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much data retention the table supports for incremental queries. | +| [hours_retained](configurations#hoodiecleanerhoursretained) | Int | N | None | When KEEP_LATEST_BY_HOURS cleaning policy is used, the number of hours for which commits need to be retained. This config provides a more flexible option as compared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group, corresponding to commits with commit times older than the configured number of hours to be retained are cleaned. | +| [file_versions_retained](configurations#hoodiecleanerfileversionsretained) | Int | N | None | When KEEP_LATEST_FILE_VERSIONS cleaning policy is used, the minimum number of file slices to retain in each file group, during cleaning. | +| [trigger_strategy](/docs/next/configurations#hoodiecleantriggerstrategy) | String | N | None | org.apache.hudi.table.action.clean.CleaningTriggerStrategy: Controls when cleaning is scheduled. NUM_COMMITS(default): Trigger the cleaning service every N commits, determined by `hoodie.clean.max.commits` | +| [trigger_max_commits](/docs/next/configurations/#hoodiecleanmaxcommits) | Int | N | None | Number of commits after the last clean operation, before scheduling of a new clean is attempted. | +| [options](/docs/next/configurations/#Clean-Configs) | String | N | None | comma separated list of Hudi configs for cleaning in the format "config1=value1,config2=value2" | + +**Output** + +| Parameter Name | Type | +|---------------------------|--------| +| start_clean_time | String | +| time_taken_in_millis | Long | +| total_files_deleted | Int | +| earliest_commit_to_retain | String | +| bootstrap_part_metadata | String | +| version | Int | + +**Example** + +Run clean with table name +``` +call run_clean(table => 'test_hudi_table'); +``` + +Run clean with keep latest file versions policy +``` +call run_clean(table => 'test_hudi_table', trigger_max_commits => 2, clean_policy => 'KEEP_LATEST_FILE_VERSIONS', file_versions_retained => 1) +``` + +### delete_marker + +Delete marker files of a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | +| instant_time | String | Y | None | Instant name | + +**Output** + +| Output Name | Type | +|----------------------|---------| +| delete_marker_result | Boolean | + +**Example** + +``` +call delete_marker(table => 'test_hudi_table', instant_time => '20230206174349556'); +``` + +| delete_marker_result | +|:---------------------| +| true | + +### sync_validate + +Validate sync procedure. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|-----------------|--------|----------|---------------|-------------------| +| src_table | String | Y | None | Source table name | +| dst_table | String | Y | None | Target table name | +| mode | String | Y | "complete" | Mode | +| hive_server_url | String | Y | None | Hive server url | +| hive_pass | String | Y | None | Hive password | +| src_db | String | N | "rawdata" | Source database | +| target_db | String | N | dwh_hoodie" | Target database | +| partition_cnt | Int | N | 5 | Partition count | +| hive_user | String | N | "" | Hive user name | + +**Output** + +| Output Name | Type | +|-------------|--------| +| result | String | + +**Example** + +``` + call sync_validate(hive_server_url=>'jdbc:hive2://localhost:10000/default', src_table => 'test_hudi_table_src', dst_table=> 'test_hudi_table_dst', mode=>'complete', hive_pass=>'', src_db=> 'default', target_db=>'default'); +``` + +### hive_sync + +Sync the table's latest schema to Hive metastore. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|-----------------------------------------------------------------------------------------------------------|--------|----------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| table | String | Y | None | Hudi table name | +| metastore_uri | String | N | "" | Metastore_uri | +| username | String | N | "" | User name | +| password | String | N | "" | Password | +| [use_jdbc](/docs/next/configurations#hoodiedatasourcehive_syncuse_jdbc) | String | N | "" | Use JDBC when hive synchronization is enabled | +| [mode](/docs/next/configurations#hoodiedatasourcehive_syncmode) | String | N | "" | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql. | +| [partition_fields](/docs/next/configurations#hoodiedatasourcehive_syncpartition_fields) | String | N | "" | Field in the table to use for determining hive partition columns. | | +| [partition_extractor_class](/docs/next/configurations#hoodiedatasourcehive_syncpartition_extractor_class) | String | N | "" | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'. | +| [strategy](/docs/next/configurations#hoodiedatasourcehive_synctablestrategy) | String | N | "" | Hive table synchronization strategy. Available option: RO, RT, ALL. | +| [sync_incremental](/docs/next/configurations#hoodiemetasyncincremental) | String | N | "" | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost. | + + + +**Output** + +| Output Name | Type | +|-------------|--------| +| result | String | + +**Example** + +``` +call hive_sync(table => 'test_hudi_table'); +``` + +| result | +|:-------| +| true | + +### hdfs_parquet_import + +add parquet files to a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|------------------|--------|----------|---------------|--------------------------------------------| +| table | String | Y | None | Hudi table name | +| table_type | String | Y | "" | Table type, MERGE_ON_READ or COPY_ON_WRITE | +| src_path | String | Y | "" | Source path | +| target_path | String | Y | "" | target path | +| row_key | String | Y | "" | Primary key | +| partition_key | String | Y | "" | Partition key | +| schema_file_path | String | Y | "" | Path of Schema file | +| format | String | N | "parquet" | File format | +| command | String | N | "insert" | Import command | +| retry | Int | N | 0 | Retry times | +| parallelism | Int | N | None | Parallelism | +| props_file_path | String | N | "" | Path of properties file | + +**Output** + +| Output Name | Type | +|---------------|------| +| import_result | Int | + +**Example** + +``` +call hdfs_parquet_import(table => 'test_hudi_table', table_type => 'COPY_ON_WRITE', src_path => '', target_path => '', row_key => 'id', partition_key => 'dt', schema_file_path => ''); +``` + +| import_result | +|:--------------| +| 0 | + + +### repair_add_partition_meta + +Repair add partition for a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|---------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | +| dry_run | Boolean | N | true | Dry run | + +**Output** + +| Output Name | Type | +|---------------------|--------| +| partition_path | String | +| metadata_is_present | String | +| action | String | + +**Example** + +``` +call repair_add_partition_meta(table => 'test_hudi_table'); +``` + +| partition_path | metadata_is_present | action | +|----------------|---------------------|--------| +| dt=2021-05-03 | Yes | None | + +### repair_corrupted_clean_files + +Repair corrupted clean files for a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|---------------------|---------|----------|---------------|--------------------| +| table | String | Y | None | Hudi table name | + +**Output** + +| Output Name | Type | +|-------------|---------| +| result | Boolean | + +**Example** + +``` +call repair_corrupted_clean_files(table => 'test_hudi_table'); +``` + +| result | +|--------| +| true | + +### repair_deduplicate + +Repair deduplicate records for a hudi table. The job dedupliates the data in the duplicated_partition_path and writes it into repaired_output_path. In the end of the job, the data in repaired_output_path is copied into the original path (duplicated_partition_path). + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|---------------------------|---------|----------|---------------|---------------------------| +| table | String | Y | None | Hudi table name | +| duplicated_partition_path | String | Y | None | Duplicated partition path | +| repaired_output_path | String | Y | None | Repaired output path | +| dry_run | Boolean | N | true | Dry run | +| dedupe_type | String | N | "insert_type" | Dedupe type | + +**Output** + +| Output Name | Type | +|-------------|--------| +| result | String | + +**Example** + +``` +call repair_deduplicate(table => 'test_hudi_table', duplicated_partition_path => 'dt=2021-05-03', repaired_output_path => '/tmp/repair_path/'); +``` + +| result | +|----------------------------------------------| +| Reduplicated files placed in: /tmp/repair_path/. | + +### repair_migrate_partition_meta + +downgrade a hudi table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|---------|----------|---------------|-----------------| +| table | String | Y | None | Hudi table name | +| dry_run | Boolean | N | true | Dry run | + +**Output** + +| Output Name | Type | +|-----------------------|--------| +| partition_path | String | +| text_metafile_present | String | +| base_metafile_present | String | +| action | String | + +**Example** + +``` +call repair_migrate_partition_meta(table => 'test_hudi_table'); +``` + +### repair_overwrite_hoodie_props + +overwrite a hudi table properties. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|---------------------|--------|----------|---------------|------------------------| +| table | String | Y | None | Hudi table name | +| new_props_file_path | String | Y | None | Path of new properties | + +**Output** + +| Output Name | Type | +|-------------|--------| +| property | String | +| old_value | String | +| new_value | String | + +**Example** + +``` +call repair_overwrite_hoodie_props(table => 'test_hudi_table', new_props_file_path = > '/tmp/props'); +``` + +| property | old_value | new_value | +|--------------------------|-----------|-----------| +| hoodie.file.index.enable | true | false | + +## Bootstrap + +### run_bootstrap + +Convert an existing table to Hudi. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|------------------------------------------------------------------------------|---------|----------|-------------------------------------------------------------------------------|| +| table | String | Y | None | Name of table to be clustered | +| table_type | String | Y | None | Table type, MERGE_ON_READ or COPY_ON_WRITE | +| [bootstrap_path](/docs/next/configurations#hoodiebootstrapbasepath) | String | Y | None | Base path of the dataset that needs to be bootstrapped as a Hudi table | +| base_path | String | Y | None | Base path | +| rowKey_field | String | Y | None | Primary key field | +| base_file_format | String | N | "PARQUET" | Format of base file | +| partition_path_field | String | N | "" | Partitioned column field | +| [bootstrap_index_class](/docs/next/configurations#hoodiebootstrapindexclass) | String | N | "org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex" | Implementation to use, for mapping a skeleton base file to a bootstrap base file. | +| [selector_class](/docs/next/configurations#hoodiebootstrapmodeselector) | String | N | "org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector" | Selects the mode in which each file/partition in the bootstrapped dataset gets bootstrapped | +| key_generator_class | String | N | "org.apache.hudi.keygen.SimpleKeyGenerator" | Class of key generator | +| full_bootstrap_input_provider | String | N | "org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider" | Class of full bootstrap input provider | +| schema_provider_class | String | N | "" | Class of schema provider | +| payload_class | String | N | "org.apache.hudi.common.model.OverwriteWithLatestAvroPayload" | Class of payload | +| [parallelism](/docs/next/configurations#hoodiebootstrapparallelism) | Int | N | 1500 | For metadata-only bootstrap, Hudi parallelizes the operation so that each table partition is handled by one Spark task. This config limits the number of parallelism. We pick the configured parallelism if the number of table partitions is larger than this configured value. The parallelism is assigned to the number of table partitions if it is smaller than the configured value. For full-record bootstrap, i.e., BULK_INSERT operation of the records, this configured value is passed as the BULK_INSERT shuffle parallelism (`hoodie.bulkinsert.shuffle.parallelism`), determining the BULK_INSERT write behavior. If you see that the bootstrap is slow due to the limited parallelism, you can increase this. | +| enable_hive_sync | Boolean | N | false | Whether to enable hive sync | +| props_file_path | String | N | "" | Path of properties file | +| bootstrap_overwrite | Boolean | N | false | Overwrite bootstrap path | + +**Output** + +| Output Name | Type | +|-------------|---------| +| status | Boolean | + +**Example** + +``` +call run_bootstrap(table => 'test_hudi_table', table_type => 'COPY_ON_WRITE', bootstrap_path => 'hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table', base_path => 'hdfs://ns1//tmp/hoodie/test_hudi_table', rowKey_field => 'id', partition_path_field => 'dt',bootstrap_overwrite => true); +``` + +| status | +|--------| +| 0 | + +### show_bootstrap_mapping + +Show mapping files of a bootstrap table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|---------|----------|---------------|--------------------------------------| +| table | String | Y | None | Name of table to be clustered | +| partition_path | String | N | "" | Partition path | +| file_ids | String | N | "" | File ids | +| limit | Int | N | 10 | Max number of records to be returned | +| sort_by | String | N | "partition" | Sort by columns | +| desc | Boolean | N | false | Descending order | + +**Output** + +| Parameter Name | Type | +|------------------|--------| +| partition | String | +| file_id | Int | +| source_base_path | String | +| source_partition | Int | +| source_file | String | + +**Example** + +``` +call show_bootstrap_mapping(table => 'test_hudi_table'); +``` + +| partition | file_id | source_base_path | source_partition | source_file | +|---------------|----------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|------------------|--------------------------------------------| +| dt=2021-05-03 | d0073a12-085d-4f49-83e9-402947e7e90a-0 | hdfs://ns1/hive/warehouse/hudi.db/test_hudi_table/dt=2021-05-03/d0073a12-085d-4f49-83e9-402947e7e90a-0_0-2-2_00000000000002.parquet | dt=2021-05-03 | hdfs://ns1/tmp/dt=2021-05-03/00001.parquet | + + +### show_bootstrap_partitions + +Show partitions of a bootstrap table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|--------------------------------------| +| table | String | Y | None | Name of table to be clustered | + +**Output** + +| Parameter Name | Type | +|--------------------|--------| +| indexed_partitions | String | + +**Example** + +``` +call show_bootstrap_partitions(table => 'test_hudi_table'); +``` + +| indexed_partitions | +|--------------------| +| dt=2021-05-03 | + +## Version management + +### upgrade_table + +upgrade a hudi table to a specific version. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-------------------------| +| table | String | Y | None | Hudi table name | +| to_version | String | Y | None | Version of hoodie table | + +**Output** + +| Output Name | Type | +|-------------|---------| +| result | Boolean | + +**Example** + +``` +call upgrade_table(table => 'test_hudi_table', to_version => 'FIVE'); +``` + +| result | +|--------| +| true | + +### downgrade_table + +downgrade a hudi table to a specific version. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|-------------------------| +| table | String | Y | None | Hudi table name | +| to_version | String | Y | None | Version of hoodie table | + +**Output** + +| Output Name | Type | +|-------------|---------| +| result | Boolean | + +**Example** + +``` +call downgrade_table(table => 'test_hudi_table', to_version => 'FOUR'); +``` + +| result | +|--------| +| true | diff --git a/website/versioned_docs/version-1.0.0/python-rust-quick-start-guide.md b/website/versioned_docs/version-1.0.0/python-rust-quick-start-guide.md new file mode 100644 index 0000000000000..b4aecb8d958fe --- /dev/null +++ b/website/versioned_docs/version-1.0.0/python-rust-quick-start-guide.md @@ -0,0 +1,119 @@ +--- +title: "Python/Rust Quick Start" +toc: true +last_modified_at: 2024-11-28T12:53:57+08:00 +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +This guide will help you get started with [hudi-rs](https://github.com/apache/hudi-rs), a native Rust library for Apache Hudi with Python bindings. Learn how to install, set up, and perform basic operations using both Python and Rust interfaces. + +## Installation + +```bash +# Python +pip install hudi + +# Rust +cargo add hudi +``` + +## Basic Usage + +:::note +Currently, write capabilities and reading from MOR tables are not supported. + +The examples below expect a Hudi table exists at `/tmp/trips_table`, created using the [quick start guide](/docs/quick-start-guide). +::: + +### Python Example + +```python +from hudi import HudiTableBuilder +import pyarrow as pa + +hudi_table = ( + HudiTableBuilder + .from_base_uri("/tmp/trips_table") + .build() +) + +# Read with partition filters +records = hudi_table.read_snapshot(filters=[("city", "=", "san_francisco")]) + +# Convert to PyArrow table +arrow_table = pa.Table.from_batches(records) +result = arrow_table.select(["rider", "city", "ts", "fare"]) +``` + +### Rust Example (with DataFusion) + +1. Set up your project: + +```bash +cargo new my_project --bin && cd my_project +cargo add tokio@1 datafusion@42 +cargo add hudi --features datafusion +``` + +1. Add code to `src/main.rs`: + +```rust +use std::sync::Arc; +use datafusion::error::Result; +use datafusion::prelude::{DataFrame, SessionContext}; +use hudi::HudiDataSource; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + let hudi = HudiDataSource::new_with_options("/tmp/trips_table", []).await?; + ctx.register_table("trips_table", Arc::new(hudi))?; + // Read with partition filters + let df: DataFrame = ctx.sql("SELECT * from trips_table where city = 'san_francisco'").await?; + df.show().await?; + Ok(()) +} +``` + +## Cloud Storage Integration + +### Python + +```python +from hudi import HudiTableBuilder + +hudi_table = ( + HudiTableBuilder + .from_base_uri("s3://bucket/trips_table") + .with_option("aws_region", "us-west-2") + .build() +) +``` + +### Rust + +```rust +use hudi::HudiDataSource; + +let hudi = HudiDataSource::new_with_options( + "s3://bucket/trips_table", + [("aws_region", "us-west-2")] +).await?; +``` + +### Supported Cloud Storage + +- AWS S3 (`s3://`) +- Azure Storage (`az://`) +- Google Cloud Storage (`gs://`) + +Set appropriate environment variables (`AWS_*`, `AZURE_*`, or `GOOGLE_*`) for authentication, or pass through the `option()` API. + +## Read with Timestamp + +Add timestamp option for time-travel queries: + +```python +.with_option("hoodie.read.as.of.timestamp", "20241122010827898") +``` diff --git a/website/versioned_docs/version-1.0.0/querying_data.md b/website/versioned_docs/version-1.0.0/querying_data.md new file mode 100644 index 0000000000000..d96d3b3875b75 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/querying_data.md @@ -0,0 +1,99 @@ +--- +title: Querying Data +keywords: [ hudi, hive, spark, sql, presto] +summary: In this page, we go over how to process data in Hudi tables. +toc: true +last_modified_at: 2019-12-30T15:59:57-04:00 +--- + +:::danger +This page is no longer maintained. Please refer to Hudi [SQL DDL](sql_ddl), [SQL DML](sql_dml), [SQL Queries](sql_queries) and [Procedures](procedures) for the latest documentation. +::: + +Conceptually, Hudi stores data physically once on DFS, while providing 3 different ways of querying, as explained [before](/docs/concepts#query-types). +Once the table is synced to the Hive metastore, it provides external Hive tables backed by Hudi's custom inputformats. Once the proper hudi +bundle has been installed, the table can be queried by popular query engines like Hive, Spark SQL, Flink, Trino and PrestoDB. + +In sections, below we will discuss specific setup to access different query types from different query engines. + +## Spark Datasource + +The Spark Datasource API is a popular way of authoring Spark ETL pipelines. Hudi tables can be queried via the Spark datasource with a simple `spark.read.parquet`. +See the [Spark Quick Start](/docs/quick-start-guide) for more examples of Spark datasource reading queries. + +**Setup** + +If your Spark environment does not have the Hudi jars installed, add [hudi-spark-bundle](https://mvnrepository.com/artifact/org.apache.hudi/hudi-spark-bundle) jar to the +classpath of drivers and executors using `--jars` option. Alternatively, hudi-spark-bundle can also fetched via the +--packages options (e.g: --packages org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0). + +### Snapshot query {#spark-snap-query} +Retrieve the data table at the present point in time. + +```scala +val hudiSnapshotQueryDF = spark + .read + .format("hudi") + .option("hoodie.datasource.query.type", "snapshot") + .load(tablePath) +``` + +### Incremental query {#spark-incr-query} +Of special interest to spark pipelines, is Hudi's ability to support incremental queries, like below. A sample incremental query, that will obtain all records written since `beginInstantTime`, looks like below. +Thanks to Hudi's support for record level change streams, these incremental pipelines often offer 10x efficiency over batch counterparts by only processing the changed records. + +The following snippet shows how to obtain all records changed after `beginInstantTime` and run some SQL on them. + +```java +Dataset hudiIncQueryDF = spark.read() + .format("org.apache.hudi") + .option("hoodie.datasource.query.type", "incremental"()) + .option("hoodie.datasource.read.begin.instanttime", ) + .option(DataSourceReadOptions.INCR_PATH_GLOB_OPT_KEY(), "/year=2020/month=*/day=*") // Optional, use glob pattern if querying certain partitions + .load(tablePath); // For incremental query, pass in the root/base path of table + +hudiIncQueryDF.createOrReplaceTempView("hudi_trips_incremental") +spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_trips_incremental where fare > 20.0").show() +``` + +For examples, refer to [Incremental Queries](/docs/quick-start-guide#incremental-query) in the Spark quickstart. +Please refer to [configurations](/docs/configurations#SPARK_DATASOURCE) section, to view all datasource options. + +Additionally, `HoodieReadClient` offers the following functionality using Hudi's implicit indexing. + +| **API** | **Description** | +|-------|--------| +| read(keys) | Read out the data corresponding to the keys as a DataFrame, using Hudi's own index for faster lookup | +| filterExists() | Filter out already existing records from the provided `RDD[HoodieRecord]`. Useful for de-duplication | +| checkExists(keys) | Check if the provided keys exist in a Hudi table | + +### Incremental query +`HiveIncrementalPuller` allows incrementally extracting changes from large fact/dimension tables via HiveQL, combining the benefits of Hive (reliably process complex SQL queries) and +incremental primitives (speed up querying tables incrementally instead of scanning fully). The tool uses Hive JDBC to run the hive query and saves its results in a temp table. +that can later be upserted. Upsert utility (`HoodieStreamer`) has all the state it needs from the directory structure to know what should be the commit time on the target table. +e.g: `/app/incremental-hql/intermediate/{source_table_name}_temp/{last_commit_included}`.The Delta Hive table registered will be of the form `{tmpdb}.{source_table}_{last_commit_included}`. + +The following are the configuration options for HiveIncrementalPuller + +| **Config** | **Description** | **Default** | +|-------|--------|--------| +|hiveUrl| Hive Server 2 URL to connect to | | +|hiveUser| Hive Server 2 Username | | +|hivePass| Hive Server 2 Password | | +|queue| YARN Queue name | | +|tmp| Directory where the temporary delta data is stored in DFS. The directory structure will follow conventions. Please see the below section. | | +|extractSQLFile| The SQL to execute on the source table to extract the data. The data extracted will be all the rows that changed since a particular point in time. | | +|sourceTable| Source Table Name. Needed to set hive environment properties. | | +|sourceDb| Source DB name. Needed to set hive environment properties.| | +|targetTable| Target Table Name. Needed for the intermediate storage directory structure. | | +|targetDb| Target table's DB name.| | +|tmpdb| The database to which the intermediate temp delta table will be created | hoodie_temp | +|fromCommitTime| This is the most important parameter. This is the point in time from which the changed records are queried from. | | +|maxCommits| Number of commits to include in the query. Setting this to -1 will include all the commits from fromCommitTime. Setting this to a value > 0, will include records that ONLY changed in the specified number of commits after fromCommitTime. This may be needed if you need to catch up say 2 commits at a time. | 3 | +|help| Utility Help | | + + +Setting fromCommitTime=0 and maxCommits=-1 will fetch the entire source table and can be used to initiate backfills. If the target table is a Hudi table, +then the utility can determine if the target table has no commits or is behind more than 24 hour (this is configurable), +it will automatically use the backfill configuration, since applying the last 24 hours incrementally could take more time than doing a backfill. The current limitation of the tool +is the lack of support for self-joining the same table in mixed mode (snapshot and incremental modes). \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/quick-start-guide.md b/website/versioned_docs/version-1.0.0/quick-start-guide.md new file mode 100644 index 0000000000000..a9315c34e3bc1 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/quick-start-guide.md @@ -0,0 +1,1315 @@ +--- +title: "Spark Quick Start" +sidebar_position: 2 +toc: true +last_modified_at: 2023-08-23T21:14:52+09:00 +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +This guide provides a quick peek at Hudi's capabilities using Spark. Using Spark Datasource APIs(both scala and python) and using Spark SQL, +we will walk through code snippets that allows you to insert, update, delete and query a Hudi table. + +## Setup + +Hudi works with Spark 3.3 and above versions. You can follow instructions [here](https://spark.apache.org/downloads) for setting up Spark. + +### Spark 3 Support Matrix + +| Hudi | Supported Spark 3 version | +|:----------------|:---------------------------------------------------------| +| 1.0.x | 3.5.x (default build), 3.4.x, 3.3.x | +| 0.15.x | 3.5.x (default build), 3.4.x, 3.3.x, 3.2.x, 3.1.x, 3.0.x | +| 0.14.x | 3.4.x (default build), 3.3.x, 3.2.x, 3.1.x, 3.0.x | +| 0.13.x | 3.3.x (default build), 3.2.x, 3.1.x | +| 0.12.x | 3.3.x (default build), 3.2.x, 3.1.x | +| 0.11.x | 3.2.x (default build, Spark bundle only), 3.1.x | +| 0.10.x | 3.1.x (default build), 3.0.x | +| 0.7.0 - 0.9.0 | 3.0.x | +| 0.6.0 and prior | not supported | + +The *default build* Spark version indicates how we build `hudi-spark3-bundle`. + +:::note Change summary +In 1.0.0, we dropped the support for Spark 3.2.x and lower Spark 3 versions. +In 0.15.0, we introduced the support for Spark 3.5.x. +In 0.14.0, we introduced the support for Spark 3.4.x and bring back the support for Spark 3.0.x. +In 0.12.0, we introduced the experimental support for Spark 3.3.0. +In 0.11.0, there are changes on using Spark bundles, please refer to [0.11.0 release notes](https://hudi.apache.org/releases/release-0.11.0/#spark-versions-and-bundles) for detailed instructions. +::: + +### Spark Shell/SQL + + + + + +From the extracted directory run spark-shell with Hudi: + + +```shell +# For Spark versions: 3.3 - 3.5 +export SPARK_VERSION=3.5 # or 3.4, 3.3 +spark-shell --packages org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.15.0 \ +--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ +--conf 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog' \ +--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \ +--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar' +``` + + + + +From the extracted directory run pyspark with Hudi: + +```shell +# For Spark versions: 3.3 - 3.5 +export PYSPARK_PYTHON=$(which python3) +export SPARK_VERSION=3.5 # or 3.4, 3.3 +pyspark --packages org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.15.0 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog' --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' --conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar' +``` + + + + +Hudi support using Spark SQL to write and read data with the **HoodieSparkSessionExtension** sql extension. +From the extracted directory run Spark SQL with Hudi: + +```shell +# For Spark versions: 3.3 - 3.5 +export SPARK_VERSION=3.5 # or 3.4, 3.3 +spark-sql --packages org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.15.0 \ +--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ +--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \ +--conf 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog' \ +--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar' +``` + + + +:::note on Kryo serialization +Users are recommended to set this config to reduce Kryo serialization overhead + +``` +--conf 'spark.kryo.registrator=org.apache.spark.HoodieKryoRegistrar' +``` +::: + +### Setup project +Below, we do imports and setup the table name and corresponding base path. + + + + + +```scala +// spark-shell +import scala.collection.JavaConversions._ +import org.apache.spark.sql.SaveMode._ +import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.common.table.HoodieTableConfig._ +import org.apache.hudi.config.HoodieWriteConfig._ +import org.apache.hudi.keygen.constant.KeyGeneratorOptions._ +import org.apache.hudi.common.model.HoodieRecord +import spark.implicits._ + +val tableName = "trips_table" +val basePath = "file:///tmp/trips_table" +``` + + + + +```python +# pyspark +from pyspark.sql.functions import lit, col + +tableName = "trips_table" +basePath = "file:///tmp/trips_table" +``` + + + + +```sql +// Next section will go over create table commands +``` + + + + +## Create Table + +First, let's create a Hudi table. Here, we use a partitioned table for illustration, but Hudi also supports non-partitioned tables. + + + + +```scala +// scala +// First commit will auto-initialize the table, if it did not exist in the specified base path. +``` + + + + +```python +# pyspark +# First commit will auto-initialize the table, if it did not exist in the specified base path. +``` + + + + +:::note NOTE: +For users who have Spark-Hive integration in their environment, this guide assumes that you have the appropriate +settings configured to allow Spark to create tables and register in Hive Metastore. +::: + +Here is an example of creating a Hudi table. + +```sql +-- create a Hudi table that is partitioned. +CREATE TABLE hudi_table ( + ts BIGINT, + uuid STRING, + rider STRING, + driver STRING, + fare DOUBLE, + city STRING +) USING HUDI +PARTITIONED BY (city); +``` + +For more options for creating Hudi tables or if you're running into any issues, please refer to [SQL DDL](sql_ddl) reference guide. + + + + + + +## Insert data {#inserts} + + + + + +Generate some new records as a DataFrame and write the DataFrame into a Hudi table. +Since, this is the first write, it will also auto-create the table. + +```scala +// spark-shell +val columns = Seq("ts","uuid","rider","driver","fare","city") +val data = + Seq((1695159649087L,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"), + (1695091554788L,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70 ,"san_francisco"), + (1695046462179L,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90 ,"san_francisco"), + (1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo" ), + (1695115999911L,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai")); + +var inserts = spark.createDataFrame(data).toDF(columns:_*) +inserts.write.format("hudi"). + option("hoodie.datasource.write.partitionpath.field", "city"). + option("hoodie.table.name", tableName). + mode(Overwrite). + save(basePath) +``` + +:::info Mapping to Hudi write operations +Hudi provides a wide range of [write operations](write_operations) - both batch and incremental - to write data into Hudi tables, +with different semantics and performance. When record keys are not configured (see [keys](#keys) below), `bulk_insert` will be chosen as +the write operation, matching the out-of-behavior of Spark's Parquet Datasource. +::: + + + + +Generate some new records as a DataFrame and write the DataFrame into a Hudi table. +Since, this is the first write, it will also auto-create the table. + +```python +# pyspark +columns = ["ts","uuid","rider","driver","fare","city"] +data =[(1695159649087,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"), + (1695091554788,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70 ,"san_francisco"), + (1695046462179,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90 ,"san_francisco"), + (1695516137016,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"), + (1695115999911,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai")] +inserts = spark.createDataFrame(data).toDF(*columns) + +hudi_options = { + 'hoodie.table.name': tableName, + 'hoodie.datasource.write.partitionpath.field': 'city' +} + +inserts.write.format("hudi"). \ + options(**hudi_options). \ + mode("overwrite"). \ + save(basePath) +``` + +:::info Mapping to Hudi write operations +Hudi provides a wide range of [write operations](write_operations) - both batch and incremental - to write data into Hudi tables, +with different semantics and performance. When record keys are not configured (see [keys](#keys) below), `bulk_insert` will be chosen as +the write operation, matching the out-of-behavior of Spark's Parquet Datasource. +::: + + + + + +Users can use 'INSERT INTO' to insert data into a Hudi table. See [Insert Into](sql_dml#insert-into) for more advanced options. + +```sql +INSERT INTO hudi_table +VALUES +(1695159649087,'334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco'), +(1695091554788,'e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70 ,'san_francisco'), +(1695046462179,'9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90 ,'san_francisco'), +(1695332066204,'1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco'), +(1695516137016,'e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo' ), +(1695376420876,'7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40 ,'sao_paulo' ), +(1695173887231,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 ,'chennai' ), +(1695115999911,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai'); +``` + +If you want to control the Hudi write operation used for the INSERT statement, you can set the following config before issuing +the INSERT statement: + +```sql +-- bulk_insert using INSERT_INTO +SET hoodie.spark.sql.insert.into.operation = 'bulk_insert' +``` + + + + + +## Query data {#querying} + +Hudi tables can be queried back into a DataFrame or Spark SQL. + + + + + +```scala +// spark-shell +val tripsDF = spark.read.format("hudi").load(basePath) +tripsDF.createOrReplaceTempView("trips_table") + +spark.sql("SELECT uuid, fare, ts, rider, driver, city FROM trips_table WHERE fare > 20.0").show() +spark.sql("SELECT _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare FROM trips_table").show() +``` + + + +```python +# pyspark +tripsDF = spark.read.format("hudi").load(basePath) +tripsDF.createOrReplaceTempView("trips_table") + +spark.sql("SELECT uuid, fare, ts, rider, driver, city FROM trips_table WHERE fare > 20.0").show() +spark.sql("SELECT _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare FROM trips_table").show() +``` + + + +```sql + SELECT ts, fare, rider, driver, city FROM hudi_table WHERE fare > 20.0; +``` + + + + +## Update data {#upserts} + +Hudi tables can be updated by streaming in a DataFrame or using a standard UPDATE statement. + + + + + +```scala +// Lets read data from target Hudi table, modify fare column for rider-D and update it. +val updatesDf = spark.read.format("hudi").load(basePath).filter($"rider" === "rider-D").withColumn("fare", col("fare") * 10) + +updatesDf.write.format("hudi"). + option("hoodie.datasource.write.operation", "upsert"). + option("hoodie.datasource.write.partitionpath.field", "city"). + option("hoodie.table.name", tableName). + mode(Append). + save(basePath) +``` + +:::info Key requirements +Updates with spark-datasource is feasible only when the source dataframe contains Hudi's meta fields or a [key field](#keys) is configured. +Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time. +::: + + + + + + +Hudi table can be update using a regular UPDATE statement. See [Update](sql_dml#update) for more advanced options. + +```sql +UPDATE hudi_table SET fare = 25.0 WHERE rider = 'rider-D'; +``` + + + + +```python +# pyspark +# Lets read data from target Hudi table, modify fare column for rider-D and update it. +updatesDf = spark.read.format("hudi").load(basePath).filter("rider == 'rider-D'").withColumn("fare",col("fare")*10) + +updatesDf.write.format("hudi"). \ + options(**hudi_options). \ + mode("append"). \ + save(basePath) +``` + +:::info Key requirements +Updates with spark-datasource is feasible only when the source dataframe contains Hudi's meta fields or a [key field](#keys) is configured. +Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time. +::: + + + + + +[Querying](#querying) the data again will now show updated records. Each write operation generates a new [commit](concepts). +Look for changes in `_hoodie_commit_time`, `fare` fields for the given `_hoodie_record_key` value from a previous commit. + +## Merging Data {#merge} + + + + + + +```scala +// spark-shell +val adjustedFareDF = spark.read.format("hudi"). + load(basePath).limit(2). + withColumn("fare", col("fare") * 10) +adjustedFareDF.write.format("hudi"). +option("hoodie.datasource.write.payload.class","com.payloads.CustomMergeIntoConnector"). +mode(Append). +save(basePath) +// Notice Fare column has been updated but all other columns remain intact. +spark.read.format("hudi").load(basePath).show() +``` +The `com.payloads.CustomMergeIntoConnector` adds adjusted fare values to the original table and preserves all other fields. +Refer [here](https://gist.github.com/bhasudha/7ea07f2bb9abc5c6eb86dbd914eec4c6) for sample implementation of `com.payloads.CustomMergeIntoConnector`. + + + + + +```python +# pyspark +adjustedFareDF = spark.read.format("hudi").load(basePath). \ + limit(2).withColumn("fare", col("fare") * 100) +adjustedFareDF.write.format("hudi"). \ +option("hoodie.datasource.write.payload.class","com.payloads.CustomMergeIntoConnector"). \ +mode("append"). \ +save(basePath) +# Notice Fare column has been updated but all other columns remain intact. +spark.read.format("hudi").load(basePath).show() +``` + +The `com.payloads.CustomMergeIntoConnector` adds adjusted fare values to the original table and preserves all other fields. +Refer [here](https://gist.github.com/bhasudha/7ea07f2bb9abc5c6eb86dbd914eec4c6) for sample implementation of `com.payloads.CustomMergeIntoConnector`. + + + + + +```sql +-- source table using Hudi for testing merging into target Hudi table +CREATE TABLE fare_adjustment (ts BIGINT, uuid STRING, rider STRING, driver STRING, fare DOUBLE, city STRING) +USING HUDI; +INSERT INTO fare_adjustment VALUES +(1695091554788,'e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',-2.70 ,'san_francisco'), +(1695530237068,'3f3d9565-7261-40e6-9b39-b8aa784f95e2','rider-K','driver-U',64.20 ,'san_francisco'), +(1695241330902,'ea4c36ff-2069-4148-9927-ef8c1a5abd24','rider-H','driver-R',66.60 ,'sao_paulo' ), +(1695115999911,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',1.85,'chennai' ); + + +MERGE INTO hudi_table AS target +USING fare_adjustment AS source +ON target.uuid = source.uuid +WHEN MATCHED THEN UPDATE SET target.fare = target.fare + source.fare +WHEN NOT MATCHED THEN INSERT * +; + +``` + +:::info Key requirements +1. For a Hudi table with user defined primary record [keys](#keys), the join condition is expected to contain the primary keys of the table. +For a Hudi table with Hudi generated primary keys, the join condition can be on any arbitrary data columns. +::: + + + +## Merging Data (Partial Updates) {#merge-partial-update} + +Partial updates only write updated columns instead of full update record. This is useful when you have hundreds of +columns and only a few columns are updated. It reduces the write costs as well as storage costs. +`MERGE INTO` statement above can be modified to use partial updates as shown below. + +```sql +MERGE INTO hudi_table AS target +USING fare_adjustment AS source +ON target.uuid = source.uuid +WHEN MATCHED THEN UPDATE SET fare = source.fare +WHEN NOT MATCHED THEN INSERT * +; +``` + +Notice, instead of `UPDATE SET *`, we are updating only the `fare` column. + +## Delete data {#deletes} + +Delete operation removes the records specified from the table. For example, this code snippet deletes records +for the HoodieKeys passed in. Check out the [deletion section](/docs/writing_data#deletes) for more details. + + + + + + +```scala +// spark-shell +// Lets delete rider: rider-D +val deletesDF = spark.read.format("hudi").load(basePath).filter($"rider" === "rider-F") + +deletesDF.write.format("hudi"). + option("hoodie.datasource.write.operation", "delete"). + option("hoodie.datasource.write.partitionpath.field", "city"). + option("hoodie.table.name", tableName). + mode(Append). + save(basePath) + +``` +[Querying](#querying) the data again will not show the deleted record. + +:::info Key requirements +Deletes with spark-datasource is supported only when the source dataframe contains Hudi's meta fields or a [key field](#keys) is configured. +Notice that the save mode is again `Append`. +::: + + + + +```sql +DELETE FROM hudi_table WHERE uuid = '3f3d9565-7261-40e6-9b39-b8aa784f95e2'; +``` + + + + + +```python +# pyspark +# Lets delete rider: rider-D +deletesDF = spark.read.format("hudi").load(basePath).filter("rider == 'rider-F'") + +# issue deletes +hudi_hard_delete_options = { + 'hoodie.table.name': tableName, + 'hoodie.datasource.write.partitionpath.field': 'city', + 'hoodie.datasource.write.operation': 'delete', +} + +deletesDF.write.format("hudi"). \ +options(**hudi_hard_delete_options). \ +mode("append"). \ +save(basePath) +``` +[Querying](#querying) the data again will not show the deleted record. +:::info Key requirements +Deletes with spark-datasource is supported only when the source dataframe contains Hudi's meta fields or a [key field](#keys) is configured. +Notice that the save mode is again `Append`. +::: + + + + + +## Index data {#indexing} + +Hudi supports indexing on columns to speed up queries. Indexes can be created on columns using the `CREATE INDEX` statement. + +:::note +Please note in order to create secondary index: +1. The table must have a primary key and merge mode should be [COMMIT_TIME_ORDERING](/docs/next/record_merger#commit_time_ordering). +2. Record index must be enabled. This can be done by setting `hoodie.metadata.record.index.enable=true` and then creating `record_index`. Please note the example below. +::: + + + + + +Here is an example which shows how to create indexes for a table created using Datasource API. + +```scala +import scala.collection.JavaConversions._ +import org.apache.spark.sql.SaveMode._ +import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.common.table.HoodieTableConfig._ +import org.apache.hudi.config.HoodieWriteConfig._ +import org.apache.hudi.keygen.constant.KeyGeneratorOptions._ +import org.apache.hudi.common.model.HoodieRecord +import spark.implicits._ + +val tableName = "trips_table_index" +val basePath = "file:///tmp/hudi_indexed_table" + +val columns = Seq("ts","uuid","rider","driver","fare","city") +val data = + Seq((1695159649087L,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"), + (1695091554788L,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70 ,"san_francisco"), + (1695046462179L,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90 ,"san_francisco"), + (1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo" ), + (1695115999911L,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai")); + +var inserts = spark.createDataFrame(data).toDF(columns:_*) +inserts.write.format("hudi"). + option("hoodie.datasource.write.partitionpath.field", "city"). + option("hoodie.table.name", tableName). + option("hoodie.write.record.merge.mode", "COMMIT_TIME_ORDERING"). + option(RECORDKEY_FIELD_OPT_KEY, "uuid"). + mode(Overwrite). + save(basePath) + +// Create record index and secondary index for the table +spark.sql(s"CREATE TABLE hudi_indexed_table USING hudi LOCATION '$basePath'") +// Create bloom filter expression index on driver column +spark.sql(s"CREATE INDEX idx_bloom_driver ON hudi_indexed_table USING bloom_filters(driver) OPTIONS(expr='identity')"); +// It would show bloom filter expression index +spark.sql(s"SHOW INDEXES FROM hudi_indexed_table"); +// Query on driver column would prune the data using the idx_bloom_driver index +spark.sql(s"SELECT uuid, rider FROM hudi_indexed_table WHERE driver = 'driver-S'"); + +// Create column stat expression index on ts column +spark.sql(s"CREATE INDEX idx_column_ts ON hudi_indexed_table USING column_stats(ts) OPTIONS(expr='from_unixtime', format = 'yyyy-MM-dd')"); +// Shows both expression indexes +spark.sql(s"SHOW INDEXES FROM hudi_indexed_table"); +// Query on ts column would prune the data using the idx_column_ts index +spark.sql(s"SELECT * FROM hudi_indexed_table WHERE from_unixtime(ts, 'yyyy-MM-dd') = '2023-09-24'"); + +// To create secondary index, first create the record index +spark.sql(s"SET hoodie.metadata.record.index.enable=true"); +spark.sql(s"CREATE INDEX record_index ON hudi_indexed_table (uuid)"); +// Create secondary index on rider column +spark.sql(s"CREATE INDEX idx_rider ON hudi_indexed_table (rider)"); + +// Expression index and secondary index should show up +spark.sql(s"SHOW INDEXES FROM hudi_indexed_table"); +// Query on rider column would leverage the secondary index idx_rider +spark.sql(s"SELECT * FROM hudi_indexed_table WHERE rider = 'rider-E'"); + +// Update a record and query the table based on indexed columns +spark.sql(s"UPDATE hudi_indexed_table SET rider = 'rider-B', driver = 'driver-N', ts = '1697516137' WHERE rider = 'rider-A'"); +// Data skipping would be performed using column stat expression index +spark.sql(s"SELECT uuid, rider FROM hudi_indexed_table WHERE from_unixtime(ts, 'yyyy-MM-dd') = '2023-10-17'"); +// Data skipping would be performed using bloom filter expression index +spark.sql(s"SELECT * FROM hudi_indexed_table WHERE driver = 'driver-N'"); +// Data skipping would be performed using secondary index +spark.sql(s"SELECT * FROM hudi_indexed_table WHERE rider = 'rider-B'"); + +// Drop all the indexes +spark.sql(s"DROP INDEX secondary_index_idx_rider on hudi_indexed_table"); +spark.sql(s"DROP INDEX record_index on hudi_indexed_table"); +spark.sql(s"DROP INDEX expr_index_idx_bloom_driver on hudi_indexed_table"); +spark.sql(s"DROP INDEX expr_index_idx_column_ts on hudi_indexed_table"); +// No indexes should show up for the table +spark.sql(s"SHOW INDEXES FROM hudi_indexed_table"); + +spark.sql(s"SET hoodie.metadata.record.index.enable=false"); +``` + + + + + +```sql +-- Create a table with primary key +CREATE TABLE hudi_indexed_table ( + ts BIGINT, + uuid STRING, + rider STRING, + driver STRING, + fare DOUBLE, + city STRING +) USING HUDI +options( + primaryKey ='uuid', + hoodie.write.record.merge.mode = "COMMIT_TIME_ORDERING" +) +PARTITIONED BY (city); + +INSERT INTO hudi_indexed_table +VALUES +(1695159649,'334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco'), +(1695091554,'e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70 ,'san_francisco'), +(1695046462,'9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90 ,'san_francisco'), +(1695332066,'1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco'), +(1695516137,'e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo' ), +(1695376420,'7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40 ,'sao_paulo' ), +(1695173887,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 ,'chennai' ), +(1695115999,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai'); + +-- Create bloom filter expression index on driver column +CREATE INDEX idx_bloom_driver ON hudi_indexed_table USING bloom_filters(driver) OPTIONS(expr='identity'); +-- It would show bloom filter expression index +SHOW INDEXES FROM hudi_indexed_table; +-- Query on driver column would prune the data using the idx_bloom_driver index +SELECT uuid, rider FROM hudi_indexed_table WHERE driver = 'driver-S'; + +-- Create column stat expression index on ts column +CREATE INDEX idx_column_ts ON hudi_indexed_table USING column_stats(ts) OPTIONS(expr='from_unixtime', format = 'yyyy-MM-dd'); +-- Shows both expression indexes +SHOW INDEXES FROM hudi_indexed_table; +-- Query on ts column would prune the data using the idx_column_ts index +SELECT * FROM hudi_indexed_table WHERE from_unixtime(ts, 'yyyy-MM-dd') = '2023-09-24'; + +-- To create secondary index, first create the record index +SET hoodie.metadata.record.index.enable=true; +CREATE INDEX record_index ON hudi_indexed_table (uuid); +-- Create secondary index on rider column +CREATE INDEX idx_rider ON hudi_indexed_table (rider); + +-- Expression index and secondary index should show up +SHOW INDEXES FROM hudi_indexed_table; +-- Query on rider column would leverage the secondary index idx_rider +SELECT * FROM hudi_indexed_table WHERE rider = 'rider-E'; + +-- Update a record and query the table based on indexed columns +UPDATE hudi_indexed_table SET rider = 'rider-B', driver = 'driver-N', ts = '1697516137' WHERE rider = 'rider-A'; +-- Data skipping would be performed using column stat expression index +SELECT uuid, rider FROM hudi_indexed_table WHERE from_unixtime(ts, 'yyyy-MM-dd') = '2023-10-17'; +-- Data skipping would be performed using bloom filter expression index +SELECT * FROM hudi_indexed_table WHERE driver = 'driver-N'; +-- Data skipping would be performed using secondary index +SELECT * FROM hudi_indexed_table WHERE rider = 'rider-B'; + +-- Drop all the indexes +DROP INDEX record_index on hudi_indexed_table; +DROP INDEX secondary_index_idx_rider on hudi_indexed_table; +DROP INDEX expr_index_idx_bloom_driver on hudi_indexed_table; +DROP INDEX expr_index_idx_column_ts on hudi_indexed_table; +-- No indexes should show up for the table +SHOW INDEXES FROM hudi_indexed_table; + +SET hoodie.metadata.record.index.enable=false; +``` + + + + +## Time Travel Query {#timetravel} + +Hudi supports time travel query to query the table as of a point-in-time in history. Three timestamp formats are supported as illustrated below. + + + + + +```scala +spark.read.format("hudi"). + option("as.of.instant", "20210728141108100"). + load(basePath) + +spark.read.format("hudi"). + option("as.of.instant", "2021-07-28 14:11:08.200"). + load(basePath) + +// It is equal to "as.of.instant = 2021-07-28 00:00:00" +spark.read.format("hudi"). + option("as.of.instant", "2021-07-28"). + load(basePath) + +``` + + + + +```python +# pyspark +spark.read.format("hudi"). \ + option("as.of.instant", "20210728141108100"). \ + load(basePath) + +spark.read.format("hudi"). \ + option("as.of.instant", "2021-07-28 14:11:08.000"). \ + load(basePath) + +# It is equal to "as.of.instant = 2021-07-28 00:00:00" +spark.read.format("hudi"). \ + option("as.of.instant", "2021-07-28"). \ + load(basePath) +``` + + + + + + +```sql + +-- time travel based on commit time, for eg: `20220307091628793` +SELECT * FROM hudi_table TIMESTAMP AS OF '20220307091628793' WHERE id = 1; +-- time travel based on different timestamp formats +SELECT * FROM hudi_table TIMESTAMP AS OF '2022-03-07 09:16:28.100' WHERE id = 1; +SELECT * FROM hudi_table TIMESTAMP AS OF '2022-03-08' WHERE id = 1; +``` + + + + + + +## Incremental query {#incremental-query} +Hudi provides the unique capability to obtain a set of records that changed between a start and end commit time, providing you with the +"latest state" for each such record as of the end commit time. By default, Hudi tables are configured to support incremental queries, using +record level [metadata tracking](https://hudi.apache.org/blog/2023/05/19/hudi-metafields-demystified). + +Below, we fetch changes since a given begin time while the end time defaults to the latest commit on the table. Users can also specify an +end time using `END_INSTANTTIME.key()` option. + + + + + +```scala +// spark-shell +spark.read.format("hudi").load(basePath).createOrReplaceTempView("trips_table") + +val commits = spark.sql("SELECT DISTINCT(_hoodie_commit_time) AS commitTime FROM trips_table ORDER BY commitTime").map(k => k.getString(0)).take(50) +val beginTime = commits(commits.length - 2) // commit time we are interested in + +// incrementally query data +val tripsIncrementalDF = spark.read.format("hudi"). + option("hoodie.datasource.query.type", "incremental"). + option("hoodie.datasource.read.begin.instanttime", 0). + load(basePath) +tripsIncrementalDF.createOrReplaceTempView("trips_incremental") + +spark.sql("SELECT `_hoodie_commit_time`, fare, rider, driver, uuid, ts FROM trips_incremental WHERE fare > 20.0").show() +``` + + + + +```python +# pyspark +# reload data +spark.read.format("hudi").load(basePath).createOrReplaceTempView("trips_table") + +commits = list(map(lambda row: row[0], spark.sql("SELECT DISTINCT(_hoodie_commit_time) AS commitTime FROM trips_table ORDER BY commitTime").limit(50).collect())) +beginTime = commits[len(commits) - 2] # commit time we are interested in + +# incrementally query data +incremental_read_options = { + 'hoodie.datasource.query.type': 'incremental', + 'hoodie.datasource.read.begin.instanttime': beginTime, +} + +tripsIncrementalDF = spark.read.format("hudi"). \ + options(**incremental_read_options). \ + load(basePath) +tripsIncrementalDF.createOrReplaceTempView("trips_incremental") + +spark.sql("SELECT `_hoodie_commit_time`, fare, rider, driver, uuid, ts FROM trips_incremental WHERE fare > 20.0").show() +``` + + + + + +```sql +-- syntax +hudi_table_changes(table or path, queryType, beginTime [, endTime]); +-- table or path: table identifier, example: db.tableName, tableName, +-- or path for of your table, example: path/to/hudiTable +-- in this case table does not need to exist in the metastore, +-- queryType: incremental query mode, example: latest_state, cdc +-- (for cdc query, first enable cdc for your table by setting cdc.enabled=true), +-- beginTime: instantTime to begin query from, example: earliest, 202305150000, +-- endTime: optional instantTime to end query at, example: 202305160000, + +-- incrementally query data by table name +-- start from earliest available commit, end at latest available commit. +SELECT * FROM hudi_table_changes('db.table', 'latest_state', 'earliest'); + +-- start from earliest, end at 202305160000. +SELECT * FROM hudi_table_changes('table', 'latest_state', 'earliest', '202305160000'); + +-- start from 202305150000, end at 202305160000. +SELECT * FROM hudi_table_changes('table', 'latest_state', '202305150000', '202305160000'); +``` + + + + + + +## Change Data Capture Query {#cdc-query} + +Hudi also exposes first-class support for Change Data Capture (CDC) queries. CDC queries are useful for applications that need to +obtain all the changes, along with before/after images of records, given a commit time range. + + + + + +```scala +// spark-shell +// Lets first insert data to a new table with cdc enabled. +val columns = Seq("ts","uuid","rider","driver","fare","city") +val data = + Seq((1695158649187L,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"), + (1695091544288L,"e96c4396-3fad-413a-a942-4cb36106d721","rider-B","driver-L",27.70 ,"san_paulo"), + (1695046452379L,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-C","driver-M",33.90 ,"san_francisco"), + (1695332056404L,"1dced545-862b-4ceb-8b43-d2a568f6616b","rider-D","driver-N",93.50,"chennai")); +var df = spark.createDataFrame(data).toDF(columns:_*) + +// Insert data +df.write.format("hudi"). + option("hoodie.datasource.write.partitionpath.field", "city"). + option("hoodie.table.cdc.enabled", "true"). + option("hoodie.table.name", tableName). + mode(Overwrite). + save(basePath) + +// Update fare for riders: rider-A and rider-B +val updatesDf = spark.read.format("hudi").load(basePath).filter($"rider" === "rider-A" || $"rider" === "rider-B").withColumn("fare", col("fare") * 10) + +updatesDf.write.format("hudi"). + option("hoodie.datasource.write.operation", "upsert"). + option("hoodie.datasource.write.partitionpath.field", "city"). + option("hoodie.table.cdc.enabled", "true"). + option("hoodie.table.name", tableName). + mode(Append). + save(basePath) + + +// Query CDC data +spark.read.option("hoodie.datasource.read.begin.instanttime", 0). + option("hoodie.datasource.query.type", "incremental"). + option("hoodie.datasource.query.incremental.format", "cdc"). + format("hudi").load(basePath).show(false) +``` + + + + +```python +# pyspark +# Lets first insert data to a new table with cdc enabled. +columns = ["ts","uuid","rider","driver","fare","city"] +data =[(1695159649087,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"), + (1695091554788,"e96c4396-3fad-413a-a942-4cb36106d721","rider-B","driver-L",27.70 ,"san_francisco"), + (1695046462179,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-C","driver-M",33.90 ,"san_francisco"), + (1695516137016,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-C","driver-N",34.15,"sao_paulo")] + + +inserts = spark.createDataFrame(data).toDF(*columns) + +hudi_options = { + 'hoodie.table.name': tableName, + 'hoodie.datasource.write.partitionpath.field': 'city', + 'hoodie.table.cdc.enabled': 'true' +} +# Insert data +inserts.write.format("hudi"). \ + options(**hudi_options). \ + mode("overwrite"). \ + save(basePath) + + +# Update fare for riders: rider-A and rider-B +updatesDf = spark.read.format("hudi").load(basePath).filter("rider == 'rider-A' or rider == 'rider-B'").withColumn("fare",col("fare")*10) + +updatesDf.write.format("hudi"). \ + mode("append"). \ + save(basePath) + +# Query CDC data +cdc_read_options = { + 'hoodie.datasource.query.incremental.format': 'cdc', + 'hoodie.datasource.query.type': 'incremental', + 'hoodie.datasource.read.begin.instanttime': 0 +} +spark.read.format("hudi"). \ + options(**cdc_read_options). \ + load(basePath).show(10, False) +``` + + + + +```sql +-- incrementally query data by path +-- start from earliest available commit, end at latest available commit. +SELECT * FROM hudi_table_changes('path/to/table', 'cdc', 'earliest'); + +-- start from earliest, end at 202305160000. +SELECT * FROM hudi_table_changes('path/to/table', 'cdc', 'earliest', '202305160000'); + +-- start from 202305150000, end at 202305160000. +SELECT * FROM hudi_table_changes('path/to/table', 'cdc', '202305150000', '202305160000'); +``` + + + +:::info Key requirements +Note that CDC queries are currently only supported on Copy-on-Write tables. +::: + +## Table Types + +The examples thus far have showcased one of the two table types, that Hudi supports - Copy-on-Write (COW) tables. Hudi also supports +a more advanced write-optimized table type called Merge-on-Read (MOR) tables, that can balance read and write performance in a more +flexible manner. See [table types](/docs/table_types) for more details. + +Any of these examples can be run on a Merge-on-Read table by simply changing the table type to MOR, while creating the table, as below. + + + + + +```scala +// spark-shell +inserts.write.format("hudi"). + ... + option("hoodie.datasource.write.table.type", "MERGE_ON_READ"). + ... +``` + + + + +```python +# pyspark +hudi_options = { + ... + 'hoodie.datasource.write.table.type': 'MERGE_ON_READ' +} + +inserts.write.format("hudi"). \ +options(**hudi_options). \ +mode("overwrite"). \ +save(basePath) +``` + + + + +```sql +CREATE TABLE hudi_table ( + uuid STRING, + rider STRING, + driver STRING, + fare DOUBLE, + city STRING +) USING HUDI TBLPROPERTIES (type = 'mor') +PARTITIONED BY (city); +``` + + + + +## Keys + +Hudi also allows users to specify a record key, which will be used to uniquely identify a record within a Hudi table. This is useful and +critical to support features like indexing and clustering, which speed up upserts and queries respectively, in a consistent manner. Some of the other +benefits of keys are explained in detail [here](https://hudi.apache.org/blog/2023/05/19/hudi-metafields-demystified). To this end, Hudi supports a +wide range of built-in [key generators](https://hudi.apache.org/blog/2021/02/13/hudi-key-generators), that make it easy to generate record +keys for a given table. In the absence of a user configured key, Hudi will auto generate record keys, which are highly compressible. + + + + + +```scala +// spark-shell +inserts.write.format("hudi"). +... +option("hoodie.datasource.write.recordkey.field", "uuid"). +... +``` + + + + + +```python +# pyspark +hudi_options = { + ... + 'hoodie.datasource.write.recordkey.field': 'uuid' +} + +inserts.write.format("hudi"). \ +options(**hudi_options). \ +mode("overwrite"). \ +save(basePath) +``` + + + + + + +```sql +CREATE TABLE hudi_table ( + ts BIGINT, + uuid STRING, + rider STRING, + driver STRING, + fare DOUBLE, + city STRING +) USING HUDI TBLPROPERTIES (primaryKey = 'uuid') +PARTITIONED BY (city); +``` + + + +:::note Implications of defining record keys +Configuring keys for a Hudi table, has a new implications on the table. If record key is set by the user, `upsert` is chosen as the [write operation](write_operations). +Also if a record key is configured, then it's also advisable to specify a precombine or ordering field, to correctly handle cases where the source data has +multiple records with the same key. See section below. +::: + +## Merge Modes +Hudi also allows users to specify a _precombine_ field, which will be used to order and resolve conflicts between multiple versions of the same record. This is very important for +use-cases like applying database CDC logs to a Hudi table, where a given record may appear multiple times in the source data due to repeated upstream updates. +Hudi also uses this mechanism to support out-of-order data arrival into a table, where records may need to be resolved in a different order than their commit time. +For e.g. using a _created_at_ timestamp field as the precombine field will prevent older versions of a record from overwriting newer ones or being exposed to queries, even +if they are written at a later commit time to the table. This is one of the key features, that makes Hudi, best suited for dealing with streaming data. + +To enable different merge semantics, Hudi supports [merge modes](record_merger). Commit time and event time based merge modes are supported out of the box. +Users can also define their own custom merge strategies, see [here](sql_ddl#create-table-with-record-merge-mode). + + + + + +```scala +// spark-shell +updatesDf.write.format("hudi"). + ... + option("hoodie.datasource.write.precombine.field", "ts"). + ... +``` + + + + + +```python +# pyspark +hudi_options = { +... +'hoodie.datasource.write.precombine.field': 'ts' +} + +upsert.write.format("hudi"). + options(**hudi_options). + mode("append"). + save(basePath) +``` + + + + + + +```sql +CREATE TABLE hudi_table ( + ts BIGINT, + uuid STRING, + rider STRING, + driver STRING, + fare DOUBLE, + city STRING +) USING HUDI TBLPROPERTIES (preCombineField = 'ts') +PARTITIONED BY (city); +``` + + + + +## Where to go from here? +You can also [build hudi yourself](https://github.com/apache/hudi#building-apache-hudi-from-source) and try this quickstart using `--jars `(see also [build with scala 2.12](https://github.com/apache/hudi#build-with-different-spark-versions)) +for more info. If you are looking for ways to migrate your existing data to Hudi, refer to [migration guide](migration_guide). + +### Spark SQL Reference + +For advanced usage of spark SQL, please refer to [Spark SQL DDL](sql_ddl) and [Spark SQL DML](sql_dml) reference guides. +For alter table commands, check out [this](sql_ddl#spark-alter-table). Stored procedures provide a lot of powerful capabilities using Hudi SparkSQL to assist with monitoring, managing and operating Hudi tables, please check [this](procedures) out. + +### Streaming workloads + +Hudi provides industry-leading performance and functionality for streaming data. + +**Hudi Streamer** - Hudi provides an incremental ingestion/ETL tool - [HoodieStreamer](/docs/hoodie_streaming_ingestion#hudi-streamer), to assist with ingesting data into Hudi +from various different sources in a streaming manner, with powerful built-in capabilities like auto checkpointing, schema enforcement via schema provider, +transformation support, automatic table services and so on. + +**Structured Streaming** - Hudi supports Spark Structured Streaming reads and writes as well. Please see [here](writing_tables_streaming_writes#spark-streaming) for more. + +Check out more information on [modeling data in Hudi](faq_general#how-do-i-model-the-data-stored-in-hudi) and different ways to perform [batch writes](/docs/writing_data) and [streaming writes](writing_tables_streaming_writes). + +### Dockerized Demo +Even as we showcased the core capabilities, Hudi supports a lot more advanced functionality that can make it easy +to get your transactional data lakes up and running quickly, across a variety query engines like Hive, Flink, Spark, Presto, Trino and much more. +We have put together a [demo video](https://www.youtube.com/watch?v=VhNgUsxdrD0) that showcases all of this on a docker based setup with all +dependent systems running locally. We recommend you replicate the same setup and run the demo yourself, by following +steps [here](/docs/next/docker_demo) to get a taste for it. + diff --git a/website/versioned_docs/version-1.0.0/reading_tables_batch_reads.md b/website/versioned_docs/version-1.0.0/reading_tables_batch_reads.md new file mode 100644 index 0000000000000..d247fd4c3d082 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/reading_tables_batch_reads.md @@ -0,0 +1,35 @@ +--- +title: Batch Reads +keywords: [hudi, spark, flink, batch, processing] +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Spark DataSource API + +The `hudi-spark` module offers the DataSource API to read a Hudi table into a Spark DataFrame. + +A time-travel query example: + +```Scala +val tripsDF = spark.read. + option("as.of.instant", "2021-07-28 14:11:08.000"). + format("hudi"). + load(basePath) +tripsDF.where(tripsDF.fare > 20.0).show() +``` + +## Daft + +[Daft](https://www.getdaft.io/) supports reading Hudi tables using `daft.read_hudi()` function. + +```Python +# Read Apache Hudi table into a Daft DataFrame. +import daft + +df = daft.read_hudi("some-table-uri") +df = df.where(df["foo"] > 5) +df.show() +``` + +Check out the Daft docs for [Hudi integration](https://www.getdaft.io/projects/docs/en/latest/user_guide/integrations/hudi.html). diff --git a/website/versioned_docs/version-1.0.0/reading_tables_streaming_reads.md b/website/versioned_docs/version-1.0.0/reading_tables_streaming_reads.md new file mode 100644 index 0000000000000..5e73524e14d4a --- /dev/null +++ b/website/versioned_docs/version-1.0.0/reading_tables_streaming_reads.md @@ -0,0 +1,99 @@ +--- +title: Streaming Reads +keywords: [hudi, spark, flink, streaming, processing] +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Spark Streaming + +Structured Streaming reads are based on Hudi's Incremental Query feature, therefore streaming read can return data for which +commits and base files were not yet removed by the cleaner. You can control commits retention time. + + + + + +```scala +// spark-shell +// reload data +df.write.format("hudi"). + options(getQuickstartWriteConfigs). + option("hoodie.datasource.write.precombine.field", "ts"). + option("hoodie.datasource.write.recordkey.field", "uuid"). + option("hoodie.datasource.write.partitionpath.field", "partitionpath"). + option("hoodie.table.name", tableName). + mode(Overwrite). + save(basePath) + +// read stream and output results to console +spark.readStream. + format("hudi"). + load(basePath). + writeStream. + format("console"). + start() + +// read stream to streaming df +val df = spark.readStream. + format("hudi"). + load(basePath) + +``` + + + + +```python +# pyspark +# reload data +inserts = sc._jvm.org.apache.hudi.QuickstartUtils.convertToStringList( + dataGen.generateInserts(10)) +df = spark.read.json(spark.sparkContext.parallelize(inserts, 2)) + +hudi_options = { + 'hoodie.table.name': tableName, + 'hoodie.datasource.write.recordkey.field': 'uuid', + 'hoodie.datasource.write.partitionpath.field': 'partitionpath', + 'hoodie.datasource.write.table.name': tableName, + 'hoodie.datasource.write.operation': 'upsert', + 'hoodie.datasource.write.precombine.field': 'ts', + 'hoodie.upsert.shuffle.parallelism': 2, + 'hoodie.insert.shuffle.parallelism': 2 +} + +df.write.format("hudi"). \ + options(**hudi_options). \ + mode("overwrite"). \ + save(basePath) + +# read stream to streaming df +df = spark.readStream \ + .format("hudi") \ + .load(basePath) + +# ead stream and output results to console +spark.readStream \ + .format("hudi") \ + .load(basePath) \ + .writeStream \ + .format("console") \ + .start() + +``` + + + + +:::info +Spark SQL can be used within ForeachBatch sink to do INSERT, UPDATE, DELETE and MERGE INTO. +Target table must exist before write. +::: diff --git a/website/versioned_docs/version-1.0.0/record_merger.md b/website/versioned_docs/version-1.0.0/record_merger.md new file mode 100644 index 0000000000000..378c5575ad19c --- /dev/null +++ b/website/versioned_docs/version-1.0.0/record_merger.md @@ -0,0 +1,253 @@ +--- +title: Record Mergers +keywords: [hudi, merge, upsert, precombine] +toc: true +toc_min_heading_level: 2 +toc_max_heading_level: 4 +--- + +Hudi handles mutations to records and streaming data, as we briefly touched upon in [timeline ordering](timeline#ordering-of-actions) section. +To provide users full-fledged support for stream processing, Hudi goes all the way making the storage engine and the underlying storage format +understand how to merge changes to the same record key, that may arrive even in different order at different times. With the rise of mobile applications + and IoT, these scenarios have become the normal than an exception. For e.g. a social networking application uploading user events several hours after they happened, +when the user connects to WiFi networks. + +To achieve this, Hudi supports merge modes, which define how the base and log files are ordered in a file slice and further how different records with +the same record key within that file slice are merged consistently to produce the same deterministic results for snapshot queries, writers and table services. Specifically, +there are three merge modes supported as a table-level configuration, invoked in the following places. + + * **(writing)** Combining multiple change records for the same record key while reading input data during writes. This is an optional optimization that + reduces the number of records written to log files to improve query and write performance subsequently. + + * **(writing)** Merging final change record (partial/full update/delete) against existing record in storage for CoW tables. + + * **(compaction)** Compaction service merges all change records in log files against base files, respecting the merge mode. + + * **(query)** Merging change records in log files, after filtering/projections against base file for MoR table queries. + +Note that the merge mode should not be altered once the table is created to avoid inconsistent behavior due to compaction producing +different merge results when switching between the modes. + +### COMMIT_TIME_ORDERING + +Here, we expect the input records to arrive in strict order such that arrival order is same as their +delta commit order on the table. Merging simply picks the record belonging to the latest write as the merged result. In relational data mode speak, +this provides overwrite semantics aligned with serializable writes on the timeline. + +
+ upsert_path.png +
+ +In the example above, the writer process consumes a database change log, expected to be in strict order of a logical sequence number (lsn) +that denotes the ordering of the writes in the upstream database. + +### EVENT_TIME_ORDERING + +This is the default merge mode. While commit time ordering provides a well-understood standard behavior, it's hardly sufficient. The commit time is unrelated to the actual +ordering of data that a user may care about and strict ordering of input in complex distributed systems is difficult to achieve. +With event time ordering, the merging picks the record with the highest value on a user specified _**ordering or precombine field**_ as the merged result. + +
+ upsert_path.png +
+ +In the example above, two microservices product change records about orders at different times, that can arrive out-of-order. As color coded, +this can lead to application-level inconsistent states in the table if simply merged in commit time order like a cancelled order being re-created or +a paid order moved back to just created state expecting payment again. Event time ordering helps by ignoring older state changes that arrive late and +avoiding order status from "jumping back" in time. Combined with [non-blocking concurrency control](concurrency_control#non-blocking-concurrency-control-mode), +this provides a very powerful way for processing such data streams efficiently and correctly. + +### CUSTOM + +In some cases, even more control and customization may be needed. Extending the same example above, the two microservices could be updating two different +set of columns "order_info" and "payment_info", along with order state. The merge logic is then expected to not only resolve the correct status, but merge +order_info from the record in created state, into the record in cancelled state that already has payment_info fields populated with reasons payment failed. +Such reconciliation provide a simple denormalized data model for downstream consumption where queries (for e.g. fraud detection) can simply filter fields +across order_info and payment_info without costly self-join on each access. + +Hudi allows authoring of cross-language custom record mergers on top of a standard record merger API, that supports full and partial merges. The java APIs +are sketched below at a high-level. It simply takes older/newer records in engine native formats and produces a merged record or returns empty to skip them entirely (e.g. soft deletes). +Record merger is configured using a `hoodie.write.record.merge.strategy.id` write config whose value is an uuid, that is taken by the writer to persist in the table config, and is expected to be returned by `getMergingStrategy()` +method below. Using this mechanism, Hudi can automatically deduce the record merger to use for the table across different language/engine runtimes. + +```Java +interface HoodieRecordMerger { + + Option> merge(HoodieRecord older, Schema oldSchema, + HoodieRecord newer, Schema newSchema, + TypedProperties props) { + ... + } + + Option> partialMerge(HoodieRecord older, Schema oldSchema, + HoodieRecord newer, Schema newSchema, + Schema readerSchema, TypedProperties props) { + ... + } + + HoodieRecordType getRecordType() {...} + + String getMergingStrategy(); {...} +} +``` + +### Record Merge Configs + +The record merge mode and optional record merge strategy ID and custom merge implementation classes can be specified using the below configs. + +| Config Name | Default | Description | +| ---------------------------------------| ---------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.write.record.merge.mode | EVENT_TIME_ORDERING | Determines the logic of merging different records with the same record key. Valid values: (1) `COMMIT_TIME_ORDERING`: use commit time to merge records, i.e., the record from later commit overwrites the earlier record with the same key. (2) `EVENT_TIME_ORDERING` (default): use event time as the ordering to merge records, i.e., the record with the larger event time overwrites the record with the smaller event time on the same key, regardless of commit time. The event time or preCombine field needs to be specified by the user. (3) `CUSTOM`: use custom merging logic specified by the user.
`Config Param: RECORD_MERGE_MODE`
`Since Version: 1.0.0` | +| hoodie.write.record.merge.strategy.id | N/A (Optional) | ID of record merge strategy. When you specify this config, you also need to specify `hoodie.write.record.merge.custom.implementation.classes`. Hudi picks the `HoodieRecordMerger` implementation class from the list of classes in `hoodie.write.record.merge.custom.implementation.classes` that has the specified merge strategy ID.
`Config Param: RECORD_MERGE_STRATEGY_ID`
`Since Version: 0.13.0` | +| hoodie.write.record.merge.custom.implementation.classes | N/A (Optional) | List of `HoodieRecordMerger` implementations constituting Hudi's merging strategy based on the engine used. Hudi picks the `HoodieRecordMerger` implementation class from this list based on the specified `hoodie.write.record.merge.strategy.id`.
`Config Param: RECORD_MERGE_IMPL_CLASSES`
`Since Version: 0.13.0` | + + +### Record Payloads + +:::caution +Going forward, we recommend users to migrate and use the record merger APIs and not write new payload implementations. +::: +Record payload is an older abstraction/API for achieving similar record-level merge capabilities. While record payloads were very useful and popular, +it had drawbacks like lower performance due to conversion of engine native record formats to Apache Avro for merging and lack of cross-language support. +As we shall see below, Hudi provides out-of-box support for different payloads for different use cases. Hudi implements fallback from +record merger APIs to payload APIs internally, to provide backwards compatibility for existing payload implementations. + +#### OverwriteWithLatestAvroPayload +```scala +hoodie.datasource.write.payload.class=org.apache.hudi.common.model.OverwriteWithLatestAvroPayload +``` + +This is the default record payload implementation. It picks the record with the greatest value (determined by calling +`.compareTo()` on the value of precombine key) to break ties and simply picks the latest record while merging. This gives +latest-write-wins style semantics. + +#### DefaultHoodieRecordPayload +```scala +hoodie.datasource.write.payload.class=org.apache.hudi.common.model.DefaultHoodieRecordPayload +``` +While `OverwriteWithLatestAvroPayload` precombines based on an ordering field and picks the latest record while merging, +`DefaultHoodieRecordPayload` honors the ordering field for both precombinig and merging. Let's understand the difference with an example: + +Let's say the ordering field is `ts` and record key is `id` and schema is: + +``` +{ + [ + {"name":"id","type":"string"}, + {"name":"ts","type":"long"}, + {"name":"name","type":"string"}, + {"name":"price","type":"string"} + ] +} +``` + +Current record in storage: + +``` + id ts name price + 1 2 name_2 price_2 +``` + +Incoming record: + +``` + id ts name price + 1 1 name_1 price_1 +``` + +Result data after merging using `OverwriteWithLatestAvroPayload` (latest-write-wins): + +``` + id ts name price + 1 1 name_1 price_1 +``` + +Result data after merging using `DefaultHoodieRecordPayload` (always honors ordering field): + +``` + id ts name price + 1 2 name_2 price_2 +``` + +#### EventTimeAvroPayload +```scala +hoodie.datasource.write.payload.class=org.apache.hudi.common.model.EventTimeAvroPayload +``` +This is the default record payload for Flink based writing. Some use cases require merging records by event time and +thus event time plays the role of an ordering field. This payload is particularly useful in the case of late-arriving data. +For such use cases, users need to set the [payload event time field](/docs/configurations#RECORD_PAYLOAD) configuration. + +#### OverwriteNonDefaultsWithLatestAvroPayload +```scala +hoodie.datasource.write.payload.class=org.apache.hudi.common.model.OverwriteNonDefaultsWithLatestAvroPayload +``` +This payload is quite similar to `OverwriteWithLatestAvroPayload` with slight difference while merging records. For +precombining, just like `OverwriteWithLatestAvroPayload`, it picks the latest record for a key, based on an ordering +field. While merging, it overwrites the existing record on storage only for the specified **fields that don't equal +default value** for that field. + +#### PartialUpdateAvroPayload +```scala +hoodie.datasource.write.payload.class=org.apache.hudi.common.model.PartialUpdateAvroPayload +``` +This payload supports partial update. Typically, once the merge step resolves which record to pick, then the record on +storage is fully replaced by the resolved record. But, in some cases, the requirement is to update only certain fields +and not replace the whole record. This is called partial update. `PartialUpdateAvroPayload` provides out-of-box support +for such use cases. To illustrate the point, let us look at a simple example: + +Let's say the ordering field is `ts` and record key is `id` and schema is: + +``` +{ + [ + {"name":"id","type":"string"}, + {"name":"ts","type":"long"}, + {"name":"name","type":"string"}, + {"name":"price","type":"string"} + ] +} +``` + +Current record in storage: + +``` + id ts name price + 1 2 name_1 null +``` + +Incoming record: + +``` + id ts name price + 1 1 null price_1 +``` + +Result data after merging using `PartialUpdateAvroPayload`: + +``` + id ts name price + 1 2 name_1 price_1 +``` + +#### Configs + +Payload class can be specified using the below configs. For more advanced configs refer [here](https://hudi.apache.org/docs/configurations#RECORD_PAYLOAD) + +**Spark based configs:** + +| Config Name | Default | Description | +| ---------------------------------------| ---------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hoodie.datasource.write.payload.class | org.apache.hudi.common.model.OverwriteWithLatestAvroPayload (Optional) | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for PRECOMBINE_FIELD_OPT_VAL in-effective

`Config Param: WRITE_PAYLOAD_CLASS_NAME` | + +**Flink based configs:** + +| Config Name | Default | Description | +| ---------------------------------------| ---------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| payload.class | org.apache.hudi.common.model.EventTimeAvroPayload (Optional) | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for the option in-effective

`Config Param: PAYLOAD_CLASS_NAME` | + + +There are also quite a few other implementations. Developers may be interested in looking at the hierarchy of `HoodieRecordPayload` interface. For +example, [`MySqlDebeziumAvroPayload`](https://github.com/apache/hudi/blob/e76dd102bcaf8aec5a932e7277ccdbfd73ce1a32/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/MySqlDebeziumAvroPayload.java) and [`PostgresDebeziumAvroPayload`](https://github.com/apache/hudi/blob/e76dd102bcaf8aec5a932e7277ccdbfd73ce1a32/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java) provides support for seamlessly applying changes +captured via Debezium for MySQL and PostgresDB. [`AWSDmsAvroPayload`](https://github.com/apache/hudi/blob/e76dd102bcaf8aec5a932e7277ccdbfd73ce1a32/hudi-common/src/main/java/org/apache/hudi/common/model/AWSDmsAvroPayload.java) provides support for applying changes captured via Amazon Database Migration Service onto S3. +For full configurations, go [here](/docs/configurations#RECORD_PAYLOAD) and please check out [this FAQ](faq_writing_tables/#can-i-implement-my-own-logic-for-how-input-records-are-merged-with-record-on-storage) if you want to implement your own custom payloads. + diff --git a/website/versioned_docs/version-1.0.0/rollbacks.md b/website/versioned_docs/version-1.0.0/rollbacks.md new file mode 100644 index 0000000000000..794f27c6d6863 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/rollbacks.md @@ -0,0 +1,72 @@ +--- +title: Auto Rollbacks +toc: true +toc_min_heading_level: 2 +toc_max_heading_level: 4 +--- + +Your pipelines could fail due to numerous reasons like crashes, valid bugs in the code, unavailability of any external +third-party system (like a lock provider), or user could kill the job midway to change some properties. A well-designed +system should detect such partially failed commits, ensure dirty data is not exposed to the queries, and clean them up. +Hudi's rollback mechanism takes care of cleaning up such failed writes. + +Hudi’s timeline forms the core for reader and writer isolation. If a commit has not transitioned to complete as per the +hudi timeline, the readers will ignore the data from the respective write. And so partially failed writes are never read +by any readers (for all query types). But the curious question is, how is the partially written data eventually deleted? +Does it require manual command to be executed from time to time or should it be automatically handled by the system? This +page presents insights on how "rollback" in Hudi can automatically clean up handling partially failed writes without +manual input from users. + +### Handling partially failed commits +Hudi has a lot of platformization built in so as to ease the operationalization of [lakehouse](https://hudi.apache.org/blog/2024/07/11/what-is-a-data-lakehouse/) tables. One such feature +is the automatic cleanup of partially failed commits. Users don’t need to run any additional commands to clean up dirty +data or the data produced by failed commits. If you continue to write to hudi tables, one of your future commits will +take care of cleaning up older data that failed midway during a write/commit. We call this cleanup of a failed commit a +"rollback". A rollback will revert everything about a commit, including deleting data and removal from the timeline. +Additionally, the restore operation utilizes a series rollbacks to undo completed commits. + +Let’s zoom in a bit and understand how such cleanups happen and the challenges involved in such cleaning when +multi-writers are involved. + +### Rolling back partially failed commits for a single writer +In case of single writer model, the rollback logic is fairly straightforward. Every action in Hudi's timeline goes +through 3 states, namely requested, inflight and completed. Whenever a new commit starts, hudi checks the timeline +for any actions/commits that is not yet committed and that refers to partially failed commit. So, immediately rollback +is triggered and all dirty data is cleaned up followed by cleaning up the commit instants from the timeline. + + +![An example illustration of single writer rollbacks](/assets/images/blog/rollbacks/Rollback_1.png) +_Figure 1: single writer with eager rollbacks_ + + +### Rolling back of partially failed commits w/ multi-writers +The challenging part is when multi-writers are invoked. Just because a commit is still non-completed as per the +timeline, it does not mean current writer (new) can assume that it's a partially failed commit. Because, there could be +a concurrent writer that’s currently making progress. Hudi has been designed to not have any centralized server +running always and in such a case Hudi has an ingenious way to deduce such partially failed writes. + +#### Heartbeats +We are leveraging heartbeats to our rescue here. Each commit will keep emitting heartbeats from the start of the +write until its completion. During rollback deduction, Hudi checks for heartbeat timeouts for all ongoing or incomplete +commits and detects partially failed commits on such timeouts. For any ongoing commits, the heartbeat should not +have elapsed the timeout. For example, if a commit’s heartbeat is not updated for 10+ mins, we can safely assume the +original writer has failed/crashed and is the incomplete commit is safe to clean up. So, the rollbacks in case of +multi-writers are lazy and is not eager as we saw with single writer model. But it is still automatic and users don’t +need to execute any explicit command to trigger such cleanup of failed writes. When such lazy rollback kicks in, both +data files and timeline files for the failed writes are deleted. + +Hudi employs a simple yet effective heartbeat mechanism to notify that a commit is still making progress. A heartbeat +file is created for every commit under “.hoodie/.heartbeat/” (for eg, “.hoodie/.heartbeat/20230819183853177”). +The writer will start a background thread which will keep updating this heartbeat file at a regular cadence to refresh +the last modification time of the file. So, checking for last modification time of the heartbeat file gives us +information whether the writer that started the commit of interest is still making progress or not. On completion of +the commit, the heartbeat file is deleted. Or if the write failed midway, the last modification time of the heartbeat +file is no longer updated, so other writers can deduce the failed write after a period of time elapses. + +![An example illustration of multi writer rollbacks](/assets/images/blog/rollbacks/rollback2_new.png) +_Figure 2: multi-writer with lazy cleaning of failed commits_ + +## Related Resources +

Videos

+ +* [How to Rollback to Previous Checkpoint during Disaster in Apache Hudi using Glue 4.0 Demo](https://www.youtube.com/watch?v=Vi25q4vzogs) \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/s3_hoodie.md b/website/versioned_docs/version-1.0.0/s3_hoodie.md new file mode 100644 index 0000000000000..37f79ae753425 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/s3_hoodie.md @@ -0,0 +1,94 @@ +--- +title: AWS S3 +keywords: [ hudi, hive, aws, s3, spark, presto] +summary: In this page, we go over how to configure Hudi with S3 filesystem. +last_modified_at: 2019-12-30T15:59:57-04:00 +--- +In this page, we explain how to get your Hudi spark job to store into AWS S3. + +## AWS configs + +There are two configurations required for Hudi-S3 compatibility: + +- Adding AWS Credentials for Hudi +- Adding required Jars to classpath + +### AWS Credentials + +The simplest way to use Hudi with S3, is to configure your `SparkSession` or `SparkContext` with S3 credentials. Hudi will automatically pick this up and talk to S3. + +Alternatively, add the required configs in your core-site.xml from where Hudi can fetch them. Replace the `fs.defaultFS` with your S3 bucket name and Hudi should be able to read/write from the bucket. + +```xml + + fs.defaultFS + s3://ysharma + + + + fs.s3.awsAccessKeyId + AWS_KEY + + + + fs.s3.awsSecretAccessKey + AWS_SECRET + + + + fs.s3a.awsAccessKeyId + AWS_KEY + + + + fs.s3a.awsSecretAccessKey + AWS_SECRET + + + + fs.s3a.endpoint + http://IP-Address:Port + + + + fs.s3a.path.style.access + true + + + + fs.s3a.signing-algorithm + S3SignerType + +``` + + +Utilities such as hudi-cli or Hudi Streamer tool, can pick up s3 creds via environmental variable prefixed with `HOODIE_ENV_`. For e.g below is a bash snippet to setup +such variables and then have cli be able to work on datasets stored in s3 + +```java +export HOODIE_ENV_fs_DOT_s3a_DOT_access_DOT_key=$accessKey +export HOODIE_ENV_fs_DOT_s3a_DOT_secret_DOT_key=$secretKey +export HOODIE_ENV_fs_DOT_s3_DOT_awsAccessKeyId=$accessKey +export HOODIE_ENV_fs_DOT_s3_DOT_awsSecretAccessKey=$secretKey +``` + + + +### AWS Libs + +AWS hadoop libraries to add to our classpath + + - com.amazonaws:aws-java-sdk:1.10.34 + - org.apache.hadoop:hadoop-aws:2.7.3 + +AWS glue data libraries are needed if AWS glue data is used + + - com.amazonaws.glue:aws-glue-datacatalog-hive2-client:1.11.0 + - com.amazonaws:aws-java-sdk-glue:1.11.475 + +## AWS S3 Versioned Bucket + +With versioned buckets any object deleted creates a [Delete Marker](https://docs.aws.amazon.com/AmazonS3/latest/userguide/DeleteMarker.html), as Hudi cleans up files using [Cleaner utility](https://hudi.apache.org/docs/hoodie_cleaner) the number of Delete Markers increases over time. +It is important to configure the [Lifecycle Rule](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-lifecycle-mgmt.html) correctly +to clean up these delete markers as the List operation can choke if the number of delete markers reaches 1000. +We recommend cleaning up Delete Markers after 1 day in Lifecycle Rule. \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/schema_evolution.md b/website/versioned_docs/version-1.0.0/schema_evolution.md new file mode 100755 index 0000000000000..1638d6ad1c6fb --- /dev/null +++ b/website/versioned_docs/version-1.0.0/schema_evolution.md @@ -0,0 +1,319 @@ +--- +title: Schema Evolution +keywords: [hudi, incremental, batch, stream, processing, schema, evolution] +summary: In this page, we will discuss schema evolution support in Hudi. +toc: true +last_modified_at: 2022-04-27T15:59:57-04:00 +--- + +Schema evolution is an essential aspect of data management, and Hudi supports schema evolution on write out-of-the-box, +and experimental support for schema evolution on read. This page will discuss the schema evolution support in Hudi. + +## Schema Evolution on Write +Hudi supports backwards-compatible schema evolution scenarios out of the box, such as adding a nullable field or promoting a field's datatype. + +:::info +We recommend employing this approach as much as possible. This is a practical and efficient way to evolve schemas, proven at large-scale +data lakes at companies like Uber, Walmart, and LinkedIn. It is also implemented at scale by vendors like Confluent for streaming data. +Given the continuous nature of streaming data, there are no boundaries to define a schema change that can be incompatible with +the previous schema (e.g., renaming a column). +::: + +Furthermore, the evolved schema is queryable across high-performance engines like Presto and Spark SQL without additional overhead for column ID translations or +type reconciliations. The following table summarizes the schema changes compatible with different Hudi table types. + +The incoming schema will automatically have missing columns added with null values from the table schema. +For this we need to enable the following config +`hoodie.write.set.null.for.missing.columns`, otherwise the pipeline will fail. + +| Schema Change | COW | MOR | Remarks | +|:----------------------------------------------------------------|:----|:----|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Add a new nullable column at root level at the end | Yes | Yes | `Yes` means that a write with evolved schema succeeds and a read following the write succeeds to read entire dataset. | +| Add a new nullable column to inner struct (at the end) | Yes | Yes | | +| Add a new complex type field with default (map and array) | Yes | Yes | | +| Add a new nullable column and change the ordering of fields | No | No | Write succeeds but read fails if the write with evolved schema updated only some of the base files but not all. Currently, Hudi does not maintain a schema registry with history of changes across base files. Nevertheless, if the upsert touched all base files then the read will succeed. | +| Add a custom nullable Hudi meta column, e.g. `_hoodie_meta_col` | Yes | Yes | | +| Promote datatype for a field at root level | Yes | Yes | | +| Promote datatype for a nested field | Yes | Yes | | +| Promote datatype for a complex type (value of map or array) | Yes | Yes | | +| Add a new non-nullable column at root level at the end | No | No | In case of MOR table with Spark data source, write succeeds but read fails. As a **workaround**, you can make the field nullable. | +| Add a new non-nullable column to inner struct (at the end) | No | No | | +| Demote datatype for a field at root level | No | No | | +| Demote datatype for a nested field | No | No | | +| Demote datatype for a complex type (value of map or array) | No | No | | + +### Type Promotions + +This chart shows what the table schema will be when an incoming column type has changed (X means that it is not allowed): + +| Incoming Schema ↓ \ Table Schema → | int | long | float | double | string | bytes | +|------------------------------------------------|--------|--------|--------|--------|--------|-------| +| int | int | long | float | double | string | X | +| long | long | long | float | double | string | X | +| float | float | float | float | double | string | X | +| double | double | double | double | double | string | X | +| string | string | string | string | string | string | bytes | +| bytes | X | X | X | X | string | bytes | + +## Schema Evolution on read + +There are often scenarios where it's desirable to have the ability to evolve the schema more flexibly. +For example, + +1. Columns (including nested columns) can be added, deleted, modified, and moved. +2. Renaming of columns (including nested columns). +3. Add, delete, or perform operations on nested columns of the Array type. + +Hudi has experimental support for allowing backward incompatible schema evolution scenarios on write while resolving +it during read time. To enable this feature, `hoodie.schema.on.read.enable=true` needs to be set on the writer config (Datasource) or table property (SQL). + +:::note +Hudi versions > 0.11 and Spark versions > 3.1.x, and 3.2.1 are required. For Spark 3.2.1 and above, +`spark.sql.catalog.spark_catalog` must also be set. If schema on read is enabled, it cannot be disabled again +since the table would have accepted such schema changes already. +::: + +### Adding Columns + +```sql +-- add columns +ALTER TABLE tableName ADD COLUMNS(col_spec[, col_spec ...]) +``` + +Column specification consists of five field, next to each other. + +| Parameter | Description | +|:-------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| col_name | name of the new column. To add sub-column col1 to a nested map type column member map\>, set this field to member.value.col1 | +| col_type | type of the new column. | +| nullable | whether or not the new column allows null values. (optional) | +| comment | comment of the new column. (optional) | +| col_position | The position where the new column is added. The value can be *FIRST* or *AFTER origin_col*. If it is set to *FIRST*, the new column will be added before the first column of the table. If it is set to *AFTER origin_col*, the new column will be added after the original column. *FIRST* can be used only when new sub-columns are added to nested columns and not in top-level columns. There are no restrictions on the usage of *AFTER*. | + +**Examples** + +```sql +ALTER TABLE h0 ADD COLUMNS(ext0 string); +ALTER TABLE h0 ADD COLUMNS(new_col int not null comment 'add new column' AFTER col1); +ALTER TABLE complex_table ADD COLUMNS(col_struct.col_name string comment 'add new column to a struct col' AFTER col_from_col_struct); +``` + +### Altering Columns +**Syntax** +```sql +-- alter table ... alter column +ALTER TABLE tableName ALTER [COLUMN] col_old_name TYPE column_type [COMMENT] col_comment[FIRST|AFTER] column_name +``` + +**Parameter Description** + +| Parameter | Description | +|:-----------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------| +| tableName | Table name. | +| col_old_name | Name of the column to be altered. | +| column_type | Type of the target column. | +| col_comment | Optional comments on the altered column. | +| column_name | The new position to place the altered column. For example, *AFTER* **column_name** indicates that the target column is placed after **column_name**. | + + +**Examples** + +```sql +--- Changing the column type +ALTER TABLE table1 ALTER COLUMN a.b.c TYPE bigint + +--- Altering other attributes +ALTER TABLE table1 ALTER COLUMN a.b.c COMMENT 'new comment' +ALTER TABLE table1 ALTER COLUMN a.b.c FIRST +ALTER TABLE table1 ALTER COLUMN a.b.c AFTER x +ALTER TABLE table1 ALTER COLUMN a.b.c DROP NOT NULL +``` + +**column type change** + +| Source\Target | long | float | double | string | decimal | date | int | +|--------------------|-------|-------|--------|--------|---------|------|-----| +| int | Y | Y | Y | Y | Y | N | Y | +| long | Y | Y | Y | Y | Y | N | N | +| float | N | Y | Y | Y | Y | N | N | +| double | N | N | Y | Y | Y | N | N | +| decimal | N | N | N | Y | Y | N | N | +| string | N | N | N | Y | Y | Y | N | +| date | N | N | N | Y | N | Y | N | + +### Deleting Columns +**Syntax** +```sql +-- alter table ... drop columns +ALTER TABLE tableName DROP COLUMN|COLUMNS cols +``` + +**Examples** + +```sql +ALTER TABLE table1 DROP COLUMN a.b.c +ALTER TABLE table1 DROP COLUMNS a.b.c, x, y +``` + +### Renaming columns +**Syntax** +```sql +-- alter table ... rename column +ALTER TABLE tableName RENAME COLUMN old_columnName TO new_columnName +``` + +**Examples** + +```sql +ALTER TABLE table1 RENAME COLUMN a.b.c TO x +``` + +:::note +When using hive metastore, please disable `hive.metastore.disallow.incompatible.col.type.changes` if you encounter this error: +`The following columns have types incompatible with the existing columns in their respective positions`. +::: + +## Schema Evolution in Action + +Let us walk through an example to demonstrate the schema evolution support in Hudi. In the below example, we are going to add a new string field and change the datatype of a field from int to long. + +```scala +scala> :paste +import org.apache.hudi.QuickstartUtils._ +import scala.collection.JavaConversions._ +import org.apache.spark.sql.SaveMode._ +import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.config.HoodieWriteConfig._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.Row + +val tableName = "hudi_trips_cow" +val basePath = "file:///tmp/hudi_trips_cow" +val schema = StructType( Array( + StructField("rowId", StringType,true), + StructField("partitionId", StringType,true), + StructField("preComb", LongType,true), + StructField("name", StringType,true), + StructField("versionId", StringType,true), + StructField("intToLong", IntegerType,true) +)) + + +val data1 = Seq(Row("row_1", "part_0", 0L, "bob", "v_0", 0), + Row("row_2", "part_0", 0L, "john", "v_0", 0), + Row("row_3", "part_0", 0L, "tom", "v_0", 0)) + +var dfFromData1 = spark.createDataFrame(data1, schema) +dfFromData1.write.format("hudi"). + options(getQuickstartWriteConfigs). + option("hoodie.datasource.write.precombine.field", "preComb"). + option("hoodie.datasource.write.recordkey.field", "rowId"). + option("hoodie.datasource.write.partitionpath.field", "partitionId"). + option("hoodie.index.type","SIMPLE"). + option("hoodie.table.name", tableName). + mode(Overwrite). + save(basePath) + +var tripsSnapshotDF1 = spark.read.format("hudi").load(basePath + "/*/*") +tripsSnapshotDF1.createOrReplaceTempView("hudi_trips_snapshot") + +ctrl+D + +scala> spark.sql("desc hudi_trips_snapshot").show() + +--------------------+---------+-------+ + | col_name|data_type|comment| + +--------------------+---------+-------+ + | _hoodie_commit_time| string| null| + |_hoodie_commit_seqno| string| null| + | _hoodie_record_key| string| null| + |_hoodie_partition...| string| null| + | _hoodie_file_name| string| null| + | rowId| string| null| + | partitionId| string| null| + | preComb| bigint| null| + | name| string| null| + | versionId| string| null| + | intToLong| int| null| + +--------------------+---------+-------+ + +scala> spark.sql("select rowId, partitionId, preComb, name, versionId, intToLong from hudi_trips_snapshot").show() + +-----+-----------+-------+----+---------+---------+ + |rowId|partitionId|preComb|name|versionId|intToLong| + +-----+-----------+-------+----+---------+---------+ + |row_3| part_0| 0| tom| v_0| 0| + |row_2| part_0| 0|john| v_0| 0| + |row_1| part_0| 0| bob| v_0| 0| + +-----+-----------+-------+----+---------+---------+ + +// In the new schema, we are going to add a String field and +// change the datatype `intToLong` field from int to long. +scala> :paste +val newSchema = StructType( Array( + StructField("rowId", StringType,true), + StructField("partitionId", StringType,true), + StructField("preComb", LongType,true), + StructField("name", StringType,true), + StructField("versionId", StringType,true), + StructField("intToLong", LongType,true), + StructField("newField", StringType,true) +)) + +val data2 = Seq(Row("row_2", "part_0", 5L, "john", "v_3", 3L, "newField_1"), + Row("row_5", "part_0", 5L, "maroon", "v_2", 2L, "newField_1"), + Row("row_9", "part_0", 5L, "michael", "v_2", 2L, "newField_1")) + +var dfFromData2 = spark.createDataFrame(data2, newSchema) +dfFromData2.write.format("hudi"). + options(getQuickstartWriteConfigs). + option("hoodie.datasource.write.precombine.field", "preComb"). + option("hoodie.datasource.write.recordkey.field", "rowId"). + option("hoodie.datasource.write.partitionpath.field", "partitionId"). + option("hoodie.index.type","SIMPLE"). + option("hoodie.table.name", tableName). + mode(Append). + save(basePath) + +var tripsSnapshotDF2 = spark.read.format("hudi").load(basePath + "/*/*") +tripsSnapshotDF2.createOrReplaceTempView("hudi_trips_snapshot") + +Ctrl + D + +scala> spark.sql("desc hudi_trips_snapshot").show() + +--------------------+---------+-------+ + | col_name|data_type|comment| + +--------------------+---------+-------+ + | _hoodie_commit_time| string| null| + |_hoodie_commit_seqno| string| null| + | _hoodie_record_key| string| null| + |_hoodie_partition...| string| null| + | _hoodie_file_name| string| null| + | rowId| string| null| + | partitionId| string| null| + | preComb| bigint| null| + | name| string| null| + | versionId| string| null| + | intToLong| bigint| null| + | newField| string| null| + +--------------------+---------+-------+ + + +scala> spark.sql("select rowId, partitionId, preComb, name, versionId, intToLong, newField from hudi_trips_snapshot").show() + +-----+-----------+-------+-------+---------+---------+----------+ + |rowId|partitionId|preComb| name|versionId|intToLong| newField| + +-----+-----------+-------+-------+---------+---------+----------+ + |row_3| part_0| 0| tom| v_0| 0| null| + |row_2| part_0| 5| john| v_3| 3|newField_1| + |row_1| part_0| 0| bob| v_0| 0| null| + |row_5| part_0| 5| maroon| v_2| 2|newField_1| + |row_9| part_0| 5|michael| v_2| 2|newField_1| + +-----+-----------+-------+-------+---------+---------+----------+ + +``` + + +## Related Resources +

Videos

+ +* [Learn Schema Evolution in Apache Hudi Transaction Datalake with hands on labs](https://youtu.be/s1_-zl3sfLE) +* [How do I identify Schema Changes in Hudi Tables and Send Email Alert when New Column added/removed](https://www.youtube.com/watch?v=_i5G4ojpwlk) diff --git a/website/versioned_docs/version-1.0.0/snapshot_exporter.md b/website/versioned_docs/version-1.0.0/snapshot_exporter.md new file mode 100644 index 0000000000000..59544734bf6af --- /dev/null +++ b/website/versioned_docs/version-1.0.0/snapshot_exporter.md @@ -0,0 +1,135 @@ +--- +title: Exporter +keywords: [hudi, snapshotexporter, export] +toc: true +--- + +## Introduction +HoodieSnapshotExporter allows you to copy data from one location to another for backups or other purposes. +You can write data as Hudi, Json, Orc, or Parquet file formats. In addition to copying data, you can also repartition data +with a provided field or implement custom repartitioning by extending a class shown in detail below. + +## Arguments +HoodieSnapshotExporter accepts a reference to a source path and a destination path. The utility will issue a +query, perform any repartitioning if required and will write the data as Hudi, parquet, or json format. + +|Argument|Description|Required|Note| +|------------|--------|-----------|--| +|--source-base-path|Base path for the source Hudi dataset to be snapshotted|required|| +|--target-output-path|Output path for storing a particular snapshot|required|| +|--output-format|Output format for the exported dataset; accept these values: json,parquet,hudi|required|| +|--output-partition-field|A field to be used by Spark repartitioning|optional|Ignored when "Hudi" or when --output-partitioner is specified.The output dataset's default partition field will inherent from the source Hudi dataset.| +|--output-partitioner|A class to facilitate custom repartitioning|optional|Ignored when using output-format "Hudi"| +|--transformer-class|A subclass of org.apache.hudi.utilities.transform.Transformer. Allows transforming raw source Dataset to a target Dataset (conforming to target schema) before writing.|optional|Ignored when using output-format "Hudi". Available transformers: org.apache.hudi.utilities.transform.SqlQueryBasedTransformer, org.apache.hudi.utilities.transform.SqlFileBasedTransformer, org.apache.hudi.utilities.transform.FlatteningTransformer, org.apache.hudi.utilities.transform.AWSDmsTransformer.| +|--transformer-sql|sql-query template be used to transform the source before writing. The query should reference the source as a table named "\".|optional|Is required for SqlQueryBasedTransformer transformer class, ignored in other cases| +|--transformer-sql|File with a SQL query to be executed during write. The query should reference the source as a table named "\".|optional|Is required for SqlFileBasedTransformer, ignored in other cases| + +## Examples + +### Copy a Hudi dataset + +Exporter scans the source dataset and then makes a copy of it to the target output path. +```bash +spark-submit \ + --jars "packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ + --deploy-mode "client" \ + --class "org.apache.hudi.utilities.HoodieSnapshotExporter" \ + packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ + --source-base-path "/tmp/" \ + --target-output-path "/tmp/exported/hudi/" \ + --output-format "hudi" +``` + +### Export to json or parquet dataset +The Exporter can also convert the source dataset into other formats. Currently only "json" and "parquet" are supported. + +```bash +spark-submit \ + --jars "packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ + --deploy-mode "client" \ + --class "org.apache.hudi.utilities.HoodieSnapshotExporter" \ + packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ + --source-base-path "/tmp/" \ + --target-output-path "/tmp/exported/json/" \ + --output-format "json" # or "parquet" +``` + +### Export to json or parquet dataset with transformation/filtering +The Exporter supports custom transformation/filtering on records before writing to json or parquet dataset. This is done by supplying +implementation of `org.apache.hudi.utilities.transform.Transformer` via `--transformer-class` option. + +```bash +spark-submit \ + --jars "packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ + --deploy-mode "client" \ + --class "org.apache.hudi.utilities.HoodieSnapshotExporter" \ + packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ + --source-base-path "/tmp/" \ + --target-output-path "/tmp/exported/json/" \ + --transformer-class "org.apache.hudi.utilities.transform.SqlQueryBasedTransformer" \ + --transformer-sql "SELECT substr(rider,1,10) as rider, trip_type as tripType FROM WHERE trip_type = 'BLACK' LIMIT 10" \ + --output-format "json" # or "parquet" +``` + +### Re-partitioning +When exporting to a different format, the Exporter takes the `--output-partition-field` parameter to do some custom re-partitioning. +Note: All `_hoodie_*` metadata fields will be stripped during export, so make sure to use an existing non-metadata field as the output partitions. + +By default, if no partitioning parameters are given, the output dataset will have no partition. + +Example: +```bash +spark-submit \ + --jars "packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar" \ + --deploy-mode "client" \ + --class "org.apache.hudi.utilities.HoodieSnapshotExporter" \ + packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ + --source-base-path "/tmp/" \ + --target-output-path "/tmp/exported/json/" \ + --output-format "json" \ + --output-partition-field "symbol" # assume the source dataset contains a field `symbol` +``` + +The output directory will look like this + +```bash +`_SUCCESS symbol=AMRS symbol=AYX symbol=CDMO symbol=CRC symbol=DRNA ...` +``` + +### Custom Re-partitioning +`--output-partitioner` parameter takes in a fully-qualified name of a class that implements `HoodieSnapshotExporter.Partitioner`. +This parameter takes higher precedence than `--output-partition-field`, which will be ignored if this is provided. + +An example implementation is shown below: + +**MyPartitioner.java** +```java +package com.foo.bar; +public class MyPartitioner implements HoodieSnapshotExporter.Partitioner { + + private static final String PARTITION_NAME = "date"; + + @Override + public DataFrameWriter partition(Dataset source) { + // use the current hoodie partition path as the output partition + return source + .withColumnRenamed(HoodieRecord.PARTITION_PATH_METADATA_FIELD, PARTITION_NAME) + .repartition(new Column(PARTITION_NAME)) + .write() + .partitionBy(PARTITION_NAME); + } +} +``` + +After putting this class in `my-custom.jar`, which is then placed on the job classpath, the submit command will look like this: + +```bash +spark-submit \ + --jars "packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.0.0.jar,my-custom.jar" \ + --deploy-mode "client" \ + --class "org.apache.hudi.utilities.HoodieSnapshotExporter" \ + packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-1.0.0.jar \ + --source-base-path "/tmp/" \ + --target-output-path "/tmp/exported/json/" \ + --output-format "json" \ + --output-partitioner "com.foo.bar.MyPartitioner" diff --git a/website/versioned_docs/version-1.0.0/sql_ddl.md b/website/versioned_docs/version-1.0.0/sql_ddl.md new file mode 100644 index 0000000000000..ed4b151c754ee --- /dev/null +++ b/website/versioned_docs/version-1.0.0/sql_ddl.md @@ -0,0 +1,969 @@ +--- +title: SQL DDL +summary: "In this page, we discuss using SQL DDL commands with Hudi" +toc: true +last_modified_at: +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +This page describes support for creating and altering tables using SQL across various engines. + +## Spark SQL + +### Create table + +You can create tables using standard CREATE TABLE syntax, which supports partitioning and passing table properties. + +```sql +CREATE TABLE [IF NOT EXISTS] [db_name.]table_name + [(col_name data_type [COMMENT col_comment], ...)] + [COMMENT table_comment] + [PARTITIONED BY (col_name, ...)] + [ROW FORMAT row_format] + [STORED AS file_format] + [LOCATION path] + [TBLPROPERTIES (property_name=property_value, ...)] + [AS select_statement]; +``` + +:::note NOTE: +For users running this tutorial locally and have a Spark-Hive(HMS) integration in their environment: If you use +`default` database or if you don't provide `[LOCATION path]` with the DDL statement, Spark will return +`java.io.IOException: Mkdirs failed to create file:/user/hive/warehouse/hudi_table/.hoodie` error. +To get around this, you can follow either of the two options mentioned below: +1. Create a database i.e. `CREATE DATABASE hudidb;` and use it i.e. `USE hudidb;` before running the DDL statement. +2. Or provide a path using `LOCATION` keyword to persist the data with the DDL statement. +::: + +### Create non-partitioned table + +Creating a non-partitioned table is as simple as creating a regular table. + +```sql +-- create a Hudi table +CREATE TABLE IF NOT EXISTS hudi_table ( + id INT, + name STRING, + price DOUBLE +) USING hudi; +``` + +### Create partitioned table +A partitioned table can be created by adding a `partitioned by` clause. Partitioning helps to organize the data into multiple folders +based on the partition columns. It can also help speed up queries and index lookups by limiting the amount of metadata, index and data scanned. + +```sql +CREATE TABLE IF NOT EXISTS hudi_table_partitioned ( + id BIGINT, + name STRING, + dt STRING, + hh STRING +) USING hudi +TBLPROPERTIES ( + type = 'cow' +) +PARTITIONED BY (dt); +``` + +:::note +You can also create a table partitioned by multiple fields by supplying comma-separated field names. +When creating a table partitioned by multiple fields, ensure that you specify the columns in the `PARTITIONED BY` clause +in the same order as they appear in the `CREATE TABLE` schema. For example, for the above table, the partition fields +should be specified as `PARTITIONED BY (dt, hh)`. +::: + +### Create table with record keys and ordering fields + +As discussed [here](/docs/quick-start-guide#keys), tables track each record in the table using a record key. Hudi auto-generated a highly compressed +key for each new record in the examples so far. If you want to use an existing field as the key, you can set the `primaryKey` option. +Typically, this is also accompanied by configuring a `preCombineField` option to deal with out-of-order data and potential +duplicate records with the same key in the incoming writes. + +:::note +You can choose multiple fields as primary keys for a given table on a need basis. For eg, "primaryKey = 'id, name'", and +this materializes a composite key of the two fields, which can be useful for exploring the table. +::: + +Here is an example of creating a table using both options. Typically, a field that denotes the time of the event or +fact, e.g., order creation time, event generation time etc., is used as the _preCombineField_. Hudi resolves multiple versions +of the same record by ordering based on this field when queries are run on the table. + +```sql +CREATE TABLE IF NOT EXISTS hudi_table_keyed ( + id INT, + name STRING, + price DOUBLE, + ts BIGINT +) USING hudi +TBLPROPERTIES ( + type = 'cow', + primaryKey = 'id', + preCombineField = 'ts' +); +``` + +### Create table with merge modes {#create-table-with-record-merge-mode} + +Hudi supports different [record merge modes](record_merger) to handle merge of incoming records with existing +records. To create a table with specific record merge mode, you can set `recordMergeMode` option. + +```sql +CREATE TABLE IF NOT EXISTS hudi_table_merge_mode ( + id INT, + name STRING, + ts LONG, + price DOUBLE +) USING hudi +TBLPROPERTIES ( + type = 'mor', + primaryKey = 'id', + precombineField = 'ts', + recordMergeMode = 'EVENT_TIME_ORDERING' +) +LOCATION 'file:///tmp/hudi_table_merge_mode/'; +``` + +With `EVENT_TIME_ORDERING`, the record with the larger event time (`precombineField`) overwrites the record with the +smaller event time on the same key, regardless of transaction's commit time. Users can set `CUSTOM` mode to provide their own +merge logic. With `CUSTOM` merge mode, you can provide a custom class that implements the merge logic. The interfaces +to implement is explained in detail [here](record_merger#custom). + +```sql +CREATE TABLE IF NOT EXISTS hudi_table_merge_mode_custom ( + id INT, + name STRING, + ts LONG, + price DOUBLE +) USING hudi +TBLPROPERTIES ( + type = 'mor', + primaryKey = 'id', + precombineField = 'ts', + recordMergeMode = 'CUSTOM', + 'hoodie.record.merge.strategy.id' = '' +) +LOCATION 'file:///tmp/hudi_table_merge_mode_custom/'; +``` + +### Create table from an external location +Often, Hudi tables are created from streaming writers like the [streamer tool](/docs/hoodie_streaming_ingestion#hudi-streamer), which +may later need some SQL statements to run on them. You can create an External table using the `location` statement. + +```sql +CREATE TABLE hudi_table_external +USING hudi +LOCATION 'file:///tmp/hudi_table/'; +``` + +:::tip +You don't need to specify the schema and any properties except the partitioned columns if they exist. Hudi can automatically +recognize the schema and configurations. +::: + +### Create Table As Select (CTAS) + +Hudi supports CTAS(Create table as select) to support initial loads into Hudi tables. To ensure this is done efficiently, +even for large loads, CTAS uses **bulk insert** as the write operation + +```sql +# create managed parquet table +CREATE TABLE parquet_table +USING parquet +LOCATION 'file:///tmp/parquet_dataset/'; + +# CTAS by loading data into Hudi table +CREATE TABLE hudi_table_ctas +USING hudi +TBLPROPERTIES ( + type = 'cow', + preCombineField = 'ts' +) +PARTITIONED BY (dt) +AS SELECT * FROM parquet_table; +``` + +You can create a non-partitioned table as well + +```sql +# create managed parquet table +CREATE TABLE parquet_table +USING parquet +LOCATION 'file:///tmp/parquet_dataset/'; + +# CTAS by loading data into Hudi table +CREATE TABLE hudi_table_ctas +USING hudi +TBLPROPERTIES ( + type = 'cow', + preCombineField = 'ts' +) +AS SELECT * FROM parquet_table; +``` + +If you prefer explicitly setting the record keys, you can do so by setting `primaryKey` config in table properties. + +```sql +CREATE TABLE hudi_table_ctas +USING hudi +TBLPROPERTIES ( + type = 'cow', + primaryKey = 'id' +) +PARTITIONED BY (dt) +AS +SELECT 1 AS id, 'a1' AS name, 10 AS price, 1000 AS dt; +``` + +You can also use CTAS to copy data across external locations + +```sql +# create managed parquet table +CREATE TABLE parquet_table +USING parquet +LOCATION 'file:///tmp/parquet_dataset/*.parquet'; + +# CTAS by loading data into hudi table +CREATE TABLE hudi_table_ctas +USING hudi +LOCATION 'file:///tmp/hudi/hudi_tbl/' +TBLPROPERTIES ( + type = 'cow' +) +AS SELECT * FROM parquet_table; +``` + +### Create Index + +Hudi supports creating and dropping different types of indexes on a table. For more information on different +type of indexes please refer [multi-modal indexing](indexes#multi-modal-indexing). Secondary +index, expression index and record indexes can be created using SQL create index command. + +```sql +-- Create Index +CREATE INDEX [IF NOT EXISTS] index_name ON [TABLE] table_name +[USING index_type] +(column_name1 [OPTIONS(key1=value1, key2=value2, ...)], column_name2 [OPTIONS(key1=value1, key2=value2, ...)], ...) +[OPTIONS (key1=value1, key2=value2, ...)] + +-- Record index syntax +CREATE INDEX indexName ON tableIdentifier (primaryKey1 [, primayKey2 ...]); + +-- Secondary Index Syntax +CREATE INDEX indexName ON tableIdentifier (nonPrimaryKey); + +-- Expression Index Syntax +CREATE INDEX indexName ON tableIdentifier USING column_stats(col) OPTIONS(expr='expr_val', format='format_val'); +CREATE INDEX indexName ON tableIdentifier USING bloom_filters(col) OPTIONS(expr='expr_val'); + +-- Drop Index +DROP INDEX [IF EXISTS] index_name ON [TABLE] table_name +``` + +- `index_name` is the name of the index to be created or dropped. +- `table_name` is the name of the table on which the index is created or dropped. +- `index_type` is the type of the index to be created. Currently, only `column_stats` and `bloom_filters` is supported. + If the `using ..` clause is omitted, a secondary record index is created. +- `column_name` is the name of the column on which the index is created. + +Both index and column on which the index is created can be qualified with some options in the form of key-value pairs. + +:::note +Please note in order to create secondary index: +1. The table must have a primary key and merge mode should be [COMMIT_TIME_ORDERING](/docs/next/record_merger#commit_time_ordering). +2. Record index must be enabled. This can be done by setting `hoodie.metadata.record.index.enable=true` and then creating `record_index`. Please note the example below. +3. Secondary index is not supported for [complex types](https://avro.apache.org/docs/1.11.1/specification/#complex-types). +::: + +**Examples** +```sql +-- Create a table with primary key +CREATE TABLE hudi_indexed_table ( + ts BIGINT, + uuid STRING, + rider STRING, + driver STRING, + fare DOUBLE, + city STRING +) USING HUDI +options( + primaryKey ='uuid', + hoodie.write.record.merge.mode = "COMMIT_TIME_ORDERING" +) +PARTITIONED BY (city); + +-- Add some data. +INSERT INTO hudi_indexed_table +VALUES + ... + +-- Create bloom filter expression index on driver column +CREATE INDEX idx_bloom_driver ON hudi_indexed_table USING bloom_filters(driver) OPTIONS(expr='identity'); +-- It would show bloom filter expression index +SHOW INDEXES FROM hudi_indexed_table; +-- Query on driver column would prune the data using the idx_bloom_driver index +SELECT uuid, rider FROM hudi_indexed_table WHERE driver = 'driver-S'; + +-- Create column stat expression index on ts column +CREATE INDEX idx_column_ts ON hudi_indexed_table USING column_stats(ts) OPTIONS(expr='from_unixtime', format = 'yyyy-MM-dd'); +-- Shows both expression indexes +SHOW INDEXES FROM hudi_indexed_table; +-- Query on ts column would prune the data using the idx_column_ts index +SELECT * FROM hudi_indexed_table WHERE from_unixtime(ts, 'yyyy-MM-dd') = '2023-09-24'; + +-- Create secondary index on rider column +CREATE INDEX record_index ON hudi_indexed_table (uuid); +CREATE INDEX idx_rider ON hudi_indexed_table (rider); +SET hoodie.metadata.record.index.enable=true; +-- Expression index and secondary index should show up +SHOW INDEXES FROM hudi_indexed_table; +-- Query on rider column would leverage the secondary index idx_rider +SELECT * FROM hudi_indexed_table WHERE rider = 'rider-E'; + +``` + +### Create Expression Index + +A [expression index](https://github.com/apache/hudi/blob/00ece7bce0a4a8d0019721a28049723821e01842/rfc/rfc-63/rfc-63.md) is an index on a function of a column. +It is a new addition to Hudi's [multi-modal indexing](https://hudi.apache.org/blog/2022/05/17/Introducing-Multi-Modal-Index-for-the-Lakehouse-in-Apache-Hudi) +subsystem. Expression indexes can be used to implement logical partitioning of a table, by creating `column_stats` indexes +on an expression of a column. For e.g. an expression index extracting a date from a timestamp field, can effectively implement +date based partitioning, provide same benefits to queries, even if the physical layout is different. + +```sql +-- Create an expression index on the column `ts` (unix epoch) of the table `hudi_table` using the function `from_unixtime` with the format `yyyy-MM-dd` +CREATE INDEX IF NOT EXISTS ts_datestr ON hudi_table + USING column_stats(ts) + OPTIONS(expr='from_unixtime', format='yyyy-MM-dd'); +-- Create an expression index on the column `ts` (timestamp in yyyy-MM-dd HH:mm:ss) of the table `hudi_table` using the function `hour` +CREATE INDEX ts_hour ON hudi_table + USING column_stats(ts) + options(expr='hour'); +``` + +:::note +1. Expression index can only be created for Spark engine using SQL. It is not supported yet with Spark DataSource API. +2. Expression index is not yet supported for [complex types](https://avro.apache.org/docs/1.11.1/specification/#complex-types). +3. Expression index is supported for unary and certain binary expressions. Please check [SQL DDL docs](sql_ddl#create-expression-index) for more details. + ::: + +The `expr` option is required for creating expression index, and it should be a valid Spark SQL function. Please check the syntax +for the above functions in the [Spark SQL documentation](https://spark.apache.org/docs/latest/sql-ref-functions.html) and provide the options accordingly. For example, +the `format` option is required for `from_unixtime` function. + +Some useful functions that are supported are listed below. + + - `identity` + - `from_unixtime` + - `date_format` + - `to_date` + - `to_timestamp` + - `year` + - `month` + - `day` + - `hour` + - `lower` + - `upper` + - `substring` + - `regexp_extract` + - `regexp_replace` + - `concat` + - `length` + +Note that, only functions that take a single column as input are supported currently and UDFs are not supported. + +
+ Full example of creating and using expression index + +```sql +CREATE TABLE hudi_table_expr_index ( + ts STRING, + uuid STRING, + rider STRING, + driver STRING, + fare DOUBLE, + city STRING +) USING HUDI +tblproperties (primaryKey = 'uuid') +PARTITIONED BY (city) +location 'file:///tmp/hudi_table_expr_index'; + +-- Query with hour function filter but no index yet -- +spark-sql> SELECT city, fare, rider, driver FROM hudi_table_expr_index WHERE city NOT IN ('chennai') AND hour(ts) > 12; +san_francisco 93.5 rider-E driver-O +san_francisco 33.9 rider-D driver-L +sao_paulo 43.4 rider-G driver-Q +Time taken: 0.208 seconds, Fetched 3 row(s) + +spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM hudi_table_expr_index WHERE city NOT IN ('chennai') AND hour(ts) > 12; +== Optimized Logical Plan == +Project [city#3465, fare#3464, rider#3462, driver#3463], Statistics(sizeInBytes=899.5 KiB) ++- Filter ((isnotnull(city#3465) AND isnotnull(ts#3460)) AND (NOT (city#3465 = chennai) AND (hour(cast(ts#3460 as timestamp), Some(Asia/Kolkata)) > 12))), Statistics(sizeInBytes=2.5 MiB) + +- Relation default.hudi_table_expr_index[_hoodie_commit_time#3455,_hoodie_commit_seqno#3456,_hoodie_record_key#3457,_hoodie_partition_path#3458,_hoodie_file_name#3459,ts#3460,uuid#3461,rider#3462,driver#3463,fare#3464,city#3465] parquet, Statistics(sizeInBytes=2.5 MiB) + +== Physical Plan == +*(1) Project [city#3465, fare#3464, rider#3462, driver#3463] ++- *(1) Filter (isnotnull(ts#3460) AND (hour(cast(ts#3460 as timestamp), Some(Asia/Kolkata)) > 12)) + +- *(1) ColumnarToRow + +- FileScan parquet default.hudi_table_expr_index[ts#3460,rider#3462,driver#3463,fare#3464,city#3465] Batched: true, DataFilters: [isnotnull(ts#3460), (hour(cast(ts#3460 as timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location: HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters: [isnotnull(city#3465), NOT (city#3465 = chennai)], PushedFilters: [IsNotNull(ts)], ReadSchema: struct + + +-- create the expression index -- +CREATE INDEX ts_hour ON hudi_table_expr_index USING column_stats(ts) options(expr='hour'); + +-- query after creating the index -- +spark-sql> SELECT city, fare, rider, driver FROM hudi_table_expr_index WHERE city NOT IN ('chennai') AND hour(ts) > 12; +san_francisco 93.5 rider-E driver-O +san_francisco 33.9 rider-D driver-L +sao_paulo 43.4 rider-G driver-Q +Time taken: 0.202 seconds, Fetched 3 row(s) +spark-sql> EXPLAIN COST SELECT city, fare, rider, driver FROM hudi_table_expr_index WHERE city NOT IN ('chennai') AND hour(ts) > 12; +== Optimized Logical Plan == +Project [city#2970, fare#2969, rider#2967, driver#2968], Statistics(sizeInBytes=449.8 KiB) ++- Filter ((isnotnull(city#2970) AND isnotnull(ts#2965)) AND (NOT (city#2970 = chennai) AND (hour(cast(ts#2965 as timestamp), Some(Asia/Kolkata)) > 12))), Statistics(sizeInBytes=1278.3 KiB) + +- Relation default.hudi_table_expr_index[_hoodie_commit_time#2960,_hoodie_commit_seqno#2961,_hoodie_record_key#2962,_hoodie_partition_path#2963,_hoodie_file_name#2964,ts#2965,uuid#2966,rider#2967,driver#2968,fare#2969,city#2970] parquet, Statistics(sizeInBytes=1278.3 KiB) + +== Physical Plan == +*(1) Project [city#2970, fare#2969, rider#2967, driver#2968] ++- *(1) Filter (isnotnull(ts#2965) AND (hour(cast(ts#2965 as timestamp), Some(Asia/Kolkata)) > 12)) + +- *(1) ColumnarToRow + +- FileScan parquet default.hudi_table_expr_index[ts#2965,rider#2967,driver#2968,fare#2969,city#2970] Batched: true, DataFilters: [isnotnull(ts#2965), (hour(cast(ts#2965 as timestamp), Some(Asia/Kolkata)) > 12)], Format: Parquet, Location: HoodieFileIndex(1 paths)[file:/tmp/hudi_table_expr_index], PartitionFilters: [isnotnull(city#2970), NOT (city#2970 = chennai)], PushedFilters: [IsNotNull(ts)], ReadSchema: struct + +``` +
+ +### Create Partition Stats Index + +Partition stats index is similar to column stats, in the sense that it tracks - `min, max, null, count, ..` statistics on columns in the +table, useful in query planning. The key difference being, while `column_stats` tracks statistics about files, the partition_stats index +tracks aggregated statistics at the storage partition path level, to help more efficiently skip entire folder paths during query planning +and execution. + +To enable partition stats index, simply set `hoodie.metadata.index.partition.stats.enable = 'true'` in create table options. + +:::note +1. `column_stats` index is required to be enabled for `partition_stats` index. Both go hand in hand. +2. `partition_stats` index is not created automatically for all columns. Users must specify list of columns for which they want to create partition stats index. +3. `column_stats` and `partition_stats` index is not yet supported for [complex types](https://avro.apache.org/docs/1.11.1/specification/#complex-types). +::: + +### Create Secondary Index + +Secondary indexes are record level indexes built on any column in the table. It supports multiple records having the same +secondary column value efficiently and is built on top of the existing record level index built on the table's record key. +Secondary indexes are hash based indexes that offer horizontally scalable write performance by splitting key space into shards +by hashing, as well as fast lookups by employing row-based file formats. + +Let us now look at an example of creating a table with multiple indexes and how the query leverage the indexes for both +partition pruning and data skipping. + +```sql +DROP TABLE IF EXISTS hudi_table; +-- Let us create a table with multiple partition fields, and enable record index and partition stats index +CREATE TABLE hudi_table ( + ts BIGINT, + id STRING, + rider STRING, + driver STRING, + fare DOUBLE, + city STRING, + state STRING +) USING hudi + OPTIONS( + primaryKey ='id', + hoodie.metadata.record.index.enable = 'true', -- enable record index + hoodie.metadata.index.partition.stats.enable = 'true', -- enable partition stats index + hoodie.metadata.index.column.stats.enable = 'true', -- enable column stats + hoodie.metadata.index.column.stats.column.list = 'rider', -- create column stats index on rider column + hoodie.write.record.merge.mode = "COMMIT_TIME_ORDERING" -- enable commit time ordering, required for secondary index +) +PARTITIONED BY (city, state) +LOCATION 'file:///tmp/hudi_test_table'; + +INSERT INTO hudi_table VALUES (1695159649,'trip1','rider-A','driver-K',19.10,'san_francisco','california'); +INSERT INTO hudi_table VALUES (1695091554,'trip2','rider-C','driver-M',27.70,'sunnyvale','california'); +INSERT INTO hudi_table VALUES (1695332066,'trip3','rider-E','driver-O',93.50,'austin','texas'); +INSERT INTO hudi_table VALUES (1695516137,'trip4','rider-F','driver-P',34.15,'houston','texas'); + +-- simple partition predicate -- +select * from hudi_table where city = 'sunnyvale'; +20240710215107477 20240710215107477_0_0 trip2 city=sunnyvale/state=california 1dcb14a9-bc4a-4eac-aab5-015f2254b7ec-0_0-40-75_20240710215107477.parquet 1695091554 trip2 rider-C driver-M 27.7 sunnyvale california +Time taken: 0.58 seconds, Fetched 1 row(s) + +-- simple partition predicate on other partition field -- +select * from hudi_table where state = 'texas'; +20240710215119846 20240710215119846_0_0 trip4 city=houston/state=texas 08c6ed2c-a87b-4798-8f70-6d8b16cb1932-0_0-74-133_20240710215119846.parquet 1695516137 trip4 rider-F driver-P 34.15 houston texas +20240710215110584 20240710215110584_0_0 trip3 city=austin/state=texas 0ab2243c-cc08-4da3-8302-4ce0b4c47a08-0_0-57-104_20240710215110584.parquet 1695332066 trip3 rider-E driver-O 93.5 austin texas +Time taken: 0.124 seconds, Fetched 2 row(s) + +-- predicate on a column for which partition stats are present -- +select id, rider, city, state from hudi_table where rider > 'rider-D'; +trip4 rider-F houston texas +trip3 rider-E austin texas +Time taken: 0.703 seconds, Fetched 2 row(s) + +-- record key predicate -- +SELECT id, rider, driver FROM hudi_table WHERE id = 'trip1'; +trip1 rider-A driver-K +Time taken: 0.368 seconds, Fetched 1 row(s) + +-- create secondary index on driver -- +CREATE INDEX driver_idx ON hudi_table (driver); + +-- secondary key predicate -- +SELECT id, driver, city, state FROM hudi_table WHERE driver IN ('driver-K', 'driver-M'); +trip1 driver-K san_francisco california +trip2 driver-M sunnyvale california +Time taken: 0.83 seconds, Fetched 2 row(s) +``` + +### Create Bloom Filter Index + +Bloom filter indexes store a bloom filter per file, on the column or column expression being index. It can be very +effective in skipping files that don't contain a high cardinality column value e.g. uuids. + +```sql +-- Create a bloom filter index on the column derived from expression `lower(rider)` of the table `hudi_table` +CREATE INDEX idx_bloom_rider ON hudi_indexed_table USING bloom_filters(rider) OPTIONS(expr='lower'); +``` + +### Setting Hudi configs + +There are different ways you can pass the configs for a given hudi table. + +#### Using set command +You can use the **set** command to set any of Hudi's write configs. This will apply to operations across the whole spark session. + +```sql +set hoodie.insert.shuffle.parallelism = 100; +set hoodie.upsert.shuffle.parallelism = 100; +set hoodie.delete.shuffle.parallelism = 100; +``` + +#### Using table properties +You can also configure table options when creating a table. This will be applied only for the table and override any SET command values. + +```sql +CREATE TABLE IF NOT EXISTS tableName ( + colName1 colType1, + colName2 colType2, + ... +) USING hudi +TBLPROPERTIES ( + primaryKey = '${colName1}', + type = 'cow', + ${hoodie.config.key1} = '${hoodie.config.value1}', + ${hoodie.config.key2} = '${hoodie.config.value2}', + .... +); + +e.g. +CREATE TABLE IF NOT EXISTS hudi_table ( + id BIGINT, + name STRING, + price DOUBLE +) USING hudi +TBLPROPERTIES ( + primaryKey = 'id', + type = 'cow', + hoodie.cleaner.fileversions.retained = '20', + hoodie.keep.max.commits = '20' +); +``` + +### Table Properties + +Users can set table properties while creating a table. The important table properties are discussed below. + +| Parameter Name | Default | Description | +|------------------|--------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| type | cow | The table type to create. `type = 'cow'` creates a COPY-ON-WRITE table, while `type = 'mor'` creates a MERGE-ON-READ table. Same as `hoodie.datasource.write.table.type`. More details can be found [here](/docs/table_types) | +| primaryKey | uuid | The primary key field names of the table separated by commas. Same as `hoodie.datasource.write.recordkey.field`. If this config is ignored, hudi will auto-generate primary keys. If explicitly set, primary key generation will honor user configuration. | +| preCombineField | | The pre-combine field of the table. It is used for resolving the final version of the record among multiple versions. Generally, `event time` or another similar column will be used for ordering purposes. Hudi will be able to handle out-of-order data using the preCombine field value. | + +:::note +`primaryKey`, `preCombineField`, and `type` and other properties are case-sensitive. +::: + +#### Passing Lock Providers for Concurrent Writers + +Hudi requires a lock provider to support concurrent writers or asynchronous table services when using OCC +and [NBCC](concurrency_control#non-blocking-concurrency-control) (Non-Blocking Concurrency Control) +concurrency mode. For NBCC mode, locking is only used to write the commit metadata file in the timeline. Writes are +serialized by completion time. Users can pass these table properties into *TBLPROPERTIES* as well. Below is an example +for a Zookeeper based configuration. + +```sql +-- Properties to use Lock configurations to support Multi Writers +TBLPROPERTIES( + hoodie.write.lock.zookeeper.url = "zookeeper", + hoodie.write.lock.zookeeper.port = "2181", + hoodie.write.lock.zookeeper.lock_key = "tableName", + hoodie.write.lock.provider = "org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider", + hoodie.write.concurrency.mode = "optimistic_concurrency_control", + hoodie.write.lock.zookeeper.base_path = "/tableName" +) +``` + +#### Enabling Column Stats / Record Level Index for the table +Hudi provides the ability to leverage rich metadata and index about the table, speed up DMLs and queries. +For e.g: collection of column statistics can be enabled to perform quick data skipping or a record-level index can be used to perform fast updates or point lookups +using the following table properties. + + +For more, see Metadata Configurations + +```sql +TBLPROPERTIES( + 'hoodie.metadata.index.column.stats.enable' = 'true' + 'hoodie.metadata.record.index.enable' = 'true' +) +``` + +### Spark Alter Table +**Syntax** +```sql +-- Alter table name +ALTER TABLE oldTableName RENAME TO newTableName; + +-- Alter table add columns +ALTER TABLE tableIdentifier ADD COLUMNS(colAndType [, colAndType]); +``` +**Examples** + +```sql +--rename to: +ALTER TABLE hudi_table RENAME TO hudi_table_renamed; + +--add column: +ALTER TABLE hudi_table ADD COLUMNS(remark STRING); +``` + +### Modifying Table Properties +**Syntax** +```sql +-- alter table ... set|unset +ALTER TABLE tableIdentifier SET|UNSET TBLPROPERTIES (table_property = 'property_value'); +``` + +**Examples** + +```sql +ALTER TABLE hudi_table SET TBLPROPERTIES (hoodie.keep.max.commits = '10'); +ALTER TABLE hudi_table SET TBLPROPERTIES ("note" = "don't drop this table"); + +ALTER TABLE hudi_table UNSET TBLPROPERTIES IF EXISTS (hoodie.keep.max.commits); +ALTER TABLE hudi_table UNSET TBLPROPERTIES IF EXISTS ('note'); +``` + +:::note +Currently, trying to change the column type may throw an error ```ALTER TABLE CHANGE COLUMN is not supported for changing column colName with oldColType to colName with newColType.```, due to an [open SPARK issue](https://issues.apache.org/jira/browse/SPARK-21823) +::: + +### Alter config options +You can also alter the write config for a table by the **ALTER TABLE SET SERDEPROPERTIES** + +**Syntax** + +```sql +-- alter table ... set|unset +ALTER TABLE tableName SET SERDEPROPERTIES ('property' = 'property_value'); +``` + +**Example** +```sql + ALTER TABLE hudi_table SET SERDEPROPERTIES ('key1' = 'value1'); +``` + +### Show and drop partitions + +**Syntax** + +```sql +-- Show partitions +SHOW PARTITIONS tableIdentifier; + +-- Drop partition +ALTER TABLE tableIdentifier DROP PARTITION ( partition_col_name = partition_col_val [ , ... ] ); +``` + +**Examples** +```sql +--Show partition: +SHOW PARTITIONS hudi_table; + +--Drop partition: +ALTER TABLE hudi_table DROP PARTITION (dt='2021-12-09', hh='10'); +``` + +### Show and drop index + +**Syntax** + +```sql +-- Show Indexes +SHOW INDEXES FROM tableIdentifier; + +-- Drop partition +DROP INDEX indexIdentifier ON tableIdentifier; +``` + +**Examples** +```sql +-- Show indexes +SHOW INDEXES FROM hudi_indexed_table; + +-- Drop Index +DROP INDEX record_index ON hudi_indexed_table; +``` + +### Show create table + +**Syntax** + +```sql +SHOW CREATE TABLE tableIdentifier; +``` + +**Examples** +```sql +SHOW CREATE TABLE hudi_table; +``` + +### Caveats + +Hudi currently has the following limitations when using Spark SQL, to create/alter tables. + + - `ALTER TABLE ... RENAME TO ...` is not supported when using AWS Glue Data Catalog as hive metastore as Glue itself does + not support table renames. + - A new Hudi table created by Spark SQL will by default set `hoodie.datasource.write.hive_style_partitioning=true`, for ease + of use. This can be overridden using table properties. + +## Flink SQL + +### Create Catalog +The catalog helps to manage the SQL tables, the table can be shared among sessions if the catalog persists the table definitions. +For `hms` mode, the catalog also supplements the hive syncing options. + +**Example** +```sql +CREATE CATALOG hoodie_catalog + WITH ( + 'type'='hudi', + 'catalog.path' = '${catalog default root path}', + 'hive.conf.dir' = '${directory where hive-site.xml is located}', + 'mode'='hms' -- supports 'dfs' mode that uses the DFS backend for table DDLs persistence + ); +``` + +#### Options +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `catalog.path` | true | -- | Default path for the catalog's table storage, the path is used to infer the table path automatically, the default table path: `${catalog.path}/${db_name}/${table_name}` | +| `default-database` | false | default | default database name | +| `hive.conf.dir` | false | -- | The directory where hive-site.xml is located, only valid in `hms` mode | +| `mode` | false | dfs | Supports `hms` mode that uses HMS to persist the table options | +| `table.external` | false | false | Whether to create the external table, only valid in `hms` mode | + +### Create Table + +You can create tables using standard FLINK SQL CREATE TABLE syntax, which supports partitioning and passing Flink options using WITH. + +```sql +CREATE TABLE [IF NOT EXISTS] [catalog_name.][db_name.]table_name + ( + { + [ ][ , ...n] + ) + [COMMENT table_comment] + [PARTITIONED BY (partition_column_name1, partition_column_name2, ...)] + WITH (key1=val1, key2=val2, ...) +``` + +### Create non-partitioned table + +Creating a non-partitioned table is as simple as creating a regular table. + +```sql +-- create a Hudi table +CREATE TABLE hudi_table( + id BIGINT, + name STRING, + price DOUBLE +) +WITH ( +'connector' = 'hudi', +'path' = 'file:///tmp/hudi_table', +'table.type' = 'MERGE_ON_READ' +); +``` + +### Create partitioned table + +The following is an example of creating a Flink partitioned table. + +```sql +CREATE TABLE hudi_table( + id BIGINT, + name STRING, + dt STRING, + hh STRING +) +PARTITIONED BY (`dt`) +WITH ( +'connector' = 'hudi', +'path' = 'file:///tmp/hudi_table', +'table.type' = 'MERGE_ON_READ' +); +``` + +### Create table with record keys and ordering fields + +The following is an example of creating a Flink table with record key and ordering field similarly to spark. + +```sql +CREATE TABLE hudi_table( + id BIGINT PRIMARY KEY NOT ENFORCED, + name STRING, + price DOUBLE, + ts BIGINT +) +PARTITIONED BY (`dt`) +WITH ( +'connector' = 'hudi', +'path' = 'file:///tmp/hudi_table', +'table.type' = 'MERGE_ON_READ', +'precombine.field' = 'ts' +); +``` + +### Create Table in Non-Blocking Concurrency Control Mode + +The following is an example of creating a Flink table in [Non-Blocking Concurrency Control mode](concurrency_control#non-blocking-concurrency-control). + +```sql +-- This is a datagen source that can generate records continuously +CREATE TABLE sourceT ( + uuid VARCHAR(20), + name VARCHAR(10), + age INT, + ts TIMESTAMP(3), + `partition` AS 'par1' +) WITH ( + 'connector' = 'datagen', + 'rows-per-second' = '200' +); + +-- pipeline1: by default, enable the compaction and cleaning services +CREATE TABLE t1 ( + uuid VARCHAR(20), + name VARCHAR(10), + age INT, + ts TIMESTAMP(3), + `partition` VARCHAR(20) +) WITH ( + 'connector' = 'hudi', + 'path' = '/tmp/hudi-demo/t1', + 'table.type' = 'MERGE_ON_READ', + 'index.type' = 'BUCKET', + 'hoodie.write.concurrency.mode' = 'NON_BLOCKING_CONCURRENCY_CONTROL', + 'write.tasks' = '2' +); + +-- pipeline2: disable the compaction and cleaning services manually +CREATE TABLE t1_2 ( + uuid VARCHAR(20), + name VARCHAR(10), + age INT, + ts TIMESTAMP(3), + `partition` VARCHAR(20) +) WITH ( + 'connector' = 'hudi', + 'path' = '/tmp/hudi-demo/t1', + 'table.type' = 'MERGE_ON_READ', + 'index.type' = 'BUCKET', + 'hoodie.write.concurrency.mode' = 'NON_BLOCKING_CONCURRENCY_CONTROL', + 'write.tasks' = '2', + 'compaction.schedule.enabled' = 'false', + 'compaction.async.enabled' = 'false', + 'clean.async.enabled' = 'false' +); + +-- Submit the pipelines +INSERT INTO t1 +SELECT * FROM sourceT; + +INSERT INTO t1_2 +SELECT * FROM sourceT; + +SELECT * FROM t1 LIMIT 20; +``` + +### Alter Table +```sql +ALTER TABLE tableA RENAME TO tableB; +``` + +### Setting Hudi configs + +#### Using table options +You can configure hoodie configs in table options when creating a table. You can refer Flink specific hoodie configs [here](configurations#FLINK_SQL) +These configs will be applied to all the operations on that table. + +```sql +CREATE TABLE IF NOT EXISTS tableName ( + colName1 colType1 PRIMARY KEY NOT ENFORCED, + colName2 colType2, + ... +) +WITH ( + 'connector' = 'hudi', + 'path' = '${path}', + ${hoodie.config.key1} = '${hoodie.config.value1}', + ${hoodie.config.key2} = '${hoodie.config.value2}', + .... +); + +e.g. +CREATE TABLE hudi_table( + id BIGINT PRIMARY KEY NOT ENFORCED, + name STRING, + price DOUBLE, + ts BIGINT +) +PARTITIONED BY (`dt`) +WITH ( +'connector' = 'hudi', +'path' = 'file:///tmp/hudi_table', +'table.type' = 'MERGE_ON_READ', +'precombine.field' = 'ts', +'hoodie.cleaner.fileversions.retained' = '20', +'hoodie.keep.max.commits' = '20', +'hoodie.datasource.write.hive_style_partitioning' = 'true' +); +``` + +## Supported Types + +| Spark | Hudi | Notes | +|---------------|--------------|---------------| +| boolean | boolean | | +| byte | int | | +| short | int | | +| integer | int | | +| long | long | | +| date | date | | +| timestamp | timestamp | | +| float | float | | +| double | double | | +| string | string | | +| decimal | decimal | | +| binary | bytes | | +| array | array | | +| map | map | | +| struct | struct | | +| char | | not supported | +| varchar | | not supported | +| numeric | | not supported | +| null | | not supported | +| object | | not supported | diff --git a/website/versioned_docs/version-1.0.0/sql_dml.md b/website/versioned_docs/version-1.0.0/sql_dml.md new file mode 100644 index 0000000000000..8b4200154ec92 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/sql_dml.md @@ -0,0 +1,539 @@ +--- +title: SQL DML +summary: "In this page, we go will cover details on how to use DML statements on Hudi tables." +toc: true +last_modified_at: +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Spark SQL + +SparkSQL provides several Data Manipulation Language (DML) actions for interacting with Hudi tables. These operations allow you to insert, update, merge and delete data +from your Hudi tables. Let's explore them one by one. + +Please refer to [SQL DDL](sql_ddl) for creating Hudi tables using SQL. + +### Insert Into + +You can use the `INSERT INTO` statement to add data to a Hudi table using Spark SQL. Here are some examples: + +```sql +INSERT INTO
+SELECT FROM ; +``` + +:::note Deprecations +From 0.14.0, `hoodie.sql.bulk.insert.enable` and `hoodie.sql.insert.mode` are deprecated. Users are expected to use `hoodie.spark.sql.insert.into.operation` instead. +To manage duplicates with `INSERT INTO`, please check out [insert dup policy config](configurations#hoodiedatasourceinsertduppolicy). +::: + +Examples: + +```sql +-- Insert into a copy-on-write (COW) Hudi table +INSERT INTO hudi_cow_nonpcf_tbl SELECT 1, 'a1', 20; + +-- Insert into a merge-on-read (MOR) Hudi table +INSERT INTO hudi_mor_tbl SELECT 1, 'a1', 20, 1000; + +-- Insert into a COW Hudi table with static partition +INSERT INTO hudi_cow_pt_tbl PARTITION(dt = '2021-12-09', hh='11') SELECT 2, 'a2', 1000; + +-- Insert into a COW Hudi table with dynamic partition +INSERT INTO hudi_cow_pt_tbl PARTITION(dt, hh) SELECT 1 AS id, 'a1' AS name, 1000 AS ts, '2021-12-09' AS dt, '10' AS hh; +``` + +:::note Mapping to write operations +Hudi offers flexibility in choosing the underlying [write operation](/docs/write_operations) of a `INSERT INTO` statement using +the `hoodie.spark.sql.insert.into.operation` configuration. Possible options include *"bulk_insert"* (large inserts), *"insert"* (with small file management), +and *"upsert"* (with deduplication/merging). If a precombine field is not set, *"insert"* is chosen as the default. For a table with preCombine field set, +*"upsert"* is chosen as the default operation. +::: + + +### Insert Overwrite + +The `INSERT OVERWRITE` statement is used to replace existing data in a Hudi table. + +```sql +INSERT OVERWRITE
+SELECT FROM ; +``` + +All existing partitions that are affected by the `INSERT OVERWRITE` statement will replaced with the source data. +Here are some examples: + +```sql +-- Overwrite non-partitioned table +INSERT OVERWRITE hudi_mor_tbl SELECT 99, 'a99', 20.0, 900; +INSERT OVERWRITE hudi_cow_nonpcf_tbl SELECT 99, 'a99', 20.0; + +-- Overwrite partitioned table with dynamic partition +INSERT OVERWRITE TABLE hudi_cow_pt_tbl SELECT 10, 'a10', 1100, '2021-12-09', '10'; + +-- Overwrite partitioned table with static partition +INSERT OVERWRITE hudi_cow_pt_tbl PARTITION(dt = '2021-12-09', hh='12') SELECT 13, 'a13', 1100; +``` + +### Update +You can use the `UPDATE` statement to modify existing data in a Hudi table directly. + +```sql +UPDATE tableIdentifier SET column = EXPRESSION(,column = EXPRESSION) [ WHERE boolExpression] +``` + +Here's an example: + +```sql +-- Update data in a Hudi table +UPDATE hudi_mor_tbl SET price = price * 2, ts = 1111 WHERE id = 1; + +-- Update data in a partitioned Hudi table +UPDATE hudi_cow_pt_tbl SET name = 'a1_1', ts = 1001 WHERE id = 1; + +-- update using non-PK field +update hudi_cow_pt_tbl set ts = 1001 where name = 'a1'; +``` + +:::info +The `UPDATE` operation requires the specification of a `preCombineField`. +::: + +### Merge Into + +The `MERGE INTO` statement allows you to perform more complex updates and merges against source data. The `MERGE INTO` statement +is similar to the `UPDATE` statement, but it allows you to specify different actions for matched and unmatched records. + +```sql +MERGE INTO tableIdentifier AS target_alias +USING (sub_query | tableIdentifier) AS source_alias +ON +[ WHEN MATCHED [ AND ] THEN ] +[ WHEN NOT MATCHED [ AND ] THEN ] + + =A equal bool condition + = + DELETE | + UPDATE SET * | + UPDATE SET column1 = expression1 [, column2 = expression2 ...] + = + INSERT * | + INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...]) +``` + +:::info +`WHEN NOT MATCHED` clauses specify the action to perform if the values do not match. +There are two kinds of `INSERT` clauses: +1. `INSERT *` clauses require that the source table has the same columns as those in the target table. +2. `INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])` clauses do not require to specify all the columns of the target table. For unspecified target columns, insert the `NULL` value. +::: + +Examples below + +```sql +-- source table using hudi for testing merging into non-partitioned table +create table merge_source (id int, name string, price double, ts bigint) using hudi +tblproperties (primaryKey = 'id', preCombineField = 'ts'); +insert into merge_source values (1, "old_a1", 22.22, 900), (2, "new_a2", 33.33, 2000), (3, "new_a3", 44.44, 2000); + +merge into hudi_mor_tbl as target +using merge_source as source +on target.id = source.id +when matched then update set * +when not matched then insert * +; + +-- source table using parquet for testing merging into partitioned table +create table merge_source2 (id int, name string, flag string, dt string, hh string) using parquet; +insert into merge_source2 values (1, "new_a1", 'update', '2021-12-09', '10'), (2, "new_a2", 'delete', '2021-12-09', '11'), (3, "new_a3", 'insert', '2021-12-09', '12'); + +MERGE into hudi_cow_pt_tbl as target +using ( + select id, name, '1000' as ts, flag, dt, hh from merge_source2 +) source +on target.id = source.id +when matched and flag != 'delete' then + update set id = source.id, name = source.name, ts = source.ts, dt = source.dt, hh = source.hh +when matched and flag = 'delete' then delete +when not matched then + insert (id, name, ts, dt, hh) values(source.id, source.name, source.ts, source.dt, source.hh) +; + +``` + +:::note Key requirements +For a Hudi table with user configured primary keys, the join condition in `Merge Into` is expected to contain the primary keys of the table. +For a Table where Hudi auto generates primary keys, the join condition in MIT can be on any arbitrary data columns. +::: + +### Merge Into with Partial Updates {#merge-into-partial-update} + +Partial updates only write updated columns instead of full change records. This is useful when you have wide tables (typical for ML feature stores) +with hundreds of columns and only a few columns are updated. It reduces the write amplification as well as helps in lowering the query +latency. `MERGE INTO` statement above can be modified to use partial updates as shown below. + +```sql +-- Create a Merge-on-Read table +CREATE TABLE tableName ( + id INT, + name STRING, + price DOUBLE, + _ts LONG, + description STRING +) USING hudi +TBLPROPERTIES ( + type = 'mor', + primaryKey = 'id', + preCombineField = '_ts' +) +LOCATION '/location/to/basePath'; + +-- Insert values into the table +INSERT INTO tableName VALUES + (1, 'a1', 10, 1000, 'a1: desc1'), + (2, 'a2', 20, 1200, 'a2: desc2'), + (3, 'a3', 30, 1250, 'a3: desc3'); + +-- Perform partial updates using a MERGE INTO statement +MERGE INTO tableName t0 + USING ( + SELECT 1 AS id, 'a1' AS name, 12 AS price, 1001 AS ts + UNION ALL + SELECT 3 AS id, 'a3' AS name, 25 AS price, 1260 AS ts + ) s0 + ON t0.id = s0.id + WHEN MATCHED THEN UPDATE SET + price = s0.price, + _ts = s0.ts; + +SELECT id, name, price, _ts, description FROM tableName; +``` + +Notice, instead of `UPDATE SET *`, we are updating only the `price` and `_ts` columns. + +:::note +Partial update is not yet supported in the following cases: +1. When the target table is a bootstrapped table. +2. When virtual keys is enabled. +3. When schema on read is enabled. +4. When there is an enum field in the source data. +::: + +### Delete From + +You can remove data from a Hudi table using the `DELETE FROM` statement. + +```sql +DELETE FROM tableIdentifier [ WHERE boolExpression ] +``` + +Examples below + +```sql +-- Delete data from a Hudi table +DELETE FROM hudi_cow_nonpcf_tbl WHERE uuid = 1; + +-- Delete data from a MOR Hudi table based on a condition +DELETE FROM hudi_mor_tbl WHERE id % 2 = 0; + +-- Delete data using a non-primary key field +DELETE FROM hudi_cow_pt_tbl WHERE name = 'a1'; +``` + +### Data Skipping and Indexing + +DML operations can be sped up using column statistics for data skipping and using indexes to reduce the amount of data scanned. +For e.g. the following helps speed up the `DELETE` operation on a Hudi table, by using the record level index. + +```sql +SET hoodie.metadata.record.index.enable=true; + +DELETE from hudi_table where uuid = 'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa'; +``` + +These DML operations give you powerful tools for managing your tables using Spark SQL. +You can control the behavior of these operations using various configuration options, as explained in the documentation. + +## Flink SQL + +Flink SQL provides several Data Manipulation Language (DML) actions for interacting with Hudi tables. These operations allow you to insert, update and delete data from your Hudi tables. Let's explore them one by one. + +### Insert Into + +You can utilize the INSERT INTO statement to incorporate data into a Hudi table using Flink SQL. Here are a few illustrative examples: + +```sql +INSERT INTO
+SELECT FROM ; +``` + +Examples: + +```sql +-- Insert into a Hudi table +INSERT INTO hudi_table SELECT 1, 'a1', 20; +``` + +If the `write.operation` is 'upsert,' the INSERT INTO statement will not only insert new records but also update existing rows with the same record key. + +```sql +-- Insert into a Hudi table in upsert mode +INSERT INTO hudi_table/*+ OPTIONS('write.operation'='upsert')*/ SELECT 1, 'a1', 20; +``` + +### Update +With Flink SQL, you can use update command to update the hudi table. Here are a few illustrative examples: + +```sql +UPDATE tableIdentifier SET column = EXPRESSION(,column = EXPRESSION) [ WHERE boolExpression] +``` + +```sql +UPDATE hudi_table SET price = price * 2, ts = 1111 WHERE id = 1; +``` + +:::note Key requirements +Update query only work with batch excution mode. +::: + +### Delete From +With Flink SQL, you can use delete command to delete the rows from hudi table. Here are a few illustrative examples: + +```sql +DELETE FROM tableIdentifier [ WHERE boolExpression ] +``` + +```sql +DELETE FROM hudi_table WHERE price < 100; +``` + + +```sql +DELETE FROM hudi_table WHERE price < 100; +``` + +:::note Key requirements +Delete query only work with batch excution mode. +::: + +### Lookup Joins + +A lookup join is typically used to enrich a table with data that is queried from an external system. The join requires +one table to have a processing time attribute and the other table to be backed by a lookup source connector. + +```sql +CREATE TABLE datagen_source( + id int, + name STRING, + proctime as PROCTIME() +) WITH ( +'connector' = 'datagen', +'rows-per-second'='1', +'number-of-rows' = '2', +'fields.id.kind'='sequence', +'fields.id.start'='1', +'fields.id.end'='2' +); + +SELECT o.id,o.name,b.id as id2 +FROM datagen_source AS o +JOIN hudi_table/*+ OPTIONS('lookup.join.cache.ttl'= '2 day') */ FOR SYSTEM_TIME AS OF o.proctime AS b on o.id = b.id; +``` + +### Setting Writer/Reader Configs +With Flink SQL, you can additionally set the writer/reader writer configs along with the query. + +```sql +INSERT INTO hudi_table/*+ OPTIONS('${hoodie.config.key1}'='${hoodie.config.value1}')*/ +``` + +```sql +INSERT INTO hudi_table/*+ OPTIONS('hoodie.keep.max.commits'='true')*/ +``` + +## Flink SQL In Action + +The hudi-flink module defines the Flink SQL connector for both hudi source and sink. +There are a number of options available for the sink table: + +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| path | Y | N/A | Base path for the target hoodie table. The path would be created if it does not exist, otherwise a hudi table expects to be initialized successfully | +| table.type | N | COPY_ON_WRITE | Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ | +| write.operation | N | upsert | The write operation, that this write should do (insert or upsert is supported) | +| write.precombine.field | N | ts | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..) | +| write.payload.class | N | OverwriteWithLatestAvroPayload.class | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for the option in-effective | +| write.insert.drop.duplicates | N | false | Flag to indicate whether to drop duplicates upon insert. By default insert will accept duplicates, to gain extra performance | +| write.ignore.failed | N | true | Flag to indicate whether to ignore any non exception error (e.g. writestatus error). within a checkpoint batch. By default true (in favor of streaming progressing over data integrity) | +| hoodie.datasource.write.recordkey.field | N | uuid | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c` | +| hoodie.datasource.write.keygenerator.class | N | SimpleAvroKeyGenerator.class | Key generator class, that implements will extract the key out of incoming record | +| write.tasks | N | 4 | Parallelism of tasks that do actual write, default is 4 | +| write.batch.size.MB | N | 128 | Batch buffer size in MB to flush data into the underneath filesystem | + +If the table type is MERGE_ON_READ, you can also specify the asynchronous compaction strategy through options: + +| Option Name | Required | Default | Remarks | +| ----------- | ------- | ------- | ------- | +| compaction.async.enabled | N | true | Async Compaction, enabled by default for MOR | +| compaction.trigger.strategy | N | num_commits | Strategy to trigger compaction, options are 'num_commits': trigger compaction when reach N delta commits; 'time_elapsed': trigger compaction when time elapsed > N seconds since last compaction; 'num_and_time': trigger compaction when both NUM_COMMITS and TIME_ELAPSED are satisfied; 'num_or_time': trigger compaction when NUM_COMMITS or TIME_ELAPSED is satisfied. Default is 'num_commits' | +| compaction.delta_commits | N | 5 | Max delta commits needed to trigger compaction, default 5 commits | +| compaction.delta_seconds | N | 3600 | Max delta seconds time needed to trigger compaction, default 1 hour | + +You can write the data using the SQL `INSERT INTO` statements: +```sql +INSERT INTO hudi_table select ... from ...; +``` + +**Note**: INSERT OVERWRITE is not supported yet but already on the roadmap. + + +### Non-Blocking Concurrency Control (Experimental) + +Hudi Flink supports a new non-blocking concurrency control mode, where multiple writer tasks can be executed +concurrently without blocking each other. One can read more about this mode in +the [concurrency control](concurrency_control#model-c-multi-writer) docs. Let us see it in action here. + +In the below example, we have two streaming ingestion pipelines that concurrently update the same table. One of the +pipeline is responsible for the compaction and cleaning table services, while the other pipeline is just for data +ingestion. + +In order to commit the dataset, the checkpoint needs to be enabled, here is an example configuration for a flink-conf.yaml: +```yaml +-- set the interval as 30 seconds +execution.checkpointing.interval: 30000 +state.backend: rocksdb +``` + +```sql +-- This is a datagen source that can generate records continuously +CREATE TABLE sourceT ( + uuid varchar(20), + name varchar(10), + age int, + ts timestamp(3), + `partition` as 'par1' +) WITH ( + 'connector' = 'datagen', + 'rows-per-second' = '200' +); + +-- pipeline1: by default enable the compaction and cleaning services +CREATE TABLE t1( + uuid varchar(20), + name varchar(10), + age int, + ts timestamp(3), + `partition` varchar(20) +) WITH ( + 'connector' = 'hudi', + 'path' = '${work_path}/hudi-demo/t1', + 'table.type' = 'MERGE_ON_READ', + 'index.type' = 'BUCKET', + 'hoodie.write.concurrency.mode' = 'NON_BLOCKING_CONCURRENCY_CONTROL', + 'write.tasks' = '2' +); + +-- pipeline2: disable the compaction and cleaning services manually +CREATE TABLE t1_2( + uuid varchar(20), + name varchar(10), + age int, + ts timestamp(3), + `partition` varchar(20) +) WITH ( + 'connector' = 'hudi', + 'path' = '${work_path}/hudi-demo/t1', + 'table.type' = 'MERGE_ON_READ', + 'index.type' = 'BUCKET', + 'hoodie.write.concurrency.mode' = 'NON_BLOCKING_CONCURRENCY_CONTROL', + 'write.tasks' = '2', + 'compaction.schedule.enabled' = 'false', + 'compaction.async.enabled' = 'false', + 'clean.async.enabled' = 'false' +); + +-- submit the pipelines +insert into t1 select * from sourceT; +insert into t1_2 select * from sourceT; + +select * from t1 limit 20; +``` + +As you can see from the above example, we have two pipelines with multiple tasks that concurrently write to the +same table. To use the new concurrency mode, all you need to do is set the `hoodie.write.concurrency.mode` +to `NON_BLOCKING_CONCURRENCY_CONTROL`. The `write.tasks` option is used to specify the number of write tasks that will +be used for writing to the table. The `compaction.schedule.enabled`, `compaction.async.enabled` +and `clean.async.enabled` options are used to disable the compaction and cleaning services for the second pipeline. +This is done to ensure that the compaction and cleaning services are not executed twice for the same table. + + +### Consistent hashing index (Experimental) + +We have introduced the Consistent Hashing Index since [0.13.0 release](/releases/release-0.13.0#consistent-hashing-index). In comparison to the static hashing index ([Bucket Index](/releases/release-0.11.0#bucket-index)), the consistent hashing index offers dynamic scalability of data buckets for the writer. +You can find the [RFC](https://github.com/apache/hudi/blob/master/rfc/rfc-42/rfc-42.md) for the design of this feature. +In the 0.13.X release, the Consistent Hashing Index is supported only for Spark engine. And since [release 0.14.0](/releases/release-0.14.0#consistent-hashing-index-support), the index is supported for Flink engine. + +To utilize this feature, configure the option `index.type` as `BUCKET` and set `hoodie.index.bucket.engine` to `CONSISTENT_HASHING`. +When enabling the consistent hashing index, it's important to enable clustering scheduling within the writer. During this process, the writer will perform dual writes for both the old and new data buckets while the clustering is pending. Although the dual write does not impact correctness, it is strongly recommended to execute clustering as quickly as possible. + +In the below example, we will create a datagen source and do streaming ingestion into Hudi table with consistent bucket index. In order to commit the dataset, the checkpoint needs to be enabled, here is an example configuration for a flink-conf.yaml: +```yaml +-- set the interval as 30 seconds +execution.checkpointing.interval: 30000 +state.backend: rocksdb +``` + +```sql +-- This is a datagen source that can generate records continuously +CREATE TABLE sourceT ( + uuid varchar(20), + name varchar(10), + age int, + ts timestamp(3), + `partition` as 'par1' +) WITH ( + 'connector' = 'datagen', + 'rows-per-second' = '200' +); + +-- Create the hudi table with consistent bucket index +CREATE TABLE t1( + uuid VARCHAR(20) PRIMARY KEY NOT ENFORCED, + name VARCHAR(10), + age INT, + ts TIMESTAMP(3), + `partition` VARCHAR(20) +) +PARTITIONED BY (`partition`) +WITH ( + 'connector'='hudi', + 'path' = '${work_path}/hudi-demo/hudiT', + 'table.type' = 'MERGE_ON_READ', + 'index.type' = 'BUCKET', + 'clustering.schedule.enabled'='true', + 'hoodie.index.bucket.engine'='CONSISTENT_HASHING', + 'hoodie.clustering.plan.strategy.class'='org.apache.hudi.client.clustering.plan.strategy.FlinkConsistentBucketClusteringPlanStrategy', + 'hoodie.clustering.execution.strategy.class'='org.apache.hudi.client.clustering.run.strategy.SparkConsistentBucketClusteringExecutionStrategy', + 'hoodie.bucket.index.num.buckets'='8', + 'hoodie.bucket.index.max.num.buckets'='128', + 'hoodie.bucket.index.min.num.buckets'='8', + 'hoodie.bucket.index.split.threshold'='1.5', + 'write.tasks'='2' +); + +-- submit the pipelines +insert into t1 select * from sourceT; + +select * from t1 limit 20; +``` + +:::caution +Consistent Hashing Index is supported for Flink engine since [release 0.14.0](/releases/release-0.14.0#consistent-hashing-index-support) and currently there are some limitations to use it as of 0.14.0: + +- This index is supported only for MOR table. This limitation also exists even if using Spark engine. +- It does not work with metadata table enabled. This limitation also exists even if using Spark engine. +- Consistent hashing index does not work with bulk-insert using Flink engine yet, please use simple bucket index or Spark engine for bulk-insert pipelines. +- The resize plan which generated by Flink engine only supports merging small file groups, the file splitting is not supported yet. +- The resize plan should be executed through an offline Spark job. Flink engine does not support execute resize plan yet. + ::: \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.0/sql_queries.md b/website/versioned_docs/version-1.0.0/sql_queries.md new file mode 100644 index 0000000000000..5e7e3a45089cc --- /dev/null +++ b/website/versioned_docs/version-1.0.0/sql_queries.md @@ -0,0 +1,714 @@ +--- +title: SQL Queries +summary: "In this page, we go over querying Hudi tables using SQL" +toc: true +last_modified_at: +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + +Hudi stores and organizes data on storage while providing different ways of [querying](/docs/concepts#query-types), across a wide range of query engines. +This page will show how to issue different queries and discuss any specific instructions for each query engine. + +## Spark SQL +The Spark [quickstart](/docs/quick-start-guide) provides a good overview of how to use Spark SQL to query Hudi tables. This section will go into more advanced configurations and functionalities. + +### Snapshot Query +Snapshot queries are the most common query type for Hudi tables. Spark SQL supports snapshot queries on both COPY_ON_WRITE and MERGE_ON_READ tables. +Using session properties, you can specify options around indexing to optimize query performance, as shown below. + +```sql +-- You can turn on relevant options for indexing. + +-- Turn on use of column stat index, to perform range queries. +SET hoodie.metadata.column.stats.enable=true; +SELECT * FROM hudi_table +WHERE price > 1.0 and price < 10.0 + +-- Turn on use of record level index, to perform point queries. +SET hoodie.metadata.record.index.enable=true; +SELECT * FROM hudi_table +WHERE uuid = 'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa' +``` + +:::note Integration with Spark +Users are encouraged to migrate to Hudi versions > 0.12.x, for the best spark experience and discouraged from using any older approaches +using path filters. We expect that native integration with Spark's optimized table readers along with Hudi's automatic table +management will yield great performance benefits in those versions. +::: + +### Snapshot Query with Index Acceleration + +In this section we would go over the various indexes and how they help in data skipping in Hudi. We will first create +a hudi table without any index. + +```sql +-- Create a table with primary key +CREATE TABLE hudi_indexed_table ( + ts BIGINT, + uuid STRING, + rider STRING, + driver STRING, + fare DOUBLE, + city STRING +) USING HUDI +options( + primaryKey ='uuid', + hoodie.write.record.merge.mode = "COMMIT_TIME_ORDERING" +) +PARTITIONED BY (city); + +INSERT INTO hudi_indexed_table +VALUES +(1695159649,'334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco'), +(1695091554,'e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70 ,'san_francisco'), +(1695046462,'9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90 ,'san_francisco'), +(1695332066,'1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco'), +(1695516137,'e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo' ), +(1695376420,'7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40 ,'sao_paulo' ), +(1695173887,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 ,'chennai' ), +(1695115999,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai'); +UPDATE hudi_indexed_table SET rider = 'rider-B', driver = 'driver-N', ts = '1697516137' WHERE rider = 'rider-A'; +``` + +With the query run below, we will see no data skipping or pruning since there is no index created yet in the table as can +be seen in the image below. All the files are scanned in the table to fetch the data. Let's create a secondary index on the rider column. + +```sql +SHOW INDEXES FROM hudi_indexed_table; +SELECT * FROM hudi_indexed_table WHERE rider = 'rider-B'; +``` + +![Secondary Index Without Pruning Image](/assets/images/secondary-index-without-pruning.png) +

Figure: Query pruning without secondary index

+ +### Query using Secondary Index + +We will run the query again after creating secondary index on rider column. The query would now +show the files scanned as 1 compared to 3 files scanned without index. + +:::note +Please note in order to create secondary index: +1. The table must have a primary key and merge mode should be [COMMIT_TIME_ORDERING](/docs/next/record_merger#commit_time_ordering). +2. Record index must be enabled. This can be done by setting `hoodie.metadata.record.index.enable=true` and then creating `record_index`. Please note the example below. +::: + +```sql +-- We will first create a record index since secondary index is dependent upon it +CREATE INDEX record_index ON hudi_indexed_table (uuid); +-- We create a secondary index on rider column +CREATE INDEX idx_rider ON hudi_indexed_table (rider); +-- We run the same query again +SELECT * FROM hudi_indexed_table WHERE rider = 'rider-B'; +DROP INDEX record_index on hudi_indexed_table; +DROP INDEX secondary_index_idx_rider on hudi_indexed_table; +``` + +![Secondary Index With Pruning Image](/assets/images/secondary-index-with-pruning.png) +

Figure: Query pruning with secondary index

+ +### Query using Bloom Filter Expression Index + +With the query run below, we will see no data skipping or pruning since there is no index created yet on the `driver` column. +All the files are scanned in the table to fetch the data. + +```sql +SHOW INDEXES FROM hudi_indexed_table; +SELECT * FROM hudi_indexed_table WHERE driver = 'driver-N'; +``` + +![Bloom Filter Expression Index Without Pruning Image](/assets/images/bloom-filter-expression-index-without-pruning.png) +

Figure: Query pruning without bloom filter expression index

+ +We will run the query again after creating bloom filter expression index on rider column. The query would now +show the files scanned as 1 compared to 3 files scanned without index. + +```sql +-- We create a bloom filter expression index on driver column +CREATE INDEX idx_bloom_driver ON hudi_indexed_table USING bloom_filters(driver) OPTIONS(expr='identity'); +-- We run the same query again +SELECT * FROM hudi_indexed_table WHERE driver = 'driver-N'; +DROP INDEX expr_index_idx_bloom_driver on hudi_indexed_table; +``` + +![Bloom Filter Expression Index With Pruning Image](/assets/images/bloom-filter-expression-index-with-pruning.png) +

Figure: Query pruning with bloom filter expression index

+ +### Query using Column Stats Expression Index + +With the query run below, we will see no data skipping or pruning since there is no index created yet in the table as can +be seen in the image below. All the files are scanned in the table to fetch the data. + +```sql +SHOW INDEXES FROM hudi_indexed_table; +SELECT uuid, rider FROM hudi_indexed_table WHERE from_unixtime(ts, 'yyyy-MM-dd') = '2023-10-17'; +``` + +![Column Stats Expression Index Without Pruning Image](/assets/images/column-stat-expression-index-without-pruning.png) +

Figure: Query pruning without column stat expression index

+ +We will run the query again after creating column stat expression index on ts column. The query would now +show the files scanned as 1 compared to 3 files scanned without index. + +```sql +-- We create a column stat expression index on ts column +CREATE INDEX idx_column_ts ON hudi_indexed_table USING column_stats(ts) OPTIONS(expr='from_unixtime', format = 'yyyy-MM-dd'); +-- We run the same query again +SELECT uuid, rider FROM hudi_indexed_table WHERE from_unixtime(ts, 'yyyy-MM-dd') = '2023-10-17'; +DROP INDEX expr_index_idx_column_ts on hudi_indexed_table; +``` + +![Column Stats Expression Index With Pruning Image](/assets/images/column-stat-expression-index-with-pruning.png) +

Figure: Query pruning with column stat expression index

+ +### Query using Partition Stats Index + +With the query run below, we will see no data skipping or pruning since there is no partition stats index created yet in the table as can +be seen in the image below. All the partitions are scanned in the table to fetch the data. + +```sql +SHOW INDEXES FROM hudi_indexed_table; +SELECT * FROM hudi_indexed_table WHERE rider >= 'rider-H'; +``` + +![Partition Stats Index Without Pruning Image](/assets/images/partition-stat-index-without-pruning.png) +

Figure: Query pruning without partition stats index

+ +We will run the query again after creating partition stats index. The query would now show the partitions scanned as 1 +compared to 3 partitions scanned without index. + +```sql +-- We will need to enable column stats as well since partition stats index leverages it +SET hoodie.metadata.index.partition.stats.enable=true; +SET hoodie.metadata.index.column.stats.enable=true; +INSERT INTO hudi_indexed_table +VALUES +(1695159649,'854g46e0-8355-45cc-97c6-c31daf0df330','rider-H','driver-T',19.10,'chennai'); +-- Run the query again on the table with partition stats index +SELECT * FROM hudi_indexed_table WHERE rider >= 'rider-H'; +DROP INDEX column_stats on hudi_indexed_table; +DROP INDEX partition_stats on hudi_indexed_table; +``` + +![Partition Stats Index With Pruning Image](/assets/images/partition-stat-index-with-pruning.png) +

Figure: Query pruning with partition stats index

+ +### Snapshot Query with Event Time Ordering + +Hudi supports different [record merge modes](record_merger) for merging the records from the same key. Event +time ordering is one of the merge modes where the records are merged based on the event time. Let's create a table with +event time ordering merge mode. + +```sql +CREATE TABLE IF NOT EXISTS hudi_table_merge_mode ( + id INT, + name STRING, + ts LONG, + price DOUBLE +) USING hudi +TBLPROPERTIES ( + type = 'mor', + primaryKey = 'id', + precombineField = 'ts', + recordMergeMode = 'EVENT_TIME_ORDERING' +) +LOCATION 'file:///tmp/hudi_table_merge_mode/'; + +-- insert a record +INSERT INTO hudi_table_merge_mode VALUES (1, 'a1', 1000, 10.0); + +-- another record with the same key but lower ts +INSERT INTO hudi_table_merge_mode VALUES (1, 'a1', 900, 20.0); + +-- query the table, result should be id=1, name=a1, ts=1000, price=10.0 +SELECT id, name, ts, price FROM hudi_table_merge_mode; +``` + +With `EVENT_TIME_ORDERING`, the record with the larger event time (`precombineField`) overwrites the record with the +smaller event time on the same key, regardless of transaction time. + +### Snapshot Query with Custom Merge Mode + +Users can set `CUSTOM` mode to provide their own merge logic. With `CUSTOM` merge mode, you also need to provide your +payload class that implements the merge logic. For example, you can use `PartialUpdateAvroPayload` to merge the records +as below. + +```sql +CREATE TABLE IF NOT EXISTS hudi_table_merge_mode_custom ( + id INT, + name STRING, + ts LONG, + price DOUBLE +) USING hudi +TBLPROPERTIES ( + type = 'mor', + primaryKey = 'id', + precombineField = 'ts', + recordMergeMode = 'CUSTOM', + 'hoodie.datasource.write.payload.class' = 'org.apache.hudi.common.model.PartialUpdateAvroPayload' +) +LOCATION 'file:///tmp/hudi_table_merge_mode_custom/'; + +-- insert a record +INSERT INTO hudi_table_merge_mode_custom VALUES (1, 'a1', 1000, 10.0); + +-- another record with the same key but set higher ts and name as null to show partial update +INSERT INTO hudi_table_merge_mode_custom VALUES (1, null, 2000, 20.0); + +-- query the table, result should be id=1, name=a1, ts=2000, price=20.0 +SELECT id, name, ts, price FROM hudi_table_merge_mode_custom; +``` + +As you can see, not only the record with higher ordering field overwrites the record with lower ordering value, but also +the name field is partially updated. + +### Time Travel Query + +You can also query the table at a specific commit time using the `AS OF` syntax. This is useful for debugging and auditing purposes, as well as for +machine learning pipelines where you want to train models on a specific point in time. + +```sql +SELECT * FROM
+TIMESTAMP AS OF '' +WHERE +``` + +### Change Data Capture + +Change Data Capture (CDC) queries are useful when you want to obtain all changes to a Hudi table within a given time window, along with before/after images and change operation +of the changed records. Similar to many relational database counterparts, Hudi provides flexible ways of controlling supplemental logging levels, to balance storage/logging costs +by materializing more versus compute costs of computing the changes on the fly, using `hoodie.table.cdc.supplemental.logging.mode` configuration. + +```sql +-- Supported through the hudi_table_changes TVF +SELECT * +FROM hudi_table_changes( + , + 'cdc', + <'earliest' |