Skip to content

Commit a4db501

Browse files
author
infvg
committed
Added iceberg write configs
# Conflicts: # docs/velox-configuration.md
1 parent 30e6874 commit a4db501

4 files changed

Lines changed: 27 additions & 2 deletions

File tree

backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -755,4 +755,16 @@ object VeloxConfig extends ConfigRegistry {
755755
.doc("Maps table field names to file field names using names, not indices for Parquet files.")
756756
.booleanConf
757757
.createWithDefault(true)
758+
759+
val PARQUET_PAGE_SIZE_BYTES =
760+
buildConf("spark.gluten.sql.columnar.backend.velox.parquet.row-group-size-bytes")
761+
.doc("Page size in bytes for Parquet write operations.")
762+
.bytesConf(ByteUnit.BYTE)
763+
.createWithDefaultString("1MB")
764+
765+
val VELOX_TARGET_FILE_SIZE =
766+
buildConf("spark.gluten.sql.columnar.backend.velox.target-file-size-byte")
767+
.doc("Target file size in bytes for write operations.")
768+
.bytesConf(ByteUnit.BYTE)
769+
.createWithDefaultString("0")
758770
}

cpp/velox/config/VeloxConfig.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,10 +166,16 @@ const std::string kMemoryPoolCapacityTransferAcrossTasks =
166166
const std::string kOrcUseColumnNames = "spark.gluten.sql.columnar.backend.velox.orcUseColumnNames";
167167
const std::string kParquetUseColumnNames = "spark.gluten.sql.columnar.backend.velox.parquetUseColumnNames";
168168

169-
// write fies
169+
// write files
170170
const std::string kMaxPartitions = "spark.gluten.sql.columnar.backend.velox.maxPartitionsPerWritersSession";
171171
const std::string kMaxTargetFileSize = "spark.gluten.sql.columnar.backend.velox.maxTargetFileSize";
172172

173+
// Iceberg write configs
174+
const std::string kWriteTargetFileSizeBytes =
175+
"spark.gluten.sql.columnar.backend.velox.target-file-size-bytes";
176+
const std::string kWriteParquetPageSizeBytes =
177+
"spark.gluten.sql.columnar.backend.velox.parquet.page-size-bytes";
178+
173179
const std::string kGlogVerboseLevel = "spark.gluten.sql.columnar.backend.velox.glogVerboseLevel";
174180
const uint32_t kGlogVerboseLevelDefault = 0;
175181
const uint32_t kGlogVerboseLevelMaximum = 99;

cpp/velox/utils/ConfigExtractor.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,11 @@ std::shared_ptr<facebook::velox::config::ConfigBase> createHiveConnectorSessionC
242242
configs[facebook::velox::connector::hive::HiveConfig::kOrcUseColumnNamesSession] =
243243
conf->get<bool>(kOrcUseColumnNames, true) ? "true" : "false";
244244

245+
configs[facebook::velox::connector::hive::HiveConfig::kMaxTargetFileSize] =
246+
conf->get<std::string>(kWriteTargetFileSizeBytes, "0");
247+
configs[facebook::velox::parquet::WriterOptions::kParquetSessionWritePageSize] =
248+
conf->get<std::string>(kWriteParquetPageSizeBytes, "1MB");
249+
245250
overwriteVeloxConf(conf.get(), configs, kDynamicBackendConfPrefix);
246251
return std::make_shared<facebook::velox::config::ConfigBase>(std::move(configs));
247252
}

docs/velox-configuration.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,14 @@ nav_order: 16
4747
| spark.gluten.sql.columnar.backend.velox.maxSpillFileSize | 1GB | The maximum size of a single spill file created |
4848
| spark.gluten.sql.columnar.backend.velox.maxSpillLevel | 4 | The max allowed spilling level with zero being the initial spilling level |
4949
| spark.gluten.sql.columnar.backend.velox.maxSpillRunRows | 3M | The maximum row size of a single spill run |
50-
| spark.gluten.sql.columnar.backend.velox.maxTargetFileSize | 0b | The target file size for each output file when writing data. 0 means no limit on target file size, and the actual file size will be determined by other factors such as max partition number and shuffle batch size. |
50+
| spark.gluten.sql.columnar.backend.velox.maxTargetFileSize | 0b | The target file size for each output file when writing data. 0 means no limit on target file size, and the actual file size will be determined by other factors such as max partition number and shuffle batch size. |
5151
| spark.gluten.sql.columnar.backend.velox.memCacheSize | 1GB | The memory cache size |
5252
| spark.gluten.sql.columnar.backend.velox.memInitCapacity | 8MB | The initial memory capacity to reserve for a newly created Velox query memory pool. |
5353
| spark.gluten.sql.columnar.backend.velox.memoryPoolCapacityTransferAcrossTasks | true | Whether to allow memory capacity transfer between memory pools from different tasks. |
5454
| spark.gluten.sql.columnar.backend.velox.memoryUseHugePages | false | Use explicit huge pages for Velox memory allocation. |
5555
| spark.gluten.sql.columnar.backend.velox.orc.scan.enabled | true | Enable velox orc scan. If disabled, vanilla spark orc scan will be used. |
5656
| spark.gluten.sql.columnar.backend.velox.orcUseColumnNames | true | Maps table field names to file field names using names, not indices for ORC files. |
57+
| spark.gluten.sql.columnar.backend.velox.parquet.row-group-size-bytes | 1MB | Page size in bytes for Parquet write operations. |
5758
| spark.gluten.sql.columnar.backend.velox.parquetUseColumnNames | true | Maps table field names to file field names using names, not indices for Parquet files. |
5859
| spark.gluten.sql.columnar.backend.velox.prefetchRowGroups | 1 | Set the prefetch row groups for velox file scan |
5960
| spark.gluten.sql.columnar.backend.velox.queryTraceEnabled | false | Enable query tracing flag. |
@@ -74,6 +75,7 @@ nav_order: 16
7475
| spark.gluten.sql.columnar.backend.velox.ssdChecksumReadVerificationEnabled | false | If true, checksum read verification from SSD is enabled. |
7576
| spark.gluten.sql.columnar.backend.velox.ssdDisableFileCow | false | True if copy on write should be disabled. |
7677
| spark.gluten.sql.columnar.backend.velox.ssdODirect | false | The O_DIRECT flag for cache writing |
78+
| spark.gluten.sql.columnar.backend.velox.target-file-size-byte | 0 | Target file size in bytes for write operations. |
7779
| spark.gluten.sql.columnar.backend.velox.valueStream.dynamicFilter.enabled | false | Whether to apply dynamic filters pushed down from hash probe in the ValueStream (shuffle reader) operator to filter rows before they reach the hash join. |
7880
| spark.gluten.sql.enable.enhancedFeatures | true | Enable some features including iceberg native write and other features. |
7981
| spark.gluten.sql.rewrite.castArrayToString | true | When true, rewrite `cast(array as String)` to `concat('[', array_join(array, ', ', null), ']')` to allow offloading to Velox. |

0 commit comments

Comments
 (0)