Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GLUTEN-8479][CORE][Part-1] Remove unnecessary config #8480

Merged
merged 2 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ object RunTPCHTest {
.config("spark.databricks.delta.snapshotPartitions", 1)
.config("spark.databricks.delta.properties.defaults.checkpointInterval", 5)
.config("spark.databricks.delta.stalenessLimit", 3600 * 1000)
.config("spark.gluten.sql.columnar.columnarToRow", columnarColumnToRow)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Could remove the variable definition if unused?

Copy link
Contributor

@jackylee-ch jackylee-ch Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They has be removed in this pr. see GlutenConfig change

.config(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.config(GlutenConfig.GLUTEN_LIB_PATH, libPath)
.config("spark.gluten.sql.columnar.iterator", "true")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ class GlutenClickHouseJoinSuite extends GlutenClickHouseWholeStageTransformerSui
.set("spark.sql.shuffle.partitions", "5")
.set("spark.sql.adaptive.enabled", "false")
.set("spark.sql.files.minPartitionNum", "1")
.set("spark.gluten.sql.columnar.columnartorow", "true")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ class GlutenClickHouseSyntheticDataSuite
.set("spark.databricks.delta.snapshotPartitions", "1")
.set("spark.databricks.delta.properties.defaults.checkpointInterval", "5")
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set("spark.gluten.sql.columnar.columnarToRow", "true")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ abstract class GlutenClickHouseTPCDSAbstractSuite
.set("spark.databricks.delta.snapshotPartitions", "1")
.set("spark.databricks.delta.properties.defaults.checkpointInterval", "5")
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set("spark.gluten.sql.columnar.columnarToRow", "true")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,6 @@ abstract class GlutenClickHouseTPCHAbstractSuite
.set("spark.databricks.delta.properties.defaults.checkpointInterval", "5")
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.set("spark.gluten.sql.columnar.columnarToRow", "true")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS
.set("spark.databricks.delta.snapshotPartitions", "1")
.set("spark.databricks.delta.properties.defaults.checkpointInterval", "5")
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set("spark.gluten.sql.columnar.columnartorow", "true")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ class GlutenClickhouseFunctionSuite extends GlutenClickHouseTPCHAbstractSuite {
.set("spark.databricks.delta.snapshotPartitions", "1")
.set("spark.databricks.delta.properties.defaults.checkpointInterval", "5")
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set("spark.gluten.sql.columnar.columnartorow", "true")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ class GlutenClickHouseHiveTableSuite
.set("spark.sql.shuffle.partitions", "5")
.set("spark.sql.adaptive.enabled", "false")
.set("spark.sql.files.minPartitionNum", "1")
.set("spark.gluten.sql.columnar.columnartorow", "true")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ class GlutenClickHouseNativeWriteTableSuite
.set("spark.databricks.delta.snapshotPartitions", "1")
.set("spark.databricks.delta.properties.defaults.checkpointInterval", "5")
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set("spark.gluten.sql.columnar.columnartorow", "true")
.set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
.set(GlutenConfig.GLUTEN_LIB_PATH, UTSystemParameters.clickHouseLibPath)
.set("spark.gluten.sql.columnar.iterator", "true")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ trait CHSqlBasedBenchmark extends SqlBasedBenchmark {
.set("spark.databricks.delta.snapshotPartitions", "1")
.set("spark.databricks.delta.properties.defaults.checkpointInterval", "5")
.set("spark.databricks.delta.stalenessLimit", "3600000")
.set("spark.gluten.sql.columnar.columnarToRow", "true")
.set("spark.gluten.sql.enable.native.validation", "false")
.set("spark.sql.adaptive.enabled", "false")
.setIfMissing("spark.memory.offHeap.size", offheapSize)
Expand Down
4 changes: 0 additions & 4 deletions docs/get-started/ClickHouse.md
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,6 @@ cd spark-3.2.2-bin-hadoop2.7
--conf spark.memory.offHeap.enabled=true \
--conf spark.memory.offHeap.size=6442450944 \
--conf spark.plugins=org.apache.gluten.GlutenPlugin \
--conf spark.gluten.sql.columnar.columnarToRow=true \
--conf spark.executorEnv.LD_PRELOAD=/path_to_clickhouse_library/libch.so\
--conf spark.gluten.sql.columnar.libpath=/path_to_clickhouse_library/libch.so \
--conf spark.gluten.sql.columnar.iterator=true \
Expand Down Expand Up @@ -422,7 +421,6 @@ cd spark-3.2.2-bin-hadoop2.7
--conf spark.memory.offHeap.enabled=true \
--conf spark.memory.offHeap.size=6442450944 \
--conf spark.plugins=org.apache.gluten.GlutenPlugin \
--conf spark.gluten.sql.columnar.columnarToRow=true \
--conf spark.executorEnv.LD_PRELOAD=/path_to_clickhouse_library/libch.so\
--conf spark.gluten.sql.columnar.libpath=/path_to_clickhouse_library/libch.so \
--conf spark.gluten.sql.columnar.iterator=true \
Expand Down Expand Up @@ -494,7 +492,6 @@ $spark_cmd \
--conf spark.sql.shuffle.partitions=112 \
--conf spark.sql.sources.useV1SourceList=avro \
--conf spark.sql.files.maxPartitionBytes=1073741824 \
--conf spark.gluten.sql.columnar.columnartorow=true \
--conf spark.gluten.sql.columnar.loadnative=true \
--conf spark.gluten.sql.columnar.libpath=$ch_lib \
--conf spark.gluten.sql.columnar.iterator=true \
Expand Down Expand Up @@ -614,7 +611,6 @@ cd spark-3.2.2-bin-hadoop2.7
--conf spark.serializer=org.apache.spark.serializer.JavaSerializer \
--conf spark.sql.sources.ignoreDataLocality=true \
--conf spark.plugins=org.apache.gluten.GlutenPlugin \
--conf spark.gluten.sql.columnar.columnarToRow=true \
--conf spark.gluten.sql.columnar.libpath=/path_to_clickhouse_library/libch.so \
--conf spark.gluten.sql.columnar.iterator=true \
--conf spark.gluten.sql.columnar.loadarrow=false \
Expand Down
1 change: 0 additions & 1 deletion ep/build-clickhouse/src/resources/conf/gluten.properties
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ spark.sql.columnVector.offheap.enabled=true
spark.memory.offHeap.enabled=true
spark.memory.offHeap.size=6442450944
spark.plugins=org.apache.gluten.GlutenPlugin
spark.gluten.sql.columnar.columnarToRow=true
spark.gluten.sql.columnar.iterator=true
spark.gluten.sql.columnar.loadarrow=false
spark.gluten.sql.columnar.hashagg.enablefinal=true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ object ParquetReadBenchmark extends SqlBasedBenchmark {
.set("spark.memory.offHeap.enabled", "true")
.setIfMissing("spark.memory.offHeap.size", offheapSize)
.setIfMissing("spark.sql.columnVector.offheap.enabled", "true")
.set("spark.gluten.sql.columnar.columnarToRow", "true")
.set("spark.sql.adaptive.enabled", "false")
.setIfMissing("spark.driver.memory", memorySize)
.setIfMissing("spark.executor.memory", memorySize)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ object ParquetReadBenchmark extends SqlBasedBenchmark {
.set("spark.memory.offHeap.enabled", "true")
.setIfMissing("spark.memory.offHeap.size", offheapSize)
.setIfMissing("spark.sql.columnVector.offheap.enabled", "true")
.set("spark.gluten.sql.columnar.columnarToRow", "true")
.set("spark.sql.adaptive.enabled", "false")
.setIfMissing("spark.driver.memory", memorySize)
.setIfMissing("spark.executor.memory", memorySize)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ object ParquetReadBenchmark extends SqlBasedBenchmark {
.set("spark.memory.offHeap.enabled", "true")
.setIfMissing("spark.memory.offHeap.size", offheapSize)
.setIfMissing("spark.sql.columnVector.offheap.enabled", "true")
.set("spark.gluten.sql.columnar.columnarToRow", "true")
.set("spark.sql.adaptive.enabled", "false")
.setIfMissing("spark.driver.memory", memorySize)
.setIfMissing("spark.executor.memory", memorySize)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ object ParquetReadBenchmark extends SqlBasedBenchmark {
.set("spark.memory.offHeap.enabled", "true")
.setIfMissing("spark.memory.offHeap.size", offheapSize)
.setIfMissing("spark.sql.columnVector.offheap.enabled", "true")
.set("spark.gluten.sql.columnar.columnarToRow", "true")
.set("spark.sql.adaptive.enabled", "false")
.setIfMissing("spark.driver.memory", memorySize)
.setIfMissing("spark.executor.memory", memorySize)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,6 @@ class GlutenConfig(conf: SQLConf) extends Logging {
def shuffledHashJoinOptimizeBuildSide: Boolean =
getConf(COLUMNAR_SHUFFLED_HASH_JOIN_OPTIMIZE_BUILD_SIDE)

def enableNativeColumnarToRow: Boolean = getConf(COLUMNAR_COLUMNAR_TO_ROW_ENABLED)

def forceShuffledHashJoin: Boolean = getConf(COLUMNAR_FORCE_SHUFFLED_HASH_JOIN_ENABLED)

def enableColumnarSortMergeJoin: Boolean = getConf(COLUMNAR_SORTMERGEJOIN_ENABLED)
Expand Down Expand Up @@ -169,17 +167,12 @@ class GlutenConfig(conf: SQLConf) extends Logging {

def enablePreferColumnar: Boolean = getConf(COLUMNAR_PREFER_ENABLED)

def enableOneRowRelationColumnar: Boolean = getConf(COLUMNAR_ONE_ROW_RELATION_ENABLED)

def physicalJoinOptimizationThrottle: Integer =
getConf(COLUMNAR_PHYSICAL_JOIN_OPTIMIZATION_THROTTLE)

def enablePhysicalJoinOptimize: Boolean =
getConf(COLUMNAR_PHYSICAL_JOIN_OPTIMIZATION_ENABLED)

def logicalJoinOptimizationThrottle: Integer =
getConf(COLUMNAR_LOGICAL_JOIN_OPTIMIZATION_THROTTLE)

def enableScanOnly: Boolean = getConf(COLUMNAR_SCAN_ONLY_ENABLED)

def tmpFile: Option[String] = getConf(COLUMNAR_TEMP_DIR)
Expand Down Expand Up @@ -365,7 +358,7 @@ class GlutenConfig(conf: SQLConf) extends Logging {
def chColumnarShuffleSpillThreshold: Long = {
val threshold = getConf(COLUMNAR_CH_SHUFFLE_SPILL_THRESHOLD)
if (threshold == 0) {
(getConf(COLUMNAR_TASK_OFFHEAP_SIZE_IN_BYTES) * 0.9).toLong
(taskOffHeapMemorySize * 0.9).toLong
} else {
threshold
}
Expand Down Expand Up @@ -426,47 +419,56 @@ class GlutenConfig(conf: SQLConf) extends Logging {
getConf(COLUMNAR_VELOX_MEMORY_USE_HUGE_PAGES)

def debug: Boolean = getConf(DEBUG_ENABLED)

def debugKeepJniWorkspace: Boolean = getConf(DEBUG_KEEP_JNI_WORKSPACE)

def collectUtStats: Boolean = getConf(UT_STATISTIC)

def benchmarkStageId: Int = getConf(BENCHMARK_TASK_STAGEID)

def benchmarkPartitionId: String = getConf(BENCHMARK_TASK_PARTITIONID)

def benchmarkTaskId: String = getConf(BENCHMARK_TASK_TASK_ID)

def benchmarkSaveDir: String = getConf(BENCHMARK_SAVE_DIR)

def textInputMaxBlockSize: Long = getConf(TEXT_INPUT_ROW_MAX_BLOCK_SIZE)

def textIputEmptyAsDefault: Boolean = getConf(TEXT_INPUT_EMPTY_AS_DEFAULT)

def enableParquetRowGroupMaxMinIndex: Boolean =
getConf(ENABLE_PARQUET_ROW_GROUP_MAX_MIN_INDEX)

def enableVeloxFlushablePartialAggregation: Boolean =
getConf(VELOX_FLUSHABLE_PARTIAL_AGGREGATION_ENABLED)
def maxFlushableAggregationMemoryRatio: Double =
getConf(MAX_PARTIAL_AGGREGATION_MEMORY_RATIO)
def maxExtendedFlushableAggregationMemoryRatio: Double =
getConf(MAX_PARTIAL_AGGREGATION_MEMORY_RATIO)
def abandonFlushableAggregationMinPct: Int =
getConf(ABANDON_PARTIAL_AGGREGATION_MIN_PCT)
def abandonFlushableAggregationMinRows: Int =
getConf(ABANDON_PARTIAL_AGGREGATION_MIN_ROWS)

def maxFlushableAggregationMemoryRatio: Double = getConf(MAX_PARTIAL_AGGREGATION_MEMORY_RATIO)

def maxExtendedFlushableAggregationMemoryRatio: Double = getConf(
MAX_PARTIAL_AGGREGATION_MEMORY_RATIO)

def abandonFlushableAggregationMinPct: Int = getConf(ABANDON_PARTIAL_AGGREGATION_MIN_PCT)

def abandonFlushableAggregationMinRows: Int = getConf(ABANDON_PARTIAL_AGGREGATION_MIN_ROWS)

// Please use `BackendsApiManager.getSettings.enableNativeWriteFiles()` instead
def enableNativeWriter: Option[Boolean] = getConf(NATIVE_WRITER_ENABLED)

def enableNativeArrowReader: Boolean = getConf(NATIVE_ARROW_READER_ENABLED)

def directorySizeGuess: Long =
getConf(DIRECTORY_SIZE_GUESS)
def filePreloadThreshold: Long =
getConf(FILE_PRELOAD_THRESHOLD)
def prefetchRowGroups: Int =
getConf(PREFETCH_ROW_GROUPS)
def loadQuantum: Long =
getConf(LOAD_QUANTUM)
def maxCoalescedDistance: String =
getConf(MAX_COALESCED_DISTANCE_BYTES)
def maxCoalescedBytes: Long =
getConf(MAX_COALESCED_BYTES)
def cachePrefetchMinPct: Int =
getConf(CACHE_PREFETCH_MINPCT)
def directorySizeGuess: Long = getConf(DIRECTORY_SIZE_GUESS)

def filePreloadThreshold: Long = getConf(FILE_PRELOAD_THRESHOLD)

def prefetchRowGroups: Int = getConf(PREFETCH_ROW_GROUPS)

def loadQuantum: Long = getConf(LOAD_QUANTUM)

def maxCoalescedDistance: String = getConf(MAX_COALESCED_DISTANCE_BYTES)

def maxCoalescedBytes: Long = getConf(MAX_COALESCED_BYTES)

def cachePrefetchMinPct: Int = getConf(CACHE_PREFETCH_MINPCT)

def enableColumnarProjectCollapse: Boolean = getConf(ENABLE_COLUMNAR_PROJECT_COLLAPSE)

Expand Down Expand Up @@ -1029,13 +1031,6 @@ object GlutenConfig {
.booleanConf
.createWithDefault(true)

val COLUMNAR_COLUMNAR_TO_ROW_ENABLED =
buildConf("spark.gluten.sql.columnar.columnarToRow")
.internal()
.doc("Enable or disable columnar columnarToRow.")
.booleanConf
.createWithDefault(true)

val COLUMNAR_SORTMERGEJOIN_ENABLED =
buildConf("spark.gluten.sql.columnar.sortMergeJoin")
.internal()
Expand Down Expand Up @@ -1123,13 +1118,6 @@ object GlutenConfig {
.booleanConf
.createWithDefault(true)

val COLUMNAR_ONE_ROW_RELATION_ENABLED =
buildConf("spark.gluten.sql.columnar.oneRowRelation")
.internal()
.doc("Enable or disable columnar `OneRowRelation`.")
.booleanConf
.createWithDefault(true)

val COLUMNAR_TABLE_CACHE_ENABLED =
buildConf("spark.gluten.sql.columnar.tableCache")
.internal()
Expand All @@ -1151,13 +1139,6 @@ object GlutenConfig {
.booleanConf
.createWithDefault(false)

val COLUMNAR_LOGICAL_JOIN_OPTIMIZATION_THROTTLE =
buildConf("spark.gluten.sql.columnar.logicalJoinOptimizationLevel")
.internal()
.doc("Fallback to row operators if there are several continuous joins.")
.intConf
.createWithDefault(12)

val COLUMNAR_SCAN_ONLY_ENABLED =
buildConf("spark.gluten.sql.columnar.scanOnly")
.internal()
Expand Down
1 change: 0 additions & 1 deletion tools/workload/benchmark_velox/native_sql_initialize.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1276,7 +1276,6 @@
" .set('spark.gluten.sql.columnar.physicalJoinOptimizeEnable', 'true')\\\n",
" .set('spark.gluten.sql.columnar.physicalJoinOptimizationLevel', '18')\\\n",
" .set('spark.gluten.sql.columnar.logicalJoinOptimizeEnable', 'true')\\\n",
" .set('spark.gluten.sql.columnar.logicalJoinOptimizationLevel', '19')\n",
" return conf"
]
},
Expand Down
Loading