apache · mbutrovich · Oct 6, 2025 · Oct 6, 2025 · Oct 6, 2025 · Oct 6, 2025
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
@@ -103,6 +103,7 @@ jobs:
             value: |
               org.apache.comet.CometFuzzTestSuite
               org.apache.comet.CometFuzzAggregateSuite
+              org.apache.comet.CometFuzzIcebergSuite
               org.apache.comet.CometFuzzMathSuite
               org.apache.comet.DataGeneratorSuite
           - name: "shuffle"
@@ -124,6 +125,7 @@ jobs:
               org.apache.spark.sql.comet.ParquetDatetimeRebaseV2Suite
               org.apache.spark.sql.comet.ParquetEncryptionITCase
               org.apache.comet.exec.CometNativeReaderSuite
+              org.apache.comet.CometIcebergNativeSuite
           - name: "exec"
             value: |
               org.apache.comet.exec.CometAggregateSuite

diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
@@ -68,6 +68,7 @@ jobs:
             value: |
               org.apache.comet.CometFuzzTestSuite
               org.apache.comet.CometFuzzAggregateSuite
+              org.apache.comet.CometFuzzIcebergSuite
               org.apache.comet.CometFuzzMathSuite
               org.apache.comet.DataGeneratorSuite
           - name: "shuffle"
@@ -89,6 +90,7 @@ jobs:
               org.apache.spark.sql.comet.ParquetDatetimeRebaseV2Suite
               org.apache.spark.sql.comet.ParquetEncryptionITCase
               org.apache.comet.exec.CometNativeReaderSuite
+              org.apache.comet.CometIcebergNativeSuite
           - name: "exec"
             value: |
               org.apache.comet.exec.CometAggregateSuite

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -123,6 +123,16 @@ object CometConf extends ShimCometConf {
       .getOrElse("COMET_PARQUET_SCAN_IMPL", SCAN_AUTO)
       .toLowerCase(Locale.ROOT))
 
+  val COMET_ICEBERG_NATIVE_ENABLED: ConfigEntry[Boolean] =
+    conf("spark.comet.scan.icebergNative.enabled")
+      .category(CATEGORY_SCAN)
+      .doc(
+        "Whether to enable native Iceberg table scan using iceberg-rust. When enabled, " +
+          "Iceberg tables are read directly through native execution, bypassing Spark's " +
+          "DataSource V2 API for better performance.")
+      .booleanConf
+      .createWithDefault(false)
+
   val COMET_RESPECT_PARQUET_FILTER_PUSHDOWN: ConfigEntry[Boolean] =
     conf("spark.comet.parquet.respectFilterPushdown")
       .category(CATEGORY_PARQUET)

diff --git a/common/src/main/scala/org/apache/comet/objectstore/NativeConfig.scala b/common/src/main/scala/org/apache/comet/objectstore/NativeConfig.scala
@@ -56,7 +56,7 @@ object NativeConfig {
    * consistent and standardized cloud storage support across all providers.
    */
   def extractObjectStoreOptions(hadoopConf: Configuration, uri: URI): Map[String, String] = {
-    val scheme = uri.getScheme.toLowerCase(Locale.ROOT)
+    val scheme = Option(uri.getScheme).map(_.toLowerCase(Locale.ROOT)).getOrElse("file")
 
     import scala.jdk.CollectionConverters._
     val options = scala.collection.mutable.Map[String, String]()

diff --git a/dev/ci/check-suites.py b/dev/ci/check-suites.py
@@ -34,6 +34,7 @@ def file_to_class_name(path: Path) -> str | None:
     ignore_list = [
         "org.apache.comet.parquet.ParquetReadSuite", # abstract
         "org.apache.comet.parquet.ParquetReadFromS3Suite", # manual test suite
+        "org.apache.comet.IcebergReadFromS3Suite", # manual test suite
         "org.apache.spark.sql.comet.CometPlanStabilitySuite", # abstract
         "org.apache.spark.sql.comet.ParquetDatetimeRebaseSuite", # abstract
         "org.apache.comet.exec.CometColumnarShuffleSuite" # abstract

diff --git a/dev/diffs/iceberg/1.8.1.diff b/dev/diffs/iceberg/1.8.1.diff
diff --git a/dev/diffs/iceberg/1.9.1.diff b/dev/diffs/iceberg/1.9.1.diff
diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md
@@ -56,8 +56,8 @@ and sorting on floating-point data can be enabled by setting `spark.comet.expres
 ## Incompatible Expressions
 
 Expressions that are not 100% Spark-compatible will fall back to Spark by default and can be enabled by setting
-`spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See 
-the [Comet Supported Expressions Guide](expressions.md) for more information on this configuration setting.  
+`spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See
+the [Comet Supported Expressions Guide](expressions.md) for more information on this configuration setting.
 
 It is also possible to specify `spark.comet.expression.allowIncompatible=true` to enable all
 incompatible expressions.

diff --git a/docs/source/user-guide/latest/configs.md b/docs/source/user-guide/latest/configs.md
@@ -32,6 +32,7 @@ Comet provides the following configuration settings.
 | `spark.comet.convert.parquet.enabled` | When enabled, data from Spark (non-native) Parquet v1 and v2 scans will be converted to Arrow format. Note that to enable native vectorized execution, both this config and `spark.comet.exec.enabled` need to be enabled. | false |
 | `spark.comet.scan.allowIncompatible` | Some Comet scan implementations are not currently fully compatible with Spark for all datatypes. Set this config to true to allow them anyway. For more information, refer to the [Comet Compatibility Guide](https://datafusion.apache.org/comet/user-guide/compatibility.html). | false |
 | `spark.comet.scan.enabled` | Whether to enable native scans. When this is turned on, Spark will use Comet to read supported data sources (currently only Parquet is supported natively). Note that to enable native vectorized execution, both this config and `spark.comet.exec.enabled` need to be enabled. | true |
+| `spark.comet.scan.icebergNative.enabled` | Whether to enable native Iceberg table scan using iceberg-rust. When enabled, Iceberg tables are read directly through native execution, bypassing Spark's DataSource V2 API for better performance. | false |
 | `spark.comet.scan.preFetch.enabled` | Whether to enable pre-fetching feature of CometScan. | false |
 | `spark.comet.scan.preFetch.threadNum` | The number of threads running pre-fetching for CometScan. Effective if spark.comet.scan.preFetch.enabled is enabled. Note that more pre-fetching threads means more memory requirement to store pre-fetched row groups. | 2 |
 | `spark.comet.sparkToColumnar.enabled` | Whether to enable Spark to Arrow columnar conversion. When this is turned on, Comet will convert operators in `spark.comet.sparkToColumnar.supportedOperatorList` into Arrow columnar format before processing. | false |