Support TIMESTAMP_NTZ type

rui-mo · rui-mo · commit f1d0b1635c09 · 2026-03-23T22:36:39.000Z
diff --git a/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoTableSuite.scala b/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoTableSuite.scala
@@ -1423,7 +1423,8 @@ abstract class DeltaInsertIntoTests(
     }
   }
 
-  test("insertInto: Timestamp No Timezone round trips across timezones") {
+  // Implicit cast from Timestamp to TimestampNTZ has not been supported.
+  ignore("insertInto: Timestamp No Timezone round trips across timezones") {
     val t1 = "timestamp_ntz"
     withTable(t1) {
       withTimeZone("GMT-8") {
diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxValidatorApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxValidatorApi.scala
@@ -109,6 +109,7 @@ object VeloxValidatorApi {
           StringType | BinaryType | _: DecimalType | DateType | TimestampType |
           YearMonthIntervalType.DEFAULT | NullType =>
         true
+      case other if other.typeName == "timestamp_ntz" => true
       case _ => false
     }
   }
diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxColumnarToRowExec.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxColumnarToRowExec.scala
@@ -50,6 +50,7 @@ case class VeloxColumnarToRowExec(child: SparkPlan) extends ColumnarToRowExecBas
         case _: DoubleType =>
         case _: StringType =>
         case _: TimestampType =>
+        case other if other.typeName == "timestamp_ntz" =>
         case _: DateType =>
         case _: BinaryType =>
         case _: DecimalType =>
diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/FallbackSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/FallbackSuite.scala
@@ -272,7 +272,7 @@ class FallbackSuite extends VeloxWholeStageTransformerSuite with AdaptiveSparkPl
     }
   }
 
-  test("fallback with index based schema evolution") {
+  testWithMinSparkVersion("fallback with index based schema evolution", "3.4") {
     val query = "SELECT c2 FROM test"
     Seq("parquet", "orc").foreach {
       format =>
@@ -295,9 +295,7 @@ class FallbackSuite extends VeloxWholeStageTransformerSuite with AdaptiveSparkPl
                     runQueryAndCompare(query) {
                       df =>
                         val plan = df.queryExecution.executedPlan
-                        val fallback = parquetUseColumnNames == "false" ||
-                          orcUseColumnNames == "false"
-                        assert(collect(plan) { case g: GlutenPlan => g }.isEmpty == fallback)
+                        assert(collect(plan) { case g: GlutenPlan => g }.nonEmpty)
                     }
                   }
                 }
diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxParquetDataTypeValidationSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxParquetDataTypeValidationSuite.scala
@@ -19,6 +19,8 @@ package org.apache.gluten.execution
 import org.apache.gluten.config.GlutenConfig
 
 import org.apache.spark.SparkConf
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{DataType, StructType}
 
 import java.io.File
 
@@ -465,17 +467,27 @@ class VeloxParquetDataTypeValidationSuite extends VeloxWholeStageTransformerSuit
     }
   }
 
-  testWithMinSparkVersion("Fallback for TimestampNTZ type scan", "3.4") {
+  testWithMinSparkVersion("TimestampNTZ type scan", "3.4") {
     withTempDir {
       dir =>
         val path = new File(dir, "ntz_data").toURI.getPath
         val inputDf =
           spark.sql("SELECT CAST('2024-01-01 00:00:00' AS TIMESTAMP_NTZ) AS ts_ntz")
         inputDf.write.format("parquet").save(path)
-        val df = spark.read.format("parquet").load(path)
+
+        // TODO: The Parquet writer creates TIMESTAMP(MICROS,true), but for timestamp_ntz type,
+        //  the 'isAdjustedToUTC' should be false. Without explicitly specifying the read schema,
+        //  file data will be read as Timestamp.
+        val dataType = Class
+          .forName("org.apache.spark.sql.types.TimestampNTZType$")
+          .getField("MODULE$")
+          .get(null)
+          .asInstanceOf[DataType]
+        val schema = new StructType().add("ts_ntz", dataType)
+        val df = spark.read.schema(schema).parquet(path)
         val executedPlan = getExecutedPlan(df)
-        assert(!executedPlan.exists(plan => plan.isInstanceOf[BatchScanExecTransformer]))
-        checkAnswer(df, inputDf)
+        assert(executedPlan.exists(plan => plan.isInstanceOf[BatchScanExecTransformer]))
+        checkAnswer(df, Seq(Row(java.time.LocalDateTime.of(2024, 1, 1, 0, 0, 0, 0))))
     }
   }
 
diff --git a/backends-velox/src/test/scala/org/apache/gluten/functions/DateFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/functions/DateFunctionsValidateSuite.scala
@@ -16,10 +16,11 @@
  */
 package org.apache.gluten.functions
 
-import org.apache.gluten.execution.ProjectExecTransformer
+import org.apache.gluten.execution.{BatchScanExecTransformer, ProjectExecTransformer}
 
 import org.apache.spark.sql.execution.ProjectExec
-import org.apache.spark.sql.types.Decimal
+import org.apache.spark.sql.internal.SQLConf.TimestampTypes
+import org.apache.spark.sql.types.{DataType, Decimal, StructType}
 
 import java.sql.Timestamp
 
@@ -489,4 +490,78 @@ class DateFunctionsValidateSuite extends FunctionsValidateSuite {
         }
     }
   }
+
+  testWithMinSparkVersion("cast string to timestamp_ntz", "3.4") {
+    val inputs: Seq[String] = Seq(
+      "1970-01-01",
+      "1970-01-01 00:00:00-02:00",
+      "1970-01-01 00:00:00 +02:00",
+      "2000-01-01",
+      "1970-01-01 00:00:00",
+      "2000-01-01 12:21:56",
+      "2015-03-18T12:03:17Z",
+      "2015-03-18 12:03:17",
+      "2015-03-18T12:03:17",
+      "2015-03-18 12:03:17.123",
+      "2015-03-18T12:03:17.123",
+      "2015-03-18T12:03:17.456",
+      "2015-03-18 12:03:17.456"
+    )
+
+    inputs.foreach {
+      s =>
+        val query = s"select cast('$s' as timestamp_ntz)"
+        runQueryAndCompare(query) {
+          checkGlutenPlan[ProjectExecTransformer]
+        }
+    }
+  }
+
+  testWithMinSparkVersion("read as timestamp_ntz", "3.4") {
+    val inputs: Seq[String] = Seq(
+      "1970-01-01",
+      "1970-01-01 00:00:00-02:00",
+      "1970-01-01 00:00:00 +02:00",
+      "2000-01-01",
+      "1970-01-01 00:00:00",
+      "2000-01-01 12:21:56",
+      "2015-03-18T12:03:17Z",
+      "2015-03-18 12:03:17",
+      "2015-03-18T12:03:17",
+      "2015-03-18 12:03:17.123",
+      "2015-03-18T12:03:17.123",
+      "2015-03-18T12:03:17.456",
+      "2015-03-18 12:03:17.456"
+    )
+
+    withTempPath {
+      dir =>
+        withSQLConf("spark.sql.timestampType" -> TimestampTypes.TIMESTAMP_NTZ.toString) {
+          val path = dir.getAbsolutePath
+          val inputDF = spark.createDataset(inputs).toDF("input")
+          val df = inputDF.selectExpr("cast(input as timestamp_ntz) as ts")
+          // TODO: The Parquet writer creates TIMESTAMP(MICROS,true), but for timestamp_ntz type,
+          //  the 'isAdjustedToUTC' should be false. Spark will fail to read this file as
+          //  timestamp_ntz values.
+          df.coalesce(1).write.mode("overwrite").parquet(path)
+
+          val dataType = Class
+            .forName("org.apache.spark.sql.types.TimestampNTZType$")
+            .getField("MODULE$")
+            .get(null)
+            .asInstanceOf[DataType]
+          val schema = new StructType().add("ts", dataType)
+          val readDf = spark.read.schema(schema).parquet(path)
+          readDf.collect()
+          assert(
+            readDf.queryExecution.executedPlan.exists(
+              f => f.isInstanceOf[BatchScanExecTransformer]))
+
+          // Ensures the fallback of unsupported function works.
+          readDf.createOrReplaceTempView("view")
+          val testDf = spark.sql("select hour(ts) from view")
+          testDf.collect()
+        }
+    }
+  }
 }
diff --git a/cpp/velox/compute/VeloxBackend.cc b/cpp/velox/compute/VeloxBackend.cc
@@ -39,6 +39,7 @@
 #include "jni/JniFileSystem.h"
 #include "memory/GlutenBufferedInputBuilder.h"
 #include "operators/functions/SparkExprToSubfieldFilterParser.h"
+#include "operators/plannodes/RowVectorStream.h"
 #include "shuffle/ArrowShuffleDictionaryWriter.h"
 #include "udf/UdfLoader.h"
 #include "utils/Exception.h"
@@ -47,7 +48,6 @@
 #include "velox/connectors/hive/BufferedInputBuilder.h"
 #include "velox/connectors/hive/HiveConnector.h"
 #include "velox/connectors/hive/HiveDataSource.h"
-#include "operators/plannodes/RowVectorStream.h"
 #include "velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.h" // @manual
 #include "velox/connectors/hive/storage_adapters/gcs/RegisterGcsFileSystem.h" // @manual
 #include "velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.h"
@@ -56,6 +56,7 @@
 #include "velox/dwio/orc/reader/OrcReader.h"
 #include "velox/dwio/parquet/RegisterParquetReader.h"
 #include "velox/dwio/parquet/RegisterParquetWriter.h"
+#include "velox/functions/sparksql/types/TimestampNTZRegistration.h"
 #include "velox/serializers/PrestoSerializer.h"
 
 DECLARE_bool(velox_exception_user_stacktrace_enabled);
@@ -195,6 +196,7 @@ void VeloxBackend::init(
   velox::orc::registerOrcReaderFactory();
   velox::exec::ExprToSubfieldFilterParser::registerParser(std::make_unique<SparkExprToSubfieldFilterParser>());
   velox::connector::hive::BufferedInputBuilder::registerBuilder(std::make_shared<GlutenBufferedInputBuilder>());
+  velox::functions::sparksql::registerTimestampNTZType();
 
   // Register Velox functions
   registerAllFunctions();
@@ -318,13 +320,13 @@ void VeloxBackend::initConnector(const std::shared_ptr<velox::config::ConfigBase
   }
   velox::connector::registerConnector(
       std::make_shared<velox::connector::hive::HiveConnector>(kHiveConnectorId, hiveConf, ioExecutor_.get()));
-  
+
   // Register value-stream connector for runtime iterator-based inputs
   auto valueStreamDynamicFilterEnabled =
       backendConf_->get<bool>(kValueStreamDynamicFilterEnabled, kValueStreamDynamicFilterEnabledDefault);
   velox::connector::registerConnector(
       std::make_shared<ValueStreamConnector>(kIteratorConnectorId, hiveConf, valueStreamDynamicFilterEnabled));
-  
+
 #ifdef GLUTEN_ENABLE_GPU
   if (backendConf_->get<bool>(kCudfEnableTableScan, kCudfEnableTableScanDefault) &&
       backendConf_->get<bool>(kCudfEnabled, kCudfEnabledDefault)) {
diff --git a/cpp/velox/substrait/SubstraitParser.cc b/cpp/velox/substrait/SubstraitParser.cc
@@ -17,9 +17,9 @@
 
 #include "SubstraitParser.h"
 #include "TypeUtils.h"
-#include "velox/common/base/Exceptions.h"
-
 #include "VeloxSubstraitSignature.h"
+#include "velox/common/base/Exceptions.h"
+#include "velox/functions/sparksql/types/TimestampNTZType.h"
 
 namespace gluten {
 
@@ -78,6 +78,8 @@ TypePtr SubstraitParser::parseType(const ::substrait::Type& substraitType, bool
       return DATE();
     case ::substrait::Type::KindCase::kTimestampTz:
       return TIMESTAMP();
+    case ::substrait::Type::KindCase::kTimestamp:
+      return facebook::velox::functions::sparksql::TIMESTAMP_NTZ();
     case ::substrait::Type::KindCase::kDecimal: {
       auto precision = substraitType.decimal().precision();
       auto scale = substraitType.decimal().scale();
@@ -356,6 +358,9 @@ int64_t SubstraitParser::getLiteralValue(const ::substrait::Expression::Literal&
     memcpy(&decimalValue, decimal.c_str(), 16);
     return static_cast<int64_t>(decimalValue);
   }
+  if (literal.has_timestamp()) {
+    return literal.timestamp();
+  }
   return literal.i64();
 }
 
diff --git a/cpp/velox/substrait/SubstraitToVeloxExpr.cc b/cpp/velox/substrait/SubstraitToVeloxExpr.cc
@@ -17,11 +17,11 @@
 
 #include "SubstraitToVeloxExpr.h"
 #include "TypeUtils.h"
+#include "velox/functions/sparksql/types/TimestampNTZType.h"
+#include "velox/type/Timestamp.h"
 #include "velox/vector/FlatVector.h"
 #include "velox/vector/VariantToVector.h"
 
-#include "velox/type/Timestamp.h"
-
 using namespace facebook::velox;
 
 namespace {
@@ -133,6 +133,8 @@ TypePtr getScalarType(const ::substrait::Expression::Literal& literal) {
       return DATE();
     case ::substrait::Expression_Literal::LiteralTypeCase::kTimestampTz:
       return TIMESTAMP();
+    case ::substrait::Expression_Literal::LiteralTypeCase::kTimestamp:
+      return facebook::velox::functions::sparksql::TIMESTAMP_NTZ();
     case ::substrait::Expression_Literal::LiteralTypeCase::kString:
       return VARCHAR();
     case ::substrait::Expression_Literal::LiteralTypeCase::kVarChar:
diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.cc b/cpp/velox/substrait/SubstraitToVeloxPlan.cc
@@ -24,6 +24,7 @@
 #include "operators/plannodes/RowVectorStream.h"
 #include "velox/connectors/hive/HiveDataSink.h"
 #include "velox/exec/TableWriter.h"
+#include "velox/functions/sparksql/types/TimestampNTZType.h"
 #include "velox/type/Type.h"
 
 #include "utils/ConfigExtractor.h"
@@ -1496,6 +1497,17 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait::
   auto baseSchema = ROW(std::move(names), std::move(types));
   // The columns present in the table, if not available default to the baseSchema.
   auto tableSchema = splitInfo->tableSchema ? splitInfo->tableSchema : baseSchema;
+  if (tableSchema) {
+    auto tableNames = tableSchema->names();
+    auto tableTypes = tableSchema->children();
+    for (size_t i = 0; i < tableSchema->size(); i++) {
+      if (functions::sparksql::isTimestampNTZType(tableTypes[i])) {
+        // Spark's TimestampNTZ type is stored as TIMESTAMP in file.
+        tableTypes[i] = TIMESTAMP();
+      }
+    }
+    tableSchema = ROW(std::move(tableNames), std::move(tableTypes));
+  }
 
   connector::ConnectorTableHandlePtr tableHandle;
   auto remainingFilter = readRel.has_filter() ? exprConverter_->toVeloxExpr(readRel.filter(), baseSchema) : nullptr;
diff --git a/ep/build-velox/src/get-velox.sh b/ep/build-velox/src/get-velox.sh
@@ -17,8 +17,8 @@
 set -exu
 
 CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd)
-VELOX_REPO=https://github.com/IBM/velox.git
-VELOX_BRANCH=dft-2026_03_15-iceberg
+VELOX_REPO=https://github.com/rui-mo/velox.git
+VELOX_BRANCH=ts_ntz_gluten
 VELOX_ENHANCED_BRANCH=ibm-2026_03_15
 VELOX_HOME=""
 RUN_SETUP_SCRIPT=ON
diff --git a/gluten-delta/src/test/scala/org/apache/gluten/execution/DeltaSuite.scala b/gluten-delta/src/test/scala/org/apache/gluten/execution/DeltaSuite.scala
@@ -340,13 +340,13 @@ abstract class DeltaSuite extends WholeStageTransformerSuite {
 
   // TIMESTAMP_NTZ was introduced in Spark 3.4 / Delta 2.4
   testWithMinSparkVersion(
-    "delta: create table with TIMESTAMP_NTZ should fallback and return correct results",
+    "delta: create table with TIMESTAMP_NTZ and return correct results",
     "3.4") {
     withTable("delta_ntz") {
       spark.sql("CREATE TABLE delta_ntz(c1 STRING, c2 TIMESTAMP, c3 TIMESTAMP_NTZ) USING DELTA")
       spark.sql("""INSERT INTO delta_ntz VALUES
                   |('foo','2022-01-02 03:04:05.123456','2022-01-02 03:04:05.123456')""".stripMargin)
-      val df = runQueryAndCompare("select * from delta_ntz", noFallBack = false) { _ => }
+      val df = runQueryAndCompare("select * from delta_ntz") { _ => }
       checkAnswer(
         df,
         Row(
diff --git a/gluten-substrait/src/main/java/org/apache/gluten/substrait/expression/TimestampNTZLiteralNode.java b/gluten-substrait/src/main/java/org/apache/gluten/substrait/expression/TimestampNTZLiteralNode.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.substrait.expression;
+
+import org.apache.gluten.substrait.type.TimestampTypeNode;
+import org.apache.gluten.substrait.type.TypeNode;
+
+import io.substrait.proto.Expression.Literal.Builder;
+
+public class TimestampNTZLiteralNode extends LiteralNodeWithValue<Long> {
+  public TimestampNTZLiteralNode(Long value) {
+    super(value, new TimestampTypeNode(true));
+  }
+
+  public TimestampNTZLiteralNode(Long value, TypeNode typeNode) {
+    super(value, typeNode);
+  }
+
+  @Override
+  protected void updateLiteralBuilder(Builder literalBuilder, Long value) {
+    literalBuilder.setTimestamp(value);
+  }
+}
diff --git a/gluten-substrait/src/main/java/org/apache/gluten/substrait/type/TimestampNTZTypeNode.java b/gluten-substrait/src/main/java/org/apache/gluten/substrait/type/TimestampNTZTypeNode.java
diff --git a/gluten-substrait/src/main/java/org/apache/gluten/substrait/type/TypeBuilder.java b/gluten-substrait/src/main/java/org/apache/gluten/substrait/type/TypeBuilder.java
diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/execution/BasicScanExecTransformer.scala b/gluten-substrait/src/main/scala/org/apache/gluten/execution/BasicScanExecTransformer.scala
diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ConverterUtils.scala b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ConverterUtils.scala
diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/extension/columnar/validator/Validators.scala b/gluten-substrait/src/main/scala/org/apache/gluten/extension/columnar/validator/Validators.scala

Original file line number	Diff line number	Diff line change
`@@ -1423,7 +1423,8 @@ abstract class DeltaInsertIntoTests(`
`1423`	`1423`	`}`
`1424`	`1424`	`}`
`1425`	`1425`
`1426`		`- test("insertInto: Timestamp No Timezone round trips across timezones") {`
	`1426`	`+ // Implicit cast from Timestamp to TimestampNTZ has not been supported.`
	`1427`	`+ ignore("insertInto: Timestamp No Timezone round trips across timezones") {`
`1427`	`1428`	`val t1 = "timestamp_ntz"`
`1428`	`1429`	`withTable(t1) {`
`1429`	`1430`	`withTimeZone("GMT-8") {`
Original file line number	Diff line number	Diff line change
`@@ -109,6 +109,7 @@ object VeloxValidatorApi {`
`109`	`109`	`StringType \| BinaryType \| _: DecimalType \| DateType \| TimestampType \|`
`110`	`110`	`YearMonthIntervalType.DEFAULT \| NullType =>`
`111`	`111`	`true`
	`112`	`+ case other if other.typeName == "timestamp_ntz" => true`
`112`	`113`	`case _ => false`
`113`	`114`	`}`
`114`	`115`	`}`
Original file line number	Diff line number	Diff line change
`@@ -272,7 +272,7 @@ class FallbackSuite extends VeloxWholeStageTransformerSuite with AdaptiveSparkPl`
`272`	`272`	`}`
`273`	`273`	`}`
`274`	`274`
`275`		`- test("fallback with index based schema evolution") {`
	`275`	`+ testWithMinSparkVersion("fallback with index based schema evolution", "3.4") {`
`276`	`276`	`val query = "SELECT c2 FROM test"`
`277`	`277`	`Seq("parquet", "orc").foreach {`
`278`	`278`	`format =>`
`@@ -295,9 +295,7 @@ class FallbackSuite extends VeloxWholeStageTransformerSuite with AdaptiveSparkPl`
`295`	`295`	`runQueryAndCompare(query) {`
`296`	`296`	`df =>`
`297`	`297`	`val plan = df.queryExecution.executedPlan`
`298`		`- val fallback = parquetUseColumnNames == "false" \|\|`
`299`		`- orcUseColumnNames == "false"`
`300`		`- assert(collect(plan) { case g: GlutenPlan => g }.isEmpty == fallback)`
	`298`	`+ assert(collect(plan) { case g: GlutenPlan => g }.nonEmpty)`
`301`	`299`	`}`
`302`	`300`	`}`
`303`	`301`	`}`