apache · akshatshenoi-db · Jun 15, 2026 · Jun 15, 2026 · cloud-fan · Jun 15, 2026
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
@@ -60,6 +60,10 @@ case class TextFileFormat() extends TextBasedFileFormat with DataSourceRegister
       options: Map[String, String],
       path: Path): Boolean = {
     val textOptions = new TextOptions(options)
+    if (textOptions.archiveFormatEnabled && ArchiveReader.isArchivePath(path)) {
+      // A tar archive is read as one sequential stream (entry by entry), so it is never split.
+      return false
+    }
     super.isSplitable(sparkSession, options, path) && !textOptions.wholeText
   }
 
@@ -108,7 +112,60 @@ case class TextFileFormat() extends TextBasedFileFormat with DataSourceRegister
     val textOptions = new TextOptions(options)
     val broadcastedHadoopConf =
       SerializableConfiguration.broadcast(sparkSession.sparkContext, hadoopConf)
-    readToUnsafeMem(broadcastedHadoopConf, requiredSchema, textOptions)
+    val perFileReader = readToUnsafeMem(broadcastedHadoopConf, requiredSchema, textOptions)
+    val archiveReader = readArchive(broadcastedHadoopConf, requiredSchema, textOptions)
+    // A tar archive (always a single split, see `isSplitable`) is streamed entry by entry when
+    // archive reads are enabled; otherwise the file is read directly. Archive scanning is wired
+    // into the V1 file source only, so this dispatch lives here rather than in a shared reader.
+    (file: PartitionedFile) => {
+      if (textOptions.archiveFormatEnabled && ArchiveReader.isArchivePath(file.toPath)) {
+        archiveReader(file)
+      } else {
+        perFileReader(file)
+      }
+    }
+  }
+
+  /**
+   * Streams a tar archive (`.tar`/`.tar.gz`/`.tgz`) entry by entry, emitting the same
+   * `value`-column rows the per-file reader produces -- each entry is read as if it were a
+   * standalone text file (one row per line, or a single row holding the whole entry when
+   * `wholeText` is set), without unpacking the archive to disk. The whole archive is a single
+   * split (see `isSplitable`).
+   *
+   * Kept separate from the per-file reader (rather than dispatched inside it) because only this V1
+   * read path supports archives; the V2 data source is intentionally left untouched.
+   */
+  private def readArchive(
+      conf: Broadcast[SerializableConfiguration],
+      requiredSchema: StructType,
+      textOptions: TextOptions): PartitionedFile => Iterator[UnsafeRow] = {
+    (file: PartitionedFile) => {
+      val confValue = conf.value.value
+      val emptyUnsafeRow = new UnsafeRow(0)
+      val unsafeRowWriter = new UnsafeRowWriter(1)
+      // Mirrors `readToUnsafeMem`: an empty required schema (e.g. `count`) yields one empty row per
+      // record; otherwise each record is written into the single `value` column.
+      def toRow(bytes: Array[Byte], length: Int): UnsafeRow = {
+        if (requiredSchema.isEmpty) {
+          emptyUnsafeRow
+        } else {
+          unsafeRowWriter.reset()
+          unsafeRowWriter.write(0, bytes, 0, length)
+          unsafeRowWriter.getRow()
+        }
+      }
+      ArchiveReader(file.toPath).readEntries(confValue) { (_, in) =>
+        if (textOptions.wholeText) {
+          val content = in.readAllBytes()
+          Iterator.single(toRow(content, content.length))
+        } else {
+          ArchiveReader.lineIterator(in, textOptions.lineSeparatorInRead).map { line =>
+            toRow(line.getBytes, line.getLength)
+          }
+        }
+      }
+    }
   }
 
   private def readToUnsafeMem(

diff --git a/...e/src/test/scala/org/apache/spark/sql/execution/datasources/TextTarArchiveReadSuite.scala b/...e/src/test/scala/org/apache/spark/sql/execution/datasources/TextTarArchiveReadSuite.scala
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.{File, FileOutputStream, OutputStream}
+import java.nio.charset.StandardCharsets
+import java.nio.file.Files
+import java.util.Locale
+import java.util.zip.GZIPOutputStream
+
+import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveOutputStream}
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.sql.{DataFrame, QueryTest, Row}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.util.Utils
+
+/**
+ * Reads of text files packed in tar archives (`.tar`/`.tar.gz`/`.tgz`), streamed through the
+ * [[ArchiveReader]] path. Entries are streamed (never unpacked to disk), and the central contract
+ * verified throughout is parity with reading the same files from a directory.
+ *
+ * Unlike CSV/JSON this does not reuse [[ArchiveReadSuiteBase]]: the text data source has a single
+ * fixed `value` column (one row per line, or per entry with `wholetext`) and no schema inference,
+ * so the structured, two-column tests there do not apply.
+ */
+class TextTarArchiveReadSuite extends QueryTest with SharedSparkSession {
+
+  override def sparkConf: SparkConf =
+    super.sparkConf.set(SQLConf.ARCHIVE_FORMAT_READER_ENABLED.key, "true")
+
+  /** Archive extensions to exercise; the head is the default. */
+  private val archiveExtensions: Seq[String] = Seq("tar", "tar.gz", "tgz")
+
+  private def textBytes(s: String): Array[Byte] = s.getBytes(StandardCharsets.UTF_8)
+
+  /** Writes `entries` (name -> bytes) into the archive at `dest`; compression follows the ext. */
+  private def writeArchive(dest: File, entries: Seq[(String, Array[Byte])]): Unit = {
+    val name = dest.getName.toLowerCase(Locale.ROOT)
+    val rawOut: OutputStream = if (name.endsWith(".gz") || name.endsWith(".tgz")) {
+      new GZIPOutputStream(new FileOutputStream(dest))
+    } else {
+      new FileOutputStream(dest)
+    }
+    val out = new TarArchiveOutputStream(rawOut)
+    try {
+      entries.foreach { case (entryName, bytes) =>
+        val entry = new TarArchiveEntry(entryName)
+        entry.setSize(bytes.length.toLong)
+        out.putArchiveEntry(entry)
+        out.write(bytes)
+        out.closeArchiveEntry()
+      }
+      out.finish()
+    } finally out.close()
+  }
+
+  /** Provides an archive-extensioned path inside a fresh temp dir to `f`. */
+  private def withArchiveFile(
+      extension: String = archiveExtensions.head)(f: File => Unit): Unit = {
+    val dir = Utils.createTempDir(namePrefix = "archive-test")
+    try f(new File(dir, s"archive.$extension")) finally Utils.deleteRecursively(dir)
+  }
+
+  private def read(path: String, options: Map[String, String] = Map.empty): DataFrame =
+    spark.read.options(options).text(path)
+
+  test("read a tar archive of multiple text entries matches the union of the lines") {
+    archiveExtensions.foreach { ext =>
+      withArchiveFile(ext) { archive =>
+        writeArchive(archive, Seq(
+          "a.txt" -> textBytes("line1\nline2\n"),
+          "b.txt" -> textBytes("line3\n"),
+          "c.txt" -> textBytes("line4\nline5\n")))
+        checkAnswer(
+          read(archive.getCanonicalPath),
+          Seq("line1", "line2", "line3", "line4", "line5").map(Row(_)))
+      }
+    }
+  }
+
+  test("archive entries read like a directory of the same files") {
+    val entries = Seq("a.txt" -> textBytes("a1\na2\n"), "b.txt" -> textBytes("b1\n"))
+    withArchiveFile() { archive =>
+      writeArchive(archive, entries)
+      val fromArchive = read(archive.getCanonicalPath)
+      withTempDir { dir =>
+        entries.foreach { case (n, b) => Files.write(new File(dir, n).toPath, b) }
+        checkAnswer(fromArchive, read(dir.getCanonicalPath).collect().toSeq)
+      }
+    }
+  }
+
+  test("an empty archive yields no rows") {
+    withArchiveFile() { archive =>
+      writeArchive(archive, Seq.empty)
+      checkAnswer(read(archive.getCanonicalPath), Seq.empty[Row])
+    }
+  }
+
+  test("an archive and loose text files in the same directory are all read") {
+    withTempDir { dir =>
+      val ext = archiveExtensions.head
+      writeArchive(
+        new File(dir, s"data.$ext"),
+        Seq("a.txt" -> textBytes("in-archive-1\nin-archive-2\n")))
+      Files.write(new File(dir, "loose.txt").toPath, textBytes("loose-1\n"))
+      checkAnswer(
+        read(dir.getCanonicalPath),
+        Seq("in-archive-1", "in-archive-2", "loose-1").map(Row(_)))
+    }
+  }
+
+  test("wholetext reads each entry as a single row") {
+    withArchiveFile() { archive =>
+      writeArchive(archive, Seq(
+        "a.txt" -> textBytes("l1\nl2"),
+        "b.txt" -> textBytes("only")))
+      checkAnswer(
+        read(archive.getCanonicalPath, Map("wholetext" -> "true")),
+        Seq(Row("l1\nl2"), Row("only")))
+    }
+  }
+
+  test("a custom line separator splits entries into rows") {
+    withArchiveFile() { archive =>
+      writeArchive(archive, Seq("a.txt" -> textBytes("x;y;z")))
+      checkAnswer(
+        read(archive.getCanonicalPath, Map("lineSep" -> ";")),
+        Seq(Row("x"), Row("y"), Row("z")))
+    }
+  }
+
+  test("count over an archive reads the right number of rows with an empty required schema") {
+    withArchiveFile() { archive =>
+      writeArchive(archive, Seq(
+        "a.txt" -> textBytes("1\n2\n3\n"),
+        "b.txt" -> textBytes("4\n")))
+      assert(read(archive.getCanonicalPath).count() == 4L)
+    }
+  }
+
+  test("an archive always yields a single partition regardless of size") {
+    withArchiveFile() { archive =>
+      val big = (1 to 1000).map(i => s"value-$i").mkString("\n")
+      writeArchive(archive, (0 until 4).map(i => s"part-$i.txt" -> textBytes(big + "\n")))
+      withSQLConf(SQLConf.FILES_MAX_PARTITION_BYTES.key -> "1024") {
+        val readDf = read(archive.getCanonicalPath)
+        assert(readDf.rdd.getNumPartitions == 1,
+          s"archive should be a single partition; got ${readDf.rdd.getNumPartitions}")
+        assert(readDf.count() == 4000L)
+      }
+    }
+  }
+
+  Seq(true, false).foreach { ignoreCorrupt =>
+    test(s"ignoreCorruptFiles=$ignoreCorrupt controls whether a corrupt archive is skipped") {
+      withArchiveFile("tar.gz") { archive =>
+        Files.write(archive.toPath, textBytes("this is not a valid gzip-compressed tar archive"))
+        withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> ignoreCorrupt.toString) {
+          if (ignoreCorrupt) {
+            checkAnswer(read(archive.getCanonicalPath), Seq.empty[Row])
+          } else {
+            intercept[SparkException](read(archive.getCanonicalPath).collect())
+          }
+        }
+      }
+    }
+  }
+
+  Seq(true, false).foreach { ignoreMissing =>
+    test(s"ignoreMissingFiles=$ignoreMissing controls whether a missing archive is skipped") {
+      withArchiveFile() { archive =>
+        writeArchive(archive, Seq("a.txt" -> textBytes("line1\nline2\n")))
+        withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> ignoreMissing.toString) {
+          // The archive is listed when the DataFrame is built, then deleted before the scan opens
+          // it, so the reader hits a missing file -- handled by `FileScanRDD`, like any file.
+          val df = read(archive.getCanonicalPath)
+          assert(archive.delete(), s"failed to delete $archive")
+          if (ignoreMissing) {
+            checkAnswer(df, Seq.empty[Row])
+          } else {
+            intercept[SparkException](df.collect())
+          }
+        }
+      }
+    }
+  }
+}