elastacloud · dazfuller · Jun 29, 2024 · Jun 28, 2024 · Jun 28, 2024 · Jun 29, 2024
diff --git a/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala b/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala
@@ -426,41 +426,48 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO
       firstColumnIndex.to(lastColumnIndex).zipWithIndex.map { case (_, i) => s"col_$i" }
     }
 
-    // Determine the last data row, this is either the last row of data, or the maximum number of rows defined by the user
-    val lastRowNum = options.maxRowCount match {
-      case rowNum if rowNum != 0 && rowNum + firstDataRow.getRowNum <= sheet.getLastRowNum => rowNum + firstDataRow.getRowNum
-      case _ => sheet.getLastRowNum
-    }
+    var fields = if (firstRow.getRowNum == sheet.getLastRowNum) {
+      // If there is no data in the file (other than the header) then return a default schema
+      firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (_, i) =>
+        StructField(fieldNames(i), StringType, nullable = true)
+      }
+    } else {
+      // Determine the last data row, this is either the last row of data, or the maximum number of rows defined by the user
+      val lastRowNum = options.maxRowCount match {
+        case rowNum if rowNum != 0 && rowNum + firstDataRow.getRowNum <= sheet.getLastRowNum => rowNum + firstDataRow.getRowNum
+        case _ => sheet.getLastRowNum
+      }
 
-    // Get the field structure for data in the workbook
-    var fields = firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (colIndex, i) =>
-      // Get the collection of types for the current column across the rows used for inferring the schema
-      val colTypes = firstDataRow.getRowNum.until(lastRowNum).flatMap(rowIndex => {
-        // Get the current cell (or cell containing data for part of a merged region), the determine the Spark DataType
-        // for the cell
-        val currentCell = sheet.getRow(rowIndex).getCell(colIndex, Row.MissingCellPolicy.RETURN_NULL_AND_BLANK)
-        val fieldType: Option[DataType] = if (currentCell == null || currentCell.getCellType == CellType.BLANK) None else {
-          val cellType = formulaEvaluator match {
-            case Some(evaluator) => evaluator.evaluate(currentCell).getCellType
-            case None => currentCell.getCellType
-          }
+      // Get the field structure for data in the workbook
+      firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (colIndex, i) =>
+        // Get the collection of types for the current column across the rows used for inferring the schema
+        val colTypes = firstDataRow.getRowNum.until(lastRowNum).flatMap(rowIndex => {
+          // Get the current cell (or cell containing data for part of a merged region), the determine the Spark DataType
+          // for the cell
+          val currentCell = sheet.getRow(rowIndex).getCell(colIndex, Row.MissingCellPolicy.RETURN_NULL_AND_BLANK)
+          val fieldType: Option[DataType] = if (currentCell == null || currentCell.getCellType == CellType.BLANK) None else {
+            val cellType = formulaEvaluator match {
+              case Some(evaluator) => evaluator.evaluate(currentCell).getCellType
+              case None => currentCell.getCellType
+            }
 
-          cellType match {
-            case CellType._NONE | CellType.BLANK | CellType.ERROR => None
-            case CellType.BOOLEAN => Some(BooleanType)
-            case CellType.NUMERIC => if (DateUtil.isCellDateFormatted(currentCell)) Some(TimestampType) else Some(DoubleType)
-            case _ => Some(StringType)
+            cellType match {
+              case CellType._NONE | CellType.BLANK | CellType.ERROR => None
+              case CellType.BOOLEAN => Some(BooleanType)
+              case CellType.NUMERIC => if (DateUtil.isCellDateFormatted(currentCell)) Some(TimestampType) else Some(DoubleType)
+              case _ => Some(StringType)
+            }
           }
+          fieldType
+        })
+
+        // If all of the cells in the inference set are of the same type, then use this as the schema type, otherwise
+        // default to data as a string
+        if (colTypes.distinct.length == 1) {
+          StructField(fieldNames(i), colTypes.head, nullable = true)
+        } else {
+          StructField(fieldNames(i), StringType, nullable = true)
         }
-        fieldType
-      })
-
-      // If all of the cells in the inference set are of the same type, then use this as the schema type, otherwise
-      // default to data as a string
-      if (colTypes.distinct.length == 1) {
-        StructField(fieldNames(i), colTypes.head, nullable = true)
-      } else {
-        StructField(fieldNames(i), StringType, nullable = true)
       }
     }
 

diff --git a/src/test/resources/Parser/Empty.xlsx b/src/test/resources/Parser/Empty.xlsx
diff --git a/src/test/resources/Parser/NoData.xlsx b/src/test/resources/Parser/NoData.xlsx
diff --git a/src/test/resources/Parser/VaryingTypes.xlsx b/src/test/resources/Parser/VaryingTypes.xlsx
diff --git a/src/test/scala/com/elastacloud/spark/excel/DefaultSourceTests.scala b/src/test/scala/com/elastacloud/spark/excel/DefaultSourceTests.scala
@@ -16,6 +16,7 @@
 
 package com.elastacloud.spark.excel
 
+import com.elastacloud.spark.excel.parser.ExcelParserException
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.types._
 import org.scalatest.BeforeAndAfterAll
@@ -180,6 +181,38 @@ class DefaultSourceTests extends AnyFlatSpec with Matchers with BeforeAndAfterAl
     df.count() should be(3)
   }
 
+  "Reading an empty workbook" should "throw an exception" in {
+    val inputPath = testFilePath("/Parser/Empty.xlsx")
+
+    val error = intercept[ExcelParserException] {
+      spark.read
+        .format("excel")
+        .load(inputPath.replace("%20", " "))
+        .count()
+    }
+
+    error.getMessage should be("No data found on first row")
+  }
+
+  it should "return a single empty record if only headers exist" in {
+    val inputPath = testFilePath("/Parser/NoData.xlsx")
+
+    val dataSchema = StructType(Array(
+      StructField("Col1", StringType, nullable = true),
+      StructField("Col2", StringType, nullable = true),
+      StructField("Col3", StringType, nullable = true),
+      StructField("Col4", StringType, nullable = true)
+    ))
+
+    val df = spark.read
+      .format("com.elastacloud.spark.excel")
+      .schema(dataSchema)
+      .load(inputPath)
+
+    df.count() should be(1)
+    df.schema should equal(dataSchema)
+  }
+
   "Attempting to write to Excel" should "raise an error" in {
     import spark.implicits._
 

diff --git a/src/test/scala/com/elastacloud/spark/excel/packageTests.scala b/src/test/scala/com/elastacloud/spark/excel/packageTests.scala
@@ -60,11 +60,12 @@ class packageTests extends AnyFlatSpec with Matchers with BeforeAndAfterAll {
   it should "apply options to all files being read" in {
     val input1 = testFilePath("/Parser/SimpleWorkbook.xlsx")
     val input2 = testFilePath("/Parser/SimpleWorkbook.xls")
+    val input3 = testFilePath("/Parser/NoData.xlsx")
 
     val df = spark.read
       .option("headerRowCount", 0)
       .option("cellAddress", "A2")
-      .excel(input1, input2)
+      .excel(input1, input2, input3)
 
     val simpleWorkbookSchema = StructType(Array(
       StructField("col_0", StringType, nullable = true),
@@ -73,6 +74,6 @@ class packageTests extends AnyFlatSpec with Matchers with BeforeAndAfterAll {
     ))
 
     df.schema should equal(simpleWorkbookSchema)
-    df.count() should be(6)
+    df.count() should be(7)
   }
 }
diff --git a/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala b/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala
@@ -122,29 +122,31 @@ class ExcelParserTests extends AnyFlatSpec with Matchers {
     }
   }
 
-  it should "handle cells with different types from the inferred schema" in {
+  it should "read data correctly using the inferred schema" in {
     withInputStream("/Parser/VaryingTypes.xlsx") { inputStream =>
       val options = new ExcelParserOptions(Map[String, String](
+        "evaluateFormulae" -> "false",
         "maxRowCount" -> "3"
       )) // Limit the row count so that it doesn't infer based on the string row
 
       val expectedSchema = StructType(Array(
         StructField("Item", StringType, nullable = true),
         StructField("2010_0", DoubleType, nullable = true),
-        StructField("2011_0", DoubleType, nullable = true)
+        StructField("2011_0", DoubleType, nullable = true),
+        StructField("IsGood", BooleanType, nullable = true)
       ))
 
       val expectedData = Seq(
-        Vector[Any]("Item 1".asUnsafe, 99.4, 99.4),
-        Vector[Any]("Item 2".asUnsafe, 12.4, 12.4),
-        Vector[Any]("Item 3".asUnsafe, 74.2, 74.2),
-        Vector[Any]("Item 4".asUnsafe, 36.8, 36.8),
-        Vector[Any]("Item 5".asUnsafe, 24.2, 24.2),
-        Vector[Any]("Item 6".asUnsafe, 11.6, 11.6),
-        Vector[Any]("Header Items".asUnsafe, null, null),
-        Vector[Any]("Item 12".asUnsafe, 99.2, 99.2),
-        Vector[Any]("Item 13".asUnsafe, 18.4, 18.4),
-        Vector[Any]("Item 14".asUnsafe, 12.3, 12.3)
+        Vector[Any]("Item 1".asUnsafe, 99.4, 99.4, true),
+        Vector[Any]("Item 2".asUnsafe, 12.4, 12.4, true),
+        Vector[Any]("Item 3".asUnsafe, 74.2, 74.2, true),
+        Vector[Any]("Item 4".asUnsafe, 36.8, 36.8, false),
+        Vector[Any]("Item 5".asUnsafe, 24.2, 24.2, false),
+        Vector[Any]("Item 6".asUnsafe, 11.6, 11.6, false),
+        Vector[Any]("Header Items".asUnsafe, null, null, null),
+        Vector[Any]("Item 12".asUnsafe, 99.2, 99.2, false),
+        Vector[Any]("Item 13".asUnsafe, 18.4, 18.4, true),
+        Vector[Any]("Item 14".asUnsafe, 12.3, 12.3, true)
       )
 
       val parser = new ExcelParser(inputStream, options)
@@ -537,20 +539,21 @@ class ExcelParserTests extends AnyFlatSpec with Matchers {
         StructField("Item", StringType, nullable = true),
         StructField("2010_0", DoubleType, nullable = true),
         StructField("2011_0", DoubleType, nullable = true),
+        StructField("IsGood", BooleanType, nullable = true),
         StructField("ValidRow", BooleanType, nullable = false)
       ))
 
       val expectedData = Seq(
-        Vector[Any]("Item 1".asUnsafe, 99.4, 99.4, true),
-        Vector[Any]("Item 2".asUnsafe, 12.4, 12.4, true),
-        Vector[Any]("Item 3".asUnsafe, 74.2, 74.2, true),
-        Vector[Any]("Item 4".asUnsafe, 36.8, 36.8, true),
-        Vector[Any]("Item 5".asUnsafe, 24.2, 24.2, true),
-        Vector[Any]("Item 6".asUnsafe, 11.6, 11.6, true),
-        Vector[Any]("Header Items".asUnsafe, null, null, false),
-        Vector[Any]("Item 12".asUnsafe, 99.2, 99.2, true),
-        Vector[Any]("Item 13".asUnsafe, 18.4, 18.4, true),
-        Vector[Any]("Item 14".asUnsafe, 12.3, 12.3, true)
+        Vector[Any]("Item 1".asUnsafe, 99.4, 99.4, true, true),
+        Vector[Any]("Item 2".asUnsafe, 12.4, 12.4, true, true),
+        Vector[Any]("Item 3".asUnsafe, 74.2, 74.2, true, true),
+        Vector[Any]("Item 4".asUnsafe, 36.8, 36.8, false, true),
+        Vector[Any]("Item 5".asUnsafe, 24.2, 24.2, false, true),
+        Vector[Any]("Item 6".asUnsafe, 11.6, 11.6, false, true),
+        Vector[Any]("Header Items".asUnsafe, null, null, null, false),
+        Vector[Any]("Item 12".asUnsafe, 99.2, 99.2, false, true),
+        Vector[Any]("Item 13".asUnsafe, 18.4, 18.4, true, true),
+        Vector[Any]("Item 14".asUnsafe, 12.3, 12.3, true, true)
       )
 
       val parser = new ExcelParser(inputStream, options)
@@ -660,4 +663,27 @@ class ExcelParserTests extends AnyFlatSpec with Matchers {
       actualData should equal(expectedData)
     }
   }
+
+  "Reading a file containing no data" should "throw an exception" in {
+    withInputStream("/Parser/Empty.xlsx") { inputStream =>
+      val parser = new ExcelParser(inputStream, new ExcelParserOptions())
+
+      val error = intercept[ExcelParserException] {
+        parser.getDataIterator.toList
+      }
+
+      error.getMessage should be("No data found on first row")
+    }
+  }
+
+  it should "return an single empty record if only headers exist" in {
+    withInputStream("/Parser/NoData.xlsx") { inputStream =>
+      val expectedData = Seq(Vector(null, null, null))
+
+      val parser = new ExcelParser(inputStream, new ExcelParserOptions())
+      val actualData = parser.getDataIterator.toList
+
+      actualData should be(expectedData)
+    }
+  }
 }