diff --git a/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala b/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala index 04f17ff..ff10f62 100644 --- a/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala +++ b/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala @@ -426,41 +426,48 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO firstColumnIndex.to(lastColumnIndex).zipWithIndex.map { case (_, i) => s"col_$i" } } - // Determine the last data row, this is either the last row of data, or the maximum number of rows defined by the user - val lastRowNum = options.maxRowCount match { - case rowNum if rowNum != 0 && rowNum + firstDataRow.getRowNum <= sheet.getLastRowNum => rowNum + firstDataRow.getRowNum - case _ => sheet.getLastRowNum - } + var fields = if (firstRow.getRowNum == sheet.getLastRowNum) { + // If there is no data in the file (other than the header) then return a default schema + firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (_, i) => + StructField(fieldNames(i), StringType, nullable = true) + } + } else { + // Determine the last data row, this is either the last row of data, or the maximum number of rows defined by the user + val lastRowNum = options.maxRowCount match { + case rowNum if rowNum != 0 && rowNum + firstDataRow.getRowNum <= sheet.getLastRowNum => rowNum + firstDataRow.getRowNum + case _ => sheet.getLastRowNum + } - // Get the field structure for data in the workbook - var fields = firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (colIndex, i) => - // Get the collection of types for the current column across the rows used for inferring the schema - val colTypes = firstDataRow.getRowNum.until(lastRowNum).flatMap(rowIndex => { - // Get the current cell (or cell containing data for part of a merged region), the determine the Spark DataType - // for the cell - val currentCell = sheet.getRow(rowIndex).getCell(colIndex, Row.MissingCellPolicy.RETURN_NULL_AND_BLANK) - val fieldType: Option[DataType] = if (currentCell == null || currentCell.getCellType == CellType.BLANK) None else { - val cellType = formulaEvaluator match { - case Some(evaluator) => evaluator.evaluate(currentCell).getCellType - case None => currentCell.getCellType - } + // Get the field structure for data in the workbook + firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (colIndex, i) => + // Get the collection of types for the current column across the rows used for inferring the schema + val colTypes = firstDataRow.getRowNum.until(lastRowNum).flatMap(rowIndex => { + // Get the current cell (or cell containing data for part of a merged region), the determine the Spark DataType + // for the cell + val currentCell = sheet.getRow(rowIndex).getCell(colIndex, Row.MissingCellPolicy.RETURN_NULL_AND_BLANK) + val fieldType: Option[DataType] = if (currentCell == null || currentCell.getCellType == CellType.BLANK) None else { + val cellType = formulaEvaluator match { + case Some(evaluator) => evaluator.evaluate(currentCell).getCellType + case None => currentCell.getCellType + } - cellType match { - case CellType._NONE | CellType.BLANK | CellType.ERROR => None - case CellType.BOOLEAN => Some(BooleanType) - case CellType.NUMERIC => if (DateUtil.isCellDateFormatted(currentCell)) Some(TimestampType) else Some(DoubleType) - case _ => Some(StringType) + cellType match { + case CellType._NONE | CellType.BLANK | CellType.ERROR => None + case CellType.BOOLEAN => Some(BooleanType) + case CellType.NUMERIC => if (DateUtil.isCellDateFormatted(currentCell)) Some(TimestampType) else Some(DoubleType) + case _ => Some(StringType) + } } + fieldType + }) + + // If all of the cells in the inference set are of the same type, then use this as the schema type, otherwise + // default to data as a string + if (colTypes.distinct.length == 1) { + StructField(fieldNames(i), colTypes.head, nullable = true) + } else { + StructField(fieldNames(i), StringType, nullable = true) } - fieldType - }) - - // If all of the cells in the inference set are of the same type, then use this as the schema type, otherwise - // default to data as a string - if (colTypes.distinct.length == 1) { - StructField(fieldNames(i), colTypes.head, nullable = true) - } else { - StructField(fieldNames(i), StringType, nullable = true) } } diff --git a/src/test/resources/Parser/Empty.xlsx b/src/test/resources/Parser/Empty.xlsx new file mode 100644 index 0000000..322af83 --- /dev/null +++ b/src/test/resources/Parser/Empty.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:477e9ee8c58ba24b71b49169eb758b6c0a3505242dacfcc0bd34887f3536c133 +size 9316 diff --git a/src/test/resources/Parser/NoData.xlsx b/src/test/resources/Parser/NoData.xlsx new file mode 100644 index 0000000..8328cd9 --- /dev/null +++ b/src/test/resources/Parser/NoData.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9a1ae3c97097df2702dee8b8023933df3af6902c421fd86a862dceeefa19d0 +size 9749 diff --git a/src/test/resources/Parser/VaryingTypes.xlsx b/src/test/resources/Parser/VaryingTypes.xlsx index f8f9ab2..78ccd66 100644 --- a/src/test/resources/Parser/VaryingTypes.xlsx +++ b/src/test/resources/Parser/VaryingTypes.xlsx @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c7054a41ec2e4a1b3396adb83f82c243ee8bbdd997a8c2440fe941e0cfc0a07 -size 9152 +oid sha256:c9c4df21ae384d3aa8d0160ce69253d6edcfa8c171ae802af64e8b5da8aa0758 +size 10117 diff --git a/src/test/scala/com/elastacloud/spark/excel/DefaultSourceTests.scala b/src/test/scala/com/elastacloud/spark/excel/DefaultSourceTests.scala index 1b9f53e..8e8175e 100644 --- a/src/test/scala/com/elastacloud/spark/excel/DefaultSourceTests.scala +++ b/src/test/scala/com/elastacloud/spark/excel/DefaultSourceTests.scala @@ -16,6 +16,7 @@ package com.elastacloud.spark.excel +import com.elastacloud.spark.excel.parser.ExcelParserException import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types._ import org.scalatest.BeforeAndAfterAll @@ -180,6 +181,38 @@ class DefaultSourceTests extends AnyFlatSpec with Matchers with BeforeAndAfterAl df.count() should be(3) } + "Reading an empty workbook" should "throw an exception" in { + val inputPath = testFilePath("/Parser/Empty.xlsx") + + val error = intercept[ExcelParserException] { + spark.read + .format("excel") + .load(inputPath.replace("%20", " ")) + .count() + } + + error.getMessage should be("No data found on first row") + } + + it should "return a single empty record if only headers exist" in { + val inputPath = testFilePath("/Parser/NoData.xlsx") + + val dataSchema = StructType(Array( + StructField("Col1", StringType, nullable = true), + StructField("Col2", StringType, nullable = true), + StructField("Col3", StringType, nullable = true), + StructField("Col4", StringType, nullable = true) + )) + + val df = spark.read + .format("com.elastacloud.spark.excel") + .schema(dataSchema) + .load(inputPath) + + df.count() should be(1) + df.schema should equal(dataSchema) + } + "Attempting to write to Excel" should "raise an error" in { import spark.implicits._ diff --git a/src/test/scala/com/elastacloud/spark/excel/packageTests.scala b/src/test/scala/com/elastacloud/spark/excel/packageTests.scala index 20e7261..41ba1a0 100644 --- a/src/test/scala/com/elastacloud/spark/excel/packageTests.scala +++ b/src/test/scala/com/elastacloud/spark/excel/packageTests.scala @@ -60,11 +60,12 @@ class packageTests extends AnyFlatSpec with Matchers with BeforeAndAfterAll { it should "apply options to all files being read" in { val input1 = testFilePath("/Parser/SimpleWorkbook.xlsx") val input2 = testFilePath("/Parser/SimpleWorkbook.xls") + val input3 = testFilePath("/Parser/NoData.xlsx") val df = spark.read .option("headerRowCount", 0) .option("cellAddress", "A2") - .excel(input1, input2) + .excel(input1, input2, input3) val simpleWorkbookSchema = StructType(Array( StructField("col_0", StringType, nullable = true), @@ -73,6 +74,6 @@ class packageTests extends AnyFlatSpec with Matchers with BeforeAndAfterAll { )) df.schema should equal(simpleWorkbookSchema) - df.count() should be(6) + df.count() should be(7) } } diff --git a/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala b/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala index 5e1bb20..7ebd803 100644 --- a/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala +++ b/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala @@ -122,29 +122,31 @@ class ExcelParserTests extends AnyFlatSpec with Matchers { } } - it should "handle cells with different types from the inferred schema" in { + it should "read data correctly using the inferred schema" in { withInputStream("/Parser/VaryingTypes.xlsx") { inputStream => val options = new ExcelParserOptions(Map[String, String]( + "evaluateFormulae" -> "false", "maxRowCount" -> "3" )) // Limit the row count so that it doesn't infer based on the string row val expectedSchema = StructType(Array( StructField("Item", StringType, nullable = true), StructField("2010_0", DoubleType, nullable = true), - StructField("2011_0", DoubleType, nullable = true) + StructField("2011_0", DoubleType, nullable = true), + StructField("IsGood", BooleanType, nullable = true) )) val expectedData = Seq( - Vector[Any]("Item 1".asUnsafe, 99.4, 99.4), - Vector[Any]("Item 2".asUnsafe, 12.4, 12.4), - Vector[Any]("Item 3".asUnsafe, 74.2, 74.2), - Vector[Any]("Item 4".asUnsafe, 36.8, 36.8), - Vector[Any]("Item 5".asUnsafe, 24.2, 24.2), - Vector[Any]("Item 6".asUnsafe, 11.6, 11.6), - Vector[Any]("Header Items".asUnsafe, null, null), - Vector[Any]("Item 12".asUnsafe, 99.2, 99.2), - Vector[Any]("Item 13".asUnsafe, 18.4, 18.4), - Vector[Any]("Item 14".asUnsafe, 12.3, 12.3) + Vector[Any]("Item 1".asUnsafe, 99.4, 99.4, true), + Vector[Any]("Item 2".asUnsafe, 12.4, 12.4, true), + Vector[Any]("Item 3".asUnsafe, 74.2, 74.2, true), + Vector[Any]("Item 4".asUnsafe, 36.8, 36.8, false), + Vector[Any]("Item 5".asUnsafe, 24.2, 24.2, false), + Vector[Any]("Item 6".asUnsafe, 11.6, 11.6, false), + Vector[Any]("Header Items".asUnsafe, null, null, null), + Vector[Any]("Item 12".asUnsafe, 99.2, 99.2, false), + Vector[Any]("Item 13".asUnsafe, 18.4, 18.4, true), + Vector[Any]("Item 14".asUnsafe, 12.3, 12.3, true) ) val parser = new ExcelParser(inputStream, options) @@ -537,20 +539,21 @@ class ExcelParserTests extends AnyFlatSpec with Matchers { StructField("Item", StringType, nullable = true), StructField("2010_0", DoubleType, nullable = true), StructField("2011_0", DoubleType, nullable = true), + StructField("IsGood", BooleanType, nullable = true), StructField("ValidRow", BooleanType, nullable = false) )) val expectedData = Seq( - Vector[Any]("Item 1".asUnsafe, 99.4, 99.4, true), - Vector[Any]("Item 2".asUnsafe, 12.4, 12.4, true), - Vector[Any]("Item 3".asUnsafe, 74.2, 74.2, true), - Vector[Any]("Item 4".asUnsafe, 36.8, 36.8, true), - Vector[Any]("Item 5".asUnsafe, 24.2, 24.2, true), - Vector[Any]("Item 6".asUnsafe, 11.6, 11.6, true), - Vector[Any]("Header Items".asUnsafe, null, null, false), - Vector[Any]("Item 12".asUnsafe, 99.2, 99.2, true), - Vector[Any]("Item 13".asUnsafe, 18.4, 18.4, true), - Vector[Any]("Item 14".asUnsafe, 12.3, 12.3, true) + Vector[Any]("Item 1".asUnsafe, 99.4, 99.4, true, true), + Vector[Any]("Item 2".asUnsafe, 12.4, 12.4, true, true), + Vector[Any]("Item 3".asUnsafe, 74.2, 74.2, true, true), + Vector[Any]("Item 4".asUnsafe, 36.8, 36.8, false, true), + Vector[Any]("Item 5".asUnsafe, 24.2, 24.2, false, true), + Vector[Any]("Item 6".asUnsafe, 11.6, 11.6, false, true), + Vector[Any]("Header Items".asUnsafe, null, null, null, false), + Vector[Any]("Item 12".asUnsafe, 99.2, 99.2, false, true), + Vector[Any]("Item 13".asUnsafe, 18.4, 18.4, true, true), + Vector[Any]("Item 14".asUnsafe, 12.3, 12.3, true, true) ) val parser = new ExcelParser(inputStream, options) @@ -660,4 +663,27 @@ class ExcelParserTests extends AnyFlatSpec with Matchers { actualData should equal(expectedData) } } + + "Reading a file containing no data" should "throw an exception" in { + withInputStream("/Parser/Empty.xlsx") { inputStream => + val parser = new ExcelParser(inputStream, new ExcelParserOptions()) + + val error = intercept[ExcelParserException] { + parser.getDataIterator.toList + } + + error.getMessage should be("No data found on first row") + } + } + + it should "return an single empty record if only headers exist" in { + withInputStream("/Parser/NoData.xlsx") { inputStream => + val expectedData = Seq(Vector(null, null, null)) + + val parser = new ExcelParser(inputStream, new ExcelParserOptions()) + val actualData = parser.getDataIterator.toList + + actualData should be(expectedData) + } + } }