Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dazfuller/no data fix #43

Merged
merged 3 commits into from
Jun 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 38 additions & 31 deletions src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -426,41 +426,48 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO
firstColumnIndex.to(lastColumnIndex).zipWithIndex.map { case (_, i) => s"col_$i" }
}

// Determine the last data row, this is either the last row of data, or the maximum number of rows defined by the user
val lastRowNum = options.maxRowCount match {
case rowNum if rowNum != 0 && rowNum + firstDataRow.getRowNum <= sheet.getLastRowNum => rowNum + firstDataRow.getRowNum
case _ => sheet.getLastRowNum
}
var fields = if (firstRow.getRowNum == sheet.getLastRowNum) {
// If there is no data in the file (other than the header) then return a default schema
firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (_, i) =>
StructField(fieldNames(i), StringType, nullable = true)
}
} else {
// Determine the last data row, this is either the last row of data, or the maximum number of rows defined by the user
val lastRowNum = options.maxRowCount match {
case rowNum if rowNum != 0 && rowNum + firstDataRow.getRowNum <= sheet.getLastRowNum => rowNum + firstDataRow.getRowNum
case _ => sheet.getLastRowNum
}

// Get the field structure for data in the workbook
var fields = firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (colIndex, i) =>
// Get the collection of types for the current column across the rows used for inferring the schema
val colTypes = firstDataRow.getRowNum.until(lastRowNum).flatMap(rowIndex => {
// Get the current cell (or cell containing data for part of a merged region), the determine the Spark DataType
// for the cell
val currentCell = sheet.getRow(rowIndex).getCell(colIndex, Row.MissingCellPolicy.RETURN_NULL_AND_BLANK)
val fieldType: Option[DataType] = if (currentCell == null || currentCell.getCellType == CellType.BLANK) None else {
val cellType = formulaEvaluator match {
case Some(evaluator) => evaluator.evaluate(currentCell).getCellType
case None => currentCell.getCellType
}
// Get the field structure for data in the workbook
firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (colIndex, i) =>
// Get the collection of types for the current column across the rows used for inferring the schema
val colTypes = firstDataRow.getRowNum.until(lastRowNum).flatMap(rowIndex => {
// Get the current cell (or cell containing data for part of a merged region), the determine the Spark DataType
// for the cell
val currentCell = sheet.getRow(rowIndex).getCell(colIndex, Row.MissingCellPolicy.RETURN_NULL_AND_BLANK)
val fieldType: Option[DataType] = if (currentCell == null || currentCell.getCellType == CellType.BLANK) None else {
val cellType = formulaEvaluator match {
case Some(evaluator) => evaluator.evaluate(currentCell).getCellType
case None => currentCell.getCellType
}

cellType match {
case CellType._NONE | CellType.BLANK | CellType.ERROR => None
case CellType.BOOLEAN => Some(BooleanType)
case CellType.NUMERIC => if (DateUtil.isCellDateFormatted(currentCell)) Some(TimestampType) else Some(DoubleType)
case _ => Some(StringType)
cellType match {
case CellType._NONE | CellType.BLANK | CellType.ERROR => None
case CellType.BOOLEAN => Some(BooleanType)
case CellType.NUMERIC => if (DateUtil.isCellDateFormatted(currentCell)) Some(TimestampType) else Some(DoubleType)
case _ => Some(StringType)
}
}
fieldType
})

// If all of the cells in the inference set are of the same type, then use this as the schema type, otherwise
// default to data as a string
if (colTypes.distinct.length == 1) {
StructField(fieldNames(i), colTypes.head, nullable = true)
} else {
StructField(fieldNames(i), StringType, nullable = true)
}
fieldType
})

// If all of the cells in the inference set are of the same type, then use this as the schema type, otherwise
// default to data as a string
if (colTypes.distinct.length == 1) {
StructField(fieldNames(i), colTypes.head, nullable = true)
} else {
StructField(fieldNames(i), StringType, nullable = true)
}
}

Expand Down
3 changes: 3 additions & 0 deletions src/test/resources/Parser/Empty.xlsx
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/test/resources/Parser/NoData.xlsx
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/Parser/VaryingTypes.xlsx
Git LFS file not shown
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package com.elastacloud.spark.excel

import com.elastacloud.spark.excel.parser.ExcelParserException
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.scalatest.BeforeAndAfterAll
Expand Down Expand Up @@ -180,6 +181,38 @@ class DefaultSourceTests extends AnyFlatSpec with Matchers with BeforeAndAfterAl
df.count() should be(3)
}

"Reading an empty workbook" should "throw an exception" in {
val inputPath = testFilePath("/Parser/Empty.xlsx")

val error = intercept[ExcelParserException] {
spark.read
.format("excel")
.load(inputPath.replace("%20", " "))
.count()
}

error.getMessage should be("No data found on first row")
}

it should "return a single empty record if only headers exist" in {
val inputPath = testFilePath("/Parser/NoData.xlsx")

val dataSchema = StructType(Array(
StructField("Col1", StringType, nullable = true),
StructField("Col2", StringType, nullable = true),
StructField("Col3", StringType, nullable = true),
StructField("Col4", StringType, nullable = true)
))

val df = spark.read
.format("com.elastacloud.spark.excel")
.schema(dataSchema)
.load(inputPath)

df.count() should be(1)
df.schema should equal(dataSchema)
}

"Attempting to write to Excel" should "raise an error" in {
import spark.implicits._

Expand Down
5 changes: 3 additions & 2 deletions src/test/scala/com/elastacloud/spark/excel/packageTests.scala
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,12 @@ class packageTests extends AnyFlatSpec with Matchers with BeforeAndAfterAll {
it should "apply options to all files being read" in {
val input1 = testFilePath("/Parser/SimpleWorkbook.xlsx")
val input2 = testFilePath("/Parser/SimpleWorkbook.xls")
val input3 = testFilePath("/Parser/NoData.xlsx")

val df = spark.read
.option("headerRowCount", 0)
.option("cellAddress", "A2")
.excel(input1, input2)
.excel(input1, input2, input3)

val simpleWorkbookSchema = StructType(Array(
StructField("col_0", StringType, nullable = true),
Expand All @@ -73,6 +74,6 @@ class packageTests extends AnyFlatSpec with Matchers with BeforeAndAfterAll {
))

df.schema should equal(simpleWorkbookSchema)
df.count() should be(6)
df.count() should be(7)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -122,29 +122,31 @@ class ExcelParserTests extends AnyFlatSpec with Matchers {
}
}

it should "handle cells with different types from the inferred schema" in {
it should "read data correctly using the inferred schema" in {
withInputStream("/Parser/VaryingTypes.xlsx") { inputStream =>
val options = new ExcelParserOptions(Map[String, String](
"evaluateFormulae" -> "false",
"maxRowCount" -> "3"
)) // Limit the row count so that it doesn't infer based on the string row

val expectedSchema = StructType(Array(
StructField("Item", StringType, nullable = true),
StructField("2010_0", DoubleType, nullable = true),
StructField("2011_0", DoubleType, nullable = true)
StructField("2011_0", DoubleType, nullable = true),
StructField("IsGood", BooleanType, nullable = true)
))

val expectedData = Seq(
Vector[Any]("Item 1".asUnsafe, 99.4, 99.4),
Vector[Any]("Item 2".asUnsafe, 12.4, 12.4),
Vector[Any]("Item 3".asUnsafe, 74.2, 74.2),
Vector[Any]("Item 4".asUnsafe, 36.8, 36.8),
Vector[Any]("Item 5".asUnsafe, 24.2, 24.2),
Vector[Any]("Item 6".asUnsafe, 11.6, 11.6),
Vector[Any]("Header Items".asUnsafe, null, null),
Vector[Any]("Item 12".asUnsafe, 99.2, 99.2),
Vector[Any]("Item 13".asUnsafe, 18.4, 18.4),
Vector[Any]("Item 14".asUnsafe, 12.3, 12.3)
Vector[Any]("Item 1".asUnsafe, 99.4, 99.4, true),
Vector[Any]("Item 2".asUnsafe, 12.4, 12.4, true),
Vector[Any]("Item 3".asUnsafe, 74.2, 74.2, true),
Vector[Any]("Item 4".asUnsafe, 36.8, 36.8, false),
Vector[Any]("Item 5".asUnsafe, 24.2, 24.2, false),
Vector[Any]("Item 6".asUnsafe, 11.6, 11.6, false),
Vector[Any]("Header Items".asUnsafe, null, null, null),
Vector[Any]("Item 12".asUnsafe, 99.2, 99.2, false),
Vector[Any]("Item 13".asUnsafe, 18.4, 18.4, true),
Vector[Any]("Item 14".asUnsafe, 12.3, 12.3, true)
)

val parser = new ExcelParser(inputStream, options)
Expand Down Expand Up @@ -537,20 +539,21 @@ class ExcelParserTests extends AnyFlatSpec with Matchers {
StructField("Item", StringType, nullable = true),
StructField("2010_0", DoubleType, nullable = true),
StructField("2011_0", DoubleType, nullable = true),
StructField("IsGood", BooleanType, nullable = true),
StructField("ValidRow", BooleanType, nullable = false)
))

val expectedData = Seq(
Vector[Any]("Item 1".asUnsafe, 99.4, 99.4, true),
Vector[Any]("Item 2".asUnsafe, 12.4, 12.4, true),
Vector[Any]("Item 3".asUnsafe, 74.2, 74.2, true),
Vector[Any]("Item 4".asUnsafe, 36.8, 36.8, true),
Vector[Any]("Item 5".asUnsafe, 24.2, 24.2, true),
Vector[Any]("Item 6".asUnsafe, 11.6, 11.6, true),
Vector[Any]("Header Items".asUnsafe, null, null, false),
Vector[Any]("Item 12".asUnsafe, 99.2, 99.2, true),
Vector[Any]("Item 13".asUnsafe, 18.4, 18.4, true),
Vector[Any]("Item 14".asUnsafe, 12.3, 12.3, true)
Vector[Any]("Item 1".asUnsafe, 99.4, 99.4, true, true),
Vector[Any]("Item 2".asUnsafe, 12.4, 12.4, true, true),
Vector[Any]("Item 3".asUnsafe, 74.2, 74.2, true, true),
Vector[Any]("Item 4".asUnsafe, 36.8, 36.8, false, true),
Vector[Any]("Item 5".asUnsafe, 24.2, 24.2, false, true),
Vector[Any]("Item 6".asUnsafe, 11.6, 11.6, false, true),
Vector[Any]("Header Items".asUnsafe, null, null, null, false),
Vector[Any]("Item 12".asUnsafe, 99.2, 99.2, false, true),
Vector[Any]("Item 13".asUnsafe, 18.4, 18.4, true, true),
Vector[Any]("Item 14".asUnsafe, 12.3, 12.3, true, true)
)

val parser = new ExcelParser(inputStream, options)
Expand Down Expand Up @@ -660,4 +663,27 @@ class ExcelParserTests extends AnyFlatSpec with Matchers {
actualData should equal(expectedData)
}
}

"Reading a file containing no data" should "throw an exception" in {
withInputStream("/Parser/Empty.xlsx") { inputStream =>
val parser = new ExcelParser(inputStream, new ExcelParserOptions())

val error = intercept[ExcelParserException] {
parser.getDataIterator.toList
}

error.getMessage should be("No data found on first row")
}
}

it should "return an single empty record if only headers exist" in {
withInputStream("/Parser/NoData.xlsx") { inputStream =>
val expectedData = Seq(Vector(null, null, null))

val parser = new ExcelParser(inputStream, new ExcelParserOptions())
val actualData = parser.getDataIterator.toList

actualData should be(expectedData)
}
}
}