Skip to content

Commit

Permalink
Add usePlainNumberFormat config to allow toggling between PlainNumber…
Browse files Browse the repository at this point in the history
…Format and ExcelGeneralNumberFormat.
  • Loading branch information
waiyan1612 authored and nightscape committed Dec 9, 2020
1 parent ab5c0e8 commit 6e4f9b9
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 17 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
Next
====
- Feature: Add PlainNumberFormat that does not round or use scientific notations for long numbers.
Can be enabled by setting `usePlainNumberFormat=true` when reading the excel file.

0.13.2
======
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ val df = spark.read
.option("dataAddress", "'My Sheet'!B3:C35") // Optional, default: "A1"
.option("header", "true") // Required
.option("treatEmptyValuesAsNulls", "false") // Optional, default: true
.option("usePlainNumberFormat", "false") // Optional, default: false, If true, format the cells without rounding and scientific notations
.option("inferSchema", "false") // Optional, default: false
.option("addColorColumns", "true") // Optional, default: false
.option("timestampFormat", "MM-dd-yyyy HH:mm:ss") // Optional, default: yyyy-mm-dd hh:mm:ss[.fffffffff]
Expand All @@ -91,6 +92,7 @@ val df = spark.read.excel(
header = true, // Required
dataAddress = "'My Sheet'!B3:C35", // Optional, default: "A1"
treatEmptyValuesAsNulls = false, // Optional, default: true
usePlainNumberFormat = false, // Optional, default: false. If true, format the cells without rounding and scientific notations
inferSchema = false, // Optional, default: false
addColorColumns = true, // Optional, default: false
timestampFormat = "MM-dd-yyyy HH:mm:ss", // Optional, default: yyyy-mm-dd hh:mm:ss[.fffffffff]
Expand Down
13 changes: 8 additions & 5 deletions src/main/scala/com/crealytics/spark/excel/DataColumn.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class HeaderDataColumn(
val field: StructField,
val columnIndex: Int,
treatEmptyValuesAsNulls: Boolean,
usePlainNumberFormat: Boolean,
parseTimestamp: String => Timestamp
) extends DataColumn {
def name: String = field.name
Expand All @@ -30,11 +31,13 @@ class HeaderDataColumn(
}

lazy val dataFormatter = new DataFormatter()
// Overwrite ExcelGeneralNumberFormat with custom PlainNumberFormat.
// See https://github.com/crealytics/spark-excel/issues/321
lazy val plainNumberFormat = PlainNumberFormat
dataFormatter.addFormat("General", plainNumberFormat)
dataFormatter.addFormat("@", plainNumberFormat)
if (usePlainNumberFormat) {
// Overwrite ExcelGeneralNumberFormat with custom PlainNumberFormat.
// See https://github.com/crealytics/spark-excel/issues/321
lazy val plainNumberFormat = PlainNumberFormat
dataFormatter.addFormat("General", plainNumberFormat)
dataFormatter.addFormat("@", plainNumberFormat)
}

lazy val stringValue =
cell.getCellType match {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class DefaultSource extends RelationProvider with SchemaRelationProvider with Cr
ExcelRelation(
header = checkParameter(parameters, "header").toBoolean,
treatEmptyValuesAsNulls = parameters.get("treatEmptyValuesAsNulls").fold(false)(_.toBoolean),
usePlainNumberFormat = parameters.get("usePlainNumberFormat").fold(false)(_.toBoolean),
userSchema = Option(schema),
inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean),
addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ case class ExcelRelation(
dataLocator: DataLocator,
header: Boolean,
treatEmptyValuesAsNulls: Boolean,
usePlainNumberFormat: Boolean,
inferSheetSchema: Boolean,
addColorColumns: Boolean = true,
userSchema: Option[StructType] = None,
Expand Down Expand Up @@ -153,7 +154,7 @@ case class ExcelRelation(
}

firstRow.zip(fields).map { case (cell, field) =>
new HeaderDataColumn(field, cell.getColumnIndex, treatEmptyValuesAsNulls, timestampParser)
new HeaderDataColumn(field, cell.getColumnIndex, treatEmptyValuesAsNulls, usePlainNumberFormat, timestampParser)
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/main/scala/com/crealytics/spark/excel/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ package object excel {
header: Boolean = true,
treatEmptyValuesAsNulls: Boolean = false,
inferSchema: Boolean = false,
usePlainNumberFormat: Boolean = false,
addColorColumns: Boolean = false,
dataAddress: String = null,
timestampFormat: String = null,
Expand All @@ -69,6 +70,7 @@ package object excel {
Map(
"header" -> header,
"treatEmptyValuesAsNulls" -> treatEmptyValuesAsNulls,
"usePlainNumberFormat" -> usePlainNumberFormat,
"inferSchema" -> inferSchema,
"addColorColumns" -> addColorColumns,
"dataAddress" -> dataAddress,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package com.crealytics.spark.excel
import java.util

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.sql._
import org.apache.spark.sql.{Row, _}
import org.apache.spark.sql.types._
import org.scalatest.funspec.AnyFunSpec
import org.scalatest.matchers.should.Matchers
Expand All @@ -19,13 +19,20 @@ object PlainNumberReadSuite {
)
)

val expectedDataInferSchema: util.List[Row] = List(
val expectedPlainDataInferSchema: util.List[Row] = List(
Row(12345678901d, "12345678901-123", "12/1/20"),
Row(123456789012d, "123456789012", "0.01"),
Row(-0.12345678901, "0.05", "0h 14m"),
Row(null, null, null)
).asJava

val expectedExcelDataInferSchema: util.List[Row] = List(
Row(1.2345678901e10, "12345678901-123", "12/1/20"),
Row(1.23456789012e11, "1.23457E+11", "0.01"), // values are displayed in scientific notation and rounded up
Row(-0.12345678901, "0.05", "0h 14m"),
Row(null, null, null)
).asJava

val expectedNonInferredSchema = StructType(
List(
StructField("only_numbers", StringType, true),
Expand All @@ -34,33 +41,53 @@ object PlainNumberReadSuite {
)
)

val expectedDataNonInferSchema: util.List[Row] = List(
val expectedPlainDataNonInferSchema: util.List[Row] = List(
Row("12345678901", "12345678901-123", "12/1/20"),
Row("123456789012", "123456789012", "0.01"),
Row("-0.12345678901", "0.05", "0h 14m"),
Row(null, null, null),
Row("-1/0", "abc.def", null)
).asJava

val expectedExcelDataNonInferSchema: util.List[Row] = List(
Row("12345678901", "12345678901-123", "12/1/20"),
Row("1.23457E+11", "1.23457E+11", "0.01"), // values are displayed in scientific notation and rounded up
Row("-0.123456789", "0.05", "0h 14m"), // values are rounded up
Row(null, null, null),
Row("-1/0", "abc.def", null)
).asJava
}

class PlainNumberReadSuite extends AnyFunSpec with DataFrameSuiteBase with Matchers {
import PlainNumberReadSuite._

def readFromResources(path: String, inferSchema: Boolean): DataFrame = {
def readFromResources(path: String, usePlainNumberFormat: Boolean, inferSchema: Boolean): DataFrame = {
val url = getClass.getResource(path)
spark.read.excel(inferSchema = inferSchema).load(url.getPath)
spark.read.excel(usePlainNumberFormat = usePlainNumberFormat, inferSchema = inferSchema).load(url.getPath)
}

describe("spark-excel") {
it("should read long numbers in plain number format when inferSchema is true") {
val df = readFromResources("/spreadsheets/plain_number.xlsx", true)
val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema)
it("should read numbers in plain number format when usePlainNumberFormat=true and inferSchema=true") {
val df = readFromResources("/spreadsheets/plain_number.xlsx", true, true)
val expected = spark.createDataFrame(expectedPlainDataInferSchema, expectedInferredSchema)
assertDataFrameEquals(expected, df)
}

it("should read numbers in plain number format when usePlainNumberFormat=true and inferSchema=false") {
val df = readFromResources("/spreadsheets/plain_number.xlsx", true, false)
val expected = spark.createDataFrame(expectedPlainDataNonInferSchema, expectedNonInferredSchema)
assertDataFrameEquals(expected, df)
}

it("should read numbers in excel general number format when usePlainNumberFormat=false and inferSchema=true") {
val df = readFromResources("/spreadsheets/plain_number.xlsx", false, true)
val expected = spark.createDataFrame(expectedExcelDataInferSchema, expectedInferredSchema)
assertDataFrameEquals(expected, df)
}

it("should read long numbers in plain number format when inferSchema is false") {
val df = readFromResources("/spreadsheets/plain_number.xlsx", false)
val expected = spark.createDataFrame(expectedDataNonInferSchema, expectedNonInferredSchema)
it("should read numbers in excel general number format when usePlainNumberFormat=false and inferSchema=false") {
val df = readFromResources("/spreadsheets/plain_number.xlsx", false, false)
val expected = spark.createDataFrame(expectedExcelDataNonInferSchema, expectedNonInferredSchema)
assertDataFrameEquals(expected, df)
}

Expand Down

0 comments on commit 6e4f9b9

Please sign in to comment.