Skip to content

Commit

Permalink
Correctly check for errors when parsing text calendars
Browse files Browse the repository at this point in the history
The SimpleDateFormat parse() method does not set the error index on the
ParsePosition if there is an error. Instead, it only sets the normal
index to the last successful parse position of the input string.

Daffodil currently relies on the error index, which means our error
checking is incorrect.

Fortunately, in most cases, this doesn't matter because our error
checking also requires that we parse the entire input string, so if the
index is zero it wouldn't match the string length and we would still see
it as an error. The one exception to this is if the input string is zero
length, in which case we do not correctly error and just create a
date/time with value zero.

This is fixed by correctly checking index for zero instead of inspecting
the error index.

This also does a little refactoring to replace the very old cache
implementation with the much newer Evaluatable for creating
SimpleDateFormat instances.

DAFFODIL-2821
  • Loading branch information
stevedlawrence committed Jun 15, 2023
1 parent db12258 commit d670968
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 109 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import org.apache.daffodil.lib.schema.annotation.props.gen.CalendarPatternKind
import org.apache.daffodil.lib.schema.annotation.props.gen.Representation
import org.apache.daffodil.runtime1.processors.CalendarEv
import org.apache.daffodil.runtime1.processors.CalendarLanguageEv
import org.apache.daffodil.runtime1.processors.DateTimeFormatterEv
import org.apache.daffodil.runtime1.processors.parsers.ConvertBinaryCalendarSecMilliParser
import org.apache.daffodil.runtime1.processors.parsers.ConvertTextCalendarParser
import org.apache.daffodil.unparsers.runtime1.ConvertBinaryCalendarSecMilliUnparser
Expand Down Expand Up @@ -165,6 +166,12 @@ abstract class ConvertTextCalendarPrimBase(e: ElementBase, guard: Boolean)
} else {
p
}

schemaDefinitionWhen(
patternToCheck.length == 0,
"dfdl:calendarPatttern contains no pattern letters",
)

patternToCheck.toSeq.foreach(char =>
if (!validFormatCharacters.contains(char)) {
if (e.representation == Representation.Binary)
Expand Down Expand Up @@ -201,18 +208,34 @@ abstract class ConvertTextCalendarPrimBase(e: ElementBase, guard: Boolean)
p
}

private lazy val dateTimeFormatterEv = {
val ev = new DateTimeFormatterEv(
calendarEv,
localeEv,
pattern,
e.eci,
)
ev.compile(e.tunable)
ev
}

override lazy val parser = new ConvertTextCalendarParser(
e.elementRuntimeData,
xsdType,
prettyType,
pattern,
hasTZ,
localeEv,
calendarEv,
dateTimeFormatterEv,
)

override lazy val unparser =
new ConvertTextCalendarUnparser(e.elementRuntimeData, pattern, localeEv, calendarEv)
new ConvertTextCalendarUnparser(
e.elementRuntimeData,
pattern,
calendarEv,
dateTimeFormatterEv,
)
}

case class ConvertTextDatePrim(e: ElementBase) extends ConvertTextCalendarPrimBase(e, true) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,23 @@ import org.apache.daffodil.lib.calendar.DFDLCalendar
import org.apache.daffodil.lib.exceptions.Assert
import org.apache.daffodil.lib.util.Misc
import org.apache.daffodil.runtime1.processors.CalendarEv
import org.apache.daffodil.runtime1.processors.CalendarLanguageEv
import org.apache.daffodil.runtime1.processors.DateTimeFormatterEv
import org.apache.daffodil.runtime1.processors.ElementRuntimeData
import org.apache.daffodil.runtime1.processors.parsers.ConvertTextCalendarProcessorBase
import org.apache.daffodil.runtime1.processors.unparsers._

import com.ibm.icu.util.Calendar
import com.ibm.icu.util.ULocale

case class ConvertTextCalendarUnparser(
erd: ElementRuntimeData,
override val context: ElementRuntimeData,
pattern: String,
localeEv: CalendarLanguageEv,
calendarEv: CalendarEv,
) extends ConvertTextCalendarProcessorBase(erd, pattern)
with TextPrimUnparser {
dateTimeFormatterEv: DateTimeFormatterEv,
) extends TextPrimUnparser {

/**
* Primitive unparsers must override runtimeDependencies
*/
override lazy val runtimeDependencies = Vector(localeEv, calendarEv)
override lazy val runtimeDependencies = Vector(calendarEv, dateTimeFormatterEv)

def unparse(state: UState): Unit = {

Expand All @@ -59,8 +56,6 @@ case class ConvertTextCalendarUnparser(
)
}

val locale: ULocale = localeEv.evaluate(state)

val calendarOrig: Calendar = calendarEv.evaluate(state)

// The clear() here actually shouldn't be necessary since we call clear()
Expand All @@ -72,11 +67,7 @@ case class ConvertTextCalendarUnparser(
// the case.
calendarOrig.clear()

// It's important here to use the calendarOrig that results from
// calendarEv.evaluate() since the tlDataFormatter is a cache the uses
// reference equality. For everything else we want to use a clone for
// reasons described below.
val df = tlDataFormatter(locale, calendarOrig)
val df = dateTimeFormatterEv.evaluate(state).get

// When we evaluate calendarEV, if it is a constant we will always get back
// the same Calendar object. Because of this it is important here to clone
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import org.apache.daffodil.lib.cookers.Converter
import org.apache.daffodil.lib.exceptions._
import org.apache.daffodil.runtime1.dsom._

import com.ibm.icu.text.SimpleDateFormat
import com.ibm.icu.util.Calendar
import com.ibm.icu.util.TimeZone
import com.ibm.icu.util.ULocale
Expand Down Expand Up @@ -96,3 +97,33 @@ class CalendarEv(
cal
}
}

class DateTimeFormatterEv(
calendarEv: CalendarEv,
localeEv: CalendarLanguageEv,
pattern: String,
eci: DPathElementCompileInfo,
) extends Evaluatable[ThreadLocal[SimpleDateFormat]](eci)
with InfosetCachedEvaluatable[ThreadLocal[SimpleDateFormat]] {

override lazy val runtimeDependencies = Seq(localeEv)

override def compute(state: ParseOrUnparseState) = {
val calendar = calendarEv.evaluate(state)
val locale = localeEv.evaluate(state)

// As per ICU4J documentation, "Date formats are not synchronized. If multiple threads
// access a format concurrently, it must be synchronized externally." Rather than
// synchronzing, we create a ThreadLocal so each thread gets their own copy of the
// SimpleDateFormat
val dateFormatTL = new ThreadLocal[SimpleDateFormat] with Serializable {
override def initialValue = {
val formatter = new SimpleDateFormat(pattern, locale)
formatter.setCalendar(calendar)
formatter.setLenient(true) // TODO: should this use calendarCheckPolicy?
formatter
}
}
dateFormatTL
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,78 +26,22 @@ import org.apache.daffodil.lib.exceptions.Assert
import org.apache.daffodil.lib.schema.annotation.props.gen.BinaryCalendarRep
import org.apache.daffodil.runtime1.dsom.TunableLimitExceededError
import org.apache.daffodil.runtime1.processors.CalendarEv
import org.apache.daffodil.runtime1.processors.CalendarLanguageEv
import org.apache.daffodil.runtime1.processors.DateTimeFormatterEv
import org.apache.daffodil.runtime1.processors.ElementRuntimeData
import org.apache.daffodil.runtime1.processors.Processor

import com.ibm.icu.text.SimpleDateFormat
import com.ibm.icu.util.Calendar
import com.ibm.icu.util.ULocale

abstract class ConvertTextCalendarProcessorBase(
override val context: ElementRuntimeData,
pattern: String,
) extends Processor {
// The dfdl:calendarLanguage property can be a runtime-valued expression.
// Hence, locale and calendar, derived from it, can also be runtime-valued.
//
// To parse we need a SimpleDateFormat, and these are
// (1) not thread safe - so need to be a thread local
// (2) must be initialized passing the runtime-valued locale and calendar.
//
// Now, fact is, while this could be runtime-valued, even if it is, it is very unlikely
// to be changing a lot. So it's a shame to endlessly allocate SimpleDateFormat objects
//
// So we have a 1 slot cache here. We memorize the locale and calendar and associated
// thread local, and return that thread local if the locale and calendar are the same.
//
// Conservatively, they need to be the exact same object, not just equivalent in the "==" sense.
//
// This won't be great if some format really is switching among locales and calendars
// so that more than one of them really ought to be cached, but that's sufficiently unlikely
// that this trivial scheme will do for now.
//
private final class Cache(var cache: (ULocale, Calendar, SimpleDateFormat))

private object tlCache extends ThreadLocal[Cache] {
override def initialValue = new Cache(null)
}

/**
* tlDataFormatter can be runtime valued, as it depends on both locale and calendar
* each of which can be runtime valued.
*/
final protected def tlDataFormatter(locale: ULocale, calendar: Calendar) = {
val tl = tlCache.get
val cache = tl.cache
if ((cache ne null) && (locale eq cache._1) && (calendar eq cache._2)) {
// cache hit. Same formatter will do
cache._3
} else {
// As per ICU4J documentation, "Date formats are not synchronized. If
// multiple threads access a format concurrently, it must be synchronized
// externally."
val formatter = new SimpleDateFormat(pattern, locale)
formatter.setCalendar(calendar)
formatter.setLenient(true)
tl.cache = (locale, calendar, formatter)
formatter
}
}
}

case class ConvertTextCalendarParser(
erd: ElementRuntimeData,
override val context: ElementRuntimeData,
xsdType: String,
prettyType: String,
pattern: String,
hasTZ: Boolean,
localeEv: CalendarLanguageEv,
calendarEv: CalendarEv,
) extends ConvertTextCalendarProcessorBase(erd, pattern)
with TextPrimParser {
dateTimeFormatterEv: DateTimeFormatterEv,
) extends TextPrimParser {

override lazy val runtimeDependencies = Vector(localeEv, calendarEv)
override lazy val runtimeDependencies = Vector(calendarEv, dateTimeFormatterEv)

def parse(start: PState): Unit = {
val node = start.simpleElement
Expand All @@ -107,8 +51,6 @@ case class ConvertTextCalendarParser(

val pos = new ParsePosition(0)

val locale: ULocale = localeEv.evaluate(start)

val calendarOrig: Calendar = calendarEv.evaluate(start)

// The clear here actually shouldn't be necessary since we call clear()
Expand All @@ -120,17 +62,13 @@ case class ConvertTextCalendarParser(
// the case.
calendarOrig.clear()

// It's important here to use the calendarOrig that results from
// calendarEv.evaluate() since the tlDataFormatter is a cache the uses
// reference equality. For everything else we want to use a clone for
// reasons described below.
val df = tlDataFormatter(locale, calendarOrig)
val df = dateTimeFormatterEv.evaluate(start).get

// When we evaluate calendarEV, if it is a constant we will always get back
// the same Calendar object. Because of this it is important here to clone
// this calendar and always use the clone below for two reasons:
//
// 1) The below code will modify modify the calendar object based on the
// 1) The below code will modify the calendar object based on the
// value of the parsed string. Any changes to the object will persist
// and could affect future parses. By cloning, we ensure we do not
// modify the original calendar object.
Expand All @@ -143,10 +81,10 @@ case class ConvertTextCalendarParser(
df.setCalendar(calendar)
df.parse(str, calendar, pos);

// Verify that what was parsed was what was passed exactly in byte count
// Use pos to verify all characters consumed & check for errors
if (pos.getIndex != str.length || pos.getErrorIndex >= 0) {
val errIndex = if (pos.getErrorIndex >= 0) pos.getErrorIndex else pos.getIndex
// Verify that we did not fail to parse and that we consumed the entire string. Note that
// getErrorIndex is never set and is always -1. Only a getIndex value of zero means there
// was an error
if (pos.getIndex == 0 || pos.getIndex != str.length) {
PE(start, "Unable to parse xs:%s from text: %s", xsdType, str)
return
}
Expand All @@ -163,7 +101,7 @@ case class ConvertTextCalendarParser(
) < start.tunable.minValidYear)
)
throw new TunableLimitExceededError(
erd.schemaFileLocation,
context.schemaFileLocation,
"Year value of %s is not within the limits of the tunables minValidYear (%s) and maxValidYear (%s)",
calendar.get(Calendar.YEAR),
start.tunable.minValidYear,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,52 @@ class TestICU {
r.foreach(parseFractionalSeconds)
}

/**
* This test shows that error index is never set when parsing with a SimpleDateFormat. If ICU
* ever changes this to use error index, this test should fail.
*
* On success, ParsePosition.getIndex is set to where the parse finished. On error,
* ParsePosition.getIndex is set to zero and ParsePoition.getErrorIndex is -1 (unset)
*/
@Test def test_simpleDateFormatParse() = {
def parseWithPattern(pattern: String, data: String): ParsePosition = {
val df = new SimpleDateFormat(pattern)
val cal = Calendar.getInstance()
val pos = pp
df.parse(data, cal, pos)
pos
}

{
// success, consumes all data
val pos = parseWithPattern("HHmm", "1122")
assertEquals(4, pos.getIndex)
assertEquals(-1, pos.getErrorIndex)
}

{
// success, parses only first 4 characters
val pos = parseWithPattern("HHmm", "1122extra")
assertEquals(4, pos.getIndex)
assertEquals(-1, pos.getErrorIndex)
}

{
// failure, only partially correct data
val pos = parseWithPattern("HHmm", "11cd")
assertEquals(2, pos.getIndex)
assertEquals(-1, pos.getErrorIndex)
}

{
// failure, empty string
val pos = parseWithPattern("HHmm", "")
assertEquals(0, pos.getIndex)
assertEquals(-1, pos.getErrorIndex)
}

}

// The three following tests show an old ICU bug where if the decimal pattern
// uses scientific notation and the number to format/unparse is positive
// infinity, negative infinity, or not a number, ICU would include the
Expand Down
Loading

0 comments on commit d670968

Please sign in to comment.