Skip to content

Commit

Permalink
[GSOC'24 Amharic chapter] Implement Date time parser for Ethiopian Ca…
Browse files Browse the repository at this point in the history
…lendar (#763)

* Implements Ethiopian date to Gregorian calendar conversion for Amharic extractions.
* Implements Geez number to Arabic numeral conversion for dates written in geez number.
* Address concern mentioned in [Issue 761](#761)

---------

Co-authored-by: Ted Thibodeau Jr <[email protected]>
  • Loading branch information
Meti-Adane and TallTed authored Dec 11, 2024
1 parent 2fca8ce commit 7567877
Show file tree
Hide file tree
Showing 7 changed files with 522 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ object DateTimeParserConfig
val monthsMap = Map(
// For "ar" configuration, right-to-left rendering may seem like a bug, but it's not.
// Don't change this unless you know how it is done.
"am" -> Map("january"->1,"february"->2,"march"->3,"april"->4,"may"->5,"june"->6,"july"->7,"august"->8,"september"->9,"october"->10,"november"->11,"december"->12,
"ጃንዩወሪ" -> 1, "ፌብሩወሪ" -> 2,"ማርች" -> 3,"ኤፕሪል" -> 4,"ሜይ" -> 5,"ጁን" -> 6,"ጁላይ" -> 7,"ኦገስት" -> 8,"ሴፕተምበር" -> 9,"ኦክቶበር" -> 10,"ኖቬምበር" -> 11,"ዲሴምበር" -> 12),
"ar" -> Map("جانفي"->1,"فيفري"->2,"مارس"->3,"أفريل"->4,"ماي"->5,"جوان"->6,"جويلية"->7,"أوت"->8,"سبتمبر"->9,"أكتوبر"->10,"نوفمبر"->11,"ديسمبر"->12,
"يناير"->1,"فبراير"->2,"أبريل"->4,"مايو"->5,"يونيو"->6,"يوليو"->7,"يوليوز"->7,"أغسطس"->8,"غشت"->8,"شتنبر"->9,"نونبر"->11,"دجنبر"->12),
"bg" -> Map("януари"->1,"февруари"->2,"март"->3,"април"->4,"май"->5,"юни"->6,"юли"->7,"август"->8,"септември"->9,"октомври"->10,"ноември"->11,"декември"->12),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.dbpedia.extraction.config.dataparser

object EthiopianDateParserConfig {
val geezNumberDateMap = Map(
1 -> "",
2 -> "",
3 -> "",
4 -> "",
5 -> "",
6 -> "",
7 -> "",
8 -> "",
9 -> "",
10 -> "",
11 -> "፲፩",
12 -> "፲፪",
13 -> "፲፫",
14 -> "፲፬",
15 -> "፲፭",
16 -> "፲፮",
17 -> "፲፯",
18 -> "፲፰",
19 -> "፲፱",
20 -> "",
21 -> "፳፩",
22 -> "፳፪",
23 -> "፳፫",
24 -> "፳፬",
25 -> "፳፭",
26 -> "፳፮",
27 -> "፳፯",
28 -> "፳፰",
29 -> "፳፱",
30 -> ""
)

val monthsMap = Map(
"መስከረም" -> 1,
"ጥቅምት" -> 2,
"ኅዳር" -> 3,
"ታኅሳስ" -> 4,
"ጥር" -> 5,
"የካቲት" -> 6,
"መጋቢት" -> 7,
"ሚያዝያ" -> 8,
"ግንቦት" -> 9,
"ሰኔ" -> 10,
"ሐምሌ" -> 11,
"ነሐሴ" -> 12,
"ጳጉሜ" -> 13
)

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ class DateTimeParser ( context :
@transient private val logger = Logger.getLogger(getClass.getName)

// language-specific configurations

private val language = if(DateTimeParserConfig.supportedLanguages.contains(context.language.wikiCode)) context.language.wikiCode else "en"

private val months = DateTimeParserConfig.monthsMap.getOrElse(language, DateTimeParserConfig.monthsMap("en"))
private val eraStr = DateTimeParserConfig.eraStrMap.getOrElse(language, DateTimeParserConfig.eraStrMap("en"))
private val cardinalityRegex = DateTimeParserConfig.cardinalityRegexMap.getOrElse(language, DateTimeParserConfig.cardinalityRegexMap("en"))
private val templates = DateTimeParserConfig.templateDateMap.getOrElse(language, Map())

private val ethiopianDateParser = new EthiopianDateParser(datatype:Datatype, strict:Boolean);
// parse logic configurations

override val splitPropertyNodeRegex: String = if (DataParserConfig.splitPropertyNodeRegexDateTime.contains(language))
Expand Down Expand Up @@ -189,7 +189,17 @@ class DateTimeParser ( context :
}

private def findDate(input: String) : Option[Date] =

{

// scan for Ethiopian (geez) calendar dates
if(language == "am"){
for(date <- ethiopianDateParser.findGeezDate(input))
{
return Some(date)
}
}

for(date <- catchDate(input))
{
return Some(date)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
package org.dbpedia.extraction.dataparser
import java.util.logging.{Logger, Level}
import scala.util.matching.Regex
import org.dbpedia.extraction.config.dataparser.{
EthiopianDateParserConfig,
DateTimeParserConfig
}
import org.dbpedia.extraction.util.{Language, Date}
import org.dbpedia.extraction.util.{GeezNumberUtils}
import org.dbpedia.extraction.ontology.datatypes.Datatype

class EthiopianDateParser(datatype: Datatype, val strict: Boolean = false) {
require(datatype != null, "datatype != null")
@transient private val logger = Logger.getLogger(getClass.getName)

val geezNumberParser = new GeezNumberUtils()
private val monthsMap = EthiopianDateParserConfig.monthsMap
private val monthsName = monthsMap.keys.mkString("|")
private val geezNumberDate =
EthiopianDateParserConfig.geezNumberDateMap.values.mkString("|")

private val gregorianDateIndicator = s""".*(እ.ኤ.አ).*""".r
private val prefix = if (strict) """\s*""" else """.*?"""
private val postfix = if (strict) """\s*""" else ".*"

// catches dd-mm-yyyy including a 13th month 21 13 2013, 21-13-2013, 21/13/2013, 21-13-2013, 21/13/2013
private val dateRegex1: Regex =
s"""$prefix\\b(0?[1-9]|[12][0-9]|3[01])\\b[-/\\s]\\b(0?[1-9]|1[0-2]|13)\\b[-/\\s](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r

// Regex for dates containing geez characters
// catches dates like ጥቅምት-21-2013 or ጥቅምት/21/2013 or ጥቅምት 21 2013
private val dateRegex2: Regex =
s"""$prefix($monthsName)[\\s/-](\\b(0?[1-9]|[12][0-9]|3[01])\\b)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r

// catches dates dd-month-yyyy like 21-ጥቅምት-2013 or 21/ጥቅምት/2013 or 21 ጥቅምት 2013
private val dateRegex3: Regex =
s"""$prefix(\\b(0?[1-9]|[12][0-9]|3[01])\\b)[\\s/-]($monthsName)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r

// catches dates month-dd-yyyy ጥቅምት ፳፩ ፳፻፲፫ or ጥቅምት/፳፩/፳፻፲፫ or ጥቅምት ፳፩ ፳፻፲፫ mmmm-dd-yyyy
private val dateRegex4: Regex =
s"""$prefix(\\b$monthsName)[\\s/-]($geezNumberDate|0?[1-9]|[12][0-9]|3[01])[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r

// catches dates like ፳፩ ጥቅምት ፳፻፲፫ or ፳፩/ጥቅምት/፳፻፲፫ or 21/ጥቅምት/2013 dd-mmmm-yyyy
private val dateRegex5: Regex =
s"""$prefix(\\b$geezNumberDate|0?[1-9]|[12][0-9]|3[01])[\\s/-]($monthsName)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r

def catchGeezDate(dateString: String): Option[(String, String, String)] = {

for (dateRegex1(day, month, year) <- List(dateString)) {
return Some((year, month, day))
}

// Amharic month names (month-day-year)
for (dateRegex2(month, day, year) <- List(dateString)) {
return Some((year, month, day))
}

// Amharic month names (day-month-year)
for (dateRegex3(day, month, year) <- List(dateString)) {
return Some((year, month, day))
}

// dates that contain geez/Amharic numbers (month-day-year)
for (dateRegex4(month, day, year) <- List(dateString)) {
return Some((year, month, day))
}

// dates that contain geez/Amharic numbers (day-month-year)
for (dateRegex5(day, month, year) <- List(dateString)) {
return Some((year, month, day))
}

None
}

def isLeapYear(year: Int): Boolean = {
return (year % 4 == 3)
}

def isValidEthiopianCalendarDate(year: Int, month: Int, day: Int): Boolean = {
// Validate year
if (year <= 0) {
logger.log(Level.FINE, "Year must be greater than 0.")
return false
}

// Validate month
if (month < 1 || month > 13) {
logger.log(
Level.FINE,
s"Month must be between 1 and 13. Provided month: $month."
)
return false
}

// Validate day
if (day < 1 || day > 30) {
logger.log(
Level.FINE,
s"Day must be between 1 and 30. Provided day: $day."
)
return false
}

// Validate case for Pagume (month 13 in Ethiopian Calendar)
if (month == 13) {
if (day > 6) {
logger.log(
Level.FINE,
s"Day in Pagume cannot exceed 6. Provided day: $day."
)
return false
}
if (!isLeapYear(year) && day > 5) {
logger.log(
Level.FINE,
s"Pagume only has 5 days in non-leap years. Provided day: $day."
)
return false
}
}

true
}

private def ethiopianDateToJDN(year: Int, month: Int, day: Int): Double = {
val EPOCH: Long = 1723856
val julianDayNumber: Double =
(EPOCH + 365) + 365 * (year - 1) + (year / 4).toInt + 30 * month + day - 31
return julianDayNumber
}

def geezToGregorianDateConverter(
year: Int,
month: Int,
day: Int,
datatype: Datatype
): Option[Date] = {
val JDN: Double = ethiopianDateToJDN(year, month, day)
val Q: Double = JDN + 0.5
val Z: Long = Q.toLong
val W: Long = ((Z - 1867216.25) / 36524.25).toLong
val X: Long = (W / 4).toLong
val A: Long = Z + 1 + W - X
val B: Long = A + 1524
val C: Long = ((B - 122.1) / 365.25).toLong
val D: Long = (365.25 * C).toLong
val E: Long = ((B - D) / 30.6001).toLong
val F: Long = (30.6001 * E).toLong
val gregorianDay: Int = (B - D - F + (Q - Z)).toInt
val gregorianMonth: Long = if (E - 1 <= 12) E - 1 else E - 13
val gregorianYear: Long = if (month <= 2) C - 4715 else C - 4716

Some(
new Date(
Some(gregorianYear.toInt),
Some(gregorianMonth.toInt),
Some(gregorianDay.toInt),
datatype
)
)
}

def isArabicNumeral(str: String): Boolean = {
str.forall(c => c.isDigit)
}

def formatDate(
dateString: Option[(String, String, String)]
): Option[(Int, Int, Int)] = {

dateString.flatMap { case (year, month, day) =>
val yearNum =
if (isArabicNumeral(year)) year.toInt
else geezNumberParser.convertGeezToArabicNumeral(year).getOrElse(0)
val monthNum =
if (isArabicNumeral(month)) month.toInt
else {
EthiopianDateParserConfig.monthsMap.getOrElse(
month,
geezNumberParser.convertGeezToArabicNumeral(month).getOrElse(0)
)
}
val dayNum =
if (isArabicNumeral(day)) day.toInt
else geezNumberParser.convertGeezToArabicNumeral(day).getOrElse(0)

return Some((yearNum, monthNum, dayNum))

}
}

def findGeezDate(input: String): Option[Date] = {
val isGregorianDate = (gregorianDateIndicator.findFirstIn(input)).isDefined

if (isGregorianDate) {
return None
}

val dateString: Option[(String, String, String)] = catchGeezDate(input)
val (yearNum, monthNum, dayNum) =
formatDate(dateString).getOrElse((0, 0, 0))

if (!isValidEthiopianCalendarDate(yearNum, monthNum, dayNum)) {
return None
}

for (
date <- geezToGregorianDateConverter(yearNum, monthNum, dayNum, datatype)
) {

return Some(date)
}
None

}
}
Loading

0 comments on commit 7567877

Please sign in to comment.