Skip to content

Commit 7567877

Browse files
Meti-AdaneTallTed
andauthored
[GSOC'24 Amharic chapter] Implement Date time parser for Ethiopian Calendar (#763)
* Implements Ethiopian date to Gregorian calendar conversion for Amharic extractions. * Implements Geez number to Arabic numeral conversion for dates written in geez number. * Address concern mentioned in [Issue 761](#761) --------- Co-authored-by: Ted Thibodeau Jr <[email protected]>
1 parent 2fca8ce commit 7567877

File tree

7 files changed

+522
-2
lines changed

7 files changed

+522
-2
lines changed

core/src/main/scala/org/dbpedia/extraction/config/dataparser/DateTimeParserConfig.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ object DateTimeParserConfig
66
val monthsMap = Map(
77
// For "ar" configuration, right-to-left rendering may seem like a bug, but it's not.
88
// Don't change this unless you know how it is done.
9+
"am" -> Map("january"->1,"february"->2,"march"->3,"april"->4,"may"->5,"june"->6,"july"->7,"august"->8,"september"->9,"october"->10,"november"->11,"december"->12,
10+
"ጃንዩወሪ" -> 1, "ፌብሩወሪ" -> 2,"ማርች" -> 3,"ኤፕሪል" -> 4,"ሜይ" -> 5,"ጁን" -> 6,"ጁላይ" -> 7,"ኦገስት" -> 8,"ሴፕተምበር" -> 9,"ኦክቶበር" -> 10,"ኖቬምበር" -> 11,"ዲሴምበር" -> 12),
911
"ar" -> Map("جانفي"->1,"فيفري"->2,"مارس"->3,"أفريل"->4,"ماي"->5,"جوان"->6,"جويلية"->7,"أوت"->8,"سبتمبر"->9,"أكتوبر"->10,"نوفمبر"->11,"ديسمبر"->12,
1012
"يناير"->1,"فبراير"->2,"أبريل"->4,"مايو"->5,"يونيو"->6,"يوليو"->7,"يوليوز"->7,"أغسطس"->8,"غشت"->8,"شتنبر"->9,"نونبر"->11,"دجنبر"->12),
1113
"bg" -> Map("януари"->1,"февруари"->2,"март"->3,"април"->4,"май"->5,"юни"->6,"юли"->7,"август"->8,"септември"->9,"октомври"->10,"ноември"->11,"декември"->12),
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package org.dbpedia.extraction.config.dataparser
2+
3+
object EthiopianDateParserConfig {
4+
val geezNumberDateMap = Map(
5+
1 -> "",
6+
2 -> "",
7+
3 -> "",
8+
4 -> "",
9+
5 -> "",
10+
6 -> "",
11+
7 -> "",
12+
8 -> "",
13+
9 -> "",
14+
10 -> "",
15+
11 -> "፲፩",
16+
12 -> "፲፪",
17+
13 -> "፲፫",
18+
14 -> "፲፬",
19+
15 -> "፲፭",
20+
16 -> "፲፮",
21+
17 -> "፲፯",
22+
18 -> "፲፰",
23+
19 -> "፲፱",
24+
20 -> "",
25+
21 -> "፳፩",
26+
22 -> "፳፪",
27+
23 -> "፳፫",
28+
24 -> "፳፬",
29+
25 -> "፳፭",
30+
26 -> "፳፮",
31+
27 -> "፳፯",
32+
28 -> "፳፰",
33+
29 -> "፳፱",
34+
30 -> ""
35+
)
36+
37+
val monthsMap = Map(
38+
"መስከረም" -> 1,
39+
"ጥቅምት" -> 2,
40+
"ኅዳር" -> 3,
41+
"ታኅሳስ" -> 4,
42+
"ጥር" -> 5,
43+
"የካቲት" -> 6,
44+
"መጋቢት" -> 7,
45+
"ሚያዝያ" -> 8,
46+
"ግንቦት" -> 9,
47+
"ሰኔ" -> 10,
48+
"ሐምሌ" -> 11,
49+
"ነሐሴ" -> 12,
50+
"ጳጉሜ" -> 13
51+
)
52+
53+
}

core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@ class DateTimeParser ( context :
2424
@transient private val logger = Logger.getLogger(getClass.getName)
2525

2626
// language-specific configurations
27-
2827
private val language = if(DateTimeParserConfig.supportedLanguages.contains(context.language.wikiCode)) context.language.wikiCode else "en"
2928

3029
private val months = DateTimeParserConfig.monthsMap.getOrElse(language, DateTimeParserConfig.monthsMap("en"))
3130
private val eraStr = DateTimeParserConfig.eraStrMap.getOrElse(language, DateTimeParserConfig.eraStrMap("en"))
3231
private val cardinalityRegex = DateTimeParserConfig.cardinalityRegexMap.getOrElse(language, DateTimeParserConfig.cardinalityRegexMap("en"))
3332
private val templates = DateTimeParserConfig.templateDateMap.getOrElse(language, Map())
3433

34+
private val ethiopianDateParser = new EthiopianDateParser(datatype:Datatype, strict:Boolean);
3535
// parse logic configurations
3636

3737
override val splitPropertyNodeRegex: String = if (DataParserConfig.splitPropertyNodeRegexDateTime.contains(language))
@@ -189,7 +189,17 @@ class DateTimeParser ( context :
189189
}
190190

191191
private def findDate(input: String) : Option[Date] =
192+
192193
{
194+
195+
// scan for Ethiopian (geez) calendar dates
196+
if(language == "am"){
197+
for(date <- ethiopianDateParser.findGeezDate(input))
198+
{
199+
return Some(date)
200+
}
201+
}
202+
193203
for(date <- catchDate(input))
194204
{
195205
return Some(date)
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
package org.dbpedia.extraction.dataparser
2+
import java.util.logging.{Logger, Level}
3+
import scala.util.matching.Regex
4+
import org.dbpedia.extraction.config.dataparser.{
5+
EthiopianDateParserConfig,
6+
DateTimeParserConfig
7+
}
8+
import org.dbpedia.extraction.util.{Language, Date}
9+
import org.dbpedia.extraction.util.{GeezNumberUtils}
10+
import org.dbpedia.extraction.ontology.datatypes.Datatype
11+
12+
class EthiopianDateParser(datatype: Datatype, val strict: Boolean = false) {
13+
require(datatype != null, "datatype != null")
14+
@transient private val logger = Logger.getLogger(getClass.getName)
15+
16+
val geezNumberParser = new GeezNumberUtils()
17+
private val monthsMap = EthiopianDateParserConfig.monthsMap
18+
private val monthsName = monthsMap.keys.mkString("|")
19+
private val geezNumberDate =
20+
EthiopianDateParserConfig.geezNumberDateMap.values.mkString("|")
21+
22+
private val gregorianDateIndicator = s""".*(እ.ኤ.አ).*""".r
23+
private val prefix = if (strict) """\s*""" else """.*?"""
24+
private val postfix = if (strict) """\s*""" else ".*"
25+
26+
// catches dd-mm-yyyy including a 13th month 21 13 2013, 21-13-2013, 21/13/2013, 21-13-2013, 21/13/2013
27+
private val dateRegex1: Regex =
28+
s"""$prefix\\b(0?[1-9]|[12][0-9]|3[01])\\b[-/\\s]\\b(0?[1-9]|1[0-2]|13)\\b[-/\\s](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r
29+
30+
// Regex for dates containing geez characters
31+
// catches dates like ጥቅምት-21-2013 or ጥቅምት/21/2013 or ጥቅምት 21 2013
32+
private val dateRegex2: Regex =
33+
s"""$prefix($monthsName)[\\s/-](\\b(0?[1-9]|[12][0-9]|3[01])\\b)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r
34+
35+
// catches dates dd-month-yyyy like 21-ጥቅምት-2013 or 21/ጥቅምት/2013 or 21 ጥቅምት 2013
36+
private val dateRegex3: Regex =
37+
s"""$prefix(\\b(0?[1-9]|[12][0-9]|3[01])\\b)[\\s/-]($monthsName)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r
38+
39+
// catches dates month-dd-yyyy ጥቅምት ፳፩ ፳፻፲፫ or ጥቅምት/፳፩/፳፻፲፫ or ጥቅምት ፳፩ ፳፻፲፫ mmmm-dd-yyyy
40+
private val dateRegex4: Regex =
41+
s"""$prefix(\\b$monthsName)[\\s/-]($geezNumberDate|0?[1-9]|[12][0-9]|3[01])[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r
42+
43+
// catches dates like ፳፩ ጥቅምት ፳፻፲፫ or ፳፩/ጥቅምት/፳፻፲፫ or 21/ጥቅምት/2013 dd-mmmm-yyyy
44+
private val dateRegex5: Regex =
45+
s"""$prefix(\\b$geezNumberDate|0?[1-9]|[12][0-9]|3[01])[\\s/-]($monthsName)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r
46+
47+
def catchGeezDate(dateString: String): Option[(String, String, String)] = {
48+
49+
for (dateRegex1(day, month, year) <- List(dateString)) {
50+
return Some((year, month, day))
51+
}
52+
53+
// Amharic month names (month-day-year)
54+
for (dateRegex2(month, day, year) <- List(dateString)) {
55+
return Some((year, month, day))
56+
}
57+
58+
// Amharic month names (day-month-year)
59+
for (dateRegex3(day, month, year) <- List(dateString)) {
60+
return Some((year, month, day))
61+
}
62+
63+
// dates that contain geez/Amharic numbers (month-day-year)
64+
for (dateRegex4(month, day, year) <- List(dateString)) {
65+
return Some((year, month, day))
66+
}
67+
68+
// dates that contain geez/Amharic numbers (day-month-year)
69+
for (dateRegex5(day, month, year) <- List(dateString)) {
70+
return Some((year, month, day))
71+
}
72+
73+
None
74+
}
75+
76+
def isLeapYear(year: Int): Boolean = {
77+
return (year % 4 == 3)
78+
}
79+
80+
def isValidEthiopianCalendarDate(year: Int, month: Int, day: Int): Boolean = {
81+
// Validate year
82+
if (year <= 0) {
83+
logger.log(Level.FINE, "Year must be greater than 0.")
84+
return false
85+
}
86+
87+
// Validate month
88+
if (month < 1 || month > 13) {
89+
logger.log(
90+
Level.FINE,
91+
s"Month must be between 1 and 13. Provided month: $month."
92+
)
93+
return false
94+
}
95+
96+
// Validate day
97+
if (day < 1 || day > 30) {
98+
logger.log(
99+
Level.FINE,
100+
s"Day must be between 1 and 30. Provided day: $day."
101+
)
102+
return false
103+
}
104+
105+
// Validate case for Pagume (month 13 in Ethiopian Calendar)
106+
if (month == 13) {
107+
if (day > 6) {
108+
logger.log(
109+
Level.FINE,
110+
s"Day in Pagume cannot exceed 6. Provided day: $day."
111+
)
112+
return false
113+
}
114+
if (!isLeapYear(year) && day > 5) {
115+
logger.log(
116+
Level.FINE,
117+
s"Pagume only has 5 days in non-leap years. Provided day: $day."
118+
)
119+
return false
120+
}
121+
}
122+
123+
true
124+
}
125+
126+
private def ethiopianDateToJDN(year: Int, month: Int, day: Int): Double = {
127+
val EPOCH: Long = 1723856
128+
val julianDayNumber: Double =
129+
(EPOCH + 365) + 365 * (year - 1) + (year / 4).toInt + 30 * month + day - 31
130+
return julianDayNumber
131+
}
132+
133+
def geezToGregorianDateConverter(
134+
year: Int,
135+
month: Int,
136+
day: Int,
137+
datatype: Datatype
138+
): Option[Date] = {
139+
val JDN: Double = ethiopianDateToJDN(year, month, day)
140+
val Q: Double = JDN + 0.5
141+
val Z: Long = Q.toLong
142+
val W: Long = ((Z - 1867216.25) / 36524.25).toLong
143+
val X: Long = (W / 4).toLong
144+
val A: Long = Z + 1 + W - X
145+
val B: Long = A + 1524
146+
val C: Long = ((B - 122.1) / 365.25).toLong
147+
val D: Long = (365.25 * C).toLong
148+
val E: Long = ((B - D) / 30.6001).toLong
149+
val F: Long = (30.6001 * E).toLong
150+
val gregorianDay: Int = (B - D - F + (Q - Z)).toInt
151+
val gregorianMonth: Long = if (E - 1 <= 12) E - 1 else E - 13
152+
val gregorianYear: Long = if (month <= 2) C - 4715 else C - 4716
153+
154+
Some(
155+
new Date(
156+
Some(gregorianYear.toInt),
157+
Some(gregorianMonth.toInt),
158+
Some(gregorianDay.toInt),
159+
datatype
160+
)
161+
)
162+
}
163+
164+
def isArabicNumeral(str: String): Boolean = {
165+
str.forall(c => c.isDigit)
166+
}
167+
168+
def formatDate(
169+
dateString: Option[(String, String, String)]
170+
): Option[(Int, Int, Int)] = {
171+
172+
dateString.flatMap { case (year, month, day) =>
173+
val yearNum =
174+
if (isArabicNumeral(year)) year.toInt
175+
else geezNumberParser.convertGeezToArabicNumeral(year).getOrElse(0)
176+
val monthNum =
177+
if (isArabicNumeral(month)) month.toInt
178+
else {
179+
EthiopianDateParserConfig.monthsMap.getOrElse(
180+
month,
181+
geezNumberParser.convertGeezToArabicNumeral(month).getOrElse(0)
182+
)
183+
}
184+
val dayNum =
185+
if (isArabicNumeral(day)) day.toInt
186+
else geezNumberParser.convertGeezToArabicNumeral(day).getOrElse(0)
187+
188+
return Some((yearNum, monthNum, dayNum))
189+
190+
}
191+
}
192+
193+
def findGeezDate(input: String): Option[Date] = {
194+
val isGregorianDate = (gregorianDateIndicator.findFirstIn(input)).isDefined
195+
196+
if (isGregorianDate) {
197+
return None
198+
}
199+
200+
val dateString: Option[(String, String, String)] = catchGeezDate(input)
201+
val (yearNum, monthNum, dayNum) =
202+
formatDate(dateString).getOrElse((0, 0, 0))
203+
204+
if (!isValidEthiopianCalendarDate(yearNum, monthNum, dayNum)) {
205+
return None
206+
}
207+
208+
for (
209+
date <- geezToGregorianDateConverter(yearNum, monthNum, dayNum, datatype)
210+
) {
211+
212+
return Some(date)
213+
}
214+
None
215+
216+
}
217+
}

0 commit comments

Comments
 (0)