|
| 1 | +package org.dbpedia.extraction.dataparser |
| 2 | +import java.util.logging.{Logger, Level} |
| 3 | +import scala.util.matching.Regex |
| 4 | +import org.dbpedia.extraction.config.dataparser.{ |
| 5 | + EthiopianDateParserConfig, |
| 6 | + DateTimeParserConfig |
| 7 | +} |
| 8 | +import org.dbpedia.extraction.util.{Language, Date} |
| 9 | +import org.dbpedia.extraction.util.{GeezNumberUtils} |
| 10 | +import org.dbpedia.extraction.ontology.datatypes.Datatype |
| 11 | + |
| 12 | +class EthiopianDateParser(datatype: Datatype, val strict: Boolean = false) { |
| 13 | + require(datatype != null, "datatype != null") |
| 14 | + @transient private val logger = Logger.getLogger(getClass.getName) |
| 15 | + |
| 16 | + val geezNumberParser = new GeezNumberUtils() |
| 17 | + private val monthsMap = EthiopianDateParserConfig.monthsMap |
| 18 | + private val monthsName = monthsMap.keys.mkString("|") |
| 19 | + private val geezNumberDate = |
| 20 | + EthiopianDateParserConfig.geezNumberDateMap.values.mkString("|") |
| 21 | + |
| 22 | + private val gregorianDateIndicator = s""".*(እ.ኤ.አ).*""".r |
| 23 | + private val prefix = if (strict) """\s*""" else """.*?""" |
| 24 | + private val postfix = if (strict) """\s*""" else ".*" |
| 25 | + |
| 26 | + // catches dd-mm-yyyy including a 13th month 21 13 2013, 21-13-2013, 21/13/2013, 21-13-2013, 21/13/2013 |
| 27 | + private val dateRegex1: Regex = |
| 28 | + s"""$prefix\\b(0?[1-9]|[12][0-9]|3[01])\\b[-/\\s]\\b(0?[1-9]|1[0-2]|13)\\b[-/\\s](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r |
| 29 | + |
| 30 | + // Regex for dates containing geez characters |
| 31 | + // catches dates like ጥቅምት-21-2013 or ጥቅምት/21/2013 or ጥቅምት 21 2013 |
| 32 | + private val dateRegex2: Regex = |
| 33 | + s"""$prefix($monthsName)[\\s/-](\\b(0?[1-9]|[12][0-9]|3[01])\\b)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r |
| 34 | + |
| 35 | + // catches dates dd-month-yyyy like 21-ጥቅምት-2013 or 21/ጥቅምት/2013 or 21 ጥቅምት 2013 |
| 36 | + private val dateRegex3: Regex = |
| 37 | + s"""$prefix(\\b(0?[1-9]|[12][0-9]|3[01])\\b)[\\s/-]($monthsName)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r |
| 38 | + |
| 39 | + // catches dates month-dd-yyyy ጥቅምት ፳፩ ፳፻፲፫ or ጥቅምት/፳፩/፳፻፲፫ or ጥቅምት ፳፩ ፳፻፲፫ mmmm-dd-yyyy |
| 40 | + private val dateRegex4: Regex = |
| 41 | + s"""$prefix(\\b$monthsName)[\\s/-]($geezNumberDate|0?[1-9]|[12][0-9]|3[01])[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r |
| 42 | + |
| 43 | + // catches dates like ፳፩ ጥቅምት ፳፻፲፫ or ፳፩/ጥቅምት/፳፻፲፫ or 21/ጥቅምት/2013 dd-mmmm-yyyy |
| 44 | + private val dateRegex5: Regex = |
| 45 | + s"""$prefix(\\b$geezNumberDate|0?[1-9]|[12][0-9]|3[01])[\\s/-]($monthsName)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r |
| 46 | + |
| 47 | + def catchGeezDate(dateString: String): Option[(String, String, String)] = { |
| 48 | + |
| 49 | + for (dateRegex1(day, month, year) <- List(dateString)) { |
| 50 | + return Some((year, month, day)) |
| 51 | + } |
| 52 | + |
| 53 | + // Amharic month names (month-day-year) |
| 54 | + for (dateRegex2(month, day, year) <- List(dateString)) { |
| 55 | + return Some((year, month, day)) |
| 56 | + } |
| 57 | + |
| 58 | + // Amharic month names (day-month-year) |
| 59 | + for (dateRegex3(day, month, year) <- List(dateString)) { |
| 60 | + return Some((year, month, day)) |
| 61 | + } |
| 62 | + |
| 63 | + // dates that contain geez/Amharic numbers (month-day-year) |
| 64 | + for (dateRegex4(month, day, year) <- List(dateString)) { |
| 65 | + return Some((year, month, day)) |
| 66 | + } |
| 67 | + |
| 68 | + // dates that contain geez/Amharic numbers (day-month-year) |
| 69 | + for (dateRegex5(day, month, year) <- List(dateString)) { |
| 70 | + return Some((year, month, day)) |
| 71 | + } |
| 72 | + |
| 73 | + None |
| 74 | + } |
| 75 | + |
| 76 | + def isLeapYear(year: Int): Boolean = { |
| 77 | + return (year % 4 == 3) |
| 78 | + } |
| 79 | + |
| 80 | + def isValidEthiopianCalendarDate(year: Int, month: Int, day: Int): Boolean = { |
| 81 | + // Validate year |
| 82 | + if (year <= 0) { |
| 83 | + logger.log(Level.FINE, "Year must be greater than 0.") |
| 84 | + return false |
| 85 | + } |
| 86 | + |
| 87 | + // Validate month |
| 88 | + if (month < 1 || month > 13) { |
| 89 | + logger.log( |
| 90 | + Level.FINE, |
| 91 | + s"Month must be between 1 and 13. Provided month: $month." |
| 92 | + ) |
| 93 | + return false |
| 94 | + } |
| 95 | + |
| 96 | + // Validate day |
| 97 | + if (day < 1 || day > 30) { |
| 98 | + logger.log( |
| 99 | + Level.FINE, |
| 100 | + s"Day must be between 1 and 30. Provided day: $day." |
| 101 | + ) |
| 102 | + return false |
| 103 | + } |
| 104 | + |
| 105 | + // Validate case for Pagume (month 13 in Ethiopian Calendar) |
| 106 | + if (month == 13) { |
| 107 | + if (day > 6) { |
| 108 | + logger.log( |
| 109 | + Level.FINE, |
| 110 | + s"Day in Pagume cannot exceed 6. Provided day: $day." |
| 111 | + ) |
| 112 | + return false |
| 113 | + } |
| 114 | + if (!isLeapYear(year) && day > 5) { |
| 115 | + logger.log( |
| 116 | + Level.FINE, |
| 117 | + s"Pagume only has 5 days in non-leap years. Provided day: $day." |
| 118 | + ) |
| 119 | + return false |
| 120 | + } |
| 121 | + } |
| 122 | + |
| 123 | + true |
| 124 | + } |
| 125 | + |
| 126 | + private def ethiopianDateToJDN(year: Int, month: Int, day: Int): Double = { |
| 127 | + val EPOCH: Long = 1723856 |
| 128 | + val julianDayNumber: Double = |
| 129 | + (EPOCH + 365) + 365 * (year - 1) + (year / 4).toInt + 30 * month + day - 31 |
| 130 | + return julianDayNumber |
| 131 | + } |
| 132 | + |
| 133 | + def geezToGregorianDateConverter( |
| 134 | + year: Int, |
| 135 | + month: Int, |
| 136 | + day: Int, |
| 137 | + datatype: Datatype |
| 138 | + ): Option[Date] = { |
| 139 | + val JDN: Double = ethiopianDateToJDN(year, month, day) |
| 140 | + val Q: Double = JDN + 0.5 |
| 141 | + val Z: Long = Q.toLong |
| 142 | + val W: Long = ((Z - 1867216.25) / 36524.25).toLong |
| 143 | + val X: Long = (W / 4).toLong |
| 144 | + val A: Long = Z + 1 + W - X |
| 145 | + val B: Long = A + 1524 |
| 146 | + val C: Long = ((B - 122.1) / 365.25).toLong |
| 147 | + val D: Long = (365.25 * C).toLong |
| 148 | + val E: Long = ((B - D) / 30.6001).toLong |
| 149 | + val F: Long = (30.6001 * E).toLong |
| 150 | + val gregorianDay: Int = (B - D - F + (Q - Z)).toInt |
| 151 | + val gregorianMonth: Long = if (E - 1 <= 12) E - 1 else E - 13 |
| 152 | + val gregorianYear: Long = if (month <= 2) C - 4715 else C - 4716 |
| 153 | + |
| 154 | + Some( |
| 155 | + new Date( |
| 156 | + Some(gregorianYear.toInt), |
| 157 | + Some(gregorianMonth.toInt), |
| 158 | + Some(gregorianDay.toInt), |
| 159 | + datatype |
| 160 | + ) |
| 161 | + ) |
| 162 | + } |
| 163 | + |
| 164 | + def isArabicNumeral(str: String): Boolean = { |
| 165 | + str.forall(c => c.isDigit) |
| 166 | + } |
| 167 | + |
| 168 | + def formatDate( |
| 169 | + dateString: Option[(String, String, String)] |
| 170 | + ): Option[(Int, Int, Int)] = { |
| 171 | + |
| 172 | + dateString.flatMap { case (year, month, day) => |
| 173 | + val yearNum = |
| 174 | + if (isArabicNumeral(year)) year.toInt |
| 175 | + else geezNumberParser.convertGeezToArabicNumeral(year).getOrElse(0) |
| 176 | + val monthNum = |
| 177 | + if (isArabicNumeral(month)) month.toInt |
| 178 | + else { |
| 179 | + EthiopianDateParserConfig.monthsMap.getOrElse( |
| 180 | + month, |
| 181 | + geezNumberParser.convertGeezToArabicNumeral(month).getOrElse(0) |
| 182 | + ) |
| 183 | + } |
| 184 | + val dayNum = |
| 185 | + if (isArabicNumeral(day)) day.toInt |
| 186 | + else geezNumberParser.convertGeezToArabicNumeral(day).getOrElse(0) |
| 187 | + |
| 188 | + return Some((yearNum, monthNum, dayNum)) |
| 189 | + |
| 190 | + } |
| 191 | + } |
| 192 | + |
| 193 | + def findGeezDate(input: String): Option[Date] = { |
| 194 | + val isGregorianDate = (gregorianDateIndicator.findFirstIn(input)).isDefined |
| 195 | + |
| 196 | + if (isGregorianDate) { |
| 197 | + return None |
| 198 | + } |
| 199 | + |
| 200 | + val dateString: Option[(String, String, String)] = catchGeezDate(input) |
| 201 | + val (yearNum, monthNum, dayNum) = |
| 202 | + formatDate(dateString).getOrElse((0, 0, 0)) |
| 203 | + |
| 204 | + if (!isValidEthiopianCalendarDate(yearNum, monthNum, dayNum)) { |
| 205 | + return None |
| 206 | + } |
| 207 | + |
| 208 | + for ( |
| 209 | + date <- geezToGregorianDateConverter(yearNum, monthNum, dayNum, datatype) |
| 210 | + ) { |
| 211 | + |
| 212 | + return Some(date) |
| 213 | + } |
| 214 | + None |
| 215 | + |
| 216 | + } |
| 217 | +} |
0 commit comments