Skip to content

Commit eae38fb

Browse files
Meti-AdaneTallTed
andauthored
[GSOC'24 Amharic chapter] Extend Existing Extractors For Amharic (#766)
* Added Amharic configurations to extend existing extractors * Added associated tests * Included extended extractors in property files --------- Co-authored-by: Ted Thibodeau Jr <[email protected]>
1 parent 564b144 commit eae38fb

15 files changed

+96
-8
lines changed

core/src/main/scala/org/dbpedia/extraction/config/dataparser/GeoCoordinateParserConfig.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ object GeoCoordinateParserConfig
1212

1313
//map latitude letters used in languages to the ones used in English ("E" for East and "W" for West)
1414
val longitudeLetterMap = Map(
15+
"am" -> Map("E" -> "E", "W" -> "W"),
1516
"de" -> Map("E" -> "E", "O" -> "E", "W" -> "W"),
1617
"en" -> Map("E" -> "E", "W" -> "W"),
1718
"cs" -> Map("E" -> "E", "W" -> "W"),
@@ -22,6 +23,7 @@ object GeoCoordinateParserConfig
2223

2324
//map longitude letters used in languages to the ones used in English ("N" for North and "S" for South)
2425
val latitudeLetterMap = Map(
26+
"am" -> Map("N" -> "N", "S" -> "S"),
2527
"en" -> Map("N" -> "N", "S" -> "S"),
2628
"cs" -> Map("N" -> "N", "S" -> "S"),
2729
"mk" -> Map("N" -> "N", "S" -> "S")

core/src/main/scala/org/dbpedia/extraction/config/dataparser/ParserUtilsConfig.scala

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,26 @@ object ParserUtilsConfig
1414
"bln" -> 9,
1515
"trillion" -> 12,
1616
"quadrillion" -> 15
17+
),
18+
"am" -> Map(
19+
"አስር" -> 1,
20+
"መቶ" -> 2,
21+
"መቶዎች" -> 2,
22+
"thousand" -> 3,
23+
"ሺህ" -> 3,
24+
"million" -> 6,
25+
"mln" -> 6,
26+
"ሚሊዮን" -> 6,
27+
"billion" -> 9,
28+
"ቢሊዮን" -> 9,
29+
"bln" -> 9,
30+
"trillion" -> 12,
31+
"ትሪሊዮን" -> 12,
32+
"quadrillion" -> 15,
33+
"ኳድሪሊየን" -> 15
1734
),
1835
// For "ar" configuration, rendering right-to-left may seems like a bug, but it's not.
19-
// Don't change this else if you know how it is done.
36+
// Don't change this unless you know how it works.
2037
"ar" -> Map(
2138
"عشرة" -> 1,
2239
"مئة" -> 2,

core/src/main/scala/org/dbpedia/extraction/config/mappings/DateIntervalMappingConfig.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@ object DateIntervalMappingConfig
88
)
99

1010
// For "ar" configuration, rendering right-to-left may seems like a bug, but it's not.
11-
// Don't change this else if you know how it is done.
11+
// Don't change this unless you know how it works.
1212
val presentMap = Map(
1313
"en" -> Set("present", "now"), // for example see https://en.wikipedia.org/wiki/Donald_Trump -> Political party -> Republican (1987–1999, 2009–2011, 2012–present)
14+
"am" -> Set("አሁን", "እስካሁን", "እስካሁን ድረስ"),
1415
"ar" -> Set("الحاضر"),
1516
"be" -> Set("па гэты дзень", "па сучаснасць"),
1617
"bg" -> Set("до наши дни", "настояще", "досега"),
@@ -38,6 +39,7 @@ object DateIntervalMappingConfig
3839

3940
val sinceMap = Map(
4041
"en" -> "since",
42+
"am" -> "(?:ጀምሮ|አንሥቶ|አንስቶ|ከ)",
4143
"ca" -> "des del",
4244
"es" -> "desde",
4345
"fr" -> "depuis",
@@ -48,12 +50,14 @@ object DateIntervalMappingConfig
4850

4951
val onwardMap = Map(
5052
"en" -> "onward",
53+
"am" -> "በኋላ",
5154
"es" -> "en adelante",
5255
"pt" -> "adiante|avante"
5356
)
5457

5558
val splitMap = Map(
5659
"en" -> "to",
60+
"am" -> "እስከ",
5761
"es" -> "al|a la|a|hasta (?:el|la)",
5862
"fr" -> "à|au",
5963
"pl" -> "do",

core/src/main/scala/org/dbpedia/extraction/config/mappings/DisambiguationExtractorConfig.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ object DisambiguationExtractorConfig
66
// For "ar" and "he" configurations, rendering right-to-left may seem like a bug, but it's not.
77
// Don't change this unless you know what you're doing.
88
val disambiguationTitlePartMap = Map(
9+
"am" -> " (መንታ)",
910
"ar" -> " (توضيح)",
1011
"bg" -> " (пояснение)",
1112
"ca" -> " (desambiguació)",

core/src/main/scala/org/dbpedia/extraction/config/mappings/GenderExtractorConfig.scala

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,19 @@ object GenderExtractorConfig
66
val pronounsMap = Map(
77
"en" -> Map("she" -> "female", "her" -> "female", "he" -> "male", "his" -> "male", "him" -> "male", "herself" -> "female", "himself" -> "male",
88
"She" -> "female", "Her" -> "female", "He" -> "male", "His" -> "male", "Him" -> "male", "Herself" -> "female", "Himself" -> "male" //TODO why not just do case insensitive matches?
9+
),
10+
"am" -> Map(
11+
"እሷ" -> "ሴት",
12+
"እሷን" -> "ሴት",
13+
"የሷ" -> "ሴት",
14+
"እራሷን" -> "ሴት",
15+
"እራሷ" -> "ሴት",
16+
"እሱ" -> "ወንድ",
17+
"እሱን" -> "ወንድ",
18+
"የእሱ" -> "ወንድ",
19+
"የራሱ" -> "ወንድ",
20+
"እራሱ" -> "ወንድ",
21+
"እራሱን" -> "ወንድ"
922
),
1023
"pt" -> Map ("ela"-> "mulher", "dela" -> "mulher", "ele" -> "homem", "dele" -> "homem", "nela" -> "mulher", "nele" -> "homem",
1124
"Ela"-> "mulher", "Dela" -> "mulher", "Ele" -> "homem", "Dele" -> "homem", "Nela" -> "mulher", "Nele" -> "homem"

core/src/main/scala/org/dbpedia/extraction/config/mappings/HomepageExtractorConfig.scala

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,23 @@ object HomepageExtractorConfig
99
// Don't change this else if you know how it is done.
1010

1111
private val propertyNamesMap = Map(
12+
"am" -> Set(
13+
"ድህረገፅ",
14+
"ድህረ_ገፅ",
15+
"ገጽ",
16+
"ድህረ ገጽ",
17+
"ድህረ_ገጽ",
18+
"ድረ_ገፅ",
19+
"ድረገፅ",
20+
"ድረገጽ",
21+
"ድረ ገጽ",
22+
"ድረ_ገጽ",
23+
"ዋና_ገጽ",
24+
"ዌብሳይት",
25+
"website",
26+
"web",
27+
"site"
28+
),
1229
"ar" -> Set("الموقع", "الصفحة الرسمية", "موقع", "الصفحة الرئيسية", "صفحة ويب", "موقع ويب"),
1330
"bg" -> Set("сайт", "уебсайт"),
1431
"ca" -> Set("pàgina", "web", "lloc"),
@@ -38,6 +55,7 @@ object HomepageExtractorConfig
3855
val supportedLanguages = propertyNamesMap.keySet
3956

4057
private val externalLinkSectionsMap = Map(
58+
"am" -> "(?:የውጭ ንባብ|የውጭ ማያያዣ)",
4159
"ar" -> "وصلات خارجية",
4260
"bg" -> "Външни препратки",
4361
"ca" -> "(?:Enllaços externs|Enllaço extern)",
@@ -65,6 +83,7 @@ object HomepageExtractorConfig
6583
}
6684

6785
private val officialMap = Map(
86+
"am" -> "ዋና",
6887
"ar" -> "رسمي",
6988
"bg" -> "официален",
7089
"ca" -> "oficial",

core/src/main/scala/org/dbpedia/extraction/config/mappings/ImageExtractorConfig.scala

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ object ImageExtractorConfig
1212
// Don't change this else if you know how it is done.
1313
val NonFreeRegex = Map(
1414
"ar" -> """(?i)\{\{\s?غير حر""".r,
15+
"am" ->"""(?i)\{\{\s?(non-free|Logo|Screenshot|Noncommercial|ነፃ_ያልሆነ)""".r,
1516
"bg" ->"""(?i)\{\{\s?non-free""".r,
1617
"de" -> """(?iu)\{\{\s?(Dateiüberprüfung/benachrichtigt_\(Kategorie\)|Geschützt|Geschützt-Ungeklärt|Bild-LogoSH|Bild-PD-alt-100|Bild-PD-alt-1923|Bild-WikimediaCopyright)\s?\}\}""".r ,
1718
"el" -> """(?iu)\{\{\s?(εύλογη χρήση|σήμα|σήμα αθλητικού σωματείου|αφίσα ταινίας|σκηνή από ταινία|γραφικά υπολογιστή|εξώφυλλο άλμπουμ|εξώφυλλο βιβλίου|μη ελεύθερο έργο τέχνης|σελίδα κόμικς|σελίδα εφημερίδας|εικόνα-βιντεοπαιχνίδι|ιδιοκτησία Wikimedia)\s?\}\}""".r ,
@@ -29,9 +30,9 @@ object ImageExtractorConfig
2930
"ru" -> """(?iu)\{\{\s?(CopyrightByWikimedia|Fairuse|несвободный файл|несвободная лицензия|запрещенная лицензия)\s?\}\}""".r
3031
)
3132

32-
val flagRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(flag|banner|pavillon|drapeau|bandera|pabellón|bandiera|флаг)([^\w]*|[_\s]+)""".r
33-
val mapRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(map|karte|location|position|carte|carta|lage)([^\w]*|[_\s]+)""".r
34-
val signatureRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(signature|unterschrift)""".r
33+
val flagRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(flag|banner|pavillon|drapeau|bandera|pabellón|bandiera|флаг|ባንዲራ|ሰንደቅ_ዓላማ)([^\w]*|[_\s]+)""".r
34+
val mapRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(map|karte|location|position|carte|carta|lage|ካርታ)([^\w]*|[_\s]+)""".r
35+
val signatureRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(signature|unterschrift|ፊርማ)""".r
3536
val cOARegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(coat_of_arms|emblem|crest|wappen|grandes_armes|blason|armoiries)([^\w]*|[_\s]+)""".r
3637

3738

core/src/main/scala/org/dbpedia/extraction/config/mappings/InfoboxExtractorConfig.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ object InfoboxExtractorConfig
1212

1313
val ignoreProperties = Map (
1414
"en"-> Set("image", "image_photo", "map"),
15+
"am"-> Set("ምስል", "ፎቶ", "ስዕል", "ካርታ", "አርማ"),
1516
"ar"-> Set("صورة"),
1617
"id"-> Set("foto", "gambar"),
1718
"el"-> Set("εικόνα", "εικονα", "Εικόνα", "Εικονα", "χάρτης", "Χάρτης"),

core/src/main/scala/org/dbpedia/extraction/config/mappings/TopicalConceptsExtractorConfig.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ object TopicalConceptsExtractorConfig
1010
val catMainTemplates = Set(
1111
"مزيد" ,// ar
1212
"Infocat", "Infocatm", // ca
13-
"Catmore", // el,ja
13+
"Catmore", // el,ja,am
1414
"Cat main", // en
1515
"AP", // es
1616
"Nagusia", // eu

core/src/test/scala/org/dbpedia/extraction/dataparser/GeoCoordinateParserTest.scala

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,16 @@ class GeoCoordinateParserTest extends FlatSpec with Matchers
2626
{
2727
parse("fr", "{{coord|51/12/N|03/13/E}}") should equal (Some(51.2,3.216666666666667))
2828
}
29-
29+
30+
// Tests for Amharic
31+
"GeoCoordinateParser(20º12'00\"N 03º13'00\"E)" should "return (20.2,3.216666666666667))" in
32+
{
33+
parse("am", "20º12'00\"N 03º13'00\"E") should equal (Some(20.2,3.216666666666667))
34+
}
35+
"GeoCoordinateParser({{coord|10.2|N|13.2|E}}" should "return (10.2,13.2)) for Amharic" in
36+
{
37+
parse("am", "{{coord|10.2|N|13.2|E}}") should equal (Some(10.2,13.2))
38+
}
3039

3140
private val wikiParser = WikiParser.getInstance()
3241

0 commit comments

Comments
 (0)