From 62d1a745d9f4c94298a6e7016e5cfcadd728b7b6 Mon Sep 17 00:00:00 2001 From: "Meti A. Bayissa" Date: Thu, 20 Jun 2024 17:12:44 +0300 Subject: [PATCH 1/2] Add extractor configurations for Amharic Language --- .../dataparser/DateTimeParserConfig.scala | 15 ++++++ .../dataparser/DurationParserConfig.scala | 49 +++++++++++++++++++ .../GeoCoordinateParserConfig.scala | 2 + .../config/dataparser/ParserUtilsConfig.scala | 17 +++++++ .../mappings/DateIntervalMappingConfig.scala | 4 ++ .../DisambiguationExtractorConfig.scala | 1 + .../mappings/GenderExtractorConfig.scala | 13 +++++ .../mappings/HomepageExtractorConfig.scala | 19 +++++++ .../mappings/InfoboxExtractorConfig.scala | 1 + .../TopicalConceptsExtractorConfig.scala | 2 +- dump/extraction.default.properties | 2 + dump/extraction.mappings.properties | 2 + dump/extraction.topical.properties | 2 +- 13 files changed, 127 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/config/dataparser/DateTimeParserConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/dataparser/DateTimeParserConfig.scala index e1f89ac10b..f9f04580b5 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/dataparser/DateTimeParserConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/dataparser/DateTimeParserConfig.scala @@ -6,6 +6,21 @@ object DateTimeParserConfig val monthsMap = Map( // For "ar" configuration, right-to-left rendering may seem like a bug, but it's not. // Don't change this unless you know how it is done. + "am" -> Map( + "መስከረም" -> 1, + "ጥቅምት" -> 2, + "ኅዳር" -> 3, + "ታኅሳስ" -> 4, + "ጥር" -> 5, + "የካቲት" -> 6, + "መጋቢት" -> 7, + "ሚያዝያ" -> 8, + "ግንቦት" -> 9, + "ሰኔ" -> 10, + "ሐምሌ" -> 11, + "ነሐሴ" -> 12, + "ጳጉሜ" -> 13 + ), "ar" -> Map("جانفي"->1,"فيفري"->2,"مارس"->3,"أفريل"->4,"ماي"->5,"جوان"->6,"جويلية"->7,"أوت"->8,"سبتمبر"->9,"أكتوبر"->10,"نوفمبر"->11,"ديسمبر"->12, "يناير"->1,"فبراير"->2,"أبريل"->4,"مايو"->5,"يونيو"->6,"يوليو"->7,"يوليوز"->7,"أغسطس"->8,"غشت"->8,"شتنبر"->9,"نونبر"->11,"دجنبر"->12), "bg" -> Map("януари"->1,"февруари"->2,"март"->3,"април"->4,"май"->5,"юни"->6,"юли"->7,"август"->8,"септември"->9,"октомври"->10,"ноември"->11,"декември"->12), diff --git a/core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala index f91458058c..37978f11cd 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala @@ -39,6 +39,55 @@ object DurationParserConfig "years" -> "year", "yr" -> "year" ), + "am" -> Map( + "second" -> "second", + "s" -> "second", + "sec" -> "second", + "seconds" -> "second", + "secs" -> "second", + "\"" -> "second", + "ሰከንድ" -> "second", + "ሴኮንድ" -> "second", + "minute" -> "minute", + "m" -> "minute", + "min" -> "minute", + "minutes" -> "minute", + "min." -> "minute", + "mins" -> "minute", + "minu" -> "minute", + "'" -> "minute", + "ደቂቃ" -> "minute", + "ደቂቃዎች" -> "minute", + "hour" -> "hour", + "h" -> "hour", + "hours" -> "hour", + "hr" -> "hour", + "hr." -> "hour", + "hrs" -> "hour", + "hrs." -> "hour", + "ሰአት" -> "hour", + "ሰዓታት" -> "hour", + "ሰዓት" -> "hour", + "day" -> "day", + "d" -> "day", + "d." -> "day", + "days" -> "day", + "ቀን" -> "day", + "ቀናት" -> "day", + "ቀኖች" -> "day", + "month" -> "month", + "months" -> "month", + "ወር" -> "month", + "ወራት" -> "month", + "ወሮች" -> "month", + "year" -> "year", + "y" -> "year", + "years" -> "year", + "yr" -> "year", + "አመት" -> "year", + "ዓመት" -> "year", + "ዓመታት" -> "year" + ), // For "ar" configuration, rendering right-to-left may seems like a bug, but it's not. // Don't change this else if you know how it is done. "ar" -> Map( diff --git a/core/src/main/scala/org/dbpedia/extraction/config/dataparser/GeoCoordinateParserConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/dataparser/GeoCoordinateParserConfig.scala index 737a32f0e9..36b3e92128 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/dataparser/GeoCoordinateParserConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/dataparser/GeoCoordinateParserConfig.scala @@ -12,6 +12,7 @@ object GeoCoordinateParserConfig //map latitude letters used in languages to the ones used in English ("E" for East and "W" for West) val longitudeLetterMap = Map( + "am" -> Map("E" -> "E", "W" -> "W"), "de" -> Map("E" -> "E", "O" -> "E", "W" -> "W"), "en" -> Map("E" -> "E", "W" -> "W"), "cs" -> Map("E" -> "E", "W" -> "W"), @@ -22,6 +23,7 @@ object GeoCoordinateParserConfig //map longitude letters used in languages to the ones used in English ("N" for North and "S" for South) val latitudeLetterMap = Map( + "am" -> Map("N" -> "N", "S" -> "S"), "en" -> Map("N" -> "N", "S" -> "S"), "cs" -> Map("N" -> "N", "S" -> "S"), "mk" -> Map("N" -> "N", "S" -> "S") diff --git a/core/src/main/scala/org/dbpedia/extraction/config/dataparser/ParserUtilsConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/dataparser/ParserUtilsConfig.scala index 7f802625eb..fa977e371e 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/dataparser/ParserUtilsConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/dataparser/ParserUtilsConfig.scala @@ -14,6 +14,23 @@ object ParserUtilsConfig "bln" -> 9, "trillion" -> 12, "quadrillion" -> 15 + ), + "am" -> Map( + "አስር" -› 1, + "መቶ" -› 2, + "መቶዎች" -> 2, + "thousand" -> 3, + "ሺህ" -> 3, + "million" -> 6, + "mln" -> 6, + "ሚሊዮን" -> 6, + "billion" -> 9, + "ቢሊዮን" -> 9, + "bln" -> 9, + "trillion" -> 12, + "ትሪሊዮን" -> 12, + "quadrillion" -> 15, + "ኳድሪሊየን" -> 15 ), // For "ar" configuration, rendering right-to-left may seems like a bug, but it's not. // Don't change this else if you know how it is done. diff --git a/core/src/main/scala/org/dbpedia/extraction/config/mappings/DateIntervalMappingConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/mappings/DateIntervalMappingConfig.scala index 68a17c9409..2611d80eaa 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/mappings/DateIntervalMappingConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/mappings/DateIntervalMappingConfig.scala @@ -11,6 +11,7 @@ object DateIntervalMappingConfig // Don't change this else if you know how it is done. val presentMap = Map( "en" -> Set("present", "now"), // for example see https://en.wikipedia.org/wiki/Donald_Trump -> Political party -> Republican (1987–1999, 2009–2011, 2012–present) + "am" -> Set("አሁን", "እስካሁን", "እስካሁን ድረስ"), "ar" -> Set("الحاضر"), "be" -> Set("па гэты дзень", "па сучаснасць"), "bg" -> Set("до наши дни", "настояще", "досега"), @@ -38,6 +39,7 @@ object DateIntervalMappingConfig val sinceMap = Map( "en" -> "since", + "am" -> "(?:ጀምሮ|አንሥቶ|አንስቶ)", "ca" -> "des del", "es" -> "desde", "fr" -> "depuis", @@ -48,12 +50,14 @@ object DateIntervalMappingConfig val onwardMap = Map( "en" -> "onward", + "am" -> "በኋላ", "es" -> "en adelante", "pt" -> "adiante|avante" ) val splitMap = Map( "en" -> "to", + "am" -> "እስከ", "es" -> "al|a la|a|hasta (?:el|la)", "fr" -> "à|au", "pl" -> "do", diff --git a/core/src/main/scala/org/dbpedia/extraction/config/mappings/DisambiguationExtractorConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/mappings/DisambiguationExtractorConfig.scala index 28328f6383..dc92f216ec 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/mappings/DisambiguationExtractorConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/mappings/DisambiguationExtractorConfig.scala @@ -6,6 +6,7 @@ object DisambiguationExtractorConfig // For "ar" and "he" configurations, rendering right-to-left may seem like a bug, but it's not. // Don't change this unless you know what you're doing. val disambiguationTitlePartMap = Map( + "am" -> " (መንታ)", "ar" -> " (توضيح)", "bg" -> " (пояснение)", "ca" -> " (desambiguació)", diff --git a/core/src/main/scala/org/dbpedia/extraction/config/mappings/GenderExtractorConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/mappings/GenderExtractorConfig.scala index 06dbb67fc4..38dae8c840 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/mappings/GenderExtractorConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/mappings/GenderExtractorConfig.scala @@ -6,6 +6,19 @@ object GenderExtractorConfig val pronounsMap = Map( "en" -> Map("she" -> "female", "her" -> "female", "he" -> "male", "his" -> "male", "him" -> "male", "herself" -> "female", "himself" -> "male", "She" -> "female", "Her" -> "female", "He" -> "male", "His" -> "male", "Him" -> "male", "Herself" -> "female", "Himself" -> "male" //TODO why not just do case insensitive matches? + ), + "am" -> Map( + "እሷ" -> "ሴት", + "እሷን" -> "ሴት", + "የሷ" -> "ሴት", + "እራሷን" -> "ሴት", + "እራሷ" -> "ሴት", + "እሱ" -> "ወንድ", + "እሱን" -> "ወንድ", + "የእሱ" -> "ወንድ", + "የራሱ" -> "ወንድ", + "እራሱ" -> "ወንድ", + "እራሱን" -> "ወንድ" ), "pt" -> Map ("ela"-> "mulher", "dela" -> "mulher", "ele" -> "homem", "dele" -> "homem", "nela" -> "mulher", "nele" -> "homem", "Ela"-> "mulher", "Dela" -> "mulher", "Ele" -> "homem", "Dele" -> "homem", "Nela" -> "mulher", "Nele" -> "homem" diff --git a/core/src/main/scala/org/dbpedia/extraction/config/mappings/HomepageExtractorConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/mappings/HomepageExtractorConfig.scala index e221e69af5..0d6ed99af1 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/mappings/HomepageExtractorConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/mappings/HomepageExtractorConfig.scala @@ -9,6 +9,23 @@ object HomepageExtractorConfig // Don't change this else if you know how it is done. private val propertyNamesMap = Map( + "am" -> Set( + "ድህረገፅ", + "ድህረ_ገፅ", + "ገጽ", + "ድህረ ገጽ", + "ድህረ_ገጽ", + "ድረ_ገፅ", + "ድረገፅ", + "ድረገጽ", + "ድረ ገጽ", + "ድረ_ገጽ", + "ዋና_ገጽ", + "ዌብሳይት", + "website", + "web", + "site" + ), "ar" -> Set("الموقع", "الصفحة الرسمية", "موقع", "الصفحة الرئيسية", "صفحة ويب", "موقع ويب"), "bg" -> Set("сайт", "уебсайт"), "ca" -> Set("pàgina", "web", "lloc"), @@ -38,6 +55,7 @@ object HomepageExtractorConfig val supportedLanguages = propertyNamesMap.keySet private val externalLinkSectionsMap = Map( + "am" -> "(?:የውጭ ንባብ|የውጭ ማያያዣ)", "ar" -> "وصلات خارجية", "bg" -> "Външни препратки", "ca" -> "(?:Enllaços externs|Enllaço extern)", @@ -65,6 +83,7 @@ object HomepageExtractorConfig } private val officialMap = Map( + "am" -> "ዋና", "ar" -> "رسمي", "bg" -> "официален", "ca" -> "oficial", diff --git a/core/src/main/scala/org/dbpedia/extraction/config/mappings/InfoboxExtractorConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/mappings/InfoboxExtractorConfig.scala index c10043982d..c8be48610e 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/mappings/InfoboxExtractorConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/mappings/InfoboxExtractorConfig.scala @@ -12,6 +12,7 @@ object InfoboxExtractorConfig val ignoreProperties = Map ( "en"-> Set("image", "image_photo", "map"), + "am"-> Set("ምስል", "ፎቶ", "ስዕል", "ካርታ", "አርማ"), "ar"-> Set("صورة"), "id"-> Set("foto", "gambar"), "el"-> Set("εικόνα", "εικονα", "Εικόνα", "Εικονα", "χάρτης", "Χάρτης"), diff --git a/core/src/main/scala/org/dbpedia/extraction/config/mappings/TopicalConceptsExtractorConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/mappings/TopicalConceptsExtractorConfig.scala index 6cfccdb3da..28ec4e46e1 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/mappings/TopicalConceptsExtractorConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/mappings/TopicalConceptsExtractorConfig.scala @@ -10,7 +10,7 @@ object TopicalConceptsExtractorConfig val catMainTemplates = Set( "مزيد" ,// ar "Infocat", "Infocatm", // ca - "Catmore", // el,ja + "Catmore", // el,ja,am "Cat main", // en "AP", // es "Nagusia", // eu diff --git a/dump/extraction.default.properties b/dump/extraction.default.properties index c38cb7dc1a..7e7f5e2445 100644 --- a/dump/extraction.default.properties +++ b/dump/extraction.default.properties @@ -21,6 +21,8 @@ extractors=.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesEx .PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\ .WikiPageLengthExtractor,.WikiPageOutDegreeExtractor +extractors.am=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.GenderExtractor,.TopicalConceptsExtractor + extractors.ar=.MappingExtractor,.TopicalConceptsExtractor extractors.be=.MappingExtractor diff --git a/dump/extraction.mappings.properties b/dump/extraction.mappings.properties index aaeaeb340e..e387d61aec 100644 --- a/dump/extraction.mappings.properties +++ b/dump/extraction.mappings.properties @@ -18,6 +18,8 @@ languages=@mappings extractors=.MappingExtractor +#extractors.am=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.GenderExtractor,.TopicalConceptsExtractor +# #extractors.ar=.MappingExtractor,.TopicalConceptsExtractor # #extractors.be=.MappingExtractor diff --git a/dump/extraction.topical.properties b/dump/extraction.topical.properties index 17611f20d7..55085562e2 100644 --- a/dump/extraction.topical.properties +++ b/dump/extraction.topical.properties @@ -15,7 +15,7 @@ # use only directories that contain a 'download-complete' file? Default is false. require-download-complete=true -languages=ar,ca,el,en,es,eu,fr,it,pt,ru +languages=am,ar,ca,el,en,es,eu,fr,it,pt,ru # extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" From a65c5127fa73d71913ee83ba6adda2285d65df37 Mon Sep 17 00:00:00 2001 From: "Meti A. Bayissa" Date: Thu, 20 Jun 2024 17:19:22 +0300 Subject: [PATCH 2/2] Fix typo --- .../extraction/config/dataparser/ParserUtilsConfig.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/config/dataparser/ParserUtilsConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/dataparser/ParserUtilsConfig.scala index fa977e371e..c0169a6c52 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/dataparser/ParserUtilsConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/dataparser/ParserUtilsConfig.scala @@ -16,8 +16,8 @@ object ParserUtilsConfig "quadrillion" -> 15 ), "am" -> Map( - "አስር" -› 1, - "መቶ" -› 2, + "አስር" -> 1, + "መቶ" -> 2, "መቶዎች" -> 2, "thousand" -> 3, "ሺህ" -> 3,