From 3652da41b4a93996bf55adb01fbb53aaafc02c5d Mon Sep 17 00:00:00 2001 From: Blake-Madden Date: Sat, 9 Dec 2023 06:32:25 -0500 Subject: [PATCH] Update German and English to the latest Snowball standard --- src/english_stem.h | 28 ++++++++++++++++++++++------ src/german_stem.h | 33 +++++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/src/english_stem.h b/src/english_stem.h index 21096b0..7f1f809 100644 --- a/src/english_stem.h +++ b/src/english_stem.h @@ -106,8 +106,9 @@ namespace stemming - ed edly+ ing ingly+ - Delete if the preceding word part contains a vowel, and then - If the word ends at, bl or iz add e (so luxuriat -> luxuriate), or - - If the word ends with a double remove the last letter (so hopp -> hop), or - - If the word is short, add e (so hop -> hope). + - If the word ends with a double preceded by something other than exactly 'a', 'e' or 'o' then + remove the last letter (so hopp -> hop but add, egg and off are not changed), or + - If the word does not end with a double and is short, add 'e' (so hop -> hope). Step 1c: @@ -808,6 +809,16 @@ namespace stemming } if (regress_trim) { + const bool isExactly3NotAEOStart + { + text.length() == 3 && + !(stem::is_either(text[0], + common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) || + stem::is_either(text[0], + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) || + stem::is_either(text[0], + common_lang_constants::LOWER_O, common_lang_constants::UPPER_O)) + }; if (stem::is_suffix(text, /*at*/common_lang_constants::LOWER_A, common_lang_constants::UPPER_A, common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) || @@ -822,7 +833,9 @@ namespace stemming // need to search for r2 again because the 'e' added here may change that stem::find_r2(text, L"aeiouyAEIOUY"); } - else if (stem::is_suffix(text, + // undouble + else if ((text.length() > 3 || isExactly3NotAEOStart) && + (stem::is_suffix(text, /*bb*/ common_lang_constants::LOWER_B, common_lang_constants::UPPER_B, common_lang_constants::LOWER_B, common_lang_constants::UPPER_B) || @@ -856,15 +869,18 @@ namespace stemming stem::is_suffix(text, /*tt*/ common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, - common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) ) + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T)) ) { text.erase(text.length()-1); stem::update_r_sections(text); } - else if (is_short_word(text, text.length() ) ) + else if ((text.length() < 2 || + stem::tolower_western(text[text.length() - 1]) != + stem::tolower_western(text[text.length() - 2]) ) && + is_short_word(text, text.length() ) ) { text += common_lang_constants::LOWER_E; - // need to search for r2 again because the 'e' added here may change that + // need to search for R2 again because the 'e' added here may change that stem::find_r2(text, L"aeiouyAEIOUY"); } } diff --git a/src/german_stem.h b/src/german_stem.h index 7fdf9e7..15326c3 100644 --- a/src/german_stem.h +++ b/src/german_stem.h @@ -40,10 +40,12 @@ namespace stemming Step 1: Search for the longest among the following suffixes: - - e em en ern er es - - s (preceded by a valid s-ending) + a.) em (not preceded by 'syst') + b.) ern er + c.) en es e + d.) s (preceded by a valid s-ending) and delete if in R1. (Of course the letter of the valid s-ending is not necessarily in R1). - If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s. + If an ending of group (c) is deleted, and the ending is preceded by 'niss', delete the final s. (For example, äckern -> äck, ackers -> acker, armes -> arm, bedürfnissen -> bedürfnis). @@ -160,7 +162,24 @@ namespace stemming void step_1(string_typeT& text) { bool stepBSucessfull{ false }; - if (stem::delete_if_is_in_r1(text, + // 'em', but not if 'system' + if ((is_suffix(text, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M)) && + !(is_suffix(text, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y, + common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, + common_lang_constants::LOWER_T, common_lang_constants::UPPER_T, + common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M)) && + stem::delete_if_is_in_r1(text, + /*em*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, + common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) ) + { + return; + } + else if (stem::delete_if_is_in_r1(text, /*ern*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, common_lang_constants::LOWER_R, common_lang_constants::UPPER_R, common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) ) @@ -173,12 +192,6 @@ namespace stemming { return; } - else if (stem::delete_if_is_in_r1(text, - /*em*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, - common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) ) - { - return; - } else if (stem::delete_if_is_in_r1(text, /*es*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E, common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) )