Skip to content

Commit a051e12

Browse files
committed
Update the Swedish stemmer to the latest Snowball standard
1 parent 797458c commit a051e12

File tree

2 files changed

+30
-14
lines changed

2 files changed

+30
-14
lines changed

Changelog.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
## Change Log
22

3+
### 2023.1 Release
4+
- Updated the Swedish stemmer to the latest Snowball standard.
5+
36
### 2023 Release
47
- Updated Spanish, Russian, Italian, and French stemmers to the latest Snowball standard.
58
- Made stemming less aggressive with punctuation at the end of a word. Now, only trailing `'` and `'s'

src/swedish_stem.h

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ namespace stemming
2929
3030
Define a valid s-ending as one of:
3131
- b c d f g h j k l m n o p r t v y
32-
32+
3333
Define a valid öst-ending as one of:
3434
3535
- i k l n p r t u v
@@ -58,12 +58,16 @@ namespace stemming
5858
<b>Step 3:</b>
5959
6060
Search for the longest among the following suffixes in R1, and perform the action indicated.
61-
- lig ig els
62-
- Delete
63-
- löst
64-
- Replace with lös
65-
- fullt
66-
- Replace with full
61+
- lig ig els
62+
- Delete.
63+
- öst
64+
- Replace with ös if preceded by a valid öst-ending.
65+
66+
The letter of the valid öst-ending is not necessarily in R1.
67+
Prior to Snowball 2.3.0, öst-ending was effectively just
68+
l and was required to be in R1.
69+
- fullt
70+
- Replace with full.
6771
*/
6872
//------------------------------------------------------
6973
template <typename string_typeT = std::wstring>
@@ -378,7 +382,7 @@ namespace stemming
378382
void step_3(string_typeT& text)
379383
{
380384
if (stem<string_typeT>::is_suffix_in_r1(text,
381-
/*fullt*/
385+
/* fullt */
382386
common_lang_constants::LOWER_F, common_lang_constants::UPPER_F,
383387
common_lang_constants::LOWER_U, common_lang_constants::UPPER_U,
384388
common_lang_constants::LOWER_L, common_lang_constants::UPPER_L,
@@ -388,9 +392,18 @@ namespace stemming
388392
text.erase(text.length()-1);
389393
stem<string_typeT>::update_r_sections(text);
390394
}
391-
else if (stem<string_typeT>::is_suffix_in_r1(text,
392-
/*löst*/
393-
common_lang_constants::LOWER_L, common_lang_constants::UPPER_L,
395+
else if (text.length() >= 4 &&
396+
(stem<string_typeT>::is_either(text[text.length() - 4], common_lang_constants::LOWER_I, common_lang_constants::UPPER_I) ||
397+
stem<string_typeT>::is_either(text[text.length() - 4], common_lang_constants::LOWER_K, common_lang_constants::UPPER_K) ||
398+
stem<string_typeT>::is_either(text[text.length() - 4], common_lang_constants::LOWER_L, common_lang_constants::UPPER_L) ||
399+
stem<string_typeT>::is_either(text[text.length() - 4], common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) ||
400+
stem<string_typeT>::is_either(text[text.length() - 4], common_lang_constants::LOWER_P, common_lang_constants::UPPER_P) ||
401+
stem<string_typeT>::is_either(text[text.length() - 4], common_lang_constants::LOWER_R, common_lang_constants::UPPER_R) ||
402+
stem<string_typeT>::is_either(text[text.length() - 4], common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) ||
403+
stem<string_typeT>::is_either(text[text.length() - 4], common_lang_constants::LOWER_U, common_lang_constants::UPPER_U) ||
404+
stem<string_typeT>::is_either(text[text.length() - 4], common_lang_constants::LOWER_V, common_lang_constants::UPPER_V)) &&
405+
stem<string_typeT>::is_suffix_in_r1(text,
406+
/* öst (with valid character in front of it) */
394407
common_lang_constants::LOWER_O_UMLAUTS, common_lang_constants::UPPER_O_UMLAUTS,
395408
common_lang_constants::LOWER_S, common_lang_constants::UPPER_S,
396409
common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) )
@@ -399,19 +412,19 @@ namespace stemming
399412
stem<string_typeT>::update_r_sections(text);
400413
}
401414
else if (stem<string_typeT>::delete_if_is_in_r1(text,
402-
/*lig*/
415+
/* lig */
403416
common_lang_constants::LOWER_L, common_lang_constants::UPPER_L,
404417
common_lang_constants::LOWER_I, common_lang_constants::UPPER_I,
405418
common_lang_constants::LOWER_G, common_lang_constants::UPPER_G, false) )
406419
{ return; }
407420
else if (stem<string_typeT>::delete_if_is_in_r1(text,
408-
/*els*/
421+
/* els */
409422
common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
410423
common_lang_constants::LOWER_L, common_lang_constants::UPPER_L,
411424
common_lang_constants::LOWER_S, common_lang_constants::UPPER_S, false) )
412425
{ return; }
413426
else if (stem<string_typeT>::delete_if_is_in_r1(text,
414-
/*ig*/
427+
/* ig */
415428
common_lang_constants::LOWER_I, common_lang_constants::UPPER_I,
416429
common_lang_constants::LOWER_G, common_lang_constants::UPPER_G, false) )
417430
{ return; }

0 commit comments

Comments
 (0)