Skip to content

Commit

Permalink
Support Grapheme_Cluster_Break=Prepend (#62)
Browse files Browse the repository at this point in the history
These characters act like combining marks,
except they go before the base character instead of after it.
  • Loading branch information
Jules-Bertholet authored Jun 17, 2024
1 parent afab363 commit 8e40640
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 9 deletions.
16 changes: 16 additions & 0 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# - ReadMe.txt
# - Scripts.txt
# - UnicodeData.txt
# - auxiliary/GraphemeBreakProperty.txt
# - emoji/emoji-data.txt
# - emoji/emoji-variation-sequences.txt
# - extracted/DerivedGeneralCategory.txt
Expand Down Expand Up @@ -526,6 +527,21 @@ def load_zero_widths() -> list[bool]:
zw_map[0x0891] = True
zw_map[0x08E2] = True

# `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]`
gcb_prepend = set()
load_property(
"auxiliary/GraphemeBreakProperty.txt",
"Prepend",
lambda cp: gcb_prepend.add(cp),
)
load_property(
"PropList.txt",
"Prepended_Concatenation_Mark",
lambda cp: gcb_prepend.remove(cp),
)
for cp in gcb_prepend:
zw_map[cp] = True

# HANGUL CHOSEONG FILLER
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
Expand Down
5 changes: 4 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
//! - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` (<a, -i> ya, `ᨕᨗ‍ᨐ`) has total width 1.
//! - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1.
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
//! have width 0.
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
Expand Down Expand Up @@ -113,6 +113,8 @@
//! - [`'\u{0890}'` POUND MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0890),
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Cluster_Break%3DPrepend%7D-%5Cp%7BPrepended_Concatenation_Mark%7D)
//! with the [`Grapheme_Extend=Prepend`] property, that are not also [`Prepended_Concatenation_Mark`]s.
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
Expand All @@ -132,6 +134,7 @@
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862
Expand Down
24 changes: 16 additions & 8 deletions src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1162,7 +1162,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
],
[
0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15,
0x44, 0x01, 0x54, 0x55, 0x51, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55,
0x44, 0x01, 0x54, 0x55, 0x41, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
[
Expand Down Expand Up @@ -1532,7 +1532,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
],
[
0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x00,
0x40, 0x55, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x40, 0x05, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
[
Expand Down Expand Up @@ -1587,7 +1587,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
],
[
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x15,
0x44, 0x15, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x04, 0x11, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
[
Expand All @@ -1596,12 +1596,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
0x55, 0x55,
],
[
0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x14,
0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x04,
0x40, 0x55, 0x15, 0x55, 0x55, 0x01, 0x40, 0x01, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
[
0x55, 0x55, 0x05, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x00, 0x00, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
Expand All @@ -1617,7 +1617,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
],
[
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x01, 0x40, 0x45,
0x10, 0x00, 0x10, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x10, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
[
Expand All @@ -1631,7 +1631,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
0x55, 0x55,
],
[
0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40,
0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40,
0x55, 0x44, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
Expand Down Expand Up @@ -1994,7 +1994,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
/// Sorted list of codepoint ranges (inclusive)
/// that are zero-width but not `Joining_Type=Transparent`
/// FIXME: can we get better compression?
static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 53] = [
([0x05, 0x06, 0x00], [0x05, 0x06, 0x00]),
([0x90, 0x08, 0x00], [0x91, 0x08, 0x00]),
([0xE2, 0x08, 0x00], [0xE2, 0x08, 0x00]),
Expand All @@ -2010,6 +2010,7 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
([0xCA, 0x0C, 0x00], [0xCB, 0x0C, 0x00]),
([0xD5, 0x0C, 0x00], [0xD6, 0x0C, 0x00]),
([0x3E, 0x0D, 0x00], [0x3E, 0x0D, 0x00]),
([0x4E, 0x0D, 0x00], [0x4E, 0x0D, 0x00]),
([0x57, 0x0D, 0x00], [0x57, 0x0D, 0x00]),
([0xCF, 0x0D, 0x00], [0xCF, 0x0D, 0x00]),
([0xDF, 0x0D, 0x00], [0xDF, 0x0D, 0x00]),
Expand All @@ -2028,12 +2029,19 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
([0xCB, 0xD7, 0x00], [0xFB, 0xD7, 0x00]),
([0x9E, 0xFF, 0x00], [0xA0, 0xFF, 0x00]),
([0xF0, 0xFF, 0x00], [0xF8, 0xFF, 0x00]),
([0xC2, 0x11, 0x01], [0xC3, 0x11, 0x01]),
([0x3E, 0x13, 0x01], [0x3E, 0x13, 0x01]),
([0x57, 0x13, 0x01], [0x57, 0x13, 0x01]),
([0xB0, 0x14, 0x01], [0xB0, 0x14, 0x01]),
([0xBD, 0x14, 0x01], [0xBD, 0x14, 0x01]),
([0xAF, 0x15, 0x01], [0xAF, 0x15, 0x01]),
([0x30, 0x19, 0x01], [0x30, 0x19, 0x01]),
([0x3F, 0x19, 0x01], [0x3F, 0x19, 0x01]),
([0x41, 0x19, 0x01], [0x41, 0x19, 0x01]),
([0x3A, 0x1A, 0x01], [0x3A, 0x1A, 0x01]),
([0x84, 0x1A, 0x01], [0x89, 0x1A, 0x01]),
([0x46, 0x1D, 0x01], [0x46, 0x1D, 0x01]),
([0x02, 0x1F, 0x01], [0x02, 0x1F, 0x01]),
([0x65, 0xD1, 0x01], [0x65, 0xD1, 0x01]),
([0x6E, 0xD1, 0x01], [0x72, 0xD1, 0x01]),
([0x00, 0x00, 0x0E], [0x00, 0x00, 0x0E]),
Expand Down
6 changes: 6 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ fn test_prepended_concatenation_marks() {
}
}

#[test]
fn test_gcb_prepend() {
assert_width!("ൎഉ", 1, 1);
assert_width!("\u{11A89}", 0, 0);
}

#[test]
fn test_interlinear_annotation_chars() {
assert_width!('\u{FFF9}', Some(1), Some(1));
Expand Down

0 comments on commit 8e40640

Please sign in to comment.