From 9b940f6d04cce957456eee0bab6bd0c8ea4c512d Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Tue, 28 May 2024 09:58:44 -0400 Subject: [PATCH] Support emoji ZWJ sequences --- README.md | 15 +- scripts/unicode.py | 356 +++++++++++++++++++++++++++++++++------------ src/lib.rs | 2 + src/tables.rs | 295 ++++++++++++++++++++++++++++++++----- tests/tests.rs | 138 +++++++++++++++++- 5 files changed, 658 insertions(+), 148 deletions(-) diff --git a/README.md b/README.md index 40b5947..d95c4c0 100644 --- a/README.md +++ b/README.md @@ -25,25 +25,26 @@ fn main() { ``` **NOTE:** The computed width values may not match the actual rendered column -width. For example, the woman scientist emoji comprises of a woman emoji, a -zero-width joiner and a microscope emoji. Such [emoji ZWJ sequences](https://www.unicode.org/reports/tr51/#Emoji_ZWJ_Sequences) -are considered to have the sum of the widths of their constituent parts: +width. For example, Brahmic scripts like Devanagari have complex rendering rules +which this crate does not currently handle (and will never fully handle, because +the exact rendering depends on the font): ```rust extern crate unicode_width; use unicode_width::UnicodeWidthStr; fn main() { - assert_eq!("๐Ÿ‘ฉ".width(), 2); // Woman - assert_eq!("๐Ÿ”ฌ".width(), 2); // Microscope - assert_eq!("๐Ÿ‘ฉโ€๐Ÿ”ฌ".width(), 4); // Woman scientist + assert_eq!("เค•".width(), 1); // Devanagari letter Ka + assert_eq!("เคท".width(), 1); // Devanagari letter Ssa + assert_eq!("เค•เฅเคท".width(), 2); // Ka + Virama + Ssa } ``` Additionally, [defective combining character sequences](https://unicode.org/glossary/#defective_combining_character_sequence) and nonstandard [Korean jamo](https://unicode.org/glossary/#jamo) sequences may be rendered with a different width than what this crate says. (This is not an -exhaustive list.) +exhaustive list.) For a list of what this crate *does* handle, see +[docs.rs](https://docs.rs/unicode-width/latest/unicode_width/#rules-for-determining-width). ## crates.io diff --git a/scripts/unicode.py b/scripts/unicode.py index a5b478c..8d18546 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -152,121 +152,183 @@ class CharWidthInTable(enum.IntEnum): SPECIAL = 3 -class CharWidth(enum.IntEnum): - """The integer values of these variants have special meaning: +class WidthState(enum.IntEnum): + """ + Width calculation proceeds according to a state machine. + We iterate over the characters of the string from back to front; + the next character encountered determines the transition to take. + + The integer values of these variants have special meaning: - Top bit: whether this is Vs16 - 2nd from top: whether this is Vs15 - - 3rd from top: whether this is unaffected by ligature-transparent - - 4th bit: if 3rd is set but this one is not, then this is a ZWJ ligature state + - 3rd bit from top: whether this is transparent to emoji/text presentation + (if set, should also set 4th) + - 4th bit: whether to set top bit on emoji presentation. + If this is set but 3rd is not, the width mode is related to zwj sequences + - 5th from top: whether this is unaffected by ligature-transparent + - 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state where no ZWJ has been encountered yet; encountering one flips this on""" # BASIC WIDTHS - ZERO = 0x100 + ZERO = 0x1_0000 "Zero columns wide." - NARROW = 0x101 + NARROW = 0x1_0001 "One column wide." - WIDE = 0x102 + WIDE = 0x1_0002 "Two columns wide." - # TIGHTLY DEFINED SEQUENCES - # \r\n - LINE_FEED = 0b0000_0011 + LINE_FEED = 0b0000_0000_0000_0001 "\\n (CRLF has width 1)" + # EMOJI + # Emoji skintone modifiers - EMOJI_MODIFIER = 0b0000_0100 + EMOJI_MODIFIER = 0b0000_0000_0000_0010 "`Emoji_Modifier`" # Emoji ZWJ sequences - REGIONAL_INDICATOR = 0b0000_0101 - "`Regional_Indicator` (for ZWJ sequences)" + REGIONAL_INDICATOR = 0b0000_0000_0000_0011 + "`Regional_Indicator`" - EMOJI_PRESENTATION = 0b0000_0110 + SEVERAL_REGIONAL_INDICATOR = 0b0000_0000_0000_0100 + "At least two `Regional_Indicator`in sequence" + + EMOJI_PRESENTATION = 0b0000_0000_0000_0101 "`Emoji_Presentation`" + ZWJ_EMOJI_PRESENTATION = 0b0001_0000_0000_0110 + "\\u200D `Emoji_Presentation`" + + VS16_ZWJ_EMOJI_PRESENTATION = 0b1001_0000_0000_0110 + "\\uFE0F \\u200D `Emoji_Presentation`" + + KEYCAP_ZWJ_EMOJI_PRESENTATION = 0b0001_0000_0000_0111 + "\\u20E3 \\u200D `Emoji_Presentation`" + + VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION = 0b1001_0000_0000_0111 + "\\uFE0F \\u20E3 \\u200D `Emoji_Presentation`" + + REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1001 + "`Regional_Indicator` \\u200D `Emoji_Presentation`" + + EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1010 + "(`Regional_Indicator` `Regional_Indicator`)+ \\u200D `Emoji_Presentation`" + + ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1011 + "(`Regional_Indicator` `Regional_Indicator`)+ `Regional_Indicator` \\u200D `Emoji_Presentation`" + + TAG_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0000 + "\\uE007F \\u200D `Emoji_Presentation`" + + TAG_D1_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0001 + "\\uE0030..=\\uE0039 \\uE007F \\u200D `Emoji_Presentation`" + + TAG_D2_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0010 + "(\\uE0030..=\\uE0039){2} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_D3_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0011 + "(\\uE0030..=\\uE0039){3} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A1_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1001 + "\\uE0061..=\\uE007A \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A2_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1010 + "(\\uE0061..=\\uE007A){2} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A3_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1011 + "(\\uE0061..=\\uE007A){3} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A4_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1100 + "(\\uE0061..=\\uE007A){4} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A5_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1101 + "(\\uE0061..=\\uE007A){35} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110 + "(\\uE0061..=\\uE007A){6} \\uE007F \\u200D `Emoji_Presentation`" + + # VARIATION SELECTORS # Text presentation sequences (not CJK) - VARIATION_SELECTOR_15 = 0b0100_0000 + VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000 "\\uFE0E (text presentation sequences)" # Emoji presentation sequences - VARIATION_SELECTOR_16 = 0b1000_0000 + VARIATION_SELECTOR_16 = 0b1000_0000_0000_0000 "\\uFE0F (emoji presentation sequences)" - # --- Width modes below this line can have 1 of their top 2 bits set to indicate presence of a VS --- - # ARABIC LAM ALEF - JOINING_GROUP_ALEF = 0b0000_1111 + JOINING_GROUP_ALEF = 0b0011_0000_1111_1111 "Joining_Group=Alef (Arabic Lam-Alef ligature)" # COMBINING SOLIDUS (CJK only) - COMBINING_LONG_SOLIDUS_OVERLAY = 0b0011_1111 + COMBINING_LONG_SOLIDUS_OVERLAY = 0b0011_1100_1111_1111 "\\u0338 (CJK only, makes <, =, > width 2)" # SOLIDUS + ALEF (solidus is Joining_Type=Transparent) - SOLIDUS_OVERLAY_ALEF = 0b0010_1111 + SOLIDUS_OVERLAY_ALEF = 0b0011_1000_1111_1111 "\\u0338 followed by Joining_Group=Alef" # SCRIPT ZWJ LIGATURES # Hebrew alef lamed - HEBREW_LETTER_LAMED = 0b0010_0000 + HEBREW_LETTER_LAMED = 0b0011_1000_0000_0000 "\\u05DC (Alef-ZWJ-Lamed ligature)" - ZWJ_HEBREW_LETTER_LAMED = 0b0011_0000 + ZWJ_HEBREW_LETTER_LAMED = 0b0011_1100_0000_0000 "\\u200D\\u05DC (Alef-ZWJ-Lamed ligature)" # Buginese ya - BUGINESE_LETTER_YA = 0b0010_0001 + BUGINESE_LETTER_YA = 0b0011_1000_0000_0001 "\\u1A10 ( + ya ligature)" - ZWJ_BUGINESE_LETTER_YA = 0b0011_0001 + ZWJ_BUGINESE_LETTER_YA = 0b0011_1100_0000_0001 "\\u200D\\u1A10 ( + ya ligature)" - BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA = 0b0011_0010 + BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA = 0b0011_1100_0000_0010 "\\u1A17\\u200D\\u1A10 ( + ya ligature)" # Tifinagh bi-consonants - TIFINAGH_CONSONANT = 0b0010_0011 + TIFINAGH_CONSONANT = 0b0011_1000_0000_0011 "\\u2D31..=\\u2D65 or \\u2D6F (joined by ZWJ or \\u2D7F TIFINAGH CONSONANT JOINER)" - ZWJ_TIFINAGH_CONSONANT = 0b0011_0011 + ZWJ_TIFINAGH_CONSONANT = 0b0011_1100_0000_0011 "ZWJ then \\u2D31..=\\u2D65 or \\u2D6F" - TIFINAGH_JOINER_CONSONANT = 0b0011_0100 + TIFINAGH_JOINER_CONSONANT = 0b0011_1100_0000_0100 "\\u2D7F then \\u2D31..=\\u2D65 or \\u2D6F" # Lisu tone letters - LISU_TONE_LETTER_MYA_NA_JEU = 0b0011_0101 + LISU_TONE_LETTER_MYA_NA_JEU = 0b0011_1100_0000_0101 "\\uA4FC or \\uA4FD (https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078)" # Old Turkic orkhon ec - orkhon i - OLD_TURKIC_LETTER_ORKHON_I = 0b0010_0110 + OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1000_0000_0110 "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)" - ZWJ_OLD_TURKIC_LETTER_ORKHON_I = 0b0011_0110 + ZWJ_OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1100_0000_0110 "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)" def table_width(self) -> CharWidthInTable: "The width of a character as stored in the lookup tables." match self: - case CharWidth.ZERO: + case WidthState.ZERO: return CharWidthInTable.ZERO - case CharWidth.NARROW: + case WidthState.NARROW: return CharWidthInTable.ONE - case CharWidth.WIDE: + case WidthState.WIDE: return CharWidthInTable.TWO case _: return CharWidthInTable.SPECIAL @@ -275,21 +337,23 @@ def width_alone(self) -> int: "The width of a character with this type when it appears alone." match self: case ( - CharWidth.ZERO - | CharWidth.COMBINING_LONG_SOLIDUS_OVERLAY - | CharWidth.VARIATION_SELECTOR_15 - | CharWidth.VARIATION_SELECTOR_16 + WidthState.ZERO + | WidthState.COMBINING_LONG_SOLIDUS_OVERLAY + | WidthState.VARIATION_SELECTOR_15 + | WidthState.VARIATION_SELECTOR_16 ): return 0 case ( - CharWidth.WIDE | CharWidth.EMOJI_MODIFIER | CharWidth.EMOJI_PRESENTATION + WidthState.WIDE + | WidthState.EMOJI_MODIFIER + | WidthState.EMOJI_PRESENTATION ): return 2 case _: return 1 -assert len(set([v.value for v in CharWidth])) == len([v.value for v in CharWidth]) +assert len(set([v.value for v in WidthState])) == len([v.value for v in WidthState]) def load_east_asian_widths() -> list[EastAsianWidth]: @@ -454,7 +518,7 @@ def load_zero_widths() -> list[bool]: return zw_map -def load_width_maps() -> tuple[list[CharWidth], list[CharWidth]]: +def load_width_maps() -> tuple[list[WidthState], list[WidthState]]: """Load complete width table, including characters needing special handling. (Returns 2 tables, one for East Asian and one for not.)""" @@ -466,18 +530,18 @@ def load_width_maps() -> tuple[list[CharWidth], list[CharWidth]]: for eaw, zw in zip(eaws, zws): if zw: - not_ea.append(CharWidth.ZERO) - ea.append(CharWidth.ZERO) + not_ea.append(WidthState.ZERO) + ea.append(WidthState.ZERO) else: if eaw == EastAsianWidth.WIDE: - not_ea.append(CharWidth.WIDE) + not_ea.append(WidthState.WIDE) else: - not_ea.append(CharWidth.NARROW) + not_ea.append(WidthState.NARROW) if eaw == EastAsianWidth.NARROW: - ea.append(CharWidth.NARROW) + ea.append(WidthState.NARROW) else: - ea.append(CharWidth.WIDE) + ea.append(WidthState.WIDE) # Joining_Group=Alef (Arabic Lam-Alef ligature) alef_joining = [] @@ -512,29 +576,29 @@ def load_width_maps() -> tuple[list[CharWidth], list[CharWidth]]: ) for cps, width in [ - ([0x0A], CharWidth.LINE_FEED), - ([0x05DC], CharWidth.HEBREW_LETTER_LAMED), - (alef_joining, CharWidth.JOINING_GROUP_ALEF), - ([0x1A10], CharWidth.BUGINESE_LETTER_YA), - (range(0x2D31, 0x2D66), CharWidth.TIFINAGH_CONSONANT), - ([0x2D6F], CharWidth.TIFINAGH_CONSONANT), - ([0xA4FC], CharWidth.LISU_TONE_LETTER_MYA_NA_JEU), - ([0xA4FD], CharWidth.LISU_TONE_LETTER_MYA_NA_JEU), - ([0xFE0F], CharWidth.VARIATION_SELECTOR_16), - ([0x10C03], CharWidth.OLD_TURKIC_LETTER_ORKHON_I), - (regional_indicators, CharWidth.REGIONAL_INDICATOR), - (emoji_presentation, CharWidth.EMOJI_PRESENTATION), - (emoji_modifiers, CharWidth.EMOJI_MODIFIER), + ([0x0A], WidthState.LINE_FEED), + ([0x05DC], WidthState.HEBREW_LETTER_LAMED), + (alef_joining, WidthState.JOINING_GROUP_ALEF), + ([0x1A10], WidthState.BUGINESE_LETTER_YA), + (range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT), + ([0x2D6F], WidthState.TIFINAGH_CONSONANT), + ([0xA4FC], WidthState.LISU_TONE_LETTER_MYA_NA_JEU), + ([0xA4FD], WidthState.LISU_TONE_LETTER_MYA_NA_JEU), + ([0xFE0F], WidthState.VARIATION_SELECTOR_16), + ([0x10C03], WidthState.OLD_TURKIC_LETTER_ORKHON_I), + (emoji_presentation, WidthState.EMOJI_PRESENTATION), + (emoji_modifiers, WidthState.EMOJI_MODIFIER), + (regional_indicators, WidthState.REGIONAL_INDICATOR), ]: for cp in cps: not_ea[cp] = width ea[cp] = width # East-Asian only - ea[0x0338] = CharWidth.COMBINING_LONG_SOLIDUS_OVERLAY + ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY # Not East Asian only - not_ea[0xFE0E] = CharWidth.VARIATION_SELECTOR_15 + not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15 return (not_ea, ea) @@ -552,7 +616,7 @@ def load_joining_group_lam() -> list[tuple[Codepoint, Codepoint]]: def load_non_transparent_zero_widths( - width_map: list[CharWidth], + width_map: list[WidthState], ) -> list[tuple[Codepoint, Codepoint]]: "Returns a list of characters with zero width but not 'Joining_Type=Transparent'" @@ -596,7 +660,7 @@ def load_ligature_transparent() -> list[tuple[Codepoint, Codepoint]]: def load_solidus_transparent( ligature_transparents: list[tuple[Codepoint, Codepoint]], - cjk_width_map: list[CharWidth], + cjk_width_map: list[WidthState], ) -> list[tuple[Codepoint, Codepoint]]: """Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above. Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also. @@ -641,13 +705,13 @@ def load_solidus_transparent( def make_special_ranges( - width_map: list[CharWidth], -) -> list[tuple[tuple[Codepoint, Codepoint], CharWidth]]: + width_map: list[WidthState], +) -> list[tuple[tuple[Codepoint, Codepoint], WidthState]]: "Assign ranges of characters to their special behavior (used in match)" ret = [] can_merge_with_prev = False for cp, width in enumerate(width_map): - if width == CharWidth.EMOJI_PRESENTATION: + if width == WidthState.EMOJI_PRESENTATION: can_merge_with_prev = False elif width.table_width() == CharWidthInTable.SPECIAL: if can_merge_with_prev and ret[-1][1] == width: @@ -824,8 +888,8 @@ def to_bytes(self) -> list[int]: def make_tables( - width_map: list[CharWidth], - cjk_width_map: list[CharWidth], + width_map: list[WidthState], + cjk_width_map: list[WidthState], ) -> list[Table]: """Creates a table for each configuration in `table_cfgs`, with the first config corresponding to the top-level lookup table, the second config corresponding to the second-level lookup @@ -1038,7 +1102,7 @@ def make_ranges_table( def lookup_fns( is_cjk: bool, - special_ranges: list[tuple[tuple[Codepoint, Codepoint], CharWidth]], + special_ranges: list[tuple[tuple[Codepoint, Codepoint], WidthState]], joining_group_lam: list[tuple[Codepoint, Codepoint]], ) -> str: if is_cjk: @@ -1125,7 +1189,12 @@ def lookup_fns( fn width_in_str{cjk_lo}(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {{ if next_info.is_emoji_presentation() {{ if starts_emoji_presentation_seq(c) {{ - return (2, WidthInfo::DEFAULT); + let width = if next_info.is_zwj_emoji_presentation() {{ + 0 + }} else {{ + 2 + }}; + return (width, WidthInfo::EMOJI_PRESENTATION); }} else {{ next_info = next_info.unset_emoji_presentation(); }} @@ -1243,12 +1312,100 @@ def lookup_fns( }""" s += f""" - // Emoji modifier (WidthInfo::EMOJI_MODIFIER, _) if is_emoji_modifier_base(c) => {{ return (0, WidthInfo::EMOJI_PRESENTATION); }} + // Regional indicator + ( + WidthInfo::REGIONAL_INDICATOR | WidthInfo::SEVERAL_REGIONAL_INDICATOR, + '\\u{{1F1E6}}'..='\\u{{1F1FF}}', + ) => return (1, WidthInfo::SEVERAL_REGIONAL_INDICATOR), + + // ZWJ emoji + ( + WidthInfo::EMOJI_PRESENTATION + | WidthInfo::SEVERAL_REGIONAL_INDICATOR + | WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::EMOJI_MODIFIER, + '\\u{{200D}}', + ) => return (0, WidthInfo::ZWJ_EMOJI_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{20E3}}') => {{ + return (0, WidthInfo::KEYCAP_ZWJ_EMOJI_PRESENTATION); + }} + (WidthInfo::VS16_ZWJ_EMOJI_PRESENTATION, _) if starts_emoji_presentation_seq(c) => {{ + return (0, WidthInfo::EMOJI_PRESENTATION) + }} + (WidthInfo::VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION, '0'..='9' | '#' | '*') => {{ + return (0, WidthInfo::EMOJI_PRESENTATION) + }} + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F1E6}}'..='\\u{{1F1FF}}') => {{ + return (1, WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION); + }} + ( + WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION, + '\\u{{1F1E6}}'..='\\u{{1F1FF}}', + ) => return (-1, WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION), + ( + WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION, + '\\u{{1F1E6}}'..='\\u{{1F1FF}}', + ) => return (3, WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F3FB}}'..='\\u{{1F3FF}}') => {{ + return (0, WidthInfo::EMOJI_MODIFIER); + }} + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{E007F}}') => {{ + return (0, WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION); + }} + (WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION); + }} + (WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION) + }} + (WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION) + }} + (WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION) + }} + (WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION) + }} + (WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION) + }} + ( + WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, + '\\u{{E0030}}'..='\\u{{E0039}}', + ) => return (0, WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION), + (WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{ + return (0, WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION); + }} + (WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{ + return (0, WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION); + }} + ( + WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION, + '\\u{{1F3F4}}', + ) => return (0, WidthInfo::EMOJI_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, _) + if lookup_width{cjk_lo}(c).1 == WidthInfo::EMOJI_PRESENTATION => + {{ + return (0, WidthInfo::EMOJI_PRESENTATION) + }} + + // Fallback _ => {{}} }} }} @@ -1279,8 +1436,8 @@ def emit_module( out_name: str, unicode_version: tuple[int, int, int], tables: list[Table], - special_ranges: list[tuple[tuple[Codepoint, Codepoint], CharWidth]], - special_ranges_cjk: list[tuple[tuple[Codepoint, Codepoint], CharWidth]], + special_ranges: list[tuple[tuple[Codepoint, Codepoint], WidthState]], + special_ranges_cjk: list[tuple[tuple[Codepoint, Codepoint], WidthState]], emoji_presentation_table: tuple[list[tuple[int, int]], list[list[int]]], text_presentation_table: tuple[list[tuple[int, int]], list[list[tuple[int, int]]]], emoji_modifier_table: tuple[list[tuple[int, int]], list[list[tuple[int, int]]]], @@ -1311,7 +1468,7 @@ def emit_module( use core::cmp::Ordering; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -struct WidthInfo(u8); +struct WidthInfo(u16); impl WidthInfo { /// No special handling necessary @@ -1319,66 +1476,77 @@ def emit_module( """ ) - for variant in CharWidth: + for variant in WidthState: if variant.table_width() == CharWidthInTable.SPECIAL: if variant in [ - CharWidth.COMBINING_LONG_SOLIDUS_OVERLAY, - CharWidth.SOLIDUS_OVERLAY_ALEF, + WidthState.COMBINING_LONG_SOLIDUS_OVERLAY, + WidthState.SOLIDUS_OVERLAY_ALEF, ]: module.write(' #[cfg(feature = "cjk")]\n') module.write( - f" const {variant.name}: Self = Self(0b{variant.value:08b});\n" + f" const {variant.name}: Self = Self(0b{variant.value:016b});\n" ) module.write( f""" /// Whether this width mode is ligature_transparent - /// (has 3rd MSB set.) + /// (has 5th MSB set.) fn is_ligature_transparent(self) -> bool {{ - (self.0 & 0b0010_0000) == 0b0010_0000 + (self.0 & 0b0000_1000_0000_0000) == 0b0000_1000_0000_0000 }} - /// Sets 4th MSB. + /// Sets 6th MSB. fn set_zwj_bit(self) -> Self {{ - Self(self.0 | 0b001_0000) + Self(self.0 | 0b0000_0100_0000_0000) }} /// Has top bit set fn is_emoji_presentation(self) -> bool {{ - (self.0 & 0b1000_0000) == 0b1000_0000 + (self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000 + }} + + /// Has top bit set + fn is_zwj_emoji_presentation(self) -> bool {{ + (self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000 }} /// Set top bit fn set_emoji_presentation(self) -> Self {{ - if self.0 >= 0b0000_1111 {{ - Self(self.0 | 0b1000_0000) + if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 + || (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000 + {{ + Self(self.0 | 0b1000_0000_0000_0000) }} else {{ - Self(0b1000_0000) + Self::VARIATION_SELECTOR_16 }} }} /// Clear top bit fn unset_emoji_presentation(self) -> Self {{ - Self(self.0 & 0b0111_1111) + if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{ + Self(self.0 & 0b0111_1111_1111_1111) + }} else {{ + Self::DEFAULT + }} }} /// Has 2nd bit set fn is_text_presentation(self) -> bool {{ - (self.0 & 0b0100_0000) == 0b0100_0000 + (self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000 }} /// Set 2nd bit fn set_text_presentation(self) -> Self {{ - if self.0 >= 0b0000_1111 {{ - Self(self.0 | 0b0100_0000) + if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{ + Self(self.0 | 0b0100_0000_0000_0000) }} else {{ - Self(0b0100_0000) + Self(0b0100_0000_0000_0000) }} }} /// Clear 2nd bit fn unset_text_presentation(self) -> Self {{ - Self(self.0 & 0b1011_1111) + Self(self.0 & 0b1011_1111_1111_1111) }} }} diff --git a/src/lib.rs b/src/lib.rs index 98cb232..eecf49a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -56,6 +56,7 @@ //! 1. In the following cases, the width of a string differs from the sum of the widths of its constituent characters: //! - The sequence `"\r\n"` has width 1. //! - Emoji-specific ligatures: +//! - Well-formed, fully-qualified [emoji ZWJ sequences] have width 2. //! - [Emoji modifier sequences] have width 2. //! - [Emoji presentation sequences] have width 2. //! - Outside of an East Asian context, [text presentation sequences] have width 1 if their base character: @@ -139,6 +140,7 @@ //! //! [combining marks]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G30602 //! +//! [emoji ZWJ sequences]: https://www.unicode.org/reports/tr51/#def_emoji_sequence //! [Emoji modifier sequences]: https://www.unicode.org/reports/tr51/#def_emoji_modifier_sequence //! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence //! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence diff --git a/src/tables.rs b/src/tables.rs index 78fbf6e..b36968d 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -13,81 +13,110 @@ use core::cmp::Ordering; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -struct WidthInfo(u8); +struct WidthInfo(u16); impl WidthInfo { /// No special handling necessary const DEFAULT: Self = Self(0); - const LINE_FEED: Self = Self(0b00000011); - const EMOJI_MODIFIER: Self = Self(0b00000100); - const REGIONAL_INDICATOR: Self = Self(0b00000101); - const EMOJI_PRESENTATION: Self = Self(0b00000110); - const VARIATION_SELECTOR_15: Self = Self(0b01000000); - const VARIATION_SELECTOR_16: Self = Self(0b10000000); - const JOINING_GROUP_ALEF: Self = Self(0b00001111); + const LINE_FEED: Self = Self(0b0000000000000001); + const EMOJI_MODIFIER: Self = Self(0b0000000000000010); + const REGIONAL_INDICATOR: Self = Self(0b0000000000000011); + const SEVERAL_REGIONAL_INDICATOR: Self = Self(0b0000000000000100); + const EMOJI_PRESENTATION: Self = Self(0b0000000000000101); + const ZWJ_EMOJI_PRESENTATION: Self = Self(0b0001000000000110); + const VS16_ZWJ_EMOJI_PRESENTATION: Self = Self(0b1001000000000110); + const KEYCAP_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0001000000000111); + const VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION: Self = Self(0b1001000000000111); + const REGIONAL_INDICATOR_ZWJ_PRESENTATION: Self = Self(0b0000000000001001); + const EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION: Self = Self(0b0000000000001010); + const ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION: Self = Self(0b0000000000001011); + const TAG_END_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0000000000010000); + const TAG_D1_END_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0000000000010001); + const TAG_D2_END_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0000000000010010); + const TAG_D3_END_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0000000000010011); + const TAG_A1_END_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0000000000011001); + const TAG_A2_END_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0000000000011010); + const TAG_A3_END_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0000000000011011); + const TAG_A4_END_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0000000000011100); + const TAG_A5_END_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0000000000011101); + const TAG_A6_END_ZWJ_EMOJI_PRESENTATION: Self = Self(0b0000000000011110); + const VARIATION_SELECTOR_15: Self = Self(0b0100000000000000); + const VARIATION_SELECTOR_16: Self = Self(0b1000000000000000); + const JOINING_GROUP_ALEF: Self = Self(0b0011000011111111); #[cfg(feature = "cjk")] - const COMBINING_LONG_SOLIDUS_OVERLAY: Self = Self(0b00111111); + const COMBINING_LONG_SOLIDUS_OVERLAY: Self = Self(0b0011110011111111); #[cfg(feature = "cjk")] - const SOLIDUS_OVERLAY_ALEF: Self = Self(0b00101111); - const HEBREW_LETTER_LAMED: Self = Self(0b00100000); - const ZWJ_HEBREW_LETTER_LAMED: Self = Self(0b00110000); - const BUGINESE_LETTER_YA: Self = Self(0b00100001); - const ZWJ_BUGINESE_LETTER_YA: Self = Self(0b00110001); - const BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA: Self = Self(0b00110010); - const TIFINAGH_CONSONANT: Self = Self(0b00100011); - const ZWJ_TIFINAGH_CONSONANT: Self = Self(0b00110011); - const TIFINAGH_JOINER_CONSONANT: Self = Self(0b00110100); - const LISU_TONE_LETTER_MYA_NA_JEU: Self = Self(0b00110101); - const OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b00100110); - const ZWJ_OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b00110110); + const SOLIDUS_OVERLAY_ALEF: Self = Self(0b0011100011111111); + const HEBREW_LETTER_LAMED: Self = Self(0b0011100000000000); + const ZWJ_HEBREW_LETTER_LAMED: Self = Self(0b0011110000000000); + const BUGINESE_LETTER_YA: Self = Self(0b0011100000000001); + const ZWJ_BUGINESE_LETTER_YA: Self = Self(0b0011110000000001); + const BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA: Self = Self(0b0011110000000010); + const TIFINAGH_CONSONANT: Self = Self(0b0011100000000011); + const ZWJ_TIFINAGH_CONSONANT: Self = Self(0b0011110000000011); + const TIFINAGH_JOINER_CONSONANT: Self = Self(0b0011110000000100); + const LISU_TONE_LETTER_MYA_NA_JEU: Self = Self(0b0011110000000101); + const OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b0011100000000110); + const ZWJ_OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b0011110000000110); /// Whether this width mode is ligature_transparent - /// (has 3rd MSB set.) + /// (has 5th MSB set.) fn is_ligature_transparent(self) -> bool { - (self.0 & 0b0010_0000) == 0b0010_0000 + (self.0 & 0b0000_1000_0000_0000) == 0b0000_1000_0000_0000 } - /// Sets 4th MSB. + /// Sets 6th MSB. fn set_zwj_bit(self) -> Self { - Self(self.0 | 0b001_0000) + Self(self.0 | 0b0000_0100_0000_0000) } /// Has top bit set fn is_emoji_presentation(self) -> bool { - (self.0 & 0b1000_0000) == 0b1000_0000 + (self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000 + } + + /// Has top bit set + fn is_zwj_emoji_presentation(self) -> bool { + (self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000 } /// Set top bit fn set_emoji_presentation(self) -> Self { - if self.0 >= 0b0000_1111 { - Self(self.0 | 0b1000_0000) + if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 + || (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000 + { + Self(self.0 | 0b1000_0000_0000_0000) } else { - Self(0b1000_0000) + Self::VARIATION_SELECTOR_16 } } /// Clear top bit fn unset_emoji_presentation(self) -> Self { - Self(self.0 & 0b0111_1111) + if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 { + Self(self.0 & 0b0111_1111_1111_1111) + } else { + Self::DEFAULT + } } /// Has 2nd bit set fn is_text_presentation(self) -> bool { - (self.0 & 0b0100_0000) == 0b0100_0000 + (self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000 } /// Set 2nd bit fn set_text_presentation(self) -> Self { - if self.0 >= 0b0000_1111 { - Self(self.0 | 0b0100_0000) + if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 { + Self(self.0 | 0b0100_0000_0000_0000) } else { - Self(0b0100_0000) + Self(0b0100_0000_0000_0000) } } /// Clear 2nd bit fn unset_text_presentation(self) -> Self { - Self(self.0 & 0b1011_1111) + Self(self.0 & 0b1011_1111_1111_1111) } } @@ -136,6 +165,7 @@ fn lookup_width(c: char) -> (u8, WidthInfo) { '\u{FE0E}' => (0, WidthInfo::VARIATION_SELECTOR_15), '\u{FE0F}' => (0, WidthInfo::VARIATION_SELECTOR_16), '\u{10C03}' => (1, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I), + '\u{1F1E6}'..='\u{1F1FF}' => (1, WidthInfo::REGIONAL_INDICATOR), '\u{1F3FB}'..='\u{1F3FF}' => (2, WidthInfo::EMOJI_MODIFIER), _ => (2, WidthInfo::EMOJI_PRESENTATION), } @@ -170,7 +200,12 @@ pub fn single_char_width(c: char) -> Option { fn width_in_str(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) { if next_info.is_emoji_presentation() { if starts_emoji_presentation_seq(c) { - return (2, WidthInfo::DEFAULT); + let width = if next_info.is_zwj_emoji_presentation() { + 0 + } else { + 2 + }; + return (width, WidthInfo::EMOJI_PRESENTATION); } else { next_info = next_info.unset_emoji_presentation(); } @@ -248,12 +283,100 @@ fn width_in_str(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) { (WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I, '\u{10C32}') => { return (0, WidthInfo::DEFAULT); } - // Emoji modifier (WidthInfo::EMOJI_MODIFIER, _) if is_emoji_modifier_base(c) => { return (0, WidthInfo::EMOJI_PRESENTATION); } + // Regional indicator + ( + WidthInfo::REGIONAL_INDICATOR | WidthInfo::SEVERAL_REGIONAL_INDICATOR, + '\u{1F1E6}'..='\u{1F1FF}', + ) => return (1, WidthInfo::SEVERAL_REGIONAL_INDICATOR), + + // ZWJ emoji + ( + WidthInfo::EMOJI_PRESENTATION + | WidthInfo::SEVERAL_REGIONAL_INDICATOR + | WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::EMOJI_MODIFIER, + '\u{200D}', + ) => return (0, WidthInfo::ZWJ_EMOJI_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\u{20E3}') => { + return (0, WidthInfo::KEYCAP_ZWJ_EMOJI_PRESENTATION); + } + (WidthInfo::VS16_ZWJ_EMOJI_PRESENTATION, _) if starts_emoji_presentation_seq(c) => { + return (0, WidthInfo::EMOJI_PRESENTATION) + } + (WidthInfo::VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION, '0'..='9' | '#' | '*') => { + return (0, WidthInfo::EMOJI_PRESENTATION) + } + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\u{1F1E6}'..='\u{1F1FF}') => { + return (1, WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION); + } + ( + WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION, + '\u{1F1E6}'..='\u{1F1FF}', + ) => return (-1, WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION), + ( + WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION, + '\u{1F1E6}'..='\u{1F1FF}', + ) => return (3, WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\u{1F3FB}'..='\u{1F3FF}') => { + return (0, WidthInfo::EMOJI_MODIFIER); + } + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\u{E007F}') => { + return (0, WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION); + } + (WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION); + } + (WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION) + } + (WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION) + } + (WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION) + } + (WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION) + } + (WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION) + } + ( + WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, + '\u{E0030}'..='\u{E0039}', + ) => return (0, WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION), + (WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION, '\u{E0030}'..='\u{E0039}') => { + return (0, WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION); + } + (WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION, '\u{E0030}'..='\u{E0039}') => { + return (0, WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION); + } + ( + WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION, + '\u{1F3F4}', + ) => return (0, WidthInfo::EMOJI_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, _) + if lookup_width(c).1 == WidthInfo::EMOJI_PRESENTATION => + { + return (0, WidthInfo::EMOJI_PRESENTATION) + } + + // Fallback _ => {} } } @@ -318,6 +441,7 @@ fn lookup_width_cjk(c: char) -> (u8, WidthInfo) { '\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU), '\u{FE0F}' => (0, WidthInfo::VARIATION_SELECTOR_16), '\u{10C03}' => (1, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I), + '\u{1F1E6}'..='\u{1F1FF}' => (1, WidthInfo::REGIONAL_INDICATOR), '\u{1F3FB}'..='\u{1F3FF}' => (2, WidthInfo::EMOJI_MODIFIER), _ => (2, WidthInfo::EMOJI_PRESENTATION), } @@ -354,7 +478,12 @@ pub fn single_char_width_cjk(c: char) -> Option { fn width_in_str_cjk(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) { if next_info.is_emoji_presentation() { if starts_emoji_presentation_seq(c) { - return (2, WidthInfo::DEFAULT); + let width = if next_info.is_zwj_emoji_presentation() { + 0 + } else { + 2 + }; + return (width, WidthInfo::EMOJI_PRESENTATION); } else { next_info = next_info.unset_emoji_presentation(); } @@ -438,12 +567,100 @@ fn width_in_str_cjk(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) { (WidthInfo::JOINING_GROUP_ALEF, '\u{0338}') => { return (0, WidthInfo::SOLIDUS_OVERLAY_ALEF); } - // Emoji modifier (WidthInfo::EMOJI_MODIFIER, _) if is_emoji_modifier_base(c) => { return (0, WidthInfo::EMOJI_PRESENTATION); } + // Regional indicator + ( + WidthInfo::REGIONAL_INDICATOR | WidthInfo::SEVERAL_REGIONAL_INDICATOR, + '\u{1F1E6}'..='\u{1F1FF}', + ) => return (1, WidthInfo::SEVERAL_REGIONAL_INDICATOR), + + // ZWJ emoji + ( + WidthInfo::EMOJI_PRESENTATION + | WidthInfo::SEVERAL_REGIONAL_INDICATOR + | WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::EMOJI_MODIFIER, + '\u{200D}', + ) => return (0, WidthInfo::ZWJ_EMOJI_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\u{20E3}') => { + return (0, WidthInfo::KEYCAP_ZWJ_EMOJI_PRESENTATION); + } + (WidthInfo::VS16_ZWJ_EMOJI_PRESENTATION, _) if starts_emoji_presentation_seq(c) => { + return (0, WidthInfo::EMOJI_PRESENTATION) + } + (WidthInfo::VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION, '0'..='9' | '#' | '*') => { + return (0, WidthInfo::EMOJI_PRESENTATION) + } + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\u{1F1E6}'..='\u{1F1FF}') => { + return (1, WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION); + } + ( + WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION, + '\u{1F1E6}'..='\u{1F1FF}', + ) => return (-1, WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION), + ( + WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION, + '\u{1F1E6}'..='\u{1F1FF}', + ) => return (3, WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\u{1F3FB}'..='\u{1F3FF}') => { + return (0, WidthInfo::EMOJI_MODIFIER); + } + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\u{E007F}') => { + return (0, WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION); + } + (WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION); + } + (WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION) + } + (WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION) + } + (WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION) + } + (WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION) + } + (WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION, '\u{E0061}'..='\u{E007A}') => { + return (0, WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION) + } + ( + WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, + '\u{E0030}'..='\u{E0039}', + ) => return (0, WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION), + (WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION, '\u{E0030}'..='\u{E0039}') => { + return (0, WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION); + } + (WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION, '\u{E0030}'..='\u{E0039}') => { + return (0, WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION); + } + ( + WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION, + '\u{1F3F4}', + ) => return (0, WidthInfo::EMOJI_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, _) + if lookup_width_cjk(c).1 == WidthInfo::EMOJI_PRESENTATION => + { + return (0, WidthInfo::EMOJI_PRESENTATION) + } + + // Fallback _ => {} } } diff --git a/tests/tests.rs b/tests/tests.rs index f198e85..80f19a5 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -43,10 +43,17 @@ fn test_str() { #[test] fn test_emoji() { - // Example from the README. assert_width!("๐Ÿ‘ฉ", 2, 2); // Woman assert_width!("๐Ÿ”ฌ", 2, 2); // Microscope - assert_width!("๐Ÿ‘ฉโ€๐Ÿ”ฌ", 4, 4); // Woman scientist + assert_width!("๐Ÿ‘ฉโ€๐Ÿ”ฌ", 2, 2); // Woman scientist +} + +// From README +#[test] +fn test_bad_devanagari() { + assert_eq!("เค•".width(), 1); // Devanagari letter Ka + assert_eq!("เคท".width(), 1); // Devanagari letter Ssa + assert_eq!("เค•เฅเคท".width(), 2); // Ka + Virama + Ssa } #[test] @@ -288,13 +295,13 @@ fn test_hebrew_alef_lamed() { assert_width!( "\u{05D0}\u{200D}\u{200D}\u{200D}\u{200D}\u{200D}\u{200D}\u{200D}\u{05DC}", 1, - 1 + 1, ); assert_width!("\u{05D0}\u{05D0}\u{200D}\u{05DC}", 2, 2); assert_width!( "\u{05D0}\u{05D0}\u{200D}\u{200D}\u{200D}\u{200D}\u{200D}\u{200D}\u{05DC}", 2, - 2 + 2, ); assert_width!("\u{05D0}\u{FE0F}\u{200D}\u{FE0F}\u{05DC}\u{FE0F}", 1, 1); assert_width!("\u{05D0}\u{FE0E}\u{200D}\u{FE0E}\u{05DC}\u{FE0E}", 1, 1); @@ -348,7 +355,7 @@ fn test_buginese_a_i_ya() { assert_width!( "\u{1A15}\u{1A17}\u{200D}\u{200D}\u{200D}\u{200D}\u{1A10}", 1, - 1 + 1, ); assert_width!("\u{1A15}\u{1A17}\u{200D}\u{338}", 1, 1); assert_width!("\u{1A15}\u{FE0E}\u{1A17}\u{200D}", 1, 1); @@ -360,14 +367,14 @@ fn test_buginese_a_i_ya() { assert_width!( "\u{1A15}\u{17B5}\u{200D}\u{FE0E}\u{1A17}\u{200D}\u{FE0F}\u{200D}\u{FE0F}", 1, - 1 + 1, ); assert_width!("\u{1A15}\u{1A15}\u{1A17}\u{200D}\u{1A10}", 2, 2); assert_width!( "\u{1A15}\u{1A15}\u{1A17}\u{200D}\u{200D}\u{200D}\u{200D}\u{1A10}", 2, - 2 + 2, ); assert_width!("\u{1A15}\u{1A17}\u{1A10}", 2, 2); @@ -389,7 +396,7 @@ fn test_tifinagh_biconsonants() { assert_width!( "\u{2D4F}\u{FE0F}\u{200D}\u{2D7F}\u{FE0E}\u{200D}\u{17B5}\u{2D3E}", 1, - 1 + 1, ); assert_width!("\u{2D4F}\u{301}\u{2D7F}\u{2D3E}", 3, 3); @@ -423,3 +430,118 @@ fn test_emoji_modifier() { assert_width!("\u{1F46A}\u{1F3FB}", 2, 2); assert_width!("\u{1F46A}\u{200D}\u{200D}\u{1F3FB}", 4, 4); } + +#[test] +fn test_emoji_zwj() { + assert_width!("๐Ÿง‘โ€๐Ÿคโ€๐Ÿง‘", 2, 2); + + assert_width!("๐Ÿ‡ฎ๐Ÿ‡ฑ๐Ÿ•Š๏ธ๐Ÿ‡ต๐Ÿ‡ธ", 6, 6); + assert_width!("๐Ÿ‡ต๐Ÿ‡ธ\u{200D}๐Ÿ•Š๏ธ\u{200D}๐Ÿ‡ฎ๐Ÿ‡ฑ", 2, 2); + assert_width!("๐Ÿ‡ฎ๐Ÿ‡ฑ\u{200D}๐Ÿ•Š๏ธ\u{200D}\u{200D}๐Ÿ‡ต๐Ÿ‡ธ", 4, 4); + assert_width!("๐Ÿ‡ต๐Ÿ‡ธ\u{200D}\u{200D}๐Ÿ•Š๏ธ\u{200D}๐Ÿ‡ฎ๐Ÿ‡ฑ", 4, 4); + + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ", 2, 2); + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ", 3, 3); + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ", 3, 3); + + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ", 4, 4); + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ", 5, 5); + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ", 2, 2); + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ", 5, 5); + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ", 4, 4); + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ", 7, 7); + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ", 6, 6); + assert_width!("๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ", 9, 9); + + assert_width!("๐Ÿด๓ ง๓ ข๓ ท๓ ฌ๓ ณ๓ ฟ", 2, 2); + assert_width!("๐Ÿด๓ ง๓ ข๓ ฅ๓ ฎ๓ ง๓ ฟ\u{200D}๐Ÿด๓ ง๓ ข๓ ณ๓ ฃ๓ ด๓ ฟ\u{200D}๐Ÿด๓ ง๓ ข๓ ท๓ ฌ๓ ณ๓ ฟ", 2, 2); + + assert_width!("๐Ÿ‡ฆ๐Ÿ‘ช\u{200D}๐Ÿฟ", 3, 3); + assert_width!("๐Ÿ‡ฆ๐Ÿฟ\u{200D}๐Ÿฟ", 3, 3); + + assert_width!('๐Ÿด', Some(2), Some(2)); + assert_width!("\u{E0031}", 0, 0); + assert_width!("\u{E0063}", 0, 0); + assert_width!("\u{E007F}", 0, 0); + assert_width!("๐Ÿด\u{200D}โ“‚๏ธ", 2, 2); + assert_width!("๐Ÿด\u{E0031}\u{200D}โ“‚๏ธ", 4, 4); + assert_width!("๐Ÿด\u{E0063}\u{200D}โ“‚๏ธ", 4, 4); + assert_width!("๐Ÿด\u{E007F}\u{200D}โ“‚๏ธ", 4, 4); + assert_width!("๐Ÿด\u{E0031}\u{E007F}\u{200D}โ“‚๏ธ", 4, 4); + assert_width!("๐Ÿด\u{E0031}\u{E0031}\u{E007F}\u{200D}โ“‚๏ธ", 4, 4); + assert_width!("๐Ÿด\u{E0031}\u{E0031}\u{E0031}\u{E007F}\u{200D}โ“‚๏ธ", 2, 2); + assert_width!( + "๐Ÿด\u{E0031}\u{E0031}\u{E0031}\u{E0031}\u{E007F}\u{200D}โ“‚๏ธ", + 4, + 4, + ); + assert_width!( + "๐Ÿด\u{E0031}\u{E0031}\u{E0031}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", + 2, + 2, + ); + assert_width!( + "๐Ÿด\u{E0031}\u{E0031}\u{E0031}\u{E0063}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", + 2, + 2, + ); + assert_width!( + "๐Ÿด\u{E0031}\u{E0031}\u{E0031}\u{E0063}\u{E0063}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", + 2, + 2, + ); + assert_width!( + "๐Ÿด\u{E0031}\u{E0031}\u{E0031}\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", + 2, + 2, + ); + assert_width!( + "๐Ÿด\u{E0031}\u{E0031}\u{E0031}\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", + 4, + 4, + ); + assert_width!("๐Ÿด\u{E0063}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", 4, 4); + assert_width!("๐Ÿด\u{E0063}\u{E0063}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", 2, 2); + assert_width!( + "๐Ÿด\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", + 2, + 2, + ); + assert_width!( + "๐Ÿด\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", + 2, + 2, + ); + assert_width!( + "๐Ÿด\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", + 2, + 2, + ); + assert_width!( + "๐Ÿด\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E0063}\u{E007F}\u{200D}โ“‚๏ธ", + 4, + 4, + ); + + assert_width!("a\u{200D}๐Ÿด๓ ง๓ ข๓ ท๓ ฌ๓ ณ๓ ฟ", 3, 3); + assert_width!("๐Ÿ‘ช\u{200D}a", 3, 3); + assert_width!("a\u{200D}a", 2, 2); + + assert_width!("*\u{FE0F}", 2, 2); + assert_width!("*\u{20E3}", 1, 1); + assert_width!("*๏ธโƒฃ", 2, 2); + assert_width!("*\u{FE0F}", 2, 2); + assert_width!("*\u{20E3}\u{FE0F}", 1, 1); + assert_width!("*๏ธโƒฃ\u{200D}๐Ÿ‘ช", 2, 2); + assert_width!("*\u{20E3}\u{FE0F}\u{200D}๐Ÿ‘ช", 3, 3); + assert_width!("*\u{20E3}\u{200D}๐Ÿ‘ช", 3, 3); + assert_width!("*\u{FE0F}\u{200D}๐Ÿ‘ช", 2, 2); + assert_width!("*๏ธโƒฃ\u{20E3}\u{200D}๐Ÿ‘ช", 4, 4); + assert_width!("*\u{FE0F}\u{FE0F}\u{20E3}\u{200D}๐Ÿ‘ช", 4, 4); + + assert_width!( + "๐Ÿ‡ฆ๐Ÿ‘ช\u{200D}๐Ÿฟ\u{200D}๐Ÿ‘ช๐Ÿป\u{200D}โ“‚๏ธ\u{200D}*\u{FE0F}\u{200D}๐Ÿ‡ฆ๐Ÿ‡ฆ\u{200D}๐Ÿด๓ ง๓ ข๓ ท๓ ฌ๓ ณ๓ ฟ\u{200D}๐Ÿ‘ช", + 3, + 3, + ); +}