From 1496ffb8f67c54761d9630c7cd9a91cda9e83ca5 Mon Sep 17 00:00:00 2001 From: Ivan Chikish Date: Tue, 17 Dec 2024 20:44:02 +0300 Subject: [PATCH] [C++] Optimize unicode.cpp with binary search v2 Also split table to reduce memory size. --- scripts/generate_xcompose | 64 ++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/scripts/generate_xcompose b/scripts/generate_xcompose index 6084b3b..5b193b0 100755 --- a/scripts/generate_xcompose +++ b/scripts/generate_xcompose @@ -3,6 +3,7 @@ import sys codes = [] +codes2 = [] for line in open('data/unicode.txt').readlines(): # Original source: https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt try: code = int(line.split(';')[0], 16) @@ -10,8 +11,10 @@ for line in open('data/unicode.txt').readlines(): # Original source: https://www # Ensure the character is encodable (surrogates are not) chr(code).encode('utf8') - if (code >= 128): + if (code >= 128 and code < 65536): codes.append(code) + if (code >= 65536): + codes2.append(code) except: pass @@ -35,6 +38,10 @@ for n, code in enumerate(codes): data += ' ' data += ' '.join(f'<{c}>' for c in base36(n)) data += f' : "{chr(code)}"\n' +for n, code in enumerate(codes2): + data += ' ' + data += ' '.join(f'<{c}>' for c in base36(n + (65536 - 128))) + data += f' : "{chr(code)}"\n' open('data/keyd.compose', 'w').write(data) @@ -52,26 +59,53 @@ open('src/unicode.cpp', 'w').write(f''' #include "keys.h" #include #include + #include + #include + + constexpr uint16_t unicode_table1[] = {{ {','.join(map(str, codes))} }}; + constexpr uint32_t unicode_table2[] = {{ {','.join(map(str, codes2))} }}; + + constexpr auto cb = std::cbegin(unicode_table1); + constexpr auto ce = std::cend(unicode_table1); - constexpr uint32_t unicode_table[] = {{ {','.join(map(str, codes))} }}; + // Compile time binary search assistant that will generate some branches to reduce search range + template + static std::pair lookup_range(uint32_t codepoint) + {{ + if constexpr (size_t(End - Begin) <= Max) {{ + return std::make_pair(Begin, End); + }} else {{ + static constexpr auto Mid = std::midpoint(Begin, End); + if (codepoint < 0u + *Mid) + return lookup_range(codepoint); + else + return lookup_range(codepoint); + }} + }} int unicode_lookup_index(uint32_t codepoint) {{ // Slight search optimizations: prevent CPU from fetching data from unlikely areas - constexpr auto cb = std::cbegin(unicode_table); - constexpr auto ce = std::cend(unicode_table); - constexpr auto ce2 = std::lower_bound(cb, ce, 0x1100); // before hangul - constexpr auto ce3 = std::lower_bound(cb, ce2, 0x531); // after cyrillic - constexpr auto ce4 = std::lower_bound(cb, ce3, 0x300); // after latin, modifiers - - auto beg = cb; + static constexpr auto ce3 = std::lower_bound(cb, ce, 0x1100); // before hangul + static constexpr auto ce4 = std::lower_bound(cb, ce3, 0x300); // after latin, modifiers + auto beg = ce3; auto end = ce; - if (codepoint < *ce4) - end = ce4; - else if (codepoint < *ce3) - beg = ce4, end = ce3; - else if (codepoint < *ce2) - beg = ce3, end = ce4; + + if (codepoint < 0u + *ce4) + std::tie(beg, end) = lookup_range(codepoint); + else if (codepoint < 0u + *ce3) + std::tie(beg, end) = lookup_range(codepoint); + else if (codepoint <= 0u + ce[-1]) + std::tie(beg, end) = lookup_range(codepoint); + else {{ + static constexpr auto cb2 = std::cbegin(unicode_table2); + static constexpr auto ce2 = std::cend(unicode_table2); + auto [beg2, end2] = lookup_range(codepoint); + auto res = std::lower_bound(beg2, end2, codepoint); + if (res != ce2 && *res == codepoint) + return res - cb2 + (ce - cb); + return -1; + }} auto res = std::lower_bound(beg, end, codepoint); if (res != end && *res == codepoint) return res - cb;