Skip to content

Commit

Permalink
[C++] Optimize unicode.cpp with binary search v2
Browse files Browse the repository at this point in the history
Also split table to reduce memory size.
  • Loading branch information
Nekotekina committed Dec 18, 2024
1 parent c07e52d commit f50af0e
Showing 1 changed file with 49 additions and 15 deletions.
64 changes: 49 additions & 15 deletions scripts/generate_xcompose
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,18 @@
import sys

codes = []
codes2 = []
for line in open('data/unicode.txt').readlines(): # Original source: https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt
try:
code = int(line.split(';')[0], 16)

# Ensure the character is encodable (surrogates are not)
chr(code).encode('utf8')

if (code >= 128):
if (code >= 128 and code < 65536):
codes.append(code)
if (code >= 65536):
codes2.append(code)
except:
pass

Expand All @@ -35,6 +38,10 @@ for n, code in enumerate(codes):
data += '<Cancel> '
data += ' '.join(f'<{c}>' for c in base36(n))
data += f' : "{chr(code)}"\n'
for n, code in enumerate(codes2):
data += '<Cancel> '
data += ' '.join(f'<{c}>' for c in base36(n + (65536 - 128)))
data += f' : "{chr(code)}"\n'

open('data/keyd.compose', 'w').write(data)

Expand All @@ -52,26 +59,53 @@ open('src/unicode.cpp', 'w').write(f'''
#include "keys.h"
#include <algorithm>
#include <vector>
#include <utility>
#include <numeric>
constexpr uint16_t unicode_table1[] = {{ {','.join(map(str, codes))} }};
constexpr uint32_t unicode_table2[] = {{ {','.join(map(str, codes2))} }};
constexpr auto cb = std::cbegin(unicode_table1);
constexpr auto ce = std::cend(unicode_table1);
constexpr uint32_t unicode_table[] = {{ {','.join(map(str, codes))} }};
// Compile time binary search assistant that will generate some branches to reduce search range
template <typename T, const T* const& Begin, const T* const& End, size_t Max>
static std::pair<const T*, const T*> lookup_range(uint32_t codepoint)
{{
if constexpr (size_t(End - Begin) <= Max) {{
return std::make_pair(Begin, End);
}} else {{
static constexpr auto Mid = std::midpoint(Begin, End);
if (codepoint < 0u + *Mid)
return lookup_range<T, Begin, Mid, Max>(codepoint);
else
return lookup_range<T, Mid, End, Max>(codepoint);
}}
}}
int unicode_lookup_index(uint32_t codepoint)
{{
// Slight search optimizations: prevent CPU from fetching data from unlikely areas
constexpr auto cb = std::cbegin(unicode_table);
constexpr auto ce = std::cend(unicode_table);
constexpr auto ce2 = std::lower_bound(cb, ce, 0x1100); // before hangul
constexpr auto ce3 = std::lower_bound(cb, ce2, 0x531); // after cyrillic
constexpr auto ce4 = std::lower_bound(cb, ce3, 0x300); // after latin, modifiers
auto beg = cb;
static constexpr auto ce3 = std::lower_bound(cb, ce, 0x1100); // before hangul
static constexpr auto ce4 = std::lower_bound(cb, ce3, 0x300); // after latin, modifiers
auto beg = ce3;
auto end = ce;
if (codepoint < *ce4)
end = ce4;
else if (codepoint < *ce3)
beg = ce4, end = ce3;
else if (codepoint < *ce2)
beg = ce3, end = ce4;
if (codepoint < 0u + *ce4)
std::tie(beg, end) = lookup_range<uint16_t, cb, ce4, 32>(codepoint);
else if (codepoint < 0u + *ce3)
std::tie(beg, end) = lookup_range<uint16_t, ce4, ce3, 64>(codepoint);
else if (codepoint <= 0u + ce[-1])
std::tie(beg, end) = lookup_range<uint16_t, ce3, ce, 256>(codepoint);
else {{
static constexpr auto cb2 = std::cbegin(unicode_table2);
static constexpr auto ce2 = std::cend(unicode_table2);
auto [beg2, end2] = lookup_range<uint32_t, cb2, ce2, 256>(codepoint);
auto res = std::lower_bound(beg2, end2, codepoint);
if (res != ce2 && *res == codepoint)
return res - cb2 + (ce - cb);
return -1;
}}
auto res = std::lower_bound(beg, end, codepoint);
if (res != end && *res == codepoint)
return res - cb;
Expand Down

0 comments on commit f50af0e

Please sign in to comment.