[C++] Optimize unicode.cpp with binary search v2

Also split table to reduce memory size.
keyd-project · Dec 18, 2024 · 1496ffb · 1496ffb
1 parent 4d440ac
commit 1496ffb
Showing 1 changed file with 49 additions and 15 deletions.
diff --git a/scripts/generate_xcompose b/scripts/generate_xcompose
@@ -3,15 +3,18 @@
 import sys
 
 codes = []
+codes2 = []
 for line in open('data/unicode.txt').readlines(): # Original source: https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt
     try:
         code = int(line.split(';')[0], 16)
 
         # Ensure the character is encodable (surrogates are not)
         chr(code).encode('utf8')
 
-        if (code >= 128):
+        if (code >= 128 and code < 65536):
             codes.append(code)
+        if (code >= 65536):
+            codes2.append(code)
     except:
         pass
 
@@ -35,6 +38,10 @@ for n, code in enumerate(codes):
         data += '<Cancel> '
         data += ' '.join(f'<{c}>' for c in base36(n))
         data += f' : "{chr(code)}"\n'
+for n, code in enumerate(codes2):
+        data += '<Cancel> '
+        data += ' '.join(f'<{c}>' for c in base36(n + (65536 - 128)))
+        data += f' : "{chr(code)}"\n'
 
 open('data/keyd.compose', 'w').write(data)
 
@@ -52,26 +59,53 @@ open('src/unicode.cpp', 'w').write(f'''
 	#include "keys.h"
 	#include <algorithm>
 	#include <vector>
+	#include <utility>
+	#include <numeric>
+
+	constexpr uint16_t unicode_table1[] = {{ {','.join(map(str, codes))} }};
+	constexpr uint32_t unicode_table2[] = {{ {','.join(map(str, codes2))} }};
+
+	constexpr auto cb = std::cbegin(unicode_table1);
+	constexpr auto ce = std::cend(unicode_table1);
 
-	constexpr uint32_t unicode_table[] = {{ {','.join(map(str, codes))} }};
+	// Compile time binary search assistant that will generate some branches to reduce search range
+	template <typename T, const T* const& Begin, const T* const& End, size_t Max>
+	static std::pair<const T*, const T*> lookup_range(uint32_t codepoint)
+	{{
+		if constexpr (size_t(End - Begin) <= Max) {{
+			return std::make_pair(Begin, End);
+		}} else {{
+			static constexpr auto Mid = std::midpoint(Begin, End);
+			if (codepoint < 0u + *Mid)
+				return lookup_range<T, Begin, Mid, Max>(codepoint);
+			else
+				return lookup_range<T, Mid, End, Max>(codepoint);
+		}}
+	}}
 
 	int unicode_lookup_index(uint32_t codepoint)
 	{{
 		// Slight search optimizations: prevent CPU from fetching data from unlikely areas
-		constexpr auto cb = std::cbegin(unicode_table);
-		constexpr auto ce = std::cend(unicode_table);
-		constexpr auto ce2 = std::lower_bound(cb, ce, 0x1100); // before hangul
-		constexpr auto ce3 = std::lower_bound(cb, ce2, 0x531); // after cyrillic
-		constexpr auto ce4 = std::lower_bound(cb, ce3, 0x300); // after latin, modifiers
-
-		auto beg = cb;
+		static constexpr auto ce3 = std::lower_bound(cb, ce, 0x1100); // before hangul
+		static constexpr auto ce4 = std::lower_bound(cb, ce3, 0x300); // after latin, modifiers
+		auto beg = ce3;
 		auto end = ce;
-		if (codepoint < *ce4)
-			end = ce4;
-		else if (codepoint < *ce3)
-			beg = ce4, end = ce3;
-		else if (codepoint < *ce2)
-			beg = ce3, end = ce4;
+
+		if (codepoint < 0u + *ce4)
+			std::tie(beg, end) = lookup_range<uint16_t, cb, ce4, 32>(codepoint);
+		else if (codepoint < 0u + *ce3)
+			std::tie(beg, end) = lookup_range<uint16_t, ce4, ce3, 64>(codepoint);
+		else if (codepoint <= 0u + ce[-1])
+			std::tie(beg, end) = lookup_range<uint16_t, ce3, ce, 256>(codepoint);
+		else {{
+			static constexpr auto cb2 = std::cbegin(unicode_table2);
+			static constexpr auto ce2 = std::cend(unicode_table2);
+			auto [beg2, end2] = lookup_range<uint32_t, cb2, ce2, 256>(codepoint);
+			auto res = std::lower_bound(beg2, end2, codepoint);
+			if (res != ce2 && *res == codepoint)
+				return res - cb2 + (ce - cb);
+			return -1;
+		}}
 		auto res = std::lower_bound(beg, end, codepoint);
 		if (res != end && *res == codepoint)
 			return res - cb;