Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion devel/1113.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,20 @@
- `src/Data/String/converter.hpp`
- `src/Data/String/converter.cpp`
- `tests/Data/String/converter_test.cpp`
- `lolly/lolly/data/herk.hpp`
- `lolly/lolly/data/herk.cpp`
- `lolly/tests/lolly/data/herk_test.cpp`
- `src/Scheme/L2/glue_lolly.lua`
- `src/Scheme/L3/glue_moebius.lua`

## 如何测试

### 确定性测试(单元测试)
```bash
xmake b converter_test
xmake r converter_test
xmake b lolly_tests
xmake test lolly_tests/herk_test
```

### 预期输出
Expand All @@ -31,6 +38,8 @@ Totals: 6 passed, 0 failed, 0 skipped, 0 blacklisted, 9ms
********* Finished testing of TestConverter *********
```

lolly 的 `herk_test` 应同样全部通过。

## 性能评估

使用临时基准测试(10000 次迭代,混合 ASCII 与 CJK/特殊字符输入)对比修改前后:
Expand All @@ -46,9 +55,11 @@ Totals: 6 passed, 0 failed, 0 skipped, 0 blacklisted, 9ms

提交前执行以下最少步骤:
```bash
gf fmt --changed-since=main
gf fmt
xmake b converter_test
xmake r converter_test
xmake b lolly_tests
xmake test lolly_tests/herk_test
```

## What
Expand All @@ -73,5 +84,12 @@ Herk 编码是 TMU 序列化的核心编码,其转换正确性直接影响文
2. 重写 `utf8_to_herk` 与 `herk_to_utf8`,仅使用静态表完成转换。
3. 从 `converter_rep::load()` 中删除 `Hex-Cork` 相关分支。
4. 保留 `tests/Data/String/converter_test.cpp` 中已有的完整单元测试作为回归验证。
5. 将 `utf8_to_herk` 与 `herk_to_utf8` 的实现迁移到 `lolly/lolly/data/herk.cpp`,
在 `lolly::data` 命名空间中提供相同签名的函数;`src/Data/String/converter.cpp`
保留为对 `lolly::data` 的薄封装,保持现有 C++ 调用点不变。
6. 将对应的 Scheme glue(`utf8->herk`、`herk->utf8`)从 `glue_moebius.lua`
迁移到 `glue_lolly.lua`,使其绑定到 `lolly::data::utf8_to_herk` 与
`lolly::data::herk_to_utf8`。
7. 在 `lolly/tests/lolly/data/herk_test.cpp` 中添加针对 lolly 实现的单元测试。

整理时间:2026/06/25
168 changes: 168 additions & 0 deletions lolly/lolly/data/herk.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@

/******************************************************************************
* MODULE : herk.cpp
* DESCRIPTION: Herk encoding conversions
* COPYRIGHT : (C) 2026 Darcy Shen
*******************************************************************************
* This software falls under the GNU general public license version 3 or later.
* It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
* in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
******************************************************************************/

#include "herk.hpp"
#include "numeral.hpp"
#include "unicode.hpp"

namespace lolly {
namespace data {

static const int herk_to_utf8_code[256]= {
0x0060, 0x00B4, 0x02C6, 0x02DC, 0x00A8, 0x02DD, 0x02DA, 0x02C7, 0x02D8,
0x00AF, 0x02D9, 0x00B8, 0x02DB, 0x201A, 0x2039, 0x203A, 0x201C, 0x201D,
0x201E, 0x00AB, 0x00BB, 0x2013, 0x2014, 0x200B, 0x2080, 0x0131, 0x0237,
0xFB00, 0xFB01, 0xFB02, 0xFB03, 0xFB04, 0x0020, 0x0021, 0x0022, 0x0023,
0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C,
0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E,
0x003F, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050,
0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, 0x2018, 0x0061, 0x0062,
0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B,
0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D,
0x007E, 0x00AD, 0x0102, 0x0104, 0x0106, 0x010C, 0x010E, 0x011A, 0x0118,
0x011E, 0x0139, 0x013D, 0x0141, 0x0143, 0x0147, 0x014A, 0x0150, 0x0154,
0x0158, 0x015A, 0x0160, 0x015E, 0x0164, 0x0162, 0x0170, 0x016E, 0x0178,
0x0179, 0x017D, 0x017B, 0x0132, 0x0130, 0x0111, 0x00A7, 0x0103, 0x0105,
0x0107, 0x010D, 0x010F, 0x011B, 0x0119, 0x011F, 0x013A, 0x013E, 0x0142,
0x0144, 0x0148, 0x014B, 0x0151, 0x0155, 0x0159, 0x015B, 0x0161, 0x015F,
0x0165, 0x0163, 0x0171, 0x016F, 0x00FF, 0x017A, 0x017E, 0x017C, 0x0133,
0x00A1, 0x00BF, 0x00A3, 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5,
0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE,
0x00CF, 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x0152,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x1E9E, 0x00E0,
0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9,
0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x00F0, 0x00F1, 0x00F2,
0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x0153, 0x00F8, 0x00F9, 0x00FA, 0x00FB,
0x00FC, 0x00FD, 0x00FE, 0x00DF,
};

static const int utf8_to_herk_byte[256]= {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
90, 91, 92, 93, 94, 95, 0, 97, 98, 99, 100, 101, 102, 103, 104,
105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 189, -1, 191, -1,
-1, -1, 159, 4, -1, -1, 19, -1, 127, -1, 9, -1, -1, -1, -1,
1, -1, -1, -1, 11, -1, -1, 20, -1, -1, -1, 190, 192, 193, 194,
195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
210, 211, 212, 213, 214, -1, 216, 217, 218, 219, 220, 221, 222, 255, 224,
225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, -1, 248, 249, 250, 251, 252, 253, 254,
184,
};

static const int utf8_to_herk_high[][2]= {
{0x0102, 0x80}, {0x0103, 0xA0}, {0x0104, 0x81}, {0x0105, 0xA1},
{0x0106, 0x82}, {0x0107, 0xA2}, {0x010C, 0x83}, {0x010D, 0xA3},
{0x010E, 0x84}, {0x010F, 0xA4}, {0x0111, 0x9E}, {0x0118, 0x86},
{0x0119, 0xA6}, {0x011A, 0x85}, {0x011B, 0xA5}, {0x011E, 0x87},
{0x011F, 0xA7}, {0x0130, 0x9D}, {0x0131, 0x19}, {0x0132, 0x9C},
{0x0133, 0xBC}, {0x0139, 0x88}, {0x013A, 0xA8}, {0x013D, 0x89},
{0x013E, 0xA9}, {0x0141, 0x8A}, {0x0142, 0xAA}, {0x0143, 0x8B},
{0x0144, 0xAB}, {0x0147, 0x8C}, {0x0148, 0xAC}, {0x014A, 0x8D},
{0x014B, 0xAD}, {0x0150, 0x8E}, {0x0151, 0xAE}, {0x0152, 0xD7},
{0x0153, 0xF7}, {0x0154, 0x8F}, {0x0155, 0xAF}, {0x0158, 0x90},
{0x0159, 0xB0}, {0x015A, 0x91}, {0x015B, 0xB1}, {0x015E, 0x93},
{0x015F, 0xB3}, {0x0160, 0x92}, {0x0161, 0xB2}, {0x0162, 0x95},
{0x0163, 0xB5}, {0x0164, 0x94}, {0x0165, 0xB4}, {0x016E, 0x97},
{0x016F, 0xB7}, {0x0170, 0x96}, {0x0171, 0xB6}, {0x0178, 0x98},
{0x0179, 0x99}, {0x017A, 0xB9}, {0x017B, 0x9B}, {0x017C, 0xBB},
{0x017D, 0x9A}, {0x017E, 0xBA}, {0x0237, 0x1A}, {0x02C6, 0x02},
{0x02C7, 0x07}, {0x02D8, 0x08}, {0x02D9, 0x0A}, {0x02DA, 0x06},
{0x02DB, 0x0C}, {0x02DC, 0x03}, {0x02DD, 0x05}, {0x1E9E, 0xDF},
{0x200B, 0x17}, {0x2013, 0x15}, {0x2014, 0x16}, {0x2018, 0x60},
{0x201A, 0x0D}, {0x201C, 0x10}, {0x201D, 0x11}, {0x201E, 0x12},
{0x2039, 0x0E}, {0x203A, 0x0F}, {0x2080, 0x18}, {0xFB00, 0x1B},
{0xFB01, 0x1C}, {0xFB02, 0x1D}, {0xFB03, 0x1E}, {0xFB04, 0x1F},
};

static const int utf8_to_herk_high_count= 88;

static int
lookup_utf8_to_herk_high (uint32_t code) {
int lo= 0, hi= utf8_to_herk_high_count - 1;
while (lo <= hi) {
int mid= (lo + hi) >> 1;
if (utf8_to_herk_high[mid][0] == (int) code)
return utf8_to_herk_high[mid][1];
if (utf8_to_herk_high[mid][0] < (int) code) lo= mid + 1;
else hi= mid - 1;
}
return -1;
}

static inline void
append_utf8_code (string& r, int code) {
if (code < 0x80) {
r << (char) code;
}
else if (code < 0x800) {
r << (char) (0xC0 | (code >> 6));
r << (char) (0x80 | (code & 0x3F));
}
else {
r << (char) (0xE0 | (code >> 12));
r << (char) (0x80 | ((code >> 6) & 0x3F));
r << (char) (0x80 | (code & 0x3F));
}
}

string
utf8_to_herk (string_u8 input) {
int i, n= N (input);
string output;
for (i= 0; i < n;) {
uint32_t code = decode_from_utf8 (input, i);
int mapped= (code <= 0xFF) ? utf8_to_herk_byte[code] : -1;
if (mapped == -1) mapped= lookup_utf8_to_herk_high (code);
if (mapped >= 0) {
output << string ((char) mapped, 1);
}
else {
if (code < 16) output << "<#0" * to_Hex (code) * ">";
else if (code < 32 || code >= 128) output << "<#" * to_Hex (code) * ">";
else output << string ((char) code, 1);
}
}
return output;
}

string_u8
herk_to_utf8 (string input) {
int start= 0, i, n= N (input);
string r;
for (i= 0; i < n; i++)
if (input[i] == '<' && i + 1 < n && input[i + 1] == '#') {
for (int j= start; j < i; j++)
append_utf8_code (r, herk_to_utf8_code[(unsigned char) input[j]]);
start= i= i + 2;
while (i < n && input[i] != '>')
i++;
append_utf8_code (r, from_hex (input (start, i)));
start= i + 1;
}
for (int j= start; j < n; j++)
append_utf8_code (r, herk_to_utf8_code[(unsigned char) input[j]]);
return r;
}

} // namespace data
} // namespace lolly
34 changes: 34 additions & 0 deletions lolly/lolly/data/herk.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

/******************************************************************************
* MODULE : herk.hpp
* DESCRIPTION: Herk encoding conversions
* COPYRIGHT : (C) 2026 Darcy Shen
*******************************************************************************
* This software falls under the GNU general public license version 3 or later.
* It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
* in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
******************************************************************************/

#pragma once

#include "string.hpp"

namespace lolly {
namespace data {

/**
* @brief Convert a Herk-encoded string to UTF-8.
* @param input The Herk-encoded string.
* @return The UTF-8 string.
*/
string_u8 herk_to_utf8 (string input);

/**
* @brief Convert a UTF-8 string to Herk encoding.
* @param input The UTF-8 string.
* @return The Herk-encoded string.
*/
string utf8_to_herk (string_u8 input);

} // namespace data
} // namespace lolly
Loading
Loading