From e6ef15f8e1139142e7df1d80e96fe9e8c18d2ffd Mon Sep 17 00:00:00 2001 From: nopdan Date: Thu, 21 Mar 2024 21:29:08 +0800 Subject: [PATCH] feat: support BaiduPinyin backup binary format Co-authored-by: stevenlele <15964380+stevenlele@users.noreply.github.com> --- src/ImeWlConverterCore/ConstantString.cs | 4 +- .../IME/BaiduPinyinBackup.cs | 169 ++++++++++++++++++ 2 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 src/ImeWlConverterCore/IME/BaiduPinyinBackup.cs diff --git a/src/ImeWlConverterCore/ConstantString.cs b/src/ImeWlConverterCore/ConstantString.cs index 8e2fda55..5a287beb 100644 --- a/src/ImeWlConverterCore/ConstantString.cs +++ b/src/ImeWlConverterCore/ConstantString.cs @@ -29,6 +29,7 @@ public class ConstantString public const string BAIDU_BDICT = "百度分类词库bdict"; public const string BAIDU_BCD = "百度手机词库bcd"; public const string BAIDU_PINYIN = "百度拼音"; + public const string BAIDU_PINYIN_BACKUP = "百度拼音备份词库bin"; public const string QQ_PINYIN_ENG = "QQ拼音英文"; public const string QQ_SHOUJI = "QQ手机"; public const string QQ_WUBI = "QQ五笔"; @@ -81,6 +82,8 @@ public class ConstantString public const string BAIDU_SHOUJI_ENG_C = "bdsje"; public const string BAIDU_BDICT_C = "bdict"; public const string BAIDU_BCD_C = "bcd"; + public const string BAIDU_PINYIN_C = "bdpy"; + public const string BAIDU_PINYIN_BACKUP_C = "bdpybin"; public const string QQ_SHOUJI_C = "qqsj"; public const string QQ_WUBI_C = "qqwb"; public const string TOUCH_PAL_C = "cbsj"; @@ -114,7 +117,6 @@ public class ConstantString public const string RIME_USERDB_C = "rimedb"; public const string BING_PINYIN_C = "bing"; public const string LINGOES_LD2_C = "ld2"; - public const string BAIDU_PINYIN_C = "bdpy"; public const string QQ_PINYIN_ENG_C = "qqpye"; public const string XIAOYA_WUBI_C = "xywb"; public const string CANGJIE_PLATFORM_C = "cjpt"; diff --git a/src/ImeWlConverterCore/IME/BaiduPinyinBackup.cs b/src/ImeWlConverterCore/IME/BaiduPinyinBackup.cs new file mode 100644 index 00000000..f3956591 --- /dev/null +++ b/src/ImeWlConverterCore/IME/BaiduPinyinBackup.cs @@ -0,0 +1,169 @@ +/*! + * This work contains codes translated from the original work + * by stevenlele (https://github.com/studyzy/imewlconverter/issues/204#issuecomment-2011007855) + * translate to csharp by nopdan + */ + +using Studyzy.IMEWLConverter.Entities; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; + +namespace Studyzy.IMEWLConverter.IME +{ + /// + /// 百度拼音备份词库 + /// + [ComboBoxShow(ConstantString.BAIDU_PINYIN_BACKUP, ConstantString.BAIDU_PINYIN_BACKUP_C, 20)] + public class BaiduPinyinBackup : BaseImport, IWordLibraryImport + { + #region IWordLibraryImport 成员 + + public WordLibraryList Import(string path) + { + var wordLibraryList = new WordLibraryList(); + var fs = new FileStream(path, FileMode.Open, FileAccess.Read); + // FF FE + fs.Seek(2, SeekOrigin.Begin); + // 不清楚 是否在 前面,所以标记一下 + var cnFlag = false; + while (fs.Position < fs.Length - 4) + { + // 每次读取两字节 + var lineBytes = new List(); + var bytes = new byte[2]; + do + { + fs.Read(bytes, 0, 2); + // 遇到换行符结束读取 + if (bytes[0] == 0x0A && bytes[1] == 0x00) break; + lineBytes.AddRange(bytes); + } while (true); + var line = Decode(lineBytes.ToArray()); + var theLine = Encoding.Unicode.GetString(line); + // 忽略英文单词 + if (cnFlag && (theLine == "" || theLine == "")) break; + if (theLine == "") { cnFlag = true; continue; } + // 每一行的格式 + // 百度输入法(bai|du|shu|ru|fa) 2 24 1703756731 N N + var array = theLine.Split(" "); + if (array.Length < 2) continue; + var rank = Convert.ToInt32(array[1]); + // 用正则分离词组和拼音 + var pattern = @"([^\(]+)\((.+)\)"; + var match = Regex.Match(array[0], pattern); + if (match.Groups.Count != 3) continue; + var word = match.Groups[1].Value; + var py = match.Groups[2].Value; + var pinyin = py.Split("|"); + + wordLibraryList.Add(new WordLibrary + { + Rank = rank, + Word = word, + PinYin = pinyin + }); + } + return wordLibraryList; + } + + #endregion + + #region 解码算法 + + private const uint MASK = 0x2D382324; + private static readonly byte[] TABLE = Encoding.ASCII.GetBytes("qogjOuCRNkfil5p4SQ3LAmxGKZTdesvB6z_YPahMI9t80rJyHW1DEwFbc7nUVX2-"); + private static byte[] DECODE_TABLE; + + public BaiduPinyinBackup() + { + DECODE_TABLE = new byte[256]; + for (var i = 0; i < 256; i++) + { + DECODE_TABLE[i] = (byte)i; + } + for (var i = 0; i < TABLE.Length; i++) + { + DECODE_TABLE[TABLE[i]] = (byte)i; + } + } + + public static byte[] Decode(byte[] data) + { + if (data.Length % 4 != 2) + throw new ArgumentException("Invalid data length"); + + byte base64Remainder = (byte)(data[data.Length - 2] - 65); + if (base64Remainder < 0 || base64Remainder > 2 || data[data.Length - 1] != 0) + throw new ArgumentException("Invalid padding"); + + byte[] newData = new byte[data.Length - 2]; + for (int i = 0; i < data.Length - 2; i++) + { + newData[i] = DECODE_TABLE[data[i]]; + } + + var transformed = new List(); + for (int i = 0; i < newData.Length - 2; i += 4) + { + byte highBits = newData[i + 3]; + transformed.Add((byte)(newData[i] | (highBits & 0b110000) << 2)); + transformed.Add((byte)(newData[i + 1] | (highBits & 0b1100) << 4)); + transformed.Add((byte)(newData[i + 2] | (highBits & 0b11) << 6)); + } + + if (base64Remainder > 0) + { + for (int i = 0; i < 3 - base64Remainder; i++) + { + if (transformed[transformed.Count - 1] != 0) + throw new ArgumentException("Invalid padding"); + transformed.RemoveAt(transformed.Count - 1); + } + } + var result = transformed.ToArray(); + + + List finalResult = new List(); + for (int i = 0; i < result.Length / 4 * 4; i += 4) + { + uint chunk = MASK ^ BitConverter.ToUInt32(result, i); + chunk = (chunk & 0x1FFFFFFF) << 3 | chunk >> 29; + finalResult.AddRange(BitConverter.GetBytes(chunk)); + } + + if (result.Length % 4 != 0) + { + byte[] bytes = result.Skip(result.Length / 4 * 4).ToArray(); + int num = 0; + for (int i = 0; i < bytes.Length; i++) + { + num |= bytes[i] << (i * 8); + } + uint chunk = MASK ^ (uint)num; + finalResult.AddRange(BitConverter.GetBytes(chunk).Take(result.Length % 4)); + } + + return finalResult.ToArray(); + } + + #endregion + + #region IWordLibraryImport Members + + public override bool IsText + { + get { return false; } + } + + #endregion + + public WordLibraryList ImportLine(string line) + { + throw new Exception("百度输入法备份格式是二进制文件,不支持流转换"); + } + } +} \ No newline at end of file