Skip to content

Commit

Permalink
Merge pull request #17 from xtne6f/pr-ucs-latin
Browse files Browse the repository at this point in the history
UCS(UTF-8)とラテン文字の字幕に対応
  • Loading branch information
DBCTRADO authored Nov 9, 2023
2 parents 255f0fb + 5a5bc30 commit 83a0dc2
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 17 deletions.
138 changes: 132 additions & 6 deletions LibISDB/Base/ARIBString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ bool ARIBStringDecoder::DecodeInternal(

const bool IsCaption = !!(Flags & DecodeFlag::Caption);
const bool Is1Seg = !!(Flags & DecodeFlag::OneSeg);
const bool IsLatin = !!(Flags & DecodeFlag::Latin);

// 状態初期設定
m_ESCSeqCount = 0;
Expand All @@ -120,7 +121,13 @@ bool ARIBStringDecoder::DecodeInternal(
m_CodeG[2] = CodeSet::Hiragana;
m_CodeG[3] = IsCaption ? CodeSet::Macro : CodeSet::Katakana;

if (IsCaption && Is1Seg) {
if (IsLatin) {
m_CodeG[0] = CodeSet::Alphanumeric;
m_CodeG[2] = CodeSet::LatinExtension;
m_CodeG[3] = CodeSet::LatinSpecial;
m_LockingGL = 0;
m_LockingGR = 2;
} else if (IsCaption && Is1Seg) {
m_CodeG[1] = CodeSet::DRCS_1;
m_LockingGL = 1;
m_LockingGR = 0;
Expand All @@ -129,7 +136,7 @@ bool ARIBStringDecoder::DecodeInternal(
m_LockingGR = 2;
}

m_CharSize = CharSize::Normal;
m_CharSize = IsLatin ? CharSize::Medium : CharSize::Normal;
if (IsCaption) {
m_CharColorIndex = 7;
m_BackColorIndex = 8;
Expand All @@ -143,9 +150,11 @@ bool ARIBStringDecoder::DecodeInternal(
m_RPC = 1;

m_IsCaption = IsCaption;
m_IsLatin = IsLatin;
m_pFormatList = FormatList ? &*FormatList : nullptr;
m_pDRCSMap = pDRCSMap;

m_IsUCS = !!(Flags & DecodeFlag::UCS);
m_UseCharSize = !!(Flags & DecodeFlag::UseCharSize);
m_UnicodeSymbol = !!(Flags & DecodeFlag::UnicodeSymbol);

Expand Down Expand Up @@ -174,8 +183,44 @@ bool ARIBStringDecoder::DecodeString(const uint8_t *pSrcData, size_t SrcLength,
{
for (size_t SrcPos = 0; SrcPos < SrcLength; SrcPos++) {
if (m_ESCSeqCount == 0) {
// GL/GR領域
if ((pSrcData[SrcPos] >= 0x21) && (pSrcData[SrcPos] <= 0x7E)) {
if (m_IsUCS && (((pSrcData[SrcPos] >= 0x21) && (pSrcData[SrcPos] <= 0x7E))
|| ((pSrcData[SrcPos] >= 0x80)
&& ((pSrcData[SrcPos] != 0xC2) || (SrcLength - SrcPos < 2)
|| (pSrcData[SrcPos + 1] < 0x80) || (pSrcData[SrcPos + 1] >= 0xA1))))) {
// UCSの制御コード以外
if (pSrcData[SrcPos] >= 0xFE) {
// UTF-16のBOM。未サポート
return false;
}
size_t OldLength = pDstString->length();
uint32_t CodePoint;
const size_t CodeLength = UTF8ToCodePoint(pSrcData + SrcPos, SrcLength - SrcPos, &CodePoint);

if (CodePoint == 0) {
*pDstString += TOFU_STR;
} else if ((CodePoint >= 0xEC00) && (CodePoint <= 0xF8FF)) {
// U+EC00以降の私用領域はDRCS
PutDRCSChar(static_cast<uint16_t>(CodePoint), pDstString);
#ifdef LIBISDB_ARIB_STR_IS_WCHAR
} else if (CodePoint >= 0x10000) {
// サロゲートペア
*pDstString += static_cast<InternalChar>(0xD800 | ((CodePoint - 0x10000) >> 10));
*pDstString += static_cast<InternalChar>(0xDC00 | ((CodePoint - 0x10000) & 0x03FF));
} else {
*pDstString += static_cast<InternalChar>(CodePoint);
#else
} else {
for (size_t i = 0; i < CodeLength; i++)
*pDstString += static_cast<InternalChar>(pSrcData[SrcPos + i]);
#endif
}
for (; m_RPC > 1; m_RPC--) {
const size_t Length = pDstString->length();
*pDstString += pDstString->substr(OldLength);
OldLength = Length;
}
SrcPos += CodeLength - 1;
} else if (!m_IsUCS && (pSrcData[SrcPos] >= 0x21) && (pSrcData[SrcPos] <= 0x7E)) {
// GL領域
const CodeSet CurCodeSet = m_CodeG[(m_SingleGL >= 0) ? m_SingleGL : m_LockingGL];
m_SingleGL = -1;
Expand All @@ -190,7 +235,7 @@ bool ARIBStringDecoder::DecodeString(const uint8_t *pSrcData, size_t SrcLength,
// 1バイトコード
DecodeChar(pSrcData[SrcPos], CurCodeSet, pDstString);
}
} else if ((pSrcData[SrcPos] >= 0xA1) && (pSrcData[SrcPos] <= 0xFE)) {
} else if (!m_IsUCS && (pSrcData[SrcPos] >= 0xA1) && (pSrcData[SrcPos] <= 0xFE)) {
// GR領域
const CodeSet CurCodeSet = m_CodeG[m_LockingGR];

Expand All @@ -206,6 +251,10 @@ bool ARIBStringDecoder::DecodeString(const uint8_t *pSrcData, size_t SrcLength,
}
} else {
// 制御コード
if (m_IsUCS && (pSrcData[SrcPos] == 0xC2)) {
// UCSのC1制御コード
SrcPos++;
}
switch (pSrcData[SrcPos]) {
case 0x0D: // APR
*pDstString += ARIB_STR(LIBISDB_NEWLINE);
Expand Down Expand Up @@ -403,6 +452,16 @@ void ARIBStringDecoder::DecodeChar(uint16_t Code, CodeSet Set, InternalString *p
PutJISKatakanaChar(Code, pDstString);
break;

case CodeSet::LatinExtension:
// ラテン文字拡張コード出力
PutLatinExtensionChar(Code, pDstString);
break;

case CodeSet::LatinSpecial:
// ラテン文字特殊コード出力
PutLatinSpecialChar(Code, pDstString);
break;

case CodeSet::AdditionalSymbols:
// 追加シンボルコード出力
PutSymbolsChar(Code, pDstString);
Expand Down Expand Up @@ -591,7 +650,7 @@ void ARIBStringDecoder::PutAlphanumericChar(uint16_t Code, InternalString *pDstS
ARIB_STR_TABLE_END;

const ARIBStrTableType * Table =
(m_UseCharSize && m_CharSize == CharSize::Medium) ? AlphanumericHalfWidthTable : AlphanumericTable;
(m_IsLatin || (m_UseCharSize && m_CharSize == CharSize::Medium)) ? AlphanumericHalfWidthTable : AlphanumericTable;

*pDstString += Table[Code < 0x20 ? 0 : Code - 0x20];
}
Expand Down Expand Up @@ -646,6 +705,37 @@ void ARIBStringDecoder::PutJISKatakanaChar(uint16_t Code, InternalString *pDstSt
}


void ARIBStringDecoder::PutLatinExtensionChar(uint16_t Code, InternalString *pDstString)
{
// ラテン文字拡張コード変換
static const ARIBStrTableType LatinExtensionTable[] =
ARIB_STR_TABLE_BEGIN
ARIB_STR_TABLE(" ", "\u00a1", "\u00a2", "\u00a3", "\u20ac", "\u00a5", "\u0160", "\u00a7", "\u0161", "\u00a9", "\u00aa", "\u00ab", "\u00ac", "\u00ff", "\u00ae", "\u00af")
ARIB_STR_TABLE("\u00b0", "\u00b1", "\u00b2", "\u00b3", "\u017d", "\u03bc", "\u00b6", "\u00b7", "\u017e", "\u00b9", "\u00ba", "\u00bb", "\u0152", "\u0153", "\u0178", "\u00bf")
ARIB_STR_TABLE("\u00c0", "\u00c1", "\u00c2", "\u00c3", "\u00c4", "\u00c5", "\u00c6", "\u00c7", "\u00c8", "\u00c9", "\u00ca", "\u00cb", "\u00cc", "\u00cd", "\u00ce", "\u00cf")
ARIB_STR_TABLE("\u00d0", "\u00d1", "\u00d2", "\u00d3", "\u00d4", "\u00d5", "\u00d6", "\u00d7", "\u00d8", "\u00d9", "\u00da", "\u00db", "\u00dc", "\u00dd", "\u00de", "\u00df")
ARIB_STR_TABLE("\u00e0", "\u00e1", "\u00e2", "\u00e3", "\u00e4", "\u00e5", "\u00e6", "\u00e7", "\u00e8", "\u00e9", "\u00ea", "\u00eb", "\u00ec", "\u00ed", "\u00ee", "\u00ef")
ARIB_STR_TABLE("\u00f0", "\u00f1", "\u00f2", "\u00f3", "\u00f4", "\u00f5", "\u00f6", "\u00f7", "\u00f8", "\u00f9", "\u00fa", "\u00fb", "\u00fc", "\u00fd", "\u00fe", " ")
ARIB_STR_TABLE_END;

*pDstString += LatinExtensionTable[Code < 0x20 ? 0 : Code - 0x20];
}


void ARIBStringDecoder::PutLatinSpecialChar(uint16_t Code, InternalString *pDstString)
{
// ラテン文字特殊コード変換
static const ARIBStrTableType LatinSpecialTable[] =
ARIB_STR_TABLE_BEGIN
ARIB_STR_TABLE(" ", "\u266a", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ")
ARIB_STR_TABLE("\u00a4", "\u00a6", "\u00a8", "\u00b4", "\u00b8", "\u00bc", "\u00bd", "\u00be", " ", " ", " ", " ", " ", " ", " ", " ")
ARIB_STR_TABLE("\u2026", "\u2588", "\u2018", "\u2019", "\u201c", "\u201d", "\u2022", "\u2122", "\u215b", "\u215c", "\u215d", "\u215e", " ", " ", " ", " ")
ARIB_STR_TABLE_END;

*pDstString += LatinSpecialTable[(Code < 0x20 || Code >= 0x50) ? 0 : Code - 0x20];
}


void ARIBStringDecoder::PutSymbolsChar(uint16_t Code, InternalString *pDstString)
{
// 追加シンボル文字コード変換
Expand Down Expand Up @@ -1066,6 +1156,8 @@ bool ARIBStringDecoder::DesignationGSET(uint8_t IndexG, uint8_t Code)
case 0x37: m_CodeG[IndexG] = CodeSet::ProportionalHiragana; return true; // Proportional Hiragana
case 0x38: m_CodeG[IndexG] = CodeSet::ProportionalKatakana; return true; // Proportional Katakana
case 0x49: m_CodeG[IndexG] = CodeSet::JIS_X0201_Katakana; return true; // JIS X 0201 Katakana
case 0x4B: m_CodeG[IndexG] = CodeSet::LatinExtension; return true; // Latin Extension
case 0x4C: m_CodeG[IndexG] = CodeSet::LatinSpecial; return true; // Latin Special
case 0x39: m_CodeG[IndexG] = CodeSet::JIS_KanjiPlane1; return true; // JIS compatible Kanji Plane 1
case 0x3A: m_CodeG[IndexG] = CodeSet::JIS_KanjiPlane2; return true; // JIS compatible Kanji Plane 2
case 0x3B: m_CodeG[IndexG] = CodeSet::AdditionalSymbols; return true; // Additional symbols
Expand Down Expand Up @@ -1132,4 +1224,38 @@ bool ARIBStringDecoder::IsDoubleByteCodeSet(CodeSet Set)
}


size_t ARIBStringDecoder::UTF8ToCodePoint(const uint8_t *pData, size_t Length, uint32_t *pCodePoint)
{
*pCodePoint = 0;
if (Length == 0) {
return 0;
} else if ((pData[0] >= 0xC2) && (pData[0] < 0xE0) && (Length >= 2)
&& (pData[1] >= 80) && (pData[1] < 0xC0)) {
*pCodePoint = ((pData[0] & 0x1F) << 6) | (pData[1] & 0x3F);
return 2;
} else if ((pData[0] >= 0xE0) && (pData[0] < 0xF0) && (Length >= 3)
&& (pData[1] >= 0x80) && (pData[1] < 0xC0)
&& ((pData[0] & 0x0F) || (pData[1] & 0x20))
&& (pData[2] >= 0x80) && (pData[2] < 0xC0)) {
*pCodePoint = ((pData[0] & 0x0F) << 12) | ((pData[1] & 0x3F) << 6) | (pData[2] & 0x3F);
if ((*pCodePoint >= 0xD800) && (*pCodePoint < 0xE000))
*pCodePoint = 0;
return 3;
} else if ((pData[0] >= 0xF0) && (pData[0] < 0xF8) && (Length >= 4)
&& (pData[1] >= 0x80) && (pData[1] < 0xC0)
&& ((pData[0] & 0x07) || (pData[1] & 0x30))
&& (pData[2] >= 0x80) && (pData[2] < 0xC0)
&& (pData[3] >= 0x80) && (pData[3] < 0xC0)) {
*pCodePoint = ((pData[0] & 0x07) << 18) | ((pData[1] & 0x3F) << 12)
| ((pData[2] & 0x3F) << 6) | (pData[3] & 0x3F);
if (*pCodePoint >= 0x110000)
*pCodePoint = 0;
return 4;
} else if (pData[0] < 0x80) {
*pCodePoint = pData[0];
}
return 1;
}


} // namespace LibISDB
12 changes: 11 additions & 1 deletion LibISDB/Base/ARIBString.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ namespace LibISDB
OneSeg = 0x0002U, /**< ワンセグ */
UseCharSize = 0x0004U, /**< 文字サイズを反映 */
UnicodeSymbol = 0x0008U, /**< Unicodeの記号を利用(Unicode 5.2以降) */
UCS = 0x0010U, /**< UCS符号化方式 */
Latin = 0x0020U, /**< SBTVD規格のラテン文字符号化方式 */
LIBISDB_ENUM_FLAGS_TRAILER
};

Expand Down Expand Up @@ -135,6 +137,8 @@ namespace LibISDB
ProportionalHiragana, /**< Proportional Hiragana */
ProportionalKatakana, /**< Proportional Katakana */
JIS_X0201_Katakana, /**< JIS X 0201 Katakana */
LatinExtension, /**< Latin Extension */
LatinSpecial, /**< Latin Special */
JIS_KanjiPlane1, /**< JIS compatible Kanji Plane 1 */
JIS_KanjiPlane2, /**< JIS compatible Kanji Plane 2 */
AdditionalSymbols, /**< Additional symbols */
Expand Down Expand Up @@ -176,6 +180,8 @@ namespace LibISDB
DRCSMap *m_pDRCSMap;

bool m_IsCaption;
bool m_IsLatin;
bool m_IsUCS;
bool m_UseCharSize;
bool m_UnicodeSymbol;

Expand All @@ -193,6 +199,8 @@ namespace LibISDB
void PutHiraganaChar(uint16_t Code, InternalString *pDstString);
void PutKatakanaChar(uint16_t Code, InternalString *pDstString);
void PutJISKatakanaChar(uint16_t Code, InternalString *pDstString);
void PutLatinExtensionChar(uint16_t Code, InternalString *pDstString);
void PutLatinSpecialChar(uint16_t Code, InternalString *pDstString);
void PutSymbolsChar(uint16_t Code, InternalString *pDstString);
void PutMacroChar(uint16_t Code, InternalString *pDstString);
void PutDRCSChar(uint16_t Code, InternalString *pDstString);
Expand All @@ -206,12 +214,14 @@ namespace LibISDB

bool IsSmallCharMode() const noexcept
{
return (m_CharSize == CharSize::Small)
return m_IsLatin
|| (m_CharSize == CharSize::Small)
|| (m_CharSize == CharSize::Medium)
|| (m_CharSize == CharSize::Micro);
}

static bool IsDoubleByteCodeSet(CodeSet Set);
static size_t UTF8ToCodePoint(const uint8_t *pData, size_t Length, uint32_t *pCodePoint);
};

} // namespace LibISDB
Expand Down
1 change: 1 addition & 0 deletions LibISDB/LibISDBConsts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ namespace LibISDB
constexpr uint32_t LANGUAGE_CODE_RUS = 0x727573_u32; // ロシア語
constexpr uint32_t LANGUAGE_CODE_ZHO = 0x7A686F_u32; // 中国語
constexpr uint32_t LANGUAGE_CODE_KOR = 0x6B6F72_u32; // 韓国語
constexpr uint32_t LANGUAGE_CODE_POR = 0x706F72_u32; // ポルトガル語
constexpr uint32_t LANGUAGE_CODE_SPA = 0x737061_u32; // スペイン語
constexpr uint32_t LANGUAGE_CODE_ETC = 0x657463_u32; // その他
constexpr uint32_t LANGUAGE_CODE_INVALID = 0x000000_u32; // 無効
Expand Down
28 changes: 20 additions & 8 deletions LibISDB/TS/CaptionParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ void CaptionParser::OnPESPacket(const PESParser *pParser, const PESPacket *pPack
ParseManagementData(&pData[Pos], DataGroupSize);
} else {
// 字幕データ
ParseCaptionData(&pData[Pos], DataGroupSize);
ParseCaptionData(&pData[Pos], DataGroupSize, (DataGroupID & 0x1F));
}
}

Expand Down Expand Up @@ -228,7 +228,7 @@ bool CaptionParser::ParseManagementData(const uint8_t *pData, uint32_t DataSize)
uint32_t ReadSize = 0;
do {
uint32_t Size = UnitLoopLength - ReadSize;
if (!ParseUnitData(&pData[Pos + ReadSize], &Size))
if (!ParseUnitData(&pData[Pos + ReadSize], &Size, 0))
return false;
ReadSize += Size;
} while (ReadSize < UnitLoopLength);
Expand All @@ -238,7 +238,7 @@ bool CaptionParser::ParseManagementData(const uint8_t *pData, uint32_t DataSize)
}


bool CaptionParser::ParseCaptionData(const uint8_t *pData, uint32_t DataSize)
bool CaptionParser::ParseCaptionData(const uint8_t *pData, uint32_t DataSize, uint8_t DataGroupIndex)
{
// caption_data()

Expand Down Expand Up @@ -267,7 +267,7 @@ bool CaptionParser::ParseCaptionData(const uint8_t *pData, uint32_t DataSize)
uint32_t ReadSize = 0;
do {
uint32_t Size = UnitLoopLength - ReadSize;
if (!ParseUnitData(&pData[Pos + ReadSize], &Size))
if (!ParseUnitData(&pData[Pos + ReadSize], &Size, DataGroupIndex))
return false;
ReadSize += Size;
} while (ReadSize < UnitLoopLength);
Expand All @@ -277,7 +277,7 @@ bool CaptionParser::ParseCaptionData(const uint8_t *pData, uint32_t DataSize)
}


bool CaptionParser::ParseUnitData(const uint8_t *pData, uint32_t *pDataSize)
bool CaptionParser::ParseUnitData(const uint8_t *pData, uint32_t *pDataSize, uint8_t DataGroupIndex)
{
// data_unit()

Expand All @@ -303,13 +303,25 @@ bool CaptionParser::ParseUnitData(const uint8_t *pData, uint32_t *pDataSize)
}

if ((UnitSize > 0) && (m_pHandler != nullptr)) {
const ARIBStringDecoder::DecodeFlag Flags =
ARIBStringDecoder::DecodeFlag Flags =
m_1Seg ? ARIBStringDecoder::DecodeFlag::OneSeg : ARIBStringDecoder::DecodeFlag::None;
const int LangIndex =
DataGroupIndex != 0 ? GetLanguageIndexByTag(DataGroupIndex - 1) : m_LanguageList.empty() ? -1 : 0;
ARIBStringDecoder::FormatList FormatList;
String Text;

if (m_StringDecoder.DecodeCaption(&pData[5], UnitSize, &Text, Flags, &FormatList, m_pDRCSMap)) {
OnCaption(Text.c_str(), &FormatList);
// 原則として字幕管理の情報が必要。ただし1Segは運用によりパラメータ固定で送出頻度も低いため特別扱い
if ((LangIndex >= 0) || m_1Seg) {
if (LangIndex >= 0) {
const LanguageInfo &Info = m_LanguageList[LangIndex];
if (Info.TCS == 1)
Flags |= ARIBStringDecoder::DecodeFlag::UCS;
if ((Info.LanguageCode == LANGUAGE_CODE_POR) || (Info.LanguageCode == LANGUAGE_CODE_SPA))
Flags |= ARIBStringDecoder::DecodeFlag::Latin;
}
if (m_StringDecoder.DecodeCaption(&pData[5], UnitSize, &Text, Flags, &FormatList, m_pDRCSMap)) {
OnCaption(Text.c_str(), &FormatList);
}
}
}

Expand Down
4 changes: 2 additions & 2 deletions LibISDB/TS/CaptionParser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ namespace LibISDB
void OnPESPacket(const PESParser *pParser, const PESPacket *pPacket) override;

bool ParseManagementData(const uint8_t *pData, uint32_t DataSize);
bool ParseCaptionData(const uint8_t *pData, uint32_t DataSize);
bool ParseUnitData(const uint8_t *pData, uint32_t *pDataSize);
bool ParseCaptionData(const uint8_t *pData, uint32_t DataSize, uint8_t DataGroupIndex);
bool ParseUnitData(const uint8_t *pData, uint32_t *pDataSize, uint8_t DataGroupIndex);
bool ParseDRCSUnitData(const uint8_t *pData, uint32_t DataSize);
void OnCaption(const CharType *pText, const ARIBStringDecoder::FormatList *pFormatList);

Expand Down
1 change: 1 addition & 0 deletions LibISDB/TS/TSInformation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ bool GetLanguageText_ja(
{LANGUAGE_CODE_RUS, LIBISDB_STR("ロシア語"), LIBISDB_STR("露語"), LIBISDB_STR("")},
{LANGUAGE_CODE_ZHO, LIBISDB_STR("中国語"), LIBISDB_STR("中国語"), LIBISDB_STR("")},
{LANGUAGE_CODE_KOR, LIBISDB_STR("韓国語"), LIBISDB_STR("韓国語"), LIBISDB_STR("")},
{LANGUAGE_CODE_POR, LIBISDB_STR("ポルトガル語"), LIBISDB_STR("葡語"), LIBISDB_STR("")},
{LANGUAGE_CODE_SPA, LIBISDB_STR("スペイン語"), LIBISDB_STR("西語"), LIBISDB_STR("西")},
{LANGUAGE_CODE_ETC, LIBISDB_STR("外国語"), LIBISDB_STR("外国語"), LIBISDB_STR("")},
};
Expand Down

0 comments on commit 83a0dc2

Please sign in to comment.