From b00690038a913c6fd696c29a7240a672afc5e1cf Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Mon, 15 Jun 2026 14:42:27 +0800 Subject: [PATCH] fix: add OOV character handling to C++ runtime for Chinese TN The C++ runtime previously relied on the pre-built .fst files to handle OOV (out-of-vocabulary) characters like Korean Hangul and Japanese Kana. When the FSTs were built without tag_oov=True (the default), these characters passed through unchanged instead of being wrapped in tags. This adds Unicode-range-based OOV detection as a post-processing step in Processor::Normalize. Characters outside the CJK Unified Ideographs, ASCII, and common punctuation ranges are wrapped in tags. The check is skipped if the output already contains tags (i.e., the FST was built with OOV support), avoiding double-wrapping. Fixes #368 --- runtime/processor/wetext_processor.cc | 63 ++++++++++++++++++++++++++- runtime/processor/wetext_processor.h | 1 + 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/runtime/processor/wetext_processor.cc b/runtime/processor/wetext_processor.cc index 1027a82a..e30d1176 100644 --- a/runtime/processor/wetext_processor.cc +++ b/runtime/processor/wetext_processor.cc @@ -15,8 +15,49 @@ #include "processor/wetext_processor.h" #include "utils/wetext_log.h" +#include "utils/wetext_string.h" namespace wetext { + +static char32_t UTF8ToCodePoint(const std::string& ch) { + int len = UTF8CharLength(ch[0]); + char32_t cp = 0; + if (len == 1) { + cp = static_cast(ch[0]); + } else if (len == 2) { + cp = ((static_cast(ch[0]) & 0x1F) << 6) | + (static_cast(ch[1]) & 0x3F); + } else if (len == 3) { + cp = ((static_cast(ch[0]) & 0x0F) << 12) | + ((static_cast(ch[1]) & 0x3F) << 6) | + (static_cast(ch[2]) & 0x3F); + } else if (len == 4) { + cp = ((static_cast(ch[0]) & 0x07) << 18) | + ((static_cast(ch[1]) & 0x3F) << 12) | + ((static_cast(ch[2]) & 0x3F) << 6) | + (static_cast(ch[3]) & 0x3F); + } + return cp; +} + +static bool IsKnownChar(char32_t cp) { + // ASCII printable characters (space to ~) + if (cp >= 0x0020 && cp <= 0x007E) return true; + // CJK Unified Ideographs + if (cp >= 0x4E00 && cp <= 0x9FFF) return true; + // CJK Unified Ideographs Extension A + if (cp >= 0x3400 && cp <= 0x4DBF) return true; + // CJK Compatibility Ideographs + if (cp >= 0xF900 && cp <= 0xFAFF) return true; + // CJK Symbols and Punctuation + if (cp >= 0x3000 && cp <= 0x303F) return true; + // General Punctuation + if (cp >= 0x2000 && cp <= 0x206F) return true; + // Fullwidth forms + if (cp >= 0xFF00 && cp <= 0xFFEF) return true; + return false; +} + Processor::Processor(const std::string& tagger_path, const std::string& verbalizer_path) { tagger_.reset(StdVectorFst::Read(tagger_path)); @@ -76,8 +117,28 @@ std::string Processor::Verbalize(const std::string& input) { return output; } +std::string Processor::TagOOV(const std::string& input) { + std::vector chars; + SplitUTF8StringToChars(input, &chars); + std::string output; + for (const auto& ch : chars) { + char32_t cp = UTF8ToCodePoint(ch); + if (IsKnownChar(cp)) { + output += ch; + } else { + output += "" + ch + ""; + } + } + return output; +} + std::string Processor::Normalize(const std::string& input) { - return Verbalize(Tag(input)); + std::string output = Verbalize(Tag(input)); + if (parse_type_ == ParseType::kZH_TN && + output.find("") == std::string::npos) { + output = TagOOV(output); + } + return output; } } // namespace wetext diff --git a/runtime/processor/wetext_processor.h b/runtime/processor/wetext_processor.h index e11d307e..78da5110 100644 --- a/runtime/processor/wetext_processor.h +++ b/runtime/processor/wetext_processor.h @@ -34,6 +34,7 @@ class Processor { std::string Tag(const std::string& input); std::string Verbalize(const std::string& input); std::string Normalize(const std::string& input); + std::string TagOOV(const std::string& input); private: std::string ShortestPath(const StdVectorFst& lattice);