// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #define PCRE2_CODE_UNIT_WIDTH 8 #include "opencc/openccxx.h" #include "pcre2.h" #include "string_utils.h" #include "rag_analyzer.h" #include "re2/re2.h" #include #include #include #include #include #include // import :term; // import :stemmer; // import :analyzer; // import :darts_trie; // import :wordnet_lemmatizer; // import :stemmer; // import :term; // // import std.compat; namespace fs = std::filesystem; static const std::string DICT_PATH = "rag/huqie.txt"; static const std::string POS_DEF_PATH = "rag/pos-id.def"; static const std::string TRIE_PATH = "rag/huqie.trie"; static const std::string WORDNET_PATH = "wordnet"; static const std::string OPENCC_PATH = "opencc"; static const std::string REGEX_SPLIT_CHAR = R"#(([ ,\.<>/?;'\[\]\`!@#$%^&*$$\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-zA-Z\.-]+|[0-9,\.-]+))#"; static const std::string NLTK_TOKENIZE_PATTERN = R"((?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)|(?=[^\(\"\`{\[:;&\#\*@\)}\]\-,])\S+?(?=\s|$|(?:[)\";}\]\*:@\'\({\[\?!])|(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)|,(?=$|\s|(?:[)\";}\]\*:@\'\({\[\?!])|(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)))|\S)"; static constexpr std::size_t MAX_SENTENCE_LEN = 100; static inline int32_t Encode(int32_t freq, int32_t idx) { uint32_t encoded_value = 0; if (freq < 0) { encoded_value |= static_cast(-freq); encoded_value |= (1U << 23); } else { encoded_value = static_cast(freq & 0x7FFFFF); } encoded_value |= static_cast(idx) << 24; return static_cast(encoded_value); } static inline int32_t DecodeFreq(int32_t value) { uint32_t v1 = static_cast(value) & 0xFFFFFF; if (v1 & (1 << 23)) { v1 &= 0x7FFFFF; return -static_cast(v1); } else { v1 = static_cast(v1); } return v1; } static inline int32_t DecodePOSIndex(int32_t value) { // POS index is stored in the high 8 bits (bits 24-31) return static_cast(static_cast(value) >> 24); } void Split(const std::string &input, const std::string &split_pattern, std::vector &result, bool keep_delim = false) { re2::RE2 pattern(split_pattern); re2::StringPiece leftover(input.data()); re2::StringPiece last_end = leftover; re2::StringPiece extracted_delim_token; while (RE2::FindAndConsume(&leftover, pattern, &extracted_delim_token)) { std::string_view token(last_end.data(), extracted_delim_token.data() - last_end.data()); if (!token.empty()) { result.emplace_back(token.data(), token.size()); } if (keep_delim) result.emplace_back(extracted_delim_token.data(), extracted_delim_token.size()); last_end = leftover; } if (!leftover.empty()) { result.emplace_back(leftover.data(), leftover.size()); } } void Split(const std::string &input, const re2::RE2 &pattern, std::vector &result, bool keep_delim = false) { re2::StringPiece leftover(input.data()); re2::StringPiece last_end = leftover; re2::StringPiece extracted_delim_token; while (RE2::FindAndConsume(&leftover, pattern, &extracted_delim_token)) { std::string_view token(last_end.data(), extracted_delim_token.data() - last_end.data()); if (!token.empty()) { result.emplace_back(token.data(), token.size()); } if (keep_delim) result.emplace_back(extracted_delim_token.data(), extracted_delim_token.size()); last_end = leftover; } if (!leftover.empty()) { result.emplace_back(leftover.data(), leftover.size()); } } std::string Replace(const re2::RE2 &re, const std::string &replacement, const std::string &input) { std::string output = input; re2::RE2::GlobalReplace(&output, re, replacement); return output; } template std::string Join(const std::vector &tokens, int start, int end, const std::string &delim = " ") { std::ostringstream oss; for (int i = start; i < end; ++i) { if (i > start) oss << delim; oss << tokens[i]; } return std::move(oss).str(); } template std::string Join(const std::vector &tokens, int start, const std::string &delim = " ") { return Join(tokens, start, tokens.size(), delim); } std::string Join(const TermList &tokens, int start, int end, const std::string &delim = " ") { std::ostringstream oss; for (int i = start; i < end; ++i) { if (i > start) oss << delim; oss << tokens[i].text_; } return std::move(oss).str(); } bool IsChinese(const std::string &str) { for (std::size_t i = 0; i < str.length(); ++i) { unsigned char c = str[i]; if (c >= 0xE4 && c <= 0xE9) { if (i + 2 < str.length()) { unsigned char c2 = str[i + 1]; unsigned char c3 = str[i + 2]; if ((c2 >= 0x80 && c2 <= 0xBF) && (c3 >= 0x80 && c3 <= 0xBF)) { return true; } } } } return false; } bool IsAlphabet(const std::string &str) { for (std::size_t i = 0; i < str.length(); ++i) { unsigned char c = str[i]; if (c > 0x7F) { return false; } } return true; } bool IsKorean(const std::string &str) { for (std::size_t i = 0; i < str.length(); ++i) { unsigned char c = str[i]; if (c == 0xE1) { if (i + 2 < str.length()) { unsigned char c2 = str[i + 1]; unsigned char c3 = str[i + 2]; if ((c2 == 0x84 || c2 == 0x85 || c2 == 0x86 || c2 == 0x87) && (c3 >= 0x80 && c3 <= 0xBF)) { return true; } } } } return false; } bool IsJapanese(const std::string &str) { for (std::size_t i = 0; i < str.length(); ++i) { unsigned char c = str[i]; if (c == 0xE3) { if (i + 2 < str.length()) { unsigned char c2 = str[i + 1]; unsigned char c3 = str[i + 2]; if ((c2 == 0x81 || c2 == 0x82 || c2 == 0x83) && (c3 >= 0x81 && c3 <= 0xBF)) { return true; } } } } return false; } bool IsCJK(const std::string &str) { for (std::size_t i = 0; i < str.length(); ++i) { unsigned char c = str[i]; // Check Chinese if (c >= 0xE4 && c <= 0xE9) { if (i + 2 < str.length()) { unsigned char c2 = str[i + 1]; unsigned char c3 = str[i + 2]; if ((c2 >= 0x80 && c2 <= 0xBF) && (c3 >= 0x80 && c3 <= 0xBF)) { return true; } } } // Check Japanese if (c == 0xE3) { if (i + 2 < str.length()) { unsigned char c2 = str[i + 1]; unsigned char c3 = str[i + 2]; if ((c2 == 0x81 || c2 == 0x82 || c2 == 0x83) && (c3 >= 0x81 && c3 <= 0xBF)) { return true; } } } // Check Korean if (c == 0xE1) { if (i + 2 < str.length()) { unsigned char c2 = str[i + 1]; unsigned char c3 = str[i + 2]; if ((c2 == 0x84 || c2 == 0x85 || c2 == 0x86 || c2 == 0x87) && (c3 >= 0x80 && c3 <= 0xBF)) { return true; } } } } return false; } class RegexTokenizer { public: RegexTokenizer() { int errorcode = 0; PCRE2_SIZE erroffset = 0; re_ = pcre2_compile((PCRE2_SPTR)(NLTK_TOKENIZE_PATTERN.c_str()), PCRE2_ZERO_TERMINATED, PCRE2_MULTILINE | PCRE2_UTF, &errorcode, &erroffset, nullptr); } ~RegexTokenizer() { pcre2_code_free(re_); } void RegexTokenize(const std::string &input, TermList &tokens) { PCRE2_SPTR subject = (PCRE2_SPTR)input.c_str(); PCRE2_SIZE subject_length = input.length(); pcre2_match_data_8 *match_data = pcre2_match_data_create_8(1024, nullptr); PCRE2_SIZE start_offset = 0; while (start_offset < subject_length) { int res = pcre2_match(re_, subject, subject_length, start_offset, 0, match_data, nullptr); if (res < 0) { if (res == PCRE2_ERROR_NOMATCH) { break; // No more matches } else { std::cerr << "Matching error code: " << res << std::endl; break; // Other error } } // Extract matched substring PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data); for (int i = 0; i < res; ++i) { PCRE2_SIZE start = ovector[2 * i]; PCRE2_SIZE end = ovector[2 * i + 1]; tokens.Add(input.c_str() + start, end - start, start, end); } // Update the start offset for the next search start_offset = ovector[1]; // Move to the end of the last match } // Free memory pcre2_match_data_free(match_data); } private: pcre2_code_8 *re_{nullptr}; }; class MacIntyreContractions { public: // List of contractions adapted from Robert MacIntyre's tokenizer. std::vector CONTRACTIONS2 = {R"((?i)\b(can)(?#X)(not)\b)", R"((?i)\b(d)(?#X)('ye)\b)", R"((?i)\b(gim)(?#X)(me)\b)", R"((?i)\b(gon)(?#X)(na)\b)", R"((?i)\b(got)(?#X)(ta)\b)", R"((?i)\b(lem)(?#X)(me)\b)", R"((?i)\b(more)(?#X)('n)\b)", R"((?i)\b(wan)(?#X)(na)(?=\s))"}; std::vector CONTRACTIONS3 = {R"((?i) ('t)(?#X)(is)\b)", R"((?i) ('t)(?#X)(was)\b)"}; std::vector CONTRACTIONS4 = {R"((?i)\b(whad)(dd)(ya)\b)", R"((?i)\b(wha)(t)(cha)\b)"}; }; // Structure to hold precompiled regex patterns struct CompiledRegex { pcre2_code *re{nullptr}; std::string substitution; CompiledRegex(pcre2_code *r, std::string sub) : re(r), substitution(std::move(sub)) { } CompiledRegex(const CompiledRegex &) = delete; CompiledRegex &operator=(const CompiledRegex &) = delete; CompiledRegex(CompiledRegex &&other) noexcept : re(other.re), substitution(std::move(other.substitution)) { other.re = nullptr; } CompiledRegex &operator=(CompiledRegex &&other) noexcept { if (this != &other) { if (re) pcre2_code_free(re); re = other.re; substitution = std::move(other.substitution); other.re = nullptr; } return *this; } ~CompiledRegex() { if (re) { pcre2_code_free(re); } } }; class NLTKWordTokenizer { MacIntyreContractions contractions_; // Static singleton instance static std::unique_ptr instance_; static std::once_flag init_flag_; public: // Static method to get the singleton instance static NLTKWordTokenizer &GetInstance() { std::call_once(init_flag_, []() { instance_ = std::make_unique(); }); return *instance_; } // Starting quotes. std::vector> STARTING_QUOTES = { {std::string(R"(([«“‘„]|[`]+))"), std::string(R"( $1 )")}, {std::string(R"(^\")"), std::string(R"(``)")}, {std::string(R"((``))"), std::string(R"( $1 )")}, {std::string(R"(([ \(\[{<])(\"|\'{2}))"), std::string(R"($1 `` )")}, {std::string(R"((?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b)"), std::string(R"($1 $2)")}}; // Ending quotes. std::vector> ENDING_QUOTES = { {std::string(R"(([»”’]))"), std::string(R"( $1 )")}, {std::string(R"('')"), std::string(R"( '' )")}, {std::string(R"(")"), std::string(R"( '' )")}, {std::string(R"(\s+)"), std::string(R"( )")}, {std::string(R"(([^' ])('[sS]|'[mM]|'[dD]|') )"), std::string(R"($1 $2 )")}, {std::string(R"(([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) )"), std::string(R"($1 $2 )")}}; // Punctuation. std::vector> PUNCTUATION = { {std::string(R"(([^\.])(\.)([\]\)}>"\'»”’ ]*)\s*$)"), std::string(R"($1 $2 $3 )")}, {std::string(R"(([:,])([^\d]))"), std::string(R"( $1 $2)")}, {std::string(R"(([:,])$)"), std::string(R"($1 )")}, {std::string(R"(\.{2,})"), std::string(R"($0 )")}, {std::string(R"([;@#$%&])"), std::string(R"($0 )")}, {std::string(R"(([^\.])(\.)([\]\)}>"\']*)\s*$)"), std::string(R"($1 $2 $3 )")}, {std::string(R"([?!])"), std::string(R"($0 )")}, {std::string(R"(([^'])' )"), std::string(R"($1 ' )")}, {std::string(R"([*])"), std::string(R"($0 )")}}; // Pads parentheses std::pair PARENS_BRACKETS = {std::string(R"([\]\[\(\)\{\}\<\>])"), std::string(R"( $0 )")}; std::vector> CONVERT_PARENTHESES = {{std::string(R"(\()"), std::string("-LRB-")}, {std::string(R"(\))"), std::string("-RRB-")}, {std::string(R"(\[)"), std::string("-LSB-")}, {std::string(R"(\])"), std::string("-RSB-")}, {std::string(R"(\{)"), std::string("-LCB-")}, {std::string(R"(\})"), std::string("-RCB-")}}; std::pair DOUBLE_DASHES = {std::string(R"(--)"), std::string(R"( -- )")}; // Cache for compiled regex patterns std::vector compiled_starting_quotes_; std::vector compiled_ending_quotes_; std::vector compiled_punctuation_; CompiledRegex compiled_parens_brackets_; std::vector compiled_convert_parentheses_; CompiledRegex compiled_double_dashes_; std::vector compiled_contractions2_; std::vector compiled_contractions3_; // Constructor that precompiles all regex patterns NLTKWordTokenizer() : compiled_parens_brackets_(nullptr, ""), compiled_double_dashes_(nullptr, "") { CompileRegexPatterns(); } void Tokenize(const std::string &text, std::vector &tokens, bool convert_parentheses = false) { std::string result = text; for (const auto &compiled : compiled_starting_quotes_) { result = ApplyRegex(result, compiled); } for (const auto &compiled : compiled_punctuation_) { result = ApplyRegex(result, compiled); } // Handles parentheses. result = ApplyRegex(result, compiled_parens_brackets_); // Optionally convert parentheses if (convert_parentheses) { for (const auto &compiled : compiled_convert_parentheses_) { result = ApplyRegex(result, compiled); } } // Handles double dash. result = ApplyRegex(result, compiled_double_dashes_); // Add extra space to make things easier result = " " + result + " "; for (const auto &compiled : compiled_ending_quotes_) { result = ApplyRegex(result, compiled); } for (const auto &compiled : compiled_contractions2_) { result = ApplyRegex(result, compiled); } for (const auto &compiled : compiled_contractions3_) { result = ApplyRegex(result, compiled); } // Split the result into tokens size_t start = 0; size_t end = result.find(' '); while (end != std::string::npos) { if (end != start) { std::string token = result.substr(start, end - start); // Handle underscore tokens properly if (token == "_") { // Single underscore token tokens.push_back("_"); } else if (token.find('_') != std::string::npos) { // Split tokens containing underscores and keep underscores as separate tokens std::stringstream ss(token); std::string sub_token; bool first = true; while (std::getline(ss, sub_token, '_')) { if (!first) { tokens.push_back("_"); } if (!sub_token.empty()) { tokens.push_back(sub_token); } first = false; } // Handle case where token ends with underscore if (token.back() == '_') { tokens.push_back("_"); } } else { tokens.push_back(token); } } start = end + 1; end = result.find(' ', start); } if (start != result.length()) { std::string token = result.substr(start); // Handle underscore tokens properly if (token == "_") { // Single underscore token tokens.push_back("_"); } else if (token.find('_') != std::string::npos) { // Split tokens containing underscores and keep underscores as separate tokens std::stringstream ss(token); std::string sub_token; bool first = true; while (std::getline(ss, sub_token, '_')) { if (!first) { tokens.push_back("_"); } if (!sub_token.empty()) { tokens.push_back(sub_token); } first = false; } // Handle case where token ends with underscore if (token.back() == '_') { tokens.push_back("_"); } } else { tokens.push_back(token); } } } private: void CompileRegexPatterns() { compiled_starting_quotes_.reserve(STARTING_QUOTES.size()); for (const auto &[pattern, substitution] : STARTING_QUOTES) { compiled_starting_quotes_.emplace_back(CompilePattern(pattern), substitution); } compiled_ending_quotes_.reserve(ENDING_QUOTES.size()); for (const auto &[pattern, substitution] : ENDING_QUOTES) { compiled_ending_quotes_.emplace_back(CompilePattern(pattern), substitution); } compiled_punctuation_.reserve(PUNCTUATION.size()); for (const auto &[pattern, substitution] : PUNCTUATION) { compiled_punctuation_.emplace_back(CompilePattern(pattern), substitution); } compiled_parens_brackets_ = CompiledRegex(CompilePattern(PARENS_BRACKETS.first), PARENS_BRACKETS.second); compiled_convert_parentheses_.reserve(CONVERT_PARENTHESES.size()); for (const auto &[pattern, substitution] : CONVERT_PARENTHESES) { compiled_convert_parentheses_.emplace_back(CompilePattern(pattern), substitution); } compiled_double_dashes_ = CompiledRegex(CompilePattern(DOUBLE_DASHES.first), DOUBLE_DASHES.second); compiled_contractions2_.reserve(contractions_.CONTRACTIONS2.size()); for (const auto &pattern : contractions_.CONTRACTIONS2) { compiled_contractions2_.emplace_back(CompilePattern(pattern), R"( $1 $2 )"); } compiled_contractions3_.reserve(contractions_.CONTRACTIONS3.size()); for (const auto &pattern : contractions_.CONTRACTIONS3) { compiled_contractions3_.emplace_back(CompilePattern(pattern), R"( $1 $2 )"); } } pcre2_code *CompilePattern(const std::string &pattern) { int errorcode = 0; PCRE2_SIZE erroffset = 0; pcre2_code *re = pcre2_compile(reinterpret_cast(pattern.c_str()), PCRE2_ZERO_TERMINATED, PCRE2_MULTILINE | PCRE2_UTF, &errorcode, &erroffset, nullptr); if (re == nullptr) { PCRE2_UCHAR buffer[256]; pcre2_get_error_message(errorcode, buffer, sizeof(buffer)); std::cerr << "PCRE2 compilation failed at offset " << erroffset << ": " << buffer << std::endl; return nullptr; } return re; } std::string ApplyRegex(const std::string &text, const CompiledRegex &compiled) { if (compiled.re == nullptr) { return text; } PCRE2_SPTR pcre2_subject = reinterpret_cast(text.c_str()); PCRE2_SPTR pcre2_replacement = reinterpret_cast(compiled.substitution.c_str()); size_t outlength = text.length() * 2 < 1024 ? 1024 : text.length() * 2; auto buffer = std::make_unique(outlength); int rc = pcre2_substitute(compiled.re, pcre2_subject, text.length(), 0, PCRE2_SUBSTITUTE_GLOBAL, nullptr, nullptr, pcre2_replacement, PCRE2_ZERO_TERMINATED, buffer.get(), &outlength); if (rc < 0) { return text; } return std::string(reinterpret_cast(buffer.get()), outlength); } }; // Static member definitions for NLTKWordTokenizer singleton std::unique_ptr NLTKWordTokenizer::instance_ = nullptr; std::once_flag NLTKWordTokenizer::init_flag_; void SentenceSplitter(const std::string &text, std::vector &result) { int error_code; PCRE2_SIZE error_offset; const char *pattern = R"( *[\.\?!]['"\)\]]* *)"; pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, PCRE2_MULTILINE | PCRE2_UTF, &error_code, &error_offset, nullptr); if (re == nullptr) { PCRE2_UCHAR buffer[256]; pcre2_get_error_message(error_code, buffer, sizeof(buffer)); std::cerr << "PCRE2 compilation failed at offset " << error_offset << ": " << buffer << std::endl; return; } pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, nullptr); PCRE2_SIZE start_offset = 0; while (start_offset < text.size()) { int rc = pcre2_match(re, (PCRE2_SPTR)text.c_str(), text.size(), start_offset, 0, match_data, nullptr); if (rc < 0) { result.push_back(text.substr(start_offset)); break; } PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data); PCRE2_SIZE match_start = ovector[0]; PCRE2_SIZE match_end = ovector[1]; if (match_start > start_offset) { result.push_back(text.substr(start_offset, match_end - start_offset)); } start_offset = match_end; } pcre2_match_data_free(match_data); pcre2_code_free(re); } RAGAnalyzer::RAGAnalyzer(const std::string &path) : dict_path_(path), stemmer_(std::make_unique()) { InitStemmer(STEM_LANG_ENGLISH); } RAGAnalyzer::RAGAnalyzer(const RAGAnalyzer &other) : own_dict_(false), trie_(other.trie_), pos_table_(other.pos_table_), wordnet_lemma_(other.wordnet_lemma_), stemmer_(std::make_unique()), opencc_(other.opencc_), fine_grained_(other.fine_grained_) { InitStemmer(STEM_LANG_ENGLISH); } RAGAnalyzer::~RAGAnalyzer() { if (own_dict_) { delete trie_; delete pos_table_; delete wordnet_lemma_; delete opencc_; } } int32_t RAGAnalyzer::Load() { fs::path root(dict_path_); fs::path dict_path(root / DICT_PATH); if (!fs::exists(dict_path)) { printf("Invalid analyzer file: %s", dict_path.string().c_str()); // return Status::InvalidAnalyzerFile(dict_path); return -1; } fs::path pos_def_path(root / POS_DEF_PATH); if (!fs::exists(pos_def_path)) { printf("Invalid post file: %s", pos_def_path.string().c_str()); // return Status::InvalidAnalyzerFile(pos_def_path); return -1; } own_dict_ = true; trie_ = new DartsTrie(); pos_table_ = new POSTable(pos_def_path.string()); if (pos_table_->Load() != 0) { printf("Fail to load post table: %s", pos_def_path.string().c_str()); return -1; // return Status::InvalidAnalyzerFile("Failed to load RAGAnalyzer POS definition"); } fs::path trie_path(root / TRIE_PATH); if (fs::exists(trie_path)) { trie_->Load(trie_path.string()); } else { // Build trie try { std::ifstream from(dict_path.string()); std::string line; re2::RE2 re_pattern(R"([\r\n]+)"); std::string split_pattern("([ \t])"); while (getline(from, line)) { line = line.substr(0, line.find('\r')); if (line.empty()) continue; line = Replace(re_pattern, "", line); std::vector results; Split(line, split_pattern, results); if (results.size() != 3) throw std::runtime_error("Invalid dictionary format"); int32_t freq = std::stoi(results[1]); freq = int32_t(std::log(float(freq) / DENOMINATOR) + 0.5); int32_t pos_idx = pos_table_->GetPOSIndex(results[2]); int value = Encode(freq, pos_idx); trie_->Add(results[0], value); std::string rkey = RKey(results[0]); trie_->Add(rkey, Encode(1, 0)); } trie_->Build(); } catch (const std::exception &e) { return -1; // return Status::InvalidAnalyzerFile("Failed to load RAGAnalyzer analyzer"); } trie_->Save(trie_path.string()); } fs::path lemma_path(root / WORDNET_PATH); if (!fs::exists(lemma_path)) { printf("Fail to load wordnet: %s", lemma_path.string().c_str()); return -1; // return Status::InvalidAnalyzerFile(lemma_path); } wordnet_lemma_ = new WordNetLemmatizer(lemma_path.string()); fs::path opencc_path(root / OPENCC_PATH); if (!fs::exists(opencc_path)) { printf("Fail to load opencc_path: %s", opencc_path.string().c_str()); return -1; // return Status::InvalidAnalyzerFile(opencc_path); } try { opencc_ = new ::OpenCC(opencc_path.string()); } catch (const std::exception &e) { return -1; // return Status::InvalidAnalyzerFile("Failed to load OpenCC"); } // return Status::OK(); return 0; } void RAGAnalyzer::BuildPositionMapping(const std::string &original, const std::string &converted, std::vector &pos_mapping) { pos_mapping.clear(); pos_mapping.resize(converted.size() + 1); size_t orig_pos = 0; size_t conv_pos = 0; // Map each character position from converted string to original string while (orig_pos < original.size() && conv_pos < converted.size()) { // Get character lengths size_t orig_char_len = UTF8_BYTE_LENGTH_TABLE[static_cast(original[orig_pos])]; size_t conv_char_len = UTF8_BYTE_LENGTH_TABLE[static_cast(converted[conv_pos])]; // Map all bytes of current converted character to current original position for (size_t i = 0; i < conv_char_len && conv_pos + i < pos_mapping.size(); ++i) { pos_mapping[conv_pos + i] = static_cast(orig_pos); } // Move to next character in both strings orig_pos += orig_char_len; conv_pos += conv_char_len; } // Fill any remaining positions for (size_t i = conv_pos; i < pos_mapping.size(); ++i) { pos_mapping[i] = static_cast(original.size()); } } std::string RAGAnalyzer::StrQ2B(const std::string &input) { std::string output; size_t i = 0; while (i < input.size()) { unsigned char c = input[i]; uint32_t codepoint = 0; if (c < 0x80) { codepoint = c; i += 1; } else if ((c & 0xE0) == 0xC0) { codepoint = (c & 0x1F) << 6; codepoint |= (input[i + 1] & 0x3F); i += 2; } else if ((c & 0xF0) == 0xE0) { codepoint = (c & 0x0F) << 12; codepoint |= (input[i + 1] & 0x3F) << 6; codepoint |= (input[i + 2] & 0x3F); i += 3; } else { output += c; i += 1; continue; } if (codepoint >= 0xFF01 && codepoint <= 0xFF5E) { output += static_cast(codepoint - 0xFEE0); } else if (codepoint == 0x3000) { output += ' '; } else { if (codepoint < 0x80) { output += static_cast(codepoint); } else if (codepoint < 0x800) { output += static_cast(0xC0 | (codepoint >> 6)); output += static_cast(0x80 | (codepoint & 0x3F)); } else if (codepoint < 0x10000) { output += static_cast(0xE0 | (codepoint >> 12)); output += static_cast(0x80 | ((codepoint >> 6) & 0x3F)); output += static_cast(0x80 | (codepoint & 0x3F)); } } } return output; } int32_t RAGAnalyzer::Freq(const std::string_view key) const { int32_t v = trie_->Get(key); v = DecodeFreq(v); return static_cast(std::exp(v) * DENOMINATOR + 0.5); } std::string RAGAnalyzer::Tag(std::string_view key) const { std::string lower_key = Key(std::string(key)); int32_t encoded_value = trie_->Get(lower_key); if (encoded_value == -1) { return ""; } int32_t pos_idx = DecodePOSIndex(encoded_value); if (pos_table_ == nullptr) { return ""; } const char* pos_tag = pos_table_->GetPOS(pos_idx); return pos_tag ? std::string(pos_tag) : ""; } std::string RAGAnalyzer::Key(const std::string_view line) { return ToLowerString(line); } std::string RAGAnalyzer::RKey(const std::string_view line) { std::string reversed; reversed.reserve(line.size() + 2); reversed += "DD"; for (size_t i = line.size(); i > 0;) { size_t start = i - 1; while (start > 0 && (line[start] & 0xC0) == 0x80) { --start; } reversed += line.substr(start, i - start); i = start; } ToLower(reversed.data() + 2, reversed.size() - 2); return reversed; } std::pair, double> RAGAnalyzer::Score(const std::vector> &token_freqs) { constexpr int64_t B = 30; int64_t F = 0, L = 0; std::vector tokens; tokens.reserve(token_freqs.size()); for (const auto &[token, freq_tag] : token_freqs) { F += DecodeFreq(freq_tag); L += (UTF8Length(token) < 2) ? 0 : 1; tokens.push_back(token); } const auto score = B / static_cast(tokens.size()) + L / static_cast(tokens.size()) + F; return {std::move(tokens), score}; } void RAGAnalyzer::SortTokens(const std::vector>> &token_list, std::vector, double>> &res) { for (const auto &tfts : token_list) { res.push_back(Score(tfts)); } std::sort(res.begin(), res.end(), [](const auto &a, const auto &b) { return a.second > b.second; }); } std::pair, double> RAGAnalyzer::MaxForward(const std::string &line) const { std::vector> res; std::size_t s = 0; std::size_t len = UTF8Length(line); while (s < len) { std::size_t e = s + 1; std::string t = UTF8Substr(line, s, e - s); while (e < len && trie_->HasKeysWithPrefix(Key(t))) { e += 1; t = UTF8Substr(line, s, e - s); } while (e - 1 > s && trie_->Get(Key(t)) == -1) { e -= 1; t = UTF8Substr(line, s, e - s); } int v = trie_->Get(Key(t)); if (v != -1) { res.emplace_back(std::move(t), v); } else { res.emplace_back(std::move(t), 0); } s = e; } return Score(res); } std::pair, double> RAGAnalyzer::MaxBackward(const std::string &line) const { std::vector> res; int s = UTF8Length(line) - 1; while (s >= 0) { const int e = s + 1; std::string t = UTF8Substr(line, s, e - s); while (s > 0 && trie_->HasKeysWithPrefix(RKey(t))) { s -= 1; t = UTF8Substr(line, s, e - s); } while (s + 1 < e && trie_->Get(Key(t)) == -1) { s += 1; t = UTF8Substr(line, s, e - s); } int v = trie_->Get(Key(t)); if (v != -1) { res.emplace_back(std::move(t), v); } else { res.emplace_back(std::move(t), 0); } s -= 1; } std::reverse(res.begin(), res.end()); return Score(res); } int RAGAnalyzer::DFS(const std::string &chars, const int s, std::vector> &pre_tokens, std::vector>> &token_list, std::vector &best_tokens, double &max_score, const bool memo_all) const { int res = s; const int len = UTF8Length(chars); if (s >= len) { if (memo_all) { token_list.push_back(pre_tokens); } else if (auto [vec_str, current_score] = Score(pre_tokens); current_score > max_score) { best_tokens = std::move(vec_str); max_score = current_score; } return res; } // pruning int S = s + 1; if (s + 2 <= len) { std::string t1 = UTF8Substr(chars, s, 1); std::string t2 = UTF8Substr(chars, s, 2); if (trie_->HasKeysWithPrefix(Key(t1)) && !trie_->HasKeysWithPrefix(Key(t2))) { S = s + 2; } } if (pre_tokens.size() > 2 && UTF8Length(pre_tokens[pre_tokens.size() - 1].first) == 1 && UTF8Length(pre_tokens[pre_tokens.size() - 2].first) == 1 && UTF8Length(pre_tokens[pre_tokens.size() - 3].first) == 1) { std::string t1 = pre_tokens[pre_tokens.size() - 1].first + UTF8Substr(chars, s, 1); if (trie_->HasKeysWithPrefix(Key(t1))) { S = s + 2; } } for (int e = S; e <= len; ++e) { std::string t = UTF8Substr(chars, s, e - s); std::string k = Key(t); if (e > s + 1 && !trie_->HasKeysWithPrefix(k)) { break; } if (const int v = trie_->Get(k); v != -1) { auto pretks = pre_tokens; pretks.emplace_back(std::move(t), v); res = std::max(res, DFS(chars, e, pretks, token_list, best_tokens, max_score, memo_all)); } } if (res > s) { return res; } std::string t = UTF8Substr(chars, s, 1); if (const int v = trie_->Get(Key(t)); v != -1) { pre_tokens.emplace_back(std::move(t), v); } else { pre_tokens.emplace_back(std::move(t), Encode(-12, 0)); } return DFS(chars, s + 1, pre_tokens, token_list, best_tokens, max_score, memo_all); } struct TokensList { const TokensList *prev = nullptr; std::string_view token = {}; }; struct BestTokenCandidate { static constexpr int64_t B = 30; TokensList tl{}; // N: token num // L: num of tokens with length >= 2 // F: sum of freq uint32_t N{}; uint32_t L{}; int64_t F{}; auto k() const { #ifdef DIVIDE_F_BY_N return N; #else return std::make_pair(N, L); #endif } auto v() const { return F; } auto score() const { #ifdef DIVIDE_F_BY_N return static_cast(B + L + F) / N; #else return F + (static_cast(B + L) / N); #endif } BestTokenCandidate update(const std::string_view new_token_sv, const int32_t key_f, const uint32_t add_l) const { return {{&tl, new_token_sv}, N + 1, L + add_l, F + key_f}; } }; struct GrowingBestTokenCandidatesTopN { int32_t top_n{}; std::vector candidates{}; explicit GrowingBestTokenCandidatesTopN(const int32_t top_n) : top_n(top_n) { } void AddBestTokenCandidateTopN(const BestTokenCandidate &add_candidate) { const auto [it_b, it_e] = std::equal_range(candidates.begin(), candidates.end(), add_candidate, [](const auto &a, const auto &b) { return a.k() < b.k(); }); auto target_it = it_b; bool do_replace = false; if (const auto match_cnt = std::distance(it_b, it_e); match_cnt >= top_n) { assert(match_cnt == top_n); const auto it = std::min_element(it_b, it_e, [](const auto &a, const auto &b) { return a.v() < b.v(); }); if (it->v() >= add_candidate.v()) { return; } target_it = it; do_replace = true; } if (do_replace) { *target_it = add_candidate; } else { candidates.insert(target_it, add_candidate); } } }; std::vector, double>> RAGAnalyzer::GetBestTokensTopN(const std::string_view chars, const uint32_t n) const { const auto utf8_len = UTF8Length(chars); std::vector dp_vec(utf8_len + 1, GrowingBestTokenCandidatesTopN(n)); dp_vec[0].candidates.resize(1); const char *current_utf8_ptr = chars.data(); uint32_t current_left_chars = chars.size(); std::string growing_key; // in lower case for (uint32_t i = 0; i < utf8_len; ++i) { const std::string_view current_chars{current_utf8_ptr, current_left_chars}; const uint32_t left_utf8_cnt = utf8_len - i; growing_key.clear(); const char *lookup_until = current_utf8_ptr; uint32_t lookup_left_chars = current_left_chars; std::size_t reuse_node_pos = 0; std::size_t reuse_key_pos = 0; for (uint32_t j = 1; j <= left_utf8_cnt; ++j) { { // handle growing_key const auto next_one_utf8 = UTF8Substrview({lookup_until, lookup_left_chars}, 0, 1); if (next_one_utf8.size() == 1 && next_one_utf8[0] >= 'A' && next_one_utf8[0] <= 'Z') { growing_key.push_back(next_one_utf8[0] - 'A' + 'a'); } else { growing_key.append(next_one_utf8); } lookup_until += next_one_utf8.size(); lookup_left_chars -= next_one_utf8.size(); } auto dp_f = [&dp_vec, i, j, original_sv = std::string_view{current_utf8_ptr, growing_key.size()}]( const int32_t key_f, const uint32_t add_l) { auto &target_dp = dp_vec[i + j]; for (const auto &c : dp_vec[i].candidates) { target_dp.AddBestTokenCandidateTopN(c.update(original_sv, key_f, add_l)); } }; if (const auto traverse_result = trie_->Traverse(growing_key.data(), reuse_node_pos, reuse_key_pos, growing_key.size()); traverse_result >= 0) { // in dictionary const int32_t key_f = DecodeFreq(traverse_result); const auto add_l = static_cast(j >= 2); dp_f(key_f, add_l); } else { // not in dictionary if (j == 1) { // also give a score: -12 dp_f(-12, 0); } if (traverse_result == -2) { // no more results break; } } } // update current_utf8_ptr and current_left_chars const auto forward_cnt = UTF8Substrview(current_chars, 0, 1).size(); current_utf8_ptr += forward_cnt; current_left_chars -= forward_cnt; } std::vector> mid_result; mid_result.reserve(n); for (const auto &c : dp_vec.back().candidates) { const auto new_pair = std::make_pair(&(c.tl), c.score()); if (mid_result.size() < n) { mid_result.push_back(new_pair); } else { assert(mid_result.size() == n); if (new_pair.second > mid_result.back().second) { mid_result.pop_back(); const auto insert_pos = std::lower_bound(mid_result.begin(), mid_result.end(), new_pair, [](const auto &a, const auto &b) { return a.second > b.second; }); mid_result.insert(insert_pos, new_pair); } } } class HelperFunc { uint32_t cnt = 0; std::vector result{}; void GetTokensInner(const TokensList *tl) { if (!tl->prev) { result.reserve(cnt); return; } ++cnt; GetTokensInner(tl->prev); result.push_back(tl->token); } public: std::vector GetTokens(const TokensList *tl) { GetTokensInner(tl); return std::move(result); } }; std::vector, double>> result; result.reserve(mid_result.size()); for (const auto [tl, score] : mid_result) { result.emplace_back(HelperFunc{}.GetTokens(tl), score); } return result; } // TODO: for test // #ifndef INFINITY_DEBUG // #define INFINITY_DEBUG 1 // #endif #ifdef INFINITY_DEBUG namespace dp_debug { template std::string TestPrintTokens(const std::vector &tokens) { std::ostringstream oss; for (std::size_t i = 0; i < tokens.size(); ++i) { oss << (i ? " #" : "#") << tokens[i] << "#"; } return std::move(oss).str(); } auto print_1 = [](const bool b) { return b ? "✅" : "❌"; }; auto print_2 = [](const bool b) { return b ? "equal" : "not equal"; }; void compare_score_and_tokens(const std::vector &dfs_tokens, const double dfs_score, const std::vector &dp_tokens, const double dp_score, const std::string &prefix) { std::ostringstream oss; const auto b_score_eq = dp_score == dfs_score; oss << fmt::format("\n{} {} DFS and DP score {}:\nDFS: {}\nDP : {}\n", print_1(b_score_eq), prefix, print_2(b_score_eq), dfs_score, dp_score); bool vec_equal = true; if (dp_tokens.size() != dfs_tokens.size()) { vec_equal = false; } else { for (std::size_t k = 0; k < dp_tokens.size(); ++k) { if (dp_tokens[k] != dfs_tokens[k]) { vec_equal = false; break; } } } oss << fmt::format("{} {} DFS and DP result {}:\nDFS: {}\nDP : {}\n", print_1(vec_equal), prefix, print_2(vec_equal), TestPrintTokens(dfs_tokens), TestPrintTokens(dp_tokens)); std::cerr << std::move(oss).str() << std::endl; } inline void CheckDP(const RAGAnalyzer *this_ptr, const std::string_view input_str, const std::vector &dfs_tokens, const double dfs_score, const auto t0, const auto t1) { const auto dp_result = this_ptr->GetBestTokensTopN(input_str, 1); const auto t2 = std::chrono::high_resolution_clock::now(); const auto dfs_duration = std::chrono::duration_cast>(t1 - t0); const auto dp_duration = std::chrono::duration_cast>(t2 - t1); const auto dp_faster = dp_duration < dfs_duration; std::cerr << "\n!!! " << print_1(dp_faster) << "\nTOP1 DFS duration: " << dfs_duration << " \nDP duration: " << dp_duration; const auto &[dp_vec, dp_score] = dp_result[0]; compare_score_and_tokens(dfs_tokens, dfs_score, dp_vec, dp_score, "[1 in top1]"); } inline void CheckDP2(const RAGAnalyzer *this_ptr, const std::string_view input_str, auto get_dfs_sorted_tokens, const auto t0, const auto t1) { constexpr int topn = 2; const auto dp_result = this_ptr->GetBestTokensTopN(input_str, topn); const auto t2 = std::chrono::high_resolution_clock::now(); const auto dfs_duration = std::chrono::duration_cast>(t1 - t0); const auto dp_duration = std::chrono::duration_cast>(t2 - t1); const auto dp_faster = dp_duration < dfs_duration; std::cerr << "\n!!! " << print_1(dp_faster) << "\nTOP2 DFS duration: " << dfs_duration << " \nTOP2 DP duration: " << dp_duration; const auto dfs_sorted_tokens = get_dfs_sorted_tokens(); for (int i = 0; i < std::min(topn, (int)dfs_sorted_tokens.size()); ++i) { compare_score_and_tokens(dfs_sorted_tokens[i].first, dfs_sorted_tokens[i].second, dp_result[i].first, dp_result[i].second, std::format("[{} in top{}]", i + 1, topn)); } } } // namespace dp_debug #endif std::string RAGAnalyzer::Merge(const std::string &tks_str) const { std::string tks = tks_str; tks = Replace(replace_space_pattern_, " ", tks); std::vector tokens; Split(tks, blank_pattern_, tokens); std::vector res; std::size_t s = 0; while (true) { if (s >= tokens.size()) break; std::size_t E = s + 1; for (std::size_t e = s + 2; e < std::min(tokens.size() + 1, s + 6); ++e) { std::string tk = Join(tokens, s, e, ""); if (re2::RE2::PartialMatch(tk, regex_split_pattern_)) { if (Freq(tk) > 0) { E = e; } } } res.push_back(Join(tokens, s, E, "")); s = E; } return Join(res, 0, res.size()); } void RAGAnalyzer::MergeWithPosition(const std::vector &tokens, const std::vector> &positions, std::vector &merged_tokens, std::vector> &merged_positions) const { // Filter out empty tokens first (like spaces) to match Merge behavior std::vector filtered_tokens; std::vector> filtered_positions; for (size_t i = 0; i < tokens.size(); ++i) { if (!tokens[i].empty() && tokens[i] != " ") { filtered_tokens.push_back(tokens[i]); filtered_positions.push_back(positions[i]); } } std::vector res; std::size_t s = 0; std::vector> res_positions; while (true) { if (s >= filtered_tokens.size()) break; std::size_t E = s + 1; for (std::size_t e = s + 2; e < std::min(filtered_tokens.size() + 1, s + 6); ++e) { std::string tk = Join(filtered_tokens, s, e, ""); if (re2::RE2::PartialMatch(tk, regex_split_pattern_)) { if (Freq(tk) > 0) { E = e; } } } std::string merged_token = Join(filtered_tokens, s, E, ""); res.push_back(merged_token); unsigned start_pos = filtered_positions[s].first; unsigned end_pos = filtered_positions[E - 1].second; res_positions.emplace_back(start_pos, end_pos); s = E; } merged_tokens = std::move(res); merged_positions = std::move(res_positions); } void RAGAnalyzer::EnglishNormalize(const std::vector &tokens, std::vector &res) const { for (auto &t : tokens) { if (re2::RE2::PartialMatch(t, pattern1_)) { //"[a-zA-Z_-]+$" std::string lemma_term = wordnet_lemma_->Lemmatize(t); std::vector lowercase_buffer(term_string_buffer_limit_); char *lowercase_term = lowercase_buffer.data(); ToLower(lemma_term.c_str(), lemma_term.size(), lowercase_term, term_string_buffer_limit_); std::string stem_term; stemmer_->Stem(lowercase_term, stem_term); res.push_back(stem_term); } else { res.push_back(t); } } } void RAGAnalyzer::SplitByLang(const std::string &line, std::vector> &txt_lang_pairs) const { std::vector arr; Split(line, regex_split_pattern_, arr, true); for (const auto &a : arr) { if (a.empty()) { continue; } std::size_t s = 0; std::size_t e = s + 1; bool zh = IsChinese(UTF8Substr(a, s, 1)); while (e < UTF8Length(a)) { bool _zh = IsChinese(UTF8Substr(a, e, 1)); if (_zh == zh) { e++; continue; } std::string segment = UTF8Substr(a, s, e - s); txt_lang_pairs.emplace_back(segment, zh); s = e; e = s + 1; zh = _zh; } if (s >= UTF8Length(a)) { continue; } std::string segment = UTF8Substr(a, s, e - s); txt_lang_pairs.emplace_back(segment, zh); } } void RAGAnalyzer::TokenizeInner(std::vector &res, const std::string &L) const { auto [tks, s] = MaxForward(L); auto [tks1, s1] = MaxBackward(L); #if 0 std::size_t i = 0, j = 0, _i = 0, _j = 0, same = 0; while ((i + same < tks1.size()) && (j + same < tks.size()) && tks1[i + same] == tks[j + same]) { same++; } if (same > 0) { res.push_back(Join(tks, j, j + same)); } _i = i + same; _j = j + same; j = _j + 1; i = _i + 1; while (i < tks1.size() && j < tks.size()) { std::string tk1 = Join(tks1, _i, i, ""); std::string tk = Join(tks, _j, j, ""); if (tk1 != tk) { if (tk1.length() > tk.length()) { j++; } else { i++; } continue; } if (tks1[i] != tks[j]) { i++; j++; continue; } std::vector> pre_tokens; std::vector>> token_list; std::vector best_tokens; double max_score = std::numeric_limits::lowest(); const auto str_for_dfs = Join(tks, _j, j, ""); #ifdef INFINITY_DEBUG const auto t0 = std::chrono::high_resolution_clock::now(); #endif DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false); #ifdef INFINITY_DEBUG const auto t1 = std::chrono::high_resolution_clock::now(); dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1); #endif res.push_back(Join(best_tokens, 0)); same = 1; while (i + same < tks1.size() && j + same < tks.size() && tks1[i + same] == tks[j + same]) same++; res.push_back(Join(tks, j, j + same)); _i = i + same; _j = j + same; j = _j + 1; i = _i + 1; } if (_i < tks1.size()) { std::vector> pre_tokens; std::vector>> token_list; std::vector best_tokens; double max_score = std::numeric_limits::lowest(); const auto str_for_dfs = Join(tks, _j, tks.size(), ""); #ifdef INFINITY_DEBUG const auto t0 = std::chrono::high_resolution_clock::now(); #endif DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false); #ifdef INFINITY_DEBUG const auto t1 = std::chrono::high_resolution_clock::now(); dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1); #endif res.push_back(Join(best_tokens, 0)); } #else std::size_t i = 0, j = 0, _i = 0, _j = 0, same = 0; while ((i + same < tks1.size()) && (j + same < tks.size()) && tks1[i + same] == tks[j + same]) { same++; } if (same > 0) { res.push_back(Join(tks, j, j + same)); } _i = i + same; _j = j + same; j = _j + 1; i = _i + 1; while (i < tks1.size() && j < tks.size()) { std::string tk1 = Join(tks1, _i, i, ""); std::string tk = Join(tks, _j, j, ""); if (tk1 != tk) { if (tk1.length() > tk.length()) { j++; } else { i++; } continue; } if (tks1[i] != tks[j]) { i++; j++; continue; } std::vector> pre_tokens; std::vector>> token_list; std::vector best_tokens; double max_score = std::numeric_limits::lowest(); const auto str_for_dfs = Join(tks, _j, j, ""); #ifdef INFINITY_DEBUG const auto t0 = std::chrono::high_resolution_clock::now(); #endif DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false); #ifdef INFINITY_DEBUG const auto t1 = std::chrono::high_resolution_clock::now(); dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1); #endif res.push_back(Join(best_tokens, 0)); same = 1; while (i + same < tks1.size() && j + same < tks.size() && tks1[i + same] == tks[j + same]) same++; res.push_back(Join(tks, j, j + same)); _i = i + same; _j = j + same; j = _j + 1; i = _i + 1; } if (_i < tks1.size()) { std::vector> pre_tokens; std::vector>> token_list; std::vector best_tokens; double max_score = std::numeric_limits::lowest(); const auto str_for_dfs = Join(tks, _j, tks.size(), ""); #ifdef INFINITY_DEBUG const auto t0 = std::chrono::high_resolution_clock::now(); #endif DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false); #ifdef INFINITY_DEBUG const auto t1 = std::chrono::high_resolution_clock::now(); dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1); #endif res.push_back(Join(best_tokens, 0)); } #endif } void RAGAnalyzer::SplitLongText(const std::string &L, uint32_t length, std::vector &sublines) const { uint32_t slice_count = length / MAX_SENTENCE_LEN + 1; sublines.reserve(slice_count); std::size_t last_sentence_start = 0; std::size_t next_sentence_start = 0; for (unsigned i = 0; i < slice_count; ++i) { next_sentence_start = MAX_SENTENCE_LEN * (i + 1) - 5; if (next_sentence_start + 5 < length) { std::size_t sentence_length = MAX_SENTENCE_LEN * (i + 1) + 5 > length ? length - next_sentence_start : 10; std::string substr = UTF8Substr(L, next_sentence_start, sentence_length); auto [tks, s] = MaxForward(substr); auto [tks1, s1] = MaxBackward(substr); std::vector diff(std::max(tks.size(), tks1.size()), 0); for (std::size_t j = 0; j < std::min(tks.size(), tks1.size()); ++j) { if (tks[j] != tks1[j]) { diff[j] = 1; } } if (s1 > s) { tks = tks1; } std::size_t start = 0; std::size_t forward_same_len = 0; while (start < tks.size() && diff[start] == 0) { forward_same_len += UTF8Length(tks[start]); start++; } if (forward_same_len == 0) { std::size_t end = tks.size() - 1; std::size_t backward_same_len = 0; while (end >= 0 && diff[end] == 0) { backward_same_len += UTF8Length(tks[end]); end--; } next_sentence_start += sentence_length - backward_same_len; } else next_sentence_start += forward_same_len; } else next_sentence_start = length; if (next_sentence_start == last_sentence_start) continue; std::string str = UTF8Substr(L, last_sentence_start, next_sentence_start - last_sentence_start); sublines.push_back(str); last_sentence_start = next_sentence_start; } } // PCRE2-based replacement function to match Python's re.sub behavior // Returns processed string and position mapping from processed to original std::pair>> PCRE2GlobalReplaceWithPosition(const std::string &text, const std::string &pattern, const std::string &replacement) { std::vector> pos_mapping; std::string result; pcre2_code *re; PCRE2_SPTR pcre2_pattern = reinterpret_cast(pattern.c_str()); PCRE2_SPTR pcre2_subject = reinterpret_cast(text.c_str()); // Note: pcre2_replacement is used in the replacement logic below int errorcode; PCRE2_SIZE erroroffset; // Compile the pattern with UTF and UCP flags for Unicode support re = pcre2_compile(pcre2_pattern, PCRE2_ZERO_TERMINATED, PCRE2_UCP | PCRE2_UTF, &errorcode, &erroroffset, nullptr); if (re == nullptr) { PCRE2_UCHAR buffer[256]; pcre2_get_error_message(errorcode, buffer, sizeof(buffer)); std::cerr << "PCRE2 compilation failed at offset " << erroroffset << ": " << buffer << std::endl; return {text, {}}; } pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, nullptr); PCRE2_SIZE current_pos = 0; PCRE2_SIZE last_match_end = 0; // Process the string match by match while (current_pos < text.length()) { int rc = pcre2_match(re, pcre2_subject, text.length(), current_pos, 0, match_data, nullptr); if (rc < 0) { // No more matches, copy remaining text if (last_match_end < text.length()) { std::string remaining = text.substr(last_match_end); result += remaining; // Map each character in remaining text for (size_t i = 0; i < remaining.length(); ++i) { pos_mapping.emplace_back(last_match_end + i, last_match_end + i); } } break; } PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data); PCRE2_SIZE match_start = ovector[0]; PCRE2_SIZE match_end = ovector[1]; // Copy text before the match if (last_match_end < match_start) { std::string before_match = text.substr(last_match_end, match_start - last_match_end); result += before_match; // Map each character in before_match for (size_t i = 0; i < before_match.length(); ++i) { pos_mapping.emplace_back(last_match_end + i, last_match_end + i); } } // Add the replacement string result += replacement; // Map each character in replacement to the start of the match for (size_t i = 0; i < replacement.length(); ++i) { pos_mapping.emplace_back(match_start, match_start); } last_match_end = match_end; current_pos = match_end; // If the match was zero-length, move forward one character to avoid infinite loop if (match_start == match_end) { if (current_pos < text.length()) { current_pos++; } else { break; } } } pcre2_match_data_free(match_data); pcre2_code_free(re); return {result, pos_mapping}; } // Original PCRE2GlobalReplace for backward compatibility std::string PCRE2GlobalReplace(const std::string &text, const std::string &pattern, const std::string &replacement) { auto [result, _] = PCRE2GlobalReplaceWithPosition(text, pattern, replacement); return result; } std::string RAGAnalyzer::Tokenize(const std::string &line) const { // Python-style simple tokenization: re.sub(r"\\W+", " ", line) std::string processed_line = PCRE2GlobalReplace(line, R"#(\W+)#", " "); std::string str1 = StrQ2B(processed_line); std::string strline; opencc_->convert(str1, strline); std::vector res; // Use SplitByLang to separate by language std::vector> arr; SplitByLang(strline, arr); for (const auto &[L, lang] : arr) { if (!lang) { // Non-Chinese text: use NLTK tokenizer, lemmatize and stem std::vector term_list; std::vector sentences; SentenceSplitter(L, sentences); for (auto &sentence : sentences) { NLTKWordTokenizer::GetInstance().Tokenize(sentence, term_list); } for (unsigned i = 0; i < term_list.size(); ++i) { std::string t = wordnet_lemma_->Lemmatize(term_list[i]); std::vector lowercase_buffer(term_string_buffer_limit_); char *lowercase_term = lowercase_buffer.data(); ToLower(t.c_str(), t.size(), lowercase_term, term_string_buffer_limit_); std::string stem_term; stemmer_->Stem(lowercase_term, stem_term); res.push_back(stem_term); } continue; } auto length = UTF8Length(L); if (length < 2 || re2::RE2::PartialMatch(L, pattern2_) || re2::RE2::PartialMatch(L, pattern3_)) { //[a-z\\.-]+$ [0-9\\.-]+$ res.push_back(L); continue; } // Chinese processing: use TokenizeInner #if 0 if (length > MAX_SENTENCE_LEN) { std::vector sublines; SplitLongText(L, length, sublines); for (auto &l : sublines) { TokenizeInner(res, l); } } else #endif TokenizeInner(res, L); } // std::vector normalize_res; // EnglishNormalize(res, normalize_res); std::string r = Join(res, 0); std::string ret = Merge(r); return ret; } std::pair, std::vector>> RAGAnalyzer::TokenizeWithPosition(const std::string &line) const { // Python-style simple tokenization: re.sub(r"\W+", " ", line) // Get processed line and position mapping from PCRE2GlobalReplace auto [processed_line, pcre2_pos_mapping] = PCRE2GlobalReplaceWithPosition(line, R"#(\W+)#", " "); std::string str1 = StrQ2B(processed_line); std::string strline; opencc_->convert(str1, strline); std::vector tokens; std::vector> positions; // Build character position mapping from StrQ2B conversion std::vector strq2b_pos_mapping; BuildPositionMapping(processed_line, str1, strq2b_pos_mapping); // Build character position mapping from OpenCC conversion std::vector opencc_pos_mapping; BuildPositionMapping(str1, strline, opencc_pos_mapping); // Combine all position mappings: strline -> str1 -> processed_line -> line std::vector final_pos_mapping; final_pos_mapping.resize(strline.size() + 1); for (size_t i = 0; i < strline.size(); ++i) { if (i < opencc_pos_mapping.size()) { unsigned str1_pos = opencc_pos_mapping[i]; if (str1_pos < strq2b_pos_mapping.size()) { unsigned processed_pos = strq2b_pos_mapping[str1_pos]; if (processed_pos < pcre2_pos_mapping.size()) { final_pos_mapping[i] = pcre2_pos_mapping[processed_pos].first; } else { final_pos_mapping[i] = static_cast(line.size()); } } else { final_pos_mapping[i] = static_cast(line.size()); } } else { final_pos_mapping[i] = static_cast(line.size()); } } // Fill the last position if (strline.size() < final_pos_mapping.size()) { final_pos_mapping[strline.size()] = static_cast(line.size()); } // Use SplitByLang to separate by language std::vector> arr; SplitByLang(strline, arr); unsigned current_pos = 0; for (const auto &[L, lang] : arr) { if (L.empty()) { continue; } std::size_t processed_pos = strline.find(L, current_pos); if (processed_pos == std::string::npos) { continue; } unsigned original_start = current_pos; current_pos = original_start + static_cast(L.size()); if (!lang) { // Non-Chinese text: use NLTK tokenizer, lemmatize and stem std::vector term_list; std::vector sentences; SentenceSplitter(L, sentences); unsigned sentence_start_pos = original_start; for (auto &sentence : sentences) { std::vector sentence_terms; NLTKWordTokenizer::GetInstance().Tokenize(sentence, sentence_terms); unsigned current_search_pos = 0; for (auto &term : sentence_terms) { size_t pos_in_sentence = sentence.find(term, current_search_pos); if (pos_in_sentence != std::string::npos) { unsigned start_pos = sentence_start_pos + static_cast(pos_in_sentence); unsigned end_pos = start_pos + static_cast(term.size()); std::string t = wordnet_lemma_->Lemmatize(term); std::vector lowercase_buffer(term_string_buffer_limit_); char *lowercase_term = lowercase_buffer.data(); ToLower(t.c_str(), t.size(), lowercase_term, term_string_buffer_limit_); std::string stem_term; stemmer_->Stem(lowercase_term, stem_term); tokens.push_back(stem_term); // Map positions back to original string using final_pos_mapping if (start_pos < final_pos_mapping.size()) { positions.emplace_back(final_pos_mapping[start_pos], final_pos_mapping[end_pos]); } else { positions.emplace_back(static_cast(line.size()), static_cast(line.size())); } current_search_pos = pos_in_sentence + term.size(); } } sentence_start_pos += static_cast(sentence.size()); } continue; } auto length = UTF8Length(L); if (length < 2 || re2::RE2::PartialMatch(L, pattern2_) || re2::RE2::PartialMatch(L, pattern3_)) { tokens.push_back(L); // Map positions back to original string using final_pos_mapping unsigned start_pos = original_start; unsigned end_pos = original_start + static_cast(L.size()); if (start_pos < final_pos_mapping.size() && end_pos < final_pos_mapping.size()) { positions.emplace_back(final_pos_mapping[start_pos], final_pos_mapping[end_pos]); } else { positions.emplace_back(static_cast(line.size()), static_cast(line.size())); } continue; } // Chinese processing: use TokenizeInnerWithPosition #if 0 if (length > MAX_SENTENCE_LEN) { std::vector sublines; SplitLongText(L, length, sublines); unsigned subline_start_pos = original_start; for (auto &l : sublines) { TokenizeInnerWithPosition(l, tokens, positions, subline_start_pos, &final_pos_mapping); subline_start_pos += static_cast(l.size()); } } else #endif TokenizeInnerWithPosition(L, tokens, positions, original_start, &final_pos_mapping); } // std::vector normalize_tokens; // std::vector> normalize_positions; // EnglishNormalizeWithPosition(tokens, positions, normalize_tokens, normalize_positions); // Apply MergeWithPosition to match Tokenize behavior std::vector merged_tokens; std::vector> merged_positions; MergeWithPosition(tokens, positions, merged_tokens, merged_positions); tokens = std::move(merged_tokens); positions = std::move(merged_positions); return {std::move(tokens), std::move(positions)}; } unsigned RAGAnalyzer::MapToOriginalPosition(unsigned processed_pos, const std::vector> &mapping) const { for (const auto &[orig, proc] : mapping) { if (proc == processed_pos) { return orig; } } return processed_pos; } static unsigned CalculateTokensLength(const std::vector &tokens, int start, int end) { unsigned total_length = 0; for (int i = start; i < end; ++i) { total_length += static_cast(tokens[i].size()); } return total_length; } void RAGAnalyzer::TokenizeInnerWithPosition(const std::string &L, std::vector &tokens, std::vector> &positions, unsigned base_pos, const std::vector *pos_mapping) const { auto [tks, s] = MaxForward(L); auto [tks1, s1] = MaxBackward(L); // Use the same algorithm as Python version std::size_t i = 0, j = 0, _i = 0, _j = 0, same = 0; while ((i + same < tks1.size()) && (j + same < tks.size()) && tks1[i + same] == tks[j + same]) { same++; } if (same > 0) { std::string token_str = Join(tks, j, j + same); unsigned token_len = static_cast(token_str.size()); unsigned start_pos = base_pos + CalculateTokensLength(tks, 0, j); if (token_str.find(' ') != std::string::npos) { std::vector space_split_tokens; Split(token_str, blank_pattern_, space_split_tokens, false); unsigned space_start_pos = start_pos; for (const auto &space_token : space_split_tokens) { if (space_token.empty()) { continue; } unsigned space_token_len = static_cast(space_token.size()); tokens.push_back(space_token); // Map position back to original string if mapping is provided if (pos_mapping) { unsigned mapped_start = space_start_pos < pos_mapping->size() ? (*pos_mapping)[space_start_pos] : 0; unsigned mapped_end = (space_start_pos + space_token_len) < pos_mapping->size() ? (*pos_mapping)[space_start_pos + space_token_len] : 0; positions.emplace_back(mapped_start, mapped_end); } else { positions.emplace_back(space_start_pos, space_start_pos + space_token_len); } space_start_pos += space_token_len; } } else { tokens.push_back(token_str); // Map position back to original string if mapping is provided if (pos_mapping) { unsigned mapped_start = start_pos < pos_mapping->size() ? (*pos_mapping)[start_pos] : 0; unsigned mapped_end = (start_pos + token_len) < pos_mapping->size() ? (*pos_mapping)[start_pos + token_len] : 0; positions.emplace_back(mapped_start, mapped_end); } else { positions.emplace_back(start_pos, start_pos + token_len); } } } _i = i + same; _j = j + same; j = _j + 1; i = _i + 1; while (i < tks1.size() && j < tks.size()) { std::string tk1 = Join(tks1, _i, i, ""); std::string tk = Join(tks, _j, j, ""); if (tk1 != tk) { if (tk1.length() > tk.length()) { j++; } else { i++; } continue; } if (tks1[i] != tks[j]) { i++; j++; continue; } // Handle different part with DFS std::vector> pre_tokens; std::vector>> token_list; std::vector best_tokens; double max_score = std::numeric_limits::lowest(); const auto str_for_dfs = Join(tks, _j, j, ""); #ifdef INFINITY_DEBUG const auto t0 = std::chrono::high_resolution_clock::now(); #endif DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false); #ifdef INFINITY_DEBUG const auto t1 = std::chrono::high_resolution_clock::now(); dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1); #endif std::string best_token_str = Join(best_tokens, 0); unsigned start_pos = base_pos + CalculateTokensLength(tks, 0, _j); std::string original_token_str = Join(tks, _j, j, ""); unsigned end_pos = start_pos + static_cast(original_token_str.size()); if (best_token_str.find(' ') != std::string::npos) { std::vector space_split_tokens; Split(best_token_str, blank_pattern_, space_split_tokens, false); unsigned space_start_pos = start_pos; for (const auto &space_token : space_split_tokens) { if (space_token.empty()) { continue; } unsigned space_token_len = static_cast(space_token.size()); tokens.push_back(space_token); // Map position back to original string if mapping is provided if (pos_mapping) { unsigned mapped_start = space_start_pos < pos_mapping->size() ? (*pos_mapping)[space_start_pos] : 0; unsigned mapped_end = (space_start_pos + space_token_len) < pos_mapping->size() ? (*pos_mapping)[space_start_pos + space_token_len] : 0; positions.emplace_back(mapped_start, mapped_end); } else { positions.emplace_back(space_start_pos, space_start_pos + space_token_len); } space_start_pos += space_token_len; } } else { tokens.push_back(best_token_str); // Map position back to original string if mapping is provided if (pos_mapping) { unsigned mapped_start = start_pos < pos_mapping->size() ? (*pos_mapping)[start_pos] : 0; unsigned mapped_end = end_pos < pos_mapping->size() ? (*pos_mapping)[end_pos] : 0; positions.emplace_back(mapped_start, mapped_end); } else { positions.emplace_back(start_pos, end_pos); } } same = 1; while (i + same < tks1.size() && j + same < tks.size() && tks1[i + same] == tks[j + same]) same++; // Handle same part after different tokens std::string token_str = Join(tks, j, j + same); unsigned token_len = static_cast(token_str.size()); start_pos = base_pos + CalculateTokensLength(tks, 0, j); if (token_str.find(' ') != std::string::npos) { std::vector space_split_tokens; Split(token_str, blank_pattern_, space_split_tokens, false); unsigned space_start_pos = start_pos; for (const auto &space_token : space_split_tokens) { if (space_token.empty()) { continue; } unsigned space_token_len = static_cast(space_token.size()); tokens.push_back(space_token); // Map position back to original string if mapping is provided if (pos_mapping) { unsigned mapped_start = space_start_pos < pos_mapping->size() ? (*pos_mapping)[space_start_pos] : 0; unsigned mapped_end = (space_start_pos + space_token_len) < pos_mapping->size() ? (*pos_mapping)[space_start_pos + space_token_len] : 0; positions.emplace_back(mapped_start, mapped_end); } else { positions.emplace_back(space_start_pos, space_start_pos + space_token_len); } space_start_pos += space_token_len; } } else { tokens.push_back(token_str); // Map position back to original string if mapping is provided if (pos_mapping) { unsigned mapped_start = start_pos < pos_mapping->size() ? (*pos_mapping)[start_pos] : 0; unsigned mapped_end = (start_pos + token_len) < pos_mapping->size() ? (*pos_mapping)[start_pos + token_len] : 0; positions.emplace_back(mapped_start, mapped_end); } else { positions.emplace_back(start_pos, start_pos + token_len); } } _i = i + same; _j = j + same; j = _j + 1; i = _i + 1; } // Handle remaining part if (_i < tks1.size()) { std::vector> pre_tokens; std::vector>> token_list; std::vector best_tokens; double max_score = std::numeric_limits::lowest(); const auto str_for_dfs = Join(tks, _j, tks.size(), ""); #ifdef INFINITY_DEBUG const auto t0 = std::chrono::high_resolution_clock::now(); #endif DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false); #ifdef INFINITY_DEBUG const auto t1 = std::chrono::high_resolution_clock::now(); dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1); #endif std::string best_token_str = Join(best_tokens, 0); unsigned start_pos = base_pos + CalculateTokensLength(tks, 0, _j); std::string original_token_str = Join(tks, _j, tks.size(), ""); unsigned end_pos = start_pos + static_cast(original_token_str.size()); if (best_token_str.find(' ') != std::string::npos) { std::vector space_split_tokens; Split(best_token_str, blank_pattern_, space_split_tokens, false); unsigned space_start_pos = start_pos; for (const auto &space_token : space_split_tokens) { if (space_token.empty()) { continue; } unsigned space_token_len = static_cast(space_token.size()); tokens.push_back(space_token); // Map position back to original string if mapping is provided if (pos_mapping) { unsigned mapped_start = space_start_pos < pos_mapping->size() ? (*pos_mapping)[space_start_pos] : 0; unsigned mapped_end = (space_start_pos + space_token_len) < pos_mapping->size() ? (*pos_mapping)[space_start_pos + space_token_len] : 0; positions.emplace_back(mapped_start, mapped_end); } else { positions.emplace_back(space_start_pos, space_start_pos + space_token_len); } space_start_pos += space_token_len; } } else { tokens.push_back(best_token_str); // Map position back to original string if mapping is provided if (pos_mapping) { unsigned mapped_start = start_pos < pos_mapping->size() ? (*pos_mapping)[start_pos] : 0; unsigned mapped_end = end_pos < pos_mapping->size() ? (*pos_mapping)[end_pos] : 0; positions.emplace_back(mapped_start, mapped_end); } else { positions.emplace_back(start_pos, end_pos); } } } } void RAGAnalyzer::EnglishNormalizeWithPosition(const std::vector &tokens, const std::vector> &positions, std::vector &normalize_tokens, std::vector> &normalize_positions) const { for (size_t i = 0; i < tokens.size(); ++i) { const auto &token = tokens[i]; const auto &[start_pos, end_pos] = positions[i]; if (re2::RE2::PartialMatch(token, pattern1_)) { //"[a-zA-Z_-]+$" std::string lemma_term = wordnet_lemma_->Lemmatize(token); std::vector lowercase_buffer(term_string_buffer_limit_); char *lowercase_term = lowercase_buffer.data(); ToLower(lemma_term.c_str(), lemma_term.size(), lowercase_term, term_string_buffer_limit_); std::string stem_term; stemmer_->Stem(lowercase_term, stem_term); normalize_tokens.push_back(stem_term); normalize_positions.emplace_back(start_pos, end_pos); } else { normalize_tokens.push_back(token); normalize_positions.emplace_back(start_pos, end_pos); } } } void RAGAnalyzer::FineGrainedTokenizeWithPosition(const std::string &tokens_str, const std::vector> &positions, std::vector &fine_tokens, std::vector> &fine_positions) const { std::vector tks; Split(tokens_str, blank_pattern_, tks); std::size_t zh_num = 0; for (auto &token : tks) { int len = UTF8Length(token); for (int i = 0; i < len; ++i) { std::string t = UTF8Substr(token, i, 1); if (IsChinese(t)) { zh_num++; } } } if (zh_num < tks.size() * 0.2) { // English text processing - apply normalization std::vector temp_tokens; for (size_t i = 0; i < tks.size(); ++i) { const auto &token = tks[i]; const auto &[start_pos, end_pos] = positions[i]; std::istringstream iss(token); std::string sub_token; unsigned sub_start = start_pos; while (std::getline(iss, sub_token, '/')) { if (!sub_token.empty()) { unsigned sub_end = sub_start + sub_token.size(); fine_tokens.push_back(sub_token); fine_positions.emplace_back(sub_start, sub_end); sub_start = sub_end + 1; } } } // Apply English normalization to get lowercase and stemmed tokens // std::vector> temp_positions = fine_positions; // EnglishNormalizeWithPosition(temp_tokens, temp_positions, fine_tokens, fine_positions); } else { // Chinese or mixed text processing - match FineGrainedTokenize behavior for (size_t i = 0; i < tks.size(); ++i) { const auto &token = tks[i]; const auto &[start_pos, end_pos] = positions[i]; const auto token_len = UTF8Length(token); if (token_len < 3 || re2::RE2::PartialMatch(token, pattern4_)) { fine_tokens.push_back(token); fine_positions.emplace_back(start_pos, end_pos); continue; } std::vector>> token_list; if (token_len > 10) { std::vector> tk; tk.emplace_back(token, Encode(-1, 0)); token_list.push_back(tk); } else { std::vector> pre_tokens; std::vector best_tokens; double max_score = 0.0F; DFS(token, 0, pre_tokens, token_list, best_tokens, max_score, true); } if (token_list.size() < 2) { fine_tokens.push_back(token); fine_positions.emplace_back(start_pos, end_pos); continue; } std::vector, double>> sorted_tokens; SortTokens(token_list, sorted_tokens); const auto &stk = sorted_tokens[1].first; if (stk.size() == token_len) { fine_tokens.push_back(token); fine_positions.emplace_back(start_pos, end_pos); } else if (re2::RE2::PartialMatch(token, pattern5_)) { bool need_append_stk = true; for (auto &t : stk) { if (UTF8Length(t) < 3) { fine_tokens.push_back(token); fine_positions.emplace_back(start_pos, end_pos); need_append_stk = false; break; } } if (need_append_stk) { unsigned sub_pos = start_pos; for (auto &t : stk) { unsigned sub_end = sub_pos + UTF8Length(t); fine_tokens.push_back(t); fine_positions.emplace_back(sub_pos, sub_end); sub_pos = sub_end; } } } else { unsigned sub_pos = start_pos; for (auto &t : stk) { unsigned sub_end = sub_pos + static_cast(t.size()); fine_tokens.push_back(t); fine_positions.emplace_back(sub_pos, sub_end); sub_pos = sub_end; } } } } // Apply English normalization only if needed, similar to FineGrainedTokenize // For Chinese text, no additional normalization needed // fine_tokens already contains the correct Chinese tokens } void RAGAnalyzer::FineGrainedTokenize(const std::string &tokens, std::vector &result) const { std::vector tks; Split(tokens, blank_pattern_, tks); std::vector res; std::size_t zh_num = 0; for (auto &token : tks) { int len = UTF8Length(token); for (int i = 0; i < len; ++i) { std::string t = UTF8Substr(token, i, 1); if (IsChinese(t)) { zh_num++; } } } if (zh_num < tks.size() * 0.2) { for (auto &token : tks) { std::istringstream iss(token); std::string sub_token; while (std::getline(iss, sub_token, '/')) { result.push_back(sub_token); } } // std::string ret = Join(res, 0); return; } for (auto &token : tks) { const auto token_len = UTF8Length(token); if (token_len < 3 || re2::RE2::PartialMatch(token, pattern4_)) { //[0-9,\\.-]+$ res.push_back(token); continue; } std::vector>> token_list; if (token_len > 10) { std::vector> tk; tk.emplace_back(token, Encode(-1, 0)); token_list.push_back(tk); } else { std::vector> pre_tokens; std::vector best_tokens; double max_score = 0.0F; #ifdef INFINITY_DEBUG const auto t0 = std::chrono::high_resolution_clock::now(); #endif DFS(token, 0, pre_tokens, token_list, best_tokens, max_score, true); #ifdef INFINITY_DEBUG const auto t1 = std::chrono::high_resolution_clock::now(); auto get_dfs_sorted_tokens = [&]() { std::vector, double>> sorted_tokens; SortTokens(token_list, sorted_tokens); return sorted_tokens; }; dp_debug::CheckDP2(this, token, get_dfs_sorted_tokens, t0, t1); #endif } if (token_list.size() < 2) { res.push_back(token); continue; } std::vector, double>> sorted_tokens; SortTokens(token_list, sorted_tokens); const auto &stk = sorted_tokens[1].first; if (stk.size() == token_len) { res.push_back(token); } else if (re2::RE2::PartialMatch(token, pattern5_)) { // [a-z\\.-]+ bool need_append_stk = true; for (auto &t : stk) { if (UTF8Length(t) < 3) { res.push_back(token); need_append_stk = false; break; } } if (need_append_stk) { for (auto &t : stk) { res.push_back(t); } } } else { for (auto &t : stk) { res.push_back(t); } } } EnglishNormalize(res, result); // std::string ret = Join(normalize_res, 0); // return ret; } int RAGAnalyzer::AnalyzeImpl(const Term &input, void *data, bool fine_grained, bool enable_position, HookType func) const { if (enable_position) { auto [tokens, positions] = TokenizeWithPosition(input.text_); if (fine_grained) { std::vector fine_tokens; std::vector> fine_positions; FineGrainedTokenizeWithPosition(Join(tokens, 0), positions, fine_tokens, fine_positions); tokens = std::move(fine_tokens); positions = std::move(fine_positions); } for (size_t i = 0; i < tokens.size(); ++i) { if (tokens[i].empty()) continue; const auto &[start_pos, end_pos] = positions[i]; func(data, tokens[i].c_str(), tokens[i].size(), start_pos, end_pos, false, 0); } } else { std::string result = Tokenize(input.text_); std::vector tokens; if (fine_grained) { FineGrainedTokenize(result, tokens); } else { Split(result, blank_pattern_, tokens); } unsigned offset = 0; for (auto &t : tokens) { if (t.empty()) continue; func(data, t.c_str(), t.size(), offset++, 0, false, 0); } } return 0; }