From ac542da5053bfec48918bda211415c3c063a21f4 Mon Sep 17 00:00:00 2001 From: qinling0210 <88864212+qinling0210@users.noreply.github.com> Date: Mon, 23 Mar 2026 15:40:35 +0800 Subject: [PATCH] Fix tokenizer in cpp (#13735) ### What problem does this PR solve? Tokenzier in Infinity is modified in https://github.com/infiniflow/infinity/pull/3330, sync the code change to cpp files in ragflow ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- internal/cpp/rag_analyzer.cpp | 50 ++++++++++++++--------------- internal/cpp/rag_analyzer.h | 4 ++- internal/cpp/wordnet_lemmatizer.cpp | 44 ++++++++++++++----------- 3 files changed, 52 insertions(+), 46 deletions(-) diff --git a/internal/cpp/rag_analyzer.cpp b/internal/cpp/rag_analyzer.cpp index 9584b2c06..c52ab5745 100644 --- a/internal/cpp/rag_analyzer.cpp +++ b/internal/cpp/rag_analyzer.cpp @@ -659,13 +659,13 @@ void SentenceSplitter(const std::string &text, std::vector &result) } RAGAnalyzer::RAGAnalyzer(const std::string &path) - : dict_path_(path), stemmer_(std::make_unique()) { + : dict_path_(path), stemmer_(std::make_unique()), lowercase_string_buffer_(term_string_buffer_limit_) { InitStemmer(STEM_LANG_ENGLISH); } RAGAnalyzer::RAGAnalyzer(const RAGAnalyzer &other) : own_dict_(false), trie_(other.trie_), pos_table_(other.pos_table_), wordnet_lemma_(other.wordnet_lemma_), stemmer_(std::make_unique()), - opencc_(other.opencc_), fine_grained_(other.fine_grained_) { + opencc_(other.opencc_), lowercase_string_buffer_(term_string_buffer_limit_), fine_grained_(other.fine_grained_) { InitStemmer(STEM_LANG_ENGLISH); } @@ -1381,14 +1381,13 @@ void RAGAnalyzer::MergeWithPosition(const std::vector &tokens, void RAGAnalyzer::EnglishNormalize(const std::vector &tokens, std::vector &res) const { for (auto &t : tokens) { - if (re2::RE2::PartialMatch(t, pattern1_)) { - //"[a-zA-Z_-]+$" - std::string lemma_term = wordnet_lemma_->Lemmatize(t); - std::vector lowercase_buffer(term_string_buffer_limit_); - char *lowercase_term = lowercase_buffer.data(); - ToLower(lemma_term.c_str(), lemma_term.size(), lowercase_term, term_string_buffer_limit_); + if (re2::RE2::PartialMatch(t, pattern1_)) { //"[a-zA-Z_-]+$" + // Apply lowercase before lemmatization to match Python NLTK behavior + char *lowercase_term = lowercase_string_buffer_.data(); + ToLower(t.c_str(), t.size(), lowercase_term, term_string_buffer_limit_); + std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term); std::string stem_term; - stemmer_->Stem(lowercase_term, stem_term); + stemmer_->Stem(lemma_term, stem_term); res.push_back(stem_term); } else { res.push_back(t); @@ -1745,12 +1744,12 @@ std::string RAGAnalyzer::Tokenize(const std::string &line) const { NLTKWordTokenizer::GetInstance().Tokenize(sentence, term_list); } for (unsigned i = 0; i < term_list.size(); ++i) { - std::string t = wordnet_lemma_->Lemmatize(term_list[i]); - std::vector lowercase_buffer(term_string_buffer_limit_); - char *lowercase_term = lowercase_buffer.data(); - ToLower(t.c_str(), t.size(), lowercase_term, term_string_buffer_limit_); + // Apply lowercase before lemmatization to match Python NLTK behavior + char *lowercase_term = lowercase_string_buffer_.data(); + ToLower(term_list[i].c_str(), term_list[i].size(), lowercase_term, term_string_buffer_limit_); + std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term); std::string stem_term; - stemmer_->Stem(lowercase_term, stem_term); + stemmer_->Stem(lemma_term, stem_term); res.push_back(stem_term); } continue; @@ -1863,12 +1862,12 @@ std::pair, std::vector>> if (pos_in_sentence != std::string::npos) { unsigned start_pos = sentence_start_pos + static_cast(pos_in_sentence); unsigned end_pos = start_pos + static_cast(term.size()); - std::string t = wordnet_lemma_->Lemmatize(term); - std::vector lowercase_buffer(term_string_buffer_limit_); - char *lowercase_term = lowercase_buffer.data(); - ToLower(t.c_str(), t.size(), lowercase_term, term_string_buffer_limit_); + // Apply lowercase before lemmatization to match Python NLTK behavior + char *lowercase_term = lowercase_string_buffer_.data(); + ToLower(term.c_str(), term.size(), lowercase_term, term_string_buffer_limit_); + std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term); std::string stem_term; - stemmer_->Stem(lowercase_term, stem_term); + stemmer_->Stem(lemma_term, stem_term); tokens.push_back(stem_term); @@ -2187,14 +2186,13 @@ void RAGAnalyzer::EnglishNormalizeWithPosition(const std::vector &t const auto &token = tokens[i]; const auto &[start_pos, end_pos] = positions[i]; - if (re2::RE2::PartialMatch(token, pattern1_)) { - //"[a-zA-Z_-]+$" - std::string lemma_term = wordnet_lemma_->Lemmatize(token); - std::vector lowercase_buffer(term_string_buffer_limit_); - char *lowercase_term = lowercase_buffer.data(); - ToLower(lemma_term.c_str(), lemma_term.size(), lowercase_term, term_string_buffer_limit_); + if (re2::RE2::PartialMatch(token, pattern1_)) { //"[a-zA-Z_-]+$" + // Apply lowercase before lemmatization to match Python NLTK behavior + char *lowercase_term = lowercase_string_buffer_.data(); + ToLower(token.c_str(), token.size(), lowercase_term, term_string_buffer_limit_); + std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term); std::string stem_term; - stemmer_->Stem(lowercase_term, stem_term); + stemmer_->Stem(lemma_term, stem_term); normalize_tokens.push_back(stem_term); normalize_positions.emplace_back(start_pos, end_pos); diff --git a/internal/cpp/rag_analyzer.h b/internal/cpp/rag_analyzer.h index 78a75d713..9b3027ef9 100644 --- a/internal/cpp/rag_analyzer.h +++ b/internal/cpp/rag_analyzer.h @@ -136,7 +136,7 @@ public: [[nodiscard]] std::vector, double>> GetBestTokensTopN( std::string_view chars, uint32_t n) const; - static const size_t term_string_buffer_limit_ = 4096 * 3; + static constexpr size_t term_string_buffer_limit_ = 4096 * 3; std::string dict_path_; @@ -152,6 +152,8 @@ public: OpenCC* opencc_{nullptr}; + mutable std::vector lowercase_string_buffer_; + bool fine_grained_{false}; bool enable_position_{false}; diff --git a/internal/cpp/wordnet_lemmatizer.cpp b/internal/cpp/wordnet_lemmatizer.cpp index 673a008a0..d267beeba 100644 --- a/internal/cpp/wordnet_lemmatizer.cpp +++ b/internal/cpp/wordnet_lemmatizer.cpp @@ -182,28 +182,14 @@ std::vector WordNetLemmatizer::Morphy(const std::string &form, cons return FilterForms(forms, pos); } - // Apply morphological rules with recursion (like Java version) + // Apply morphological rules (only ONE level, not recursive like Java) + // This matches Python NLTK WordNet behavior std::vector forms = CollectSubstitutions(form, pos); std::vector combined_forms = forms; combined_forms.push_back(form); - // First attempt with original form and first-level substitutions auto results = FilterForms(combined_forms, pos); - if (!results.empty()) { - return results; - } - - // Recursively apply rules (Java version's while loop) - while (!forms.empty()) { - forms = CollectSubstitutions(forms, pos); - results = FilterForms(forms, pos); - if (!results.empty()) { - return results; - } - } - - // Return empty result if no valid lemma found - return {}; + return results; } std::string WordNetLemmatizer::Lemmatize(const std::string &form, const std::string &pos) { @@ -211,13 +197,33 @@ std::string WordNetLemmatizer::Lemmatize(const std::string &form, const std::str if (!pos.empty()) { parts_of_speech.push_back(pos); } else { - parts_of_speech = POS_LIST; + // Use only NOUN to match Python NLTK default behavior + parts_of_speech = {NOUN}; } for (const auto &part : parts_of_speech) { auto analyses = Morphy(form, part); if (!analyses.empty()) { - return analyses[0]; + // Python NLTK returns the SHORTEST lemma: min(lemmas, key=len) + // For "as" -> ["as", "a"] -> returns "a" + // For "data" -> ["data", "datum"] -> returns "data" + // For "men" -> ["men", "man"] -> returns "men" (original form preferred when same length) + std::string shortest = analyses[0]; + for (const auto &analysis : analyses) { + if (analysis.length() < shortest.length()) { + shortest = analysis; + } + } + // If original form is in the results and has same length as shortest, prefer original form + if (shortest != form) { + for (const auto &analysis : analyses) { + if (analysis == form && analysis.length() == shortest.length()) { + shortest = analysis; + break; + } + } + } + return shortest; } }