mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-03 00:37:48 +08:00
Fix tokenizer in cpp (#13735)
### What problem does this PR solve? Tokenzier in Infinity is modified in https://github.com/infiniflow/infinity/pull/3330, sync the code change to cpp files in ragflow ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -659,13 +659,13 @@ void SentenceSplitter(const std::string &text, std::vector<std::string> &result)
|
||||
}
|
||||
|
||||
RAGAnalyzer::RAGAnalyzer(const std::string &path)
|
||||
: dict_path_(path), stemmer_(std::make_unique<Stemmer>()) {
|
||||
: dict_path_(path), stemmer_(std::make_unique<Stemmer>()), lowercase_string_buffer_(term_string_buffer_limit_) {
|
||||
InitStemmer(STEM_LANG_ENGLISH);
|
||||
}
|
||||
|
||||
RAGAnalyzer::RAGAnalyzer(const RAGAnalyzer &other)
|
||||
: own_dict_(false), trie_(other.trie_), pos_table_(other.pos_table_), wordnet_lemma_(other.wordnet_lemma_), stemmer_(std::make_unique<Stemmer>()),
|
||||
opencc_(other.opencc_), fine_grained_(other.fine_grained_) {
|
||||
opencc_(other.opencc_), lowercase_string_buffer_(term_string_buffer_limit_), fine_grained_(other.fine_grained_) {
|
||||
InitStemmer(STEM_LANG_ENGLISH);
|
||||
}
|
||||
|
||||
@ -1381,14 +1381,13 @@ void RAGAnalyzer::MergeWithPosition(const std::vector<std::string> &tokens,
|
||||
|
||||
void RAGAnalyzer::EnglishNormalize(const std::vector<std::string> &tokens, std::vector<std::string> &res) const {
|
||||
for (auto &t : tokens) {
|
||||
if (re2::RE2::PartialMatch(t, pattern1_)) {
|
||||
//"[a-zA-Z_-]+$"
|
||||
std::string lemma_term = wordnet_lemma_->Lemmatize(t);
|
||||
std::vector<char> lowercase_buffer(term_string_buffer_limit_);
|
||||
char *lowercase_term = lowercase_buffer.data();
|
||||
ToLower(lemma_term.c_str(), lemma_term.size(), lowercase_term, term_string_buffer_limit_);
|
||||
if (re2::RE2::PartialMatch(t, pattern1_)) { //"[a-zA-Z_-]+$"
|
||||
// Apply lowercase before lemmatization to match Python NLTK behavior
|
||||
char *lowercase_term = lowercase_string_buffer_.data();
|
||||
ToLower(t.c_str(), t.size(), lowercase_term, term_string_buffer_limit_);
|
||||
std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
|
||||
std::string stem_term;
|
||||
stemmer_->Stem(lowercase_term, stem_term);
|
||||
stemmer_->Stem(lemma_term, stem_term);
|
||||
res.push_back(stem_term);
|
||||
} else {
|
||||
res.push_back(t);
|
||||
@ -1745,12 +1744,12 @@ std::string RAGAnalyzer::Tokenize(const std::string &line) const {
|
||||
NLTKWordTokenizer::GetInstance().Tokenize(sentence, term_list);
|
||||
}
|
||||
for (unsigned i = 0; i < term_list.size(); ++i) {
|
||||
std::string t = wordnet_lemma_->Lemmatize(term_list[i]);
|
||||
std::vector<char> lowercase_buffer(term_string_buffer_limit_);
|
||||
char *lowercase_term = lowercase_buffer.data();
|
||||
ToLower(t.c_str(), t.size(), lowercase_term, term_string_buffer_limit_);
|
||||
// Apply lowercase before lemmatization to match Python NLTK behavior
|
||||
char *lowercase_term = lowercase_string_buffer_.data();
|
||||
ToLower(term_list[i].c_str(), term_list[i].size(), lowercase_term, term_string_buffer_limit_);
|
||||
std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
|
||||
std::string stem_term;
|
||||
stemmer_->Stem(lowercase_term, stem_term);
|
||||
stemmer_->Stem(lemma_term, stem_term);
|
||||
res.push_back(stem_term);
|
||||
}
|
||||
continue;
|
||||
@ -1863,12 +1862,12 @@ std::pair<std::vector<std::string>, std::vector<std::pair<unsigned, unsigned>>>
|
||||
if (pos_in_sentence != std::string::npos) {
|
||||
unsigned start_pos = sentence_start_pos + static_cast<unsigned>(pos_in_sentence);
|
||||
unsigned end_pos = start_pos + static_cast<unsigned>(term.size());
|
||||
std::string t = wordnet_lemma_->Lemmatize(term);
|
||||
std::vector<char> lowercase_buffer(term_string_buffer_limit_);
|
||||
char *lowercase_term = lowercase_buffer.data();
|
||||
ToLower(t.c_str(), t.size(), lowercase_term, term_string_buffer_limit_);
|
||||
// Apply lowercase before lemmatization to match Python NLTK behavior
|
||||
char *lowercase_term = lowercase_string_buffer_.data();
|
||||
ToLower(term.c_str(), term.size(), lowercase_term, term_string_buffer_limit_);
|
||||
std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
|
||||
std::string stem_term;
|
||||
stemmer_->Stem(lowercase_term, stem_term);
|
||||
stemmer_->Stem(lemma_term, stem_term);
|
||||
|
||||
tokens.push_back(stem_term);
|
||||
|
||||
@ -2187,14 +2186,13 @@ void RAGAnalyzer::EnglishNormalizeWithPosition(const std::vector<std::string> &t
|
||||
const auto &token = tokens[i];
|
||||
const auto &[start_pos, end_pos] = positions[i];
|
||||
|
||||
if (re2::RE2::PartialMatch(token, pattern1_)) {
|
||||
//"[a-zA-Z_-]+$"
|
||||
std::string lemma_term = wordnet_lemma_->Lemmatize(token);
|
||||
std::vector<char> lowercase_buffer(term_string_buffer_limit_);
|
||||
char *lowercase_term = lowercase_buffer.data();
|
||||
ToLower(lemma_term.c_str(), lemma_term.size(), lowercase_term, term_string_buffer_limit_);
|
||||
if (re2::RE2::PartialMatch(token, pattern1_)) { //"[a-zA-Z_-]+$"
|
||||
// Apply lowercase before lemmatization to match Python NLTK behavior
|
||||
char *lowercase_term = lowercase_string_buffer_.data();
|
||||
ToLower(token.c_str(), token.size(), lowercase_term, term_string_buffer_limit_);
|
||||
std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
|
||||
std::string stem_term;
|
||||
stemmer_->Stem(lowercase_term, stem_term);
|
||||
stemmer_->Stem(lemma_term, stem_term);
|
||||
|
||||
normalize_tokens.push_back(stem_term);
|
||||
normalize_positions.emplace_back(start_pos, end_pos);
|
||||
|
||||
@ -136,7 +136,7 @@ public:
|
||||
[[nodiscard]] std::vector<std::pair<std::vector<std::string_view>, double>> GetBestTokensTopN(
|
||||
std::string_view chars, uint32_t n) const;
|
||||
|
||||
static const size_t term_string_buffer_limit_ = 4096 * 3;
|
||||
static constexpr size_t term_string_buffer_limit_ = 4096 * 3;
|
||||
|
||||
std::string dict_path_;
|
||||
|
||||
@ -152,6 +152,8 @@ public:
|
||||
|
||||
OpenCC* opencc_{nullptr};
|
||||
|
||||
mutable std::vector<char> lowercase_string_buffer_;
|
||||
|
||||
bool fine_grained_{false};
|
||||
|
||||
bool enable_position_{false};
|
||||
|
||||
@ -182,28 +182,14 @@ std::vector<std::string> WordNetLemmatizer::Morphy(const std::string &form, cons
|
||||
return FilterForms(forms, pos);
|
||||
}
|
||||
|
||||
// Apply morphological rules with recursion (like Java version)
|
||||
// Apply morphological rules (only ONE level, not recursive like Java)
|
||||
// This matches Python NLTK WordNet behavior
|
||||
std::vector<std::string> forms = CollectSubstitutions(form, pos);
|
||||
std::vector<std::string> combined_forms = forms;
|
||||
combined_forms.push_back(form);
|
||||
|
||||
// First attempt with original form and first-level substitutions
|
||||
auto results = FilterForms(combined_forms, pos);
|
||||
if (!results.empty()) {
|
||||
return results;
|
||||
}
|
||||
|
||||
// Recursively apply rules (Java version's while loop)
|
||||
while (!forms.empty()) {
|
||||
forms = CollectSubstitutions(forms, pos);
|
||||
results = FilterForms(forms, pos);
|
||||
if (!results.empty()) {
|
||||
return results;
|
||||
}
|
||||
}
|
||||
|
||||
// Return empty result if no valid lemma found
|
||||
return {};
|
||||
return results;
|
||||
}
|
||||
|
||||
std::string WordNetLemmatizer::Lemmatize(const std::string &form, const std::string &pos) {
|
||||
@ -211,13 +197,33 @@ std::string WordNetLemmatizer::Lemmatize(const std::string &form, const std::str
|
||||
if (!pos.empty()) {
|
||||
parts_of_speech.push_back(pos);
|
||||
} else {
|
||||
parts_of_speech = POS_LIST;
|
||||
// Use only NOUN to match Python NLTK default behavior
|
||||
parts_of_speech = {NOUN};
|
||||
}
|
||||
|
||||
for (const auto &part : parts_of_speech) {
|
||||
auto analyses = Morphy(form, part);
|
||||
if (!analyses.empty()) {
|
||||
return analyses[0];
|
||||
// Python NLTK returns the SHORTEST lemma: min(lemmas, key=len)
|
||||
// For "as" -> ["as", "a"] -> returns "a"
|
||||
// For "data" -> ["data", "datum"] -> returns "data"
|
||||
// For "men" -> ["men", "man"] -> returns "men" (original form preferred when same length)
|
||||
std::string shortest = analyses[0];
|
||||
for (const auto &analysis : analyses) {
|
||||
if (analysis.length() < shortest.length()) {
|
||||
shortest = analysis;
|
||||
}
|
||||
}
|
||||
// If original form is in the results and has same length as shortest, prefer original form
|
||||
if (shortest != form) {
|
||||
for (const auto &analysis : analyses) {
|
||||
if (analysis == form && analysis.length() == shortest.length()) {
|
||||
shortest = analysis;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return shortest;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user