// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include "opencc/openccxx.h" #include "stemmer/stemmer.h" #include "term.h" #include "re2/re2.h" #include "dart_trie.h" #include "wordnet_lemmatizer.h" #include "analyzer.h" #include #include #include #include #include // C++ reimplementation of // https://github.com/infiniflow/ragflow/blob/main/rag/nlp/rag_tokenizer.py typedef void (*HookType)(void* data, const char* text, const uint32_t len, const uint32_t offset, const uint32_t end_offset, const bool is_special_char, const uint16_t payload); class NLTKWordTokenizer; class RAGAnalyzer : public Analyzer { public: explicit RAGAnalyzer(const std::string& path); RAGAnalyzer(const RAGAnalyzer& other); ~RAGAnalyzer(); void InitStemmer(Language language) { stemmer_->Init(language); } int32_t Load(); void SetFineGrained(bool fine_grained) { fine_grained_ = fine_grained; } void SetEnablePosition(bool enable_position) { enable_position_ = enable_position; } std::pair, std::vector>> TokenizeWithPosition( const std::string& line) const; std::string Tokenize(const std::string& line) const; void FineGrainedTokenize(const std::string& tokens, std::vector& result) const; void TokenizeInnerWithPosition(const std::string& L, std::vector& tokens, std::vector>& positions, unsigned base_pos, const std::vector* pos_mapping = nullptr) const; void FineGrainedTokenizeWithPosition(const std::string& tokens_str, const std::vector>& positions, std::vector& fine_tokens, std::vector>& fine_positions) const; void EnglishNormalizeWithPosition(const std::vector& tokens, const std::vector>& positions, std::vector& normalize_tokens, std::vector>& normalize_positions) const; unsigned MapToOriginalPosition(unsigned processed_pos, const std::vector>& mapping) const; void MergeWithPosition(const std::vector& tokens, const std::vector>& positions, std::vector& merged_tokens, std::vector>& merged_positions) const; void SplitByLang(const std::string& line, std::vector>& txt_lang_pairs) const; int32_t Freq(std::string_view key) const; std::string Tag(std::string_view key) const; protected: int AnalyzeImpl(const Term& input, void* data, bool fine_grained, bool enable_position, HookType func) const; private: static constexpr float DENOMINATOR = 1000000; static std::string StrQ2B(const std::string& input); static void BuildPositionMapping(const std::string& original, const std::string& converted, std::vector& pos_mapping); static std::string Key(std::string_view line); static std::string RKey(std::string_view line); static std::pair, double> Score( const std::vector>& token_freqs); static void SortTokens(const std::vector>>& token_list, std::vector, double>>& res); std::pair, double> MaxForward(const std::string& line) const; std::pair, double> MaxBackward(const std::string& line) const; int DFS(const std::string& chars, int s, std::vector>& pre_tokens, std::vector>>& token_list, std::vector& best_tokens, double& max_score, bool memo_all) const; void TokenizeInner(std::vector& res, const std::string& L) const; void SplitLongText(const std::string& L, uint32_t length, std::vector& sublines) const; [[nodiscard]] std::string Merge(const std::string& tokens) const; void EnglishNormalize(const std::vector& tokens, std::vector& res) const; public: [[nodiscard]] std::vector, double>> GetBestTokensTopN( std::string_view chars, uint32_t n) const; static const size_t term_string_buffer_limit_ = 4096 * 3; std::string dict_path_; bool own_dict_{}; DartsTrie* trie_{nullptr}; POSTable* pos_table_{nullptr}; WordNetLemmatizer* wordnet_lemma_{nullptr}; std::unique_ptr stemmer_; OpenCC* opencc_{nullptr}; bool fine_grained_{false}; bool enable_position_{false}; static inline re2::RE2 pattern1_{"[a-zA-Z_-]+$"}; static inline re2::RE2 pattern2_{"[a-zA-Z\\.-]+$"}; static inline re2::RE2 pattern3_{"[0-9\\.-]+$"}; static inline re2::RE2 pattern4_{"[0-9,\\.-]+$"}; static inline re2::RE2 pattern5_{"[a-zA-Z\\.-]+"}; static inline re2::RE2 regex_split_pattern_{ R"#(([ ,\.<>/?;:'\[\]\\`!@#$%^&*\{\}\|_+=《》，。？、；‘’：“”【】~！￥%……（）——-]+|[a-zA-Z0-9,\.-]+))#" }; static inline re2::RE2 blank_pattern_{"( )"}; static inline re2::RE2 replace_space_pattern_{R"#(([ ]+))#"}; }; void SentenceSplitter(const std::string& text, std::vector& result);