RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀 This repository tracks the progress of porting RAGFlow to Go. We'll implement core features and provide performance comparisons between Python and Go versions. ## Implementation Checklist - [x] User Management APIs - [x] Dataset Management Operations - [x] Retrieval Test - [x] Chat Management Operations - [x] Infinity Go SDK --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
2026-05-21 16:40:07 +08:00 · 2026-03-04 19:17:16 +08:00
parent 2508c46c8f
commit 70e9743ef1
257 changed files with 80490 additions and 6 deletions
--- a/internal/cpp/tokenizer.cpp
+++ b/internal/cpp/tokenizer.cpp
@ -0,0 +1,315 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tokenizer.h"
+#include <cstring>
+#include <cstdint>
+
+const CharType ALLOW_CHR = 0; /// < regular term
+const CharType DELIMITER_CHR = 1; /// < delimiter
+const CharType SPACE_CHR = 2; /// < space term
+const CharType UNITE_CHR = 3; /// < united term
+
+CharTypeTable::CharTypeTable(bool use_def_delim) {
+    memset(char_type_table_, 0, BYTE_MAX);
+    // if use_def_delim is set, all the characters are allows
+    if (!use_def_delim)
+        return;
+    // set the lower 4 bit to record default char type
+    for (uint8_t i = 0; i < BYTE_MAX; i++) {
+        if (std::isalnum(i) || i > 127)
+            continue;
+        else if (std::isspace(i))
+            char_type_table_[i] = SPACE_CHR;
+        else
+            char_type_table_[i] = DELIMITER_CHR;
+    }
+}
+
+void CharTypeTable::SetConfig(const TokenizeConfig &conf) {
+    // set the higher 4 bit to record user defined option type
+    std::string str; // why need to copy?
+
+    str = conf.divides_;
+    if (!str.empty()) {
+        for (unsigned int j = 0; j < str.length(); j++) {
+            char_type_table_[(uint8_t)str[j]] = DELIMITER_CHR;
+        }
+    }
+
+    str = conf.unites_;
+    if (!str.empty()) {
+        for (unsigned int j = 0; j < str.length(); j++) {
+            char_type_table_[(uint8_t)str[j]] = UNITE_CHR;
+        }
+    }
+
+    str = conf.allows_;
+    if (!str.empty()) {
+        for (unsigned int j = 0; j < str.length(); j++) {
+            char_type_table_[(uint8_t)str[j]] = ALLOW_CHR;
+        }
+    }
+}
+
+void Tokenizer::SetConfig(const TokenizeConfig &conf) { table_.SetConfig(conf); }
+
+void Tokenizer::Tokenize(const std::string &input) {
+    input_ = (std::string *)&input;
+    input_cursor_ = 0;
+}
+
+bool Tokenizer::NextToken() {
+    while (input_cursor_ < input_->length() && table_.GetType(input_->at(input_cursor_)) == SPACE_CHR) {
+        input_cursor_++;
+    }
+    if (input_cursor_ == input_->length())
+        return false;
+
+    output_buffer_cursor_ = 0;
+
+    if (output_buffer_cursor_ >= output_buffer_size_) {
+        GrowOutputBuffer();
+    }
+    token_start_cursor_ = input_cursor_;
+    output_buffer_[output_buffer_cursor_++] = input_->at(input_cursor_);
+    if (table_.GetType(input_->at(input_cursor_)) == DELIMITER_CHR) {
+        ++input_cursor_;
+        is_delimiter_ = true;
+        return true;
+    } else {
+        ++input_cursor_;
+        is_delimiter_ = false;
+
+        while (input_cursor_ < input_->length()) {
+            CharType cur_type = table_.GetType(input_->at(input_cursor_));
+            if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
+                return true;
+            } else if (cur_type == ALLOW_CHR) {
+                if (output_buffer_cursor_ >= output_buffer_size_) {
+                    GrowOutputBuffer();
+                }
+                output_buffer_[output_buffer_cursor_++] = input_->at(input_cursor_++);
+            } else {
+                ++input_cursor_;
+            }
+        }
+        return true;
+    }
+}
+
+bool Tokenizer::GrowOutputBuffer() {
+    output_buffer_size_ *= 2;
+    output_buffer_ = std::make_unique<char[]>(output_buffer_size_);
+    return true;
+}
+
+bool Tokenizer::Tokenize(const std::string &input_string, TermList &special_terms, TermList &prim_terms) {
+    special_terms.clear();
+    prim_terms.clear();
+
+    size_t len = input_string.length();
+    if (len == 0)
+        return false;
+
+    Term t;
+    TermList::iterator it;
+
+    unsigned int word_off = 0, char_off = 0;
+
+    char cur_char;
+    CharType cur_type;
+
+    for (char_off = 0; char_off < len;) // char_off++ )   // char_off is always incremented inside
+    {
+        cur_type = table_.GetType(input_string.at(char_off));
+
+        if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) {
+            it = prim_terms.insert(prim_terms.end(), t);
+
+            do {
+                cur_char = input_string.at(char_off);
+                cur_type = table_.GetType(cur_char);
+
+                if (cur_type == ALLOW_CHR) {
+                    it->text_ += cur_char;
+                } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
+                    break;
+                }
+
+                char_off++;
+            } while (char_off < len);
+
+            if (it->text_.length() == 0) {
+                prim_terms.erase(it);
+                continue;
+                // char_off--;
+            }
+
+            it->word_offset_ = word_off++;
+
+            // char_off--;
+        } else if (cur_type == DELIMITER_CHR) {
+
+            it = special_terms.insert(special_terms.end(), t);
+
+            do {
+                cur_char = input_string.at(char_off);
+                cur_type = table_.GetType(cur_char);
+
+                if (cur_type == DELIMITER_CHR)
+                    it->text_ += cur_char;
+                else
+                    break;
+                char_off++;
+            } while (char_off < len);
+
+            it->word_offset_ = word_off++;
+
+            // char_off--;
+        } else
+            char_off++;
+    }
+
+    return true;
+}
+
+bool Tokenizer::Tokenize(const std::string &input_string, TermList &prim_terms) {
+    prim_terms.clear();
+    size_t len = input_string.length();
+    if (len == 0)
+        return false;
+
+    Term t;
+    TermList::iterator it;
+
+    unsigned int word_off = 0, char_off = 0;
+
+    char cur_char;
+    CharType cur_type;
+
+    for (char_off = 0; char_off < len;) // char_off++ )
+    {
+        cur_type = table_.GetType(input_string.at(char_off));
+
+        if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) {
+
+            it = prim_terms.insert(prim_terms.end(), t);
+            // it->begin_ = char_off;
+
+            do {
+                cur_char = input_string.at(char_off);
+                cur_type = table_.GetType(cur_char);
+
+                if (cur_type == ALLOW_CHR) {
+                    it->text_ += cur_char;
+                } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
+                    break;
+                }
+
+                char_off++;
+            } while (char_off < len);
+
+            if (it->text_.length() == 0) {
+                prim_terms.erase(it);
+                continue;
+                // char_off--;
+            }
+
+            it->word_offset_ = word_off++;
+
+            // char_off--;
+        } else if (cur_type == DELIMITER_CHR) {
+            if (((char_off + 1) < len) && table_.GetType(input_string.at(char_off + 1)) != DELIMITER_CHR) {
+                word_off++;
+            }
+            char_off++;
+        } else
+            char_off++;
+    }
+
+    return true;
+}
+
+bool Tokenizer::TokenizeWhite(const std::string &input_string, TermList &raw_terms) {
+    raw_terms.clear();
+
+    size_t len = input_string.length();
+    if (len == 0)
+        return false;
+
+    Term t;
+    TermList::iterator it;
+
+    unsigned int word_off = 0, char_off = 0;
+
+    char cur_char;
+    CharType cur_type;
+    // CharType cur_type, preType;
+
+    for (char_off = 0; char_off < len;) // char_off++ )
+    {
+        cur_type = table_.GetType(input_string.at(char_off));
+
+        if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) {
+            it = raw_terms.insert(raw_terms.end(), t);
+            // it->begin_ = char_off;
+
+            do {
+                cur_char = input_string.at(char_off);
+                cur_type = table_.GetType(cur_char);
+
+                if (cur_type == ALLOW_CHR) {
+                    it->text_ += cur_char;
+                } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
+                    break;
+                }
+
+                char_off++;
+            } while (char_off < len);
+
+            if (it->text_.length() == 0) {
+                raw_terms.erase(it);
+                continue;
+                // char_off--;
+            }
+
+            it->word_offset_ = word_off++;
+
+            // char_off--;
+        } else if (cur_type == DELIMITER_CHR) {
+
+            it = raw_terms.insert(raw_terms.end(), t);
+
+            do {
+                cur_char = input_string.at(char_off);
+                cur_type = table_.GetType(cur_char);
+                if (cur_type == DELIMITER_CHR)
+                    it->text_ += cur_char;
+                else
+                    break;
+                char_off++;
+            } while (char_off < len);
+
+            it->word_offset_ = word_off++;
+
+            // char_off--;
+        } else {
+            // SPACE_CHR  nothing to do
+            char_off++;
+        }
+    }
+
+    return true;
+}