RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
This commit is contained in:
Jin Hai
2026-03-04 19:17:16 +08:00
committed by GitHub
parent 2508c46c8f
commit 70e9743ef1
257 changed files with 80490 additions and 6 deletions

315
internal/cpp/tokenizer.cpp Normal file
View File

@ -0,0 +1,315 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tokenizer.h"
#include <cstring>
#include <cstdint>
const CharType ALLOW_CHR = 0; /// < regular term
const CharType DELIMITER_CHR = 1; /// < delimiter
const CharType SPACE_CHR = 2; /// < space term
const CharType UNITE_CHR = 3; /// < united term
CharTypeTable::CharTypeTable(bool use_def_delim) {
memset(char_type_table_, 0, BYTE_MAX);
// if use_def_delim is set, all the characters are allows
if (!use_def_delim)
return;
// set the lower 4 bit to record default char type
for (uint8_t i = 0; i < BYTE_MAX; i++) {
if (std::isalnum(i) || i > 127)
continue;
else if (std::isspace(i))
char_type_table_[i] = SPACE_CHR;
else
char_type_table_[i] = DELIMITER_CHR;
}
}
void CharTypeTable::SetConfig(const TokenizeConfig &conf) {
// set the higher 4 bit to record user defined option type
std::string str; // why need to copy?
str = conf.divides_;
if (!str.empty()) {
for (unsigned int j = 0; j < str.length(); j++) {
char_type_table_[(uint8_t)str[j]] = DELIMITER_CHR;
}
}
str = conf.unites_;
if (!str.empty()) {
for (unsigned int j = 0; j < str.length(); j++) {
char_type_table_[(uint8_t)str[j]] = UNITE_CHR;
}
}
str = conf.allows_;
if (!str.empty()) {
for (unsigned int j = 0; j < str.length(); j++) {
char_type_table_[(uint8_t)str[j]] = ALLOW_CHR;
}
}
}
void Tokenizer::SetConfig(const TokenizeConfig &conf) { table_.SetConfig(conf); }
void Tokenizer::Tokenize(const std::string &input) {
input_ = (std::string *)&input;
input_cursor_ = 0;
}
bool Tokenizer::NextToken() {
while (input_cursor_ < input_->length() && table_.GetType(input_->at(input_cursor_)) == SPACE_CHR) {
input_cursor_++;
}
if (input_cursor_ == input_->length())
return false;
output_buffer_cursor_ = 0;
if (output_buffer_cursor_ >= output_buffer_size_) {
GrowOutputBuffer();
}
token_start_cursor_ = input_cursor_;
output_buffer_[output_buffer_cursor_++] = input_->at(input_cursor_);
if (table_.GetType(input_->at(input_cursor_)) == DELIMITER_CHR) {
++input_cursor_;
is_delimiter_ = true;
return true;
} else {
++input_cursor_;
is_delimiter_ = false;
while (input_cursor_ < input_->length()) {
CharType cur_type = table_.GetType(input_->at(input_cursor_));
if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
return true;
} else if (cur_type == ALLOW_CHR) {
if (output_buffer_cursor_ >= output_buffer_size_) {
GrowOutputBuffer();
}
output_buffer_[output_buffer_cursor_++] = input_->at(input_cursor_++);
} else {
++input_cursor_;
}
}
return true;
}
}
bool Tokenizer::GrowOutputBuffer() {
output_buffer_size_ *= 2;
output_buffer_ = std::make_unique<char[]>(output_buffer_size_);
return true;
}
bool Tokenizer::Tokenize(const std::string &input_string, TermList &special_terms, TermList &prim_terms) {
special_terms.clear();
prim_terms.clear();
size_t len = input_string.length();
if (len == 0)
return false;
Term t;
TermList::iterator it;
unsigned int word_off = 0, char_off = 0;
char cur_char;
CharType cur_type;
for (char_off = 0; char_off < len;) // char_off++ ) // char_off is always incremented inside
{
cur_type = table_.GetType(input_string.at(char_off));
if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) {
it = prim_terms.insert(prim_terms.end(), t);
do {
cur_char = input_string.at(char_off);
cur_type = table_.GetType(cur_char);
if (cur_type == ALLOW_CHR) {
it->text_ += cur_char;
} else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
break;
}
char_off++;
} while (char_off < len);
if (it->text_.length() == 0) {
prim_terms.erase(it);
continue;
// char_off--;
}
it->word_offset_ = word_off++;
// char_off--;
} else if (cur_type == DELIMITER_CHR) {
it = special_terms.insert(special_terms.end(), t);
do {
cur_char = input_string.at(char_off);
cur_type = table_.GetType(cur_char);
if (cur_type == DELIMITER_CHR)
it->text_ += cur_char;
else
break;
char_off++;
} while (char_off < len);
it->word_offset_ = word_off++;
// char_off--;
} else
char_off++;
}
return true;
}
bool Tokenizer::Tokenize(const std::string &input_string, TermList &prim_terms) {
prim_terms.clear();
size_t len = input_string.length();
if (len == 0)
return false;
Term t;
TermList::iterator it;
unsigned int word_off = 0, char_off = 0;
char cur_char;
CharType cur_type;
for (char_off = 0; char_off < len;) // char_off++ )
{
cur_type = table_.GetType(input_string.at(char_off));
if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) {
it = prim_terms.insert(prim_terms.end(), t);
// it->begin_ = char_off;
do {
cur_char = input_string.at(char_off);
cur_type = table_.GetType(cur_char);
if (cur_type == ALLOW_CHR) {
it->text_ += cur_char;
} else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
break;
}
char_off++;
} while (char_off < len);
if (it->text_.length() == 0) {
prim_terms.erase(it);
continue;
// char_off--;
}
it->word_offset_ = word_off++;
// char_off--;
} else if (cur_type == DELIMITER_CHR) {
if (((char_off + 1) < len) && table_.GetType(input_string.at(char_off + 1)) != DELIMITER_CHR) {
word_off++;
}
char_off++;
} else
char_off++;
}
return true;
}
bool Tokenizer::TokenizeWhite(const std::string &input_string, TermList &raw_terms) {
raw_terms.clear();
size_t len = input_string.length();
if (len == 0)
return false;
Term t;
TermList::iterator it;
unsigned int word_off = 0, char_off = 0;
char cur_char;
CharType cur_type;
// CharType cur_type, preType;
for (char_off = 0; char_off < len;) // char_off++ )
{
cur_type = table_.GetType(input_string.at(char_off));
if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) {
it = raw_terms.insert(raw_terms.end(), t);
// it->begin_ = char_off;
do {
cur_char = input_string.at(char_off);
cur_type = table_.GetType(cur_char);
if (cur_type == ALLOW_CHR) {
it->text_ += cur_char;
} else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
break;
}
char_off++;
} while (char_off < len);
if (it->text_.length() == 0) {
raw_terms.erase(it);
continue;
// char_off--;
}
it->word_offset_ = word_off++;
// char_off--;
} else if (cur_type == DELIMITER_CHR) {
it = raw_terms.insert(raw_terms.end(), t);
do {
cur_char = input_string.at(char_off);
cur_type = table_.GetType(cur_char);
if (cur_type == DELIMITER_CHR)
it->text_ += cur_char;
else
break;
char_off++;
} while (char_off < len);
it->word_offset_ = word_off++;
// char_off--;
} else {
// SPACE_CHR nothing to do
char_off++;
}
}
return true;
}