Files
ragflow/internal/cpp/tokenizer.h
Jin Hai 70e9743ef1 RAGFlow go API server (#13240)
# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
2026-03-04 19:17:16 +08:00

114 lines
3.4 KiB
C++

// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <memory>
#include <cstdint>
#include "term.h"
constexpr unsigned BYTE_MAX = 255;
class TokenizeConfig {
public:
void AddAllows(std::string astr) { allows_ += astr; }
void AddDivides(std::string dstr) { divides_ += dstr; }
void AddUnites(std::string ustr) { unites_ += ustr; }
std::string allows_;
std::string divides_;
std::string unites_;
};
typedef unsigned char CharType;
extern const CharType ALLOW_CHR; /// < regular term
extern const CharType DELIMITER_CHR; /// < delimiter
extern const CharType SPACE_CHR; /// < space term
extern const CharType UNITE_CHR; /// < united term
class CharTypeTable {
CharType char_type_table_[BYTE_MAX];
public:
CharTypeTable(bool use_def_delim = true);
void SetConfig(const TokenizeConfig &conf);
CharType GetType(uint8_t c) { return char_type_table_[c]; }
bool IsAllow(uint8_t c) { return char_type_table_[c] == ALLOW_CHR; }
bool IsDivide(uint8_t c) { return char_type_table_[c] == DELIMITER_CHR; }
bool IsUnite(uint8_t c) { return char_type_table_[c] == UNITE_CHR; }
bool IsEqualType(uint8_t c1, uint8_t c2) { return char_type_table_[c1] == char_type_table_[c2]; }
};
class Tokenizer {
public:
Tokenizer(bool use_def_delim = true) : table_(use_def_delim) { output_buffer_ = std::make_unique<char[]>(output_buffer_size_); }
~Tokenizer() {}
/// \brief set the user defined char types
/// \param list char type option list
void SetConfig(const TokenizeConfig &conf);
/// \brief tokenize the input text, call nextToken(), getToken(), getLength() to get the result.
/// \param input input text string
void Tokenize(const std::string &input);
bool NextToken();
inline const char *GetToken() { return output_buffer_.get(); }
inline size_t GetLength() { return output_buffer_cursor_; }
inline bool IsDelimiter() { return is_delimiter_; }
inline size_t GetTokenStartCursor() const { return token_start_cursor_; }
inline size_t GetInputCursor() const { return input_cursor_; }
bool Tokenize(const std::string &input_string, TermList &special_terms, TermList &prim_terms);
/// \brief tokenize the input text, remove the space chars, output raw term list
bool TokenizeWhite(const std::string &input_string, TermList &raw_terms);
/// \brief tokenize the input text, output two term lists: raw term list and primary term list
bool Tokenize(const std::string &input_string, TermList &prim_terms);
private:
bool GrowOutputBuffer();
private:
CharTypeTable table_;
std::string *input_{nullptr};
size_t token_start_cursor_{0};
size_t input_cursor_{0};
size_t output_buffer_size_{4096};
std::unique_ptr<char[]> output_buffer_;
size_t output_buffer_cursor_{0};
bool is_delimiter_{false};
};