Files
ragflow/internal/cpp/analyzer.h
Jin Hai 70e9743ef1 RAGFlow go API server (#13240)
# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
2026-03-04 19:17:16 +08:00

89 lines
3.2 KiB
C++

// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tokenizer.h"
#include "term.h"
enum class CutGrain {
kCoarse,
kFine,
};
class Analyzer {
public:
Analyzer() = default;
virtual ~Analyzer() = default;
void SetExtractSpecialChar(bool extract_special_char, bool convert_to_placeholder = true) {
extract_special_char_ = extract_special_char;
convert_to_placeholder_ = convert_to_placeholder;
}
void SetCharOffset(bool set) { get_char_offset_ = set; }
void SetTokenizerConfig(const TokenizeConfig &conf) { tokenizer_.SetConfig(conf); }
int Analyze(const Term &input, TermList &output, bool fine_grained = false, bool enable_position = false) {
void *array[2] = {&output, this};
return AnalyzeImpl(input, &array, fine_grained, enable_position, Analyzer::AppendTermList);
}
protected:
typedef void (*HookType)(void *data,
const char *text,
const uint32_t len,
const uint32_t offset,
const uint32_t end_offset,
const bool is_special_char,
const uint16_t payload);
virtual int AnalyzeImpl(const Term &input, void *data, bool fine_grained, bool enable_position,HookType func) const { return -1; }
static void AppendTermList(void *data,
const char *text,
const uint32_t len,
const uint32_t offset,
const uint32_t end_offset,
const bool is_special_char,
const uint16_t payload) {
void **parameters = (void **)data;
TermList *output = (TermList *)parameters[0];
Analyzer *analyzer = (Analyzer *)parameters[1];
if (is_special_char && !analyzer->extract_special_char_)
return;
if (is_special_char && analyzer->convert_to_placeholder_) {
if (output->empty() == true || output->back().text_.compare(PLACE_HOLDER) != 0)
output->Add(PLACE_HOLDER.c_str(), PLACE_HOLDER.length(), offset, end_offset, payload);
} else {
output->Add(text, len, offset, end_offset, payload);
}
}
Tokenizer tokenizer_;
/// Whether including speical characters (e.g. puncutations) in the result.
bool extract_special_char_;
/// Whether converting speical characters (e.g. puncutations) into a particular place holder
/// symbol in the result.
/// Be effect only when extract_special_char_ is set.
bool convert_to_placeholder_;
bool get_char_offset_{false};
};