ragflow/internal/cpp/analyzer.h

// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "tokenizer.h"
#include "term.h"

enum class CutGrain {
    kCoarse,
    kFine,
};

class Analyzer {
public:
    Analyzer() = default;

    virtual ~Analyzer() = default;

    void SetExtractSpecialChar(bool extract_special_char, bool convert_to_placeholder = true) {
        extract_special_char_ = extract_special_char;
        convert_to_placeholder_ = convert_to_placeholder;
    }

    void SetCharOffset(bool set) { get_char_offset_ = set; }

    void SetTokenizerConfig(const TokenizeConfig &conf) { tokenizer_.SetConfig(conf); }

    int Analyze(const Term &input, TermList &output, bool fine_grained = false, bool enable_position = false) {
        void *array[2] = {&output, this};
        return AnalyzeImpl(input, &array, fine_grained, enable_position, Analyzer::AppendTermList);
    }

protected:
    typedef void (*HookType)(void *data,
                             const char *text,
                             const uint32_t len,
                             const uint32_t offset,
                             const uint32_t end_offset,
                             const bool is_special_char,
                             const uint16_t payload);

    virtual int AnalyzeImpl(const Term &input, void *data, bool fine_grained, bool enable_position,HookType func) const { return -1; }

    static void AppendTermList(void *data,
                               const char *text,
                               const uint32_t len,
                               const uint32_t offset,
                               const uint32_t end_offset,
                               const bool is_special_char,
                               const uint16_t payload) {
        void **parameters = (void **)data;
        TermList *output = (TermList *)parameters[0];
        Analyzer *analyzer = (Analyzer *)parameters[1];

        if (is_special_char && !analyzer->extract_special_char_)
            return;
        if (is_special_char && analyzer->convert_to_placeholder_) {
            if (output->empty() == true || output->back().text_.compare(PLACE_HOLDER) != 0)
                output->Add(PLACE_HOLDER.c_str(), PLACE_HOLDER.length(), offset, end_offset, payload);
        } else {
            output->Add(text, len, offset, end_offset, payload);
        }
    }

    Tokenizer tokenizer_;

    /// Whether including speical characters (e.g. puncutations) in the result.
    bool extract_special_char_;

    /// Whether converting speical characters (e.g. puncutations) into a particular place holder
    /// symbol in the result.
    /// Be effect only when extract_special_char_ is set.
    bool convert_to_placeholder_;

    bool get_char_offset_{false};
};