mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-03-18 21:30:01 +08:00
# RAGFlow Go Implementation Plan 🚀 This repository tracks the progress of porting RAGFlow to Go. We'll implement core features and provide performance comparisons between Python and Go versions. ## Implementation Checklist - [x] User Management APIs - [x] Dataset Management Operations - [x] Retrieval Test - [x] Chat Management Operations - [x] Infinity Go SDK --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
656 lines
20 KiB
Go
656 lines
20 KiB
Go
// Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||
//
|
||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
// you may not use this file except in compliance with the License.
|
||
// You may obtain a copy of the License at
|
||
//
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
//
|
||
// Unless required by applicable law or agreed to in writing, software
|
||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
// See the License for the specific language governing permissions and
|
||
// limitations under the License.
|
||
|
||
package nlp
|
||
|
||
import (
|
||
"fmt"
|
||
"path/filepath"
|
||
"regexp"
|
||
"sort"
|
||
"strings"
|
||
"sync"
|
||
|
||
"ragflow/internal/engine/infinity"
|
||
"ragflow/internal/tokenizer"
|
||
|
||
"github.com/siongui/gojianfan"
|
||
)
|
||
|
||
var (
|
||
// globalQueryBuilder is the global query builder instance
|
||
globalQueryBuilder *QueryBuilder
|
||
// qbOnce ensures the query builder is initialized only once
|
||
qbOnce sync.Once
|
||
// qbInitError stores any error during initialization
|
||
qbInitError error
|
||
)
|
||
|
||
// QueryBuilder provides functionality to build query expressions based on text, referencing Python's FulltextQueryer and QueryBase.
|
||
type QueryBuilder struct {
|
||
queryFields []string
|
||
termWeight *TermWeightDealer
|
||
synonym *Synonym
|
||
}
|
||
|
||
// InitQueryBuilder initializes the global QueryBuilder with the given wordnet directory.
|
||
// It should be called during the initialization phase of main.go, after tokenizer.Init.
|
||
// The wordnetDir is typically filepath.Join(tokenizer.Config.DictPath, "wordnet")
|
||
func InitQueryBuilder(wordnetDir string) error {
|
||
qbOnce.Do(func() {
|
||
globalQueryBuilder = &QueryBuilder{
|
||
queryFields: []string{
|
||
"title_tks^10",
|
||
"title_sm_tks^5",
|
||
"important_kwd^30",
|
||
"important_tks^20",
|
||
"question_tks^20",
|
||
"content_ltks^2",
|
||
"content_sm_ltks",
|
||
},
|
||
termWeight: NewTermWeightDealer(""),
|
||
synonym: NewSynonym(nil, "", wordnetDir),
|
||
}
|
||
})
|
||
return qbInitError
|
||
}
|
||
|
||
// InitQueryBuilderFromTokenizer initializes the global QueryBuilder using tokenizer's DictPath.
|
||
// The wordnet directory is derived from tokenizer's DictPath as: DictPath/wordnet
|
||
// This should be called after tokenizer.Init().
|
||
func InitQueryBuilderFromTokenizer(tokenizerDictPath string) error {
|
||
wordnetDir := filepath.Join(tokenizerDictPath, "wordnet")
|
||
return InitQueryBuilder(wordnetDir)
|
||
}
|
||
|
||
// GetQueryBuilder returns the global QueryBuilder instance.
|
||
// Returns nil if InitQueryBuilder has not been called.
|
||
func GetQueryBuilder() *QueryBuilder {
|
||
return globalQueryBuilder
|
||
}
|
||
|
||
// NewQueryBuilder creates a new QueryBuilder with default query fields.
|
||
// Deprecated: Use GetQueryBuilder() to get the global instance for better performance.
|
||
func NewQueryBuilder() *QueryBuilder {
|
||
return &QueryBuilder{
|
||
queryFields: []string{
|
||
"title_tks^10",
|
||
"title_sm_tks^5",
|
||
"important_kwd^30",
|
||
"important_tks^20",
|
||
"question_tks^20",
|
||
"content_ltks^2",
|
||
"content_sm_ltks",
|
||
},
|
||
termWeight: NewTermWeightDealer(""),
|
||
synonym: NewSynonym(nil, "", ""),
|
||
}
|
||
}
|
||
|
||
// IsChinese determines whether a line of text is primarily Chinese.
|
||
// Algorithm: split by whitespace, if segments <=3 return true; otherwise count ratio of non-pure-alphabet segments, return true if ratio >=0.7.
|
||
func (qb *QueryBuilder) IsChinese(line string) bool {
|
||
fields := strings.Fields(line)
|
||
if len(fields) <= 3 {
|
||
return true
|
||
}
|
||
nonAlpha := 0
|
||
for _, f := range fields {
|
||
matched, _ := regexp.MatchString(`^[a-zA-Z]+$`, f)
|
||
if !matched {
|
||
nonAlpha++
|
||
}
|
||
}
|
||
return float64(nonAlpha)/float64(len(fields)) >= 0.7
|
||
}
|
||
|
||
// SubSpecialChar escapes special characters for use in queries.
|
||
func (qb *QueryBuilder) SubSpecialChar(line string) string {
|
||
// Regex matches : { } / [ ] - * " ( ) | + ~ ^ and prepends backslash
|
||
re := regexp.MustCompile(`([:{}/\[\]\-\*"\(\)\|\+~\^])`)
|
||
return re.ReplaceAllString(line, `\$1`)
|
||
}
|
||
|
||
// RmWWW removes common stop words and question words from queries.
|
||
func (qb *QueryBuilder) RmWWW(txt string) string {
|
||
patterns := []struct {
|
||
regex string
|
||
repl string
|
||
}{
|
||
// Chinese stop words
|
||
{`是*(怎么办|什么样的|哪家|一下|那家|请问|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀|谁|哪位|哪个)是*`, ""},
|
||
// English stop words (case-insensitive)
|
||
{`(^| )(what|who|how|which|where|why)('re|'s)? `, " "},
|
||
{`(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down|of|to|or|and|if) `, " "},
|
||
}
|
||
original := txt
|
||
for _, p := range patterns {
|
||
re := regexp.MustCompile(`(?i)` + p.regex)
|
||
txt = re.ReplaceAllString(txt, p.repl)
|
||
}
|
||
if txt == "" {
|
||
txt = original
|
||
}
|
||
return txt
|
||
}
|
||
|
||
// AddSpaceBetweenEngZh adds spaces between English letters and Chinese characters to improve tokenization.
|
||
func (qb *QueryBuilder) AddSpaceBetweenEngZh(txt string) string {
|
||
// (ENG/ENG+NUM) + ZH: e.g., "ABC123中文" -> "ABC123 中文"
|
||
re1 := regexp.MustCompile(`([A-Za-z]+[0-9]*)([\x{4e00}-\x{9fa5}]+)`)
|
||
txt = re1.ReplaceAllString(txt, "$1 $2")
|
||
|
||
// ENG + ZH: e.g., "ABC中文" -> "ABC 中文"
|
||
re2 := regexp.MustCompile(`([A-Za-z])([\x{4e00}-\x{9fa5}]+)`)
|
||
txt = re2.ReplaceAllString(txt, "$1 $2")
|
||
|
||
// ZH + (ENG/ENG+NUM): e.g., "中文ABC123" -> "中文 ABC123"
|
||
re3 := regexp.MustCompile(`([\x{4e00}-\x{9fa5}]+)([A-Za-z]+[0-9]*)`)
|
||
txt = re3.ReplaceAllString(txt, "$1 $2")
|
||
|
||
// ZH + ENG: e.g., "中文ABC" -> "中文 ABC"
|
||
re4 := regexp.MustCompile(`([\x{4e00}-\x{9fa5}]+)([A-Za-z])`)
|
||
txt = re4.ReplaceAllString(txt, "$1 $2")
|
||
return txt
|
||
}
|
||
|
||
// StrFullWidth2HalfWidth converts full-width characters to half-width characters.
|
||
// Algorithm: For each character:
|
||
// - Full-width space (U+3000) is converted to half-width space (U+0020).
|
||
// - For other characters, subtract 0xFEE0 from its code point.
|
||
// - If the resulting code point is not in the half-width character range (0x0020 to 0x7E),
|
||
// the original character is kept.
|
||
func (qb *QueryBuilder) StrFullWidth2HalfWidth(ustring string) string {
|
||
var rstring strings.Builder
|
||
for _, uchar := range ustring {
|
||
insideCode := int32(uchar)
|
||
if insideCode == 0x3000 {
|
||
insideCode = 0x0020
|
||
} else {
|
||
insideCode -= 0xFEE0
|
||
}
|
||
if insideCode < 0x0020 || insideCode > 0x7E {
|
||
rstring.WriteRune(uchar)
|
||
} else {
|
||
rstring.WriteRune(insideCode)
|
||
}
|
||
}
|
||
return rstring.String()
|
||
}
|
||
|
||
// Traditional2Simplified converts traditional Chinese characters to simplified Chinese characters.
|
||
// Uses gojianfan library which provides conversion similar to Python's HanziConv.
|
||
func (qb *QueryBuilder) Traditional2Simplified(line string) string {
|
||
return gojianfan.T2S(line)
|
||
}
|
||
|
||
// NeedFineGrainedTokenize determines if fine-grained tokenization is needed for a token.
|
||
// Reference: rag/nlp/query.py L88-93
|
||
func (qb *QueryBuilder) NeedFineGrainedTokenize(tk string) bool {
|
||
if len(tk) < 3 {
|
||
return false
|
||
}
|
||
if matched, _ := regexp.MatchString(`^[0-9a-z\.\+#_\*-]+$`, tk); matched {
|
||
return false
|
||
}
|
||
return true
|
||
}
|
||
|
||
// Question builds a full-text query expression based on input text.
|
||
// References Python FulltextQueryer.question method.
|
||
// Currently, a simplified version, returns basic MatchTextExpr; future integration of term weight and synonyms.
|
||
func (qb *QueryBuilder) Question(txt string, tbl string, minMatch float64) (*infinity.MatchTextExpr, []string) {
|
||
// originalQuery stores the original input text for later use in query expression.
|
||
originalQuery := txt
|
||
|
||
// Add space between English and Chinese
|
||
txtWithSpaces := qb.AddSpaceBetweenEngZh(txt)
|
||
|
||
// Convert to lowercase and remove punctuation (simplified)
|
||
txtLower := strings.ToLower(txtWithSpaces)
|
||
|
||
// Convert to half-width
|
||
txtHalfWidth := qb.StrFullWidth2HalfWidth(txtLower)
|
||
|
||
// Convert to simplified Chinese
|
||
txtSimplified := qb.Traditional2Simplified(txtHalfWidth)
|
||
|
||
// Replace punctuation and special characters with space
|
||
// Reference: rag/nlp/query.py L44-48
|
||
// re is the regex pattern for matching punctuation and special characters.
|
||
re := regexp.MustCompile(`[ :|\r\n\t,,.。??/\` + "`" + `!!&^%()\[\]{}<>]+`)
|
||
// txtCleaned is the text after removing punctuation and special characters.
|
||
txtCleaned := re.ReplaceAllString(txtSimplified, " ")
|
||
|
||
// Remove stop words
|
||
txtNoStopWords := qb.RmWWW(txtCleaned)
|
||
|
||
// Determine if text is Chinese
|
||
if !qb.IsChinese(txtNoStopWords) {
|
||
// Non-Chinese processing
|
||
// Reference: rag/nlp/query.py L52-88
|
||
|
||
// Remove stop words again
|
||
// txtFinal is the text after removing stop words again.
|
||
txtFinal := qb.RmWWW(txtNoStopWords)
|
||
|
||
// Tokenize using rag_tokenizer
|
||
tokenized, err := tokenizer.Tokenize(txtFinal)
|
||
if err != nil {
|
||
// If tokenizer fails, use simple split
|
||
tokenized = txtFinal
|
||
}
|
||
|
||
// tks are tokens obtained by splitting the tokenized text by whitespace.
|
||
tks := strings.Fields(tokenized)
|
||
// keywords stores the non‑empty tokens as keywords.
|
||
keywords := make([]string, 0, len(tks))
|
||
for _, t := range tks {
|
||
if t != "" {
|
||
keywords = append(keywords, t)
|
||
}
|
||
}
|
||
|
||
// Calculate term weights using TermWeightDealer
|
||
// Reference: rag/nlp/query.py L56
|
||
// tws holds the term weight list for each token.
|
||
tws := qb.termWeight.Weights(tks, false)
|
||
|
||
// Clean tokens and filter
|
||
// Reference: rag/nlp/query.py L57-60
|
||
type tokenWeight struct {
|
||
tk string
|
||
w float64
|
||
}
|
||
// tksW holds the cleaned tokens with their weights.
|
||
var tksW []tokenWeight
|
||
for _, tw := range tws {
|
||
tk := tw.Term
|
||
w := tw.Weight
|
||
|
||
// Clean token: remove special chars
|
||
tk = regexp.MustCompile(`[ \"'^]+`).ReplaceAllString(tk, "")
|
||
// Remove single alphanumeric chars
|
||
tk = regexp.MustCompile(`^[a-z0-9]$`).ReplaceAllString(tk, "")
|
||
// Remove leading +/-
|
||
tk = regexp.MustCompile(`^[\+\-]+`).ReplaceAllString(tk, "")
|
||
tk = strings.TrimSpace(tk)
|
||
|
||
if tk == "" {
|
||
continue
|
||
}
|
||
tksW = append(tksW, tokenWeight{tk, w})
|
||
}
|
||
|
||
// Limit to 256 tokens
|
||
// Reference: rag/nlp/query.py L62
|
||
if len(tksW) > 256 {
|
||
tksW = tksW[:256]
|
||
}
|
||
|
||
// TODO: Synonym expansion (reference L61-67)
|
||
// For now, use empty synonyms
|
||
// syns is a placeholder for synonym expansion (currently empty).
|
||
syns := make([]string, len(tksW))
|
||
|
||
// Build query parts
|
||
// Reference: rag/nlp/query.py L69-70
|
||
// q collects the query part strings.
|
||
var q []string
|
||
for i, tw := range tksW {
|
||
tk := tw.tk
|
||
w := tw.w
|
||
// Skip tokens with special regex chars
|
||
if matched, _ := regexp.MatchString(`[.^+\(\)-]`, tk); matched {
|
||
continue
|
||
}
|
||
// Format: (token^weight synonym)
|
||
q = append(q, fmt.Sprintf("(%s^%.4f %s)", tk, w, syns[i]))
|
||
}
|
||
|
||
// Add phrase queries for adjacent tokens
|
||
// Reference: rag/nlp/query.py L71-82
|
||
for i := 1; i < len(tksW); i++ {
|
||
left := strings.TrimSpace(tksW[i-1].tk)
|
||
right := strings.TrimSpace(tksW[i].tk)
|
||
if left == "" || right == "" {
|
||
continue
|
||
}
|
||
// maxW is the maximum weight between two adjacent tokens.
|
||
maxW := tksW[i-1].w
|
||
if tksW[i].w > maxW {
|
||
maxW = tksW[i].w
|
||
}
|
||
q = append(q, fmt.Sprintf(`"%s %s"^%.4f`, left, right, maxW*2))
|
||
}
|
||
|
||
if len(q) == 0 {
|
||
q = append(q, txtFinal)
|
||
}
|
||
|
||
// query is the final query string built from all query parts.
|
||
query := strings.Join(q, " ")
|
||
return &infinity.MatchTextExpr{
|
||
Fields: qb.queryFields,
|
||
MatchingText: query,
|
||
TopN: 100,
|
||
ExtraOptions: map[string]interface{}{
|
||
"original_query": originalQuery,
|
||
},
|
||
}, keywords
|
||
}
|
||
// Chinese processing
|
||
// Reference: rag/nlp/query.py L88-172
|
||
|
||
// Save original text before removing stop words (for fallback)
|
||
// otxt holds the original text before removing stop words, used as fallback.
|
||
otxt := txtNoStopWords
|
||
|
||
// Remove stop words for Chinese processing
|
||
// txtChinese is the text after removing stop words for Chinese processing.
|
||
txtChinese := qb.RmWWW(txtNoStopWords)
|
||
|
||
// qs collects query strings for each segment.
|
||
var qs []string
|
||
// keywords stores keywords extracted from segments.
|
||
var keywords []string
|
||
|
||
// Split text and process each segment (limit to 256)
|
||
// segments are the text segments after splitting by term weight.
|
||
segments := qb.termWeight.Split(txtChinese)
|
||
if len(segments) > 256 {
|
||
segments = segments[:256]
|
||
}
|
||
|
||
for _, segment := range segments {
|
||
if segment == "" {
|
||
continue
|
||
}
|
||
keywords = append(keywords, segment)
|
||
|
||
// Get term weights
|
||
// termWeightList holds term weights for the current segment.
|
||
termWeightList := qb.termWeight.Weights([]string{segment}, true)
|
||
|
||
// Lookup synonyms
|
||
// syns are synonyms for the current segment.
|
||
syns := qb.synonym.Lookup(segment, 8)
|
||
if len(syns) > 0 && len(keywords) < 32 {
|
||
keywords = append(keywords, syns...)
|
||
}
|
||
|
||
// Sort by weight descending
|
||
sort.Slice(termWeightList, func(i, j int) bool {
|
||
return termWeightList[i].Weight > termWeightList[j].Weight
|
||
})
|
||
|
||
// terms stores term strings with their weights for the current segment.
|
||
var terms []struct {
|
||
term string
|
||
weight float64
|
||
}
|
||
|
||
for _, termWeight := range termWeightList {
|
||
term := termWeight.Term
|
||
weight := termWeight.Weight
|
||
|
||
// Fine-grained tokenization if needed
|
||
// sm holds fine‑grained tokens for the current term.
|
||
var sm []string
|
||
if qb.NeedFineGrainedTokenize(term) {
|
||
fineGrained, err := tokenizer.FineGrainedTokenize(term)
|
||
if err == nil && fineGrained != "" {
|
||
sm = strings.Fields(fineGrained)
|
||
}
|
||
}
|
||
|
||
// Clean special characters from sm
|
||
// cleanSm holds cleaned fine‑grained tokens with special characters removed.
|
||
var cleanSm []string
|
||
// specialCharRe is the regex pattern for matching special characters.
|
||
specialCharRe := regexp.MustCompile(`[,\.\/;'\[\]\\\` + "`" + `~!@#$%\^&\*\(\)=\+_<>\?:"\{\}\|,。;'‘’【】、!¥……()——《》?:"""-]+`)
|
||
for _, m := range sm {
|
||
m = specialCharRe.ReplaceAllString(m, "")
|
||
m = qb.SubSpecialChar(m)
|
||
if len(m) > 1 {
|
||
cleanSm = append(cleanSm, m)
|
||
}
|
||
}
|
||
sm = cleanSm
|
||
|
||
// Add to keywords if under limit
|
||
if len(keywords) < 32 {
|
||
// cleanTk is the term with quotes and spaces removed.
|
||
cleanTk := regexp.MustCompile(`[ \"']+`).ReplaceAllString(term, "")
|
||
if cleanTk != "" {
|
||
keywords = append(keywords, cleanTk)
|
||
}
|
||
keywords = append(keywords, sm...)
|
||
}
|
||
|
||
// Lookup synonyms for this token
|
||
// tkSyns are synonyms for the current term.
|
||
tkSyns := qb.synonym.Lookup(term, 8)
|
||
for i, s := range tkSyns {
|
||
tkSyns[i] = qb.SubSpecialChar(s)
|
||
}
|
||
if len(keywords) < 32 {
|
||
for _, s := range tkSyns {
|
||
if s != "" {
|
||
keywords = append(keywords, s)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Fine-grained tokenize synonyms
|
||
// fineGrainedSyns holds fine‑grained tokenized synonyms.
|
||
var fineGrainedSyns []string
|
||
for _, s := range tkSyns {
|
||
if s == "" {
|
||
continue
|
||
}
|
||
fg, err := tokenizer.FineGrainedTokenize(s)
|
||
if err == nil && fg != "" {
|
||
// Quote if contains space
|
||
if strings.Contains(fg, " ") {
|
||
fg = fmt.Sprintf(`"%s"`, fg)
|
||
}
|
||
fineGrainedSyns = append(fineGrainedSyns, fg)
|
||
}
|
||
}
|
||
|
||
if len(keywords) >= 32 {
|
||
break
|
||
}
|
||
|
||
// Clean token for query
|
||
term = qb.SubSpecialChar(term)
|
||
if term == "" {
|
||
continue
|
||
}
|
||
|
||
// Quote if contains space
|
||
if strings.Contains(term, " ") {
|
||
term = fmt.Sprintf(`"%s"`, term)
|
||
}
|
||
|
||
// Build query part with synonyms
|
||
if len(fineGrainedSyns) > 0 {
|
||
term = fmt.Sprintf("(%s OR (%s)^0.2)", term, strings.Join(fineGrainedSyns, " "))
|
||
}
|
||
if len(sm) > 0 {
|
||
smStr := strings.Join(sm, " ")
|
||
term = fmt.Sprintf(`%s OR "%s" OR ("%s"~2)^0.5`, term, smStr, smStr)
|
||
}
|
||
|
||
terms = append(terms, struct {
|
||
term string
|
||
weight float64
|
||
}{term, weight})
|
||
}
|
||
|
||
// Build query string for this segment
|
||
// termParts collects query parts for each term in the segment.
|
||
var termParts []string
|
||
for _, termWeight := range terms {
|
||
termParts = append(termParts, fmt.Sprintf("(%s)^%.4f", termWeight.term, termWeight.weight))
|
||
}
|
||
// tmsStr is the query string for the current segment.
|
||
tmsStr := strings.Join(termParts, " ")
|
||
|
||
// Add proximity query if multiple tokens
|
||
if len(termWeightList) > 1 {
|
||
// tokenized is the tokenized version of the segment.
|
||
tokenized, _ := tokenizer.Tokenize(segment)
|
||
if tokenized != "" {
|
||
tmsStr += fmt.Sprintf(` ("%s"~2)^1.5`, tokenized)
|
||
}
|
||
}
|
||
|
||
// Add segment-level synonyms
|
||
if len(syns) > 0 && tmsStr != "" {
|
||
// synParts collects synonym query parts.
|
||
var synParts []string
|
||
for _, s := range syns {
|
||
s = qb.SubSpecialChar(s)
|
||
if s != "" {
|
||
tokenized, _ := tokenizer.Tokenize(s)
|
||
if tokenized != "" {
|
||
synParts = append(synParts, fmt.Sprintf(`"%s"`, tokenized))
|
||
}
|
||
}
|
||
}
|
||
if len(synParts) > 0 {
|
||
tmsStr = fmt.Sprintf("(%s)^5 OR (%s)^0.7", tmsStr, strings.Join(synParts, " OR "))
|
||
}
|
||
}
|
||
|
||
if tmsStr != "" {
|
||
qs = append(qs, tmsStr)
|
||
} else {
|
||
fmt.Println("tmsStr is empty")
|
||
}
|
||
}
|
||
|
||
// Build final query
|
||
if len(qs) > 0 {
|
||
// queryParts collects final query parts for each segment.
|
||
var queryParts []string
|
||
for _, q := range qs {
|
||
if q != "" {
|
||
queryParts = append(queryParts, fmt.Sprintf("(%s)", q))
|
||
}
|
||
}
|
||
// query is the final query string built from all segments.
|
||
query := strings.Join(queryParts, " OR ")
|
||
if query == "" {
|
||
query = otxt
|
||
}
|
||
return &infinity.MatchTextExpr{
|
||
Fields: qb.queryFields,
|
||
MatchingText: query,
|
||
TopN: 100,
|
||
ExtraOptions: map[string]interface{}{
|
||
"minimum_should_match": minMatch,
|
||
"original_query": originalQuery,
|
||
},
|
||
}, keywords
|
||
}
|
||
|
||
return nil, keywords
|
||
}
|
||
|
||
// Paragraph builds a query expression based on content terms and keywords.
|
||
// References Python FulltextQueryer.paragraph method.
|
||
func (qb *QueryBuilder) Paragraph(contentTks string, keywords []string, keywordsTopN int) *infinity.MatchTextExpr {
|
||
// Simplified implementation: merge keywords and content terms
|
||
allTerms := make([]string, 0, len(keywords))
|
||
for _, k := range keywords {
|
||
k = strings.TrimSpace(k)
|
||
if k != "" {
|
||
allTerms = append(allTerms, `"`+k+`"`)
|
||
}
|
||
}
|
||
// Limit number of keywords
|
||
if keywordsTopN > 0 && len(allTerms) > keywordsTopN {
|
||
allTerms = allTerms[:keywordsTopN]
|
||
}
|
||
// Could add content term processing here, e.g., tokenization, weight calculation
|
||
// Currently only uses keywords
|
||
query := strings.Join(allTerms, " ")
|
||
// Calculate minimum_should_match (could be used for extra_options in future)
|
||
_ = 3
|
||
if len(allTerms) > 0 {
|
||
calc := int(float64(len(allTerms)) / 10.0)
|
||
if calc < 3 {
|
||
calc = 3
|
||
}
|
||
_ = calc
|
||
}
|
||
return &infinity.MatchTextExpr{
|
||
Fields: qb.queryFields,
|
||
MatchingText: query,
|
||
TopN: 100,
|
||
}
|
||
}
|
||
|
||
// Similarity calculates similarity between two term weight dictionaries.
|
||
// Algorithm: s = sum(qtwt[k] for k in qtwt if k in dtwt) / sum(qtwt[k])
|
||
func (qb *QueryBuilder) Similarity(qtwt map[string]float64, dtwt map[string]float64) float64 {
|
||
if len(qtwt) == 0 {
|
||
return 0.0
|
||
}
|
||
var sum float64
|
||
for k, v := range qtwt {
|
||
if _, ok := dtwt[k]; ok {
|
||
sum += v
|
||
}
|
||
}
|
||
var total float64
|
||
for _, v := range qtwt {
|
||
total += v
|
||
}
|
||
if total == 0 {
|
||
return 0.0
|
||
}
|
||
return sum / total
|
||
}
|
||
|
||
// TokenSimilarity calculates similarity between query terms and multiple document term sets.
|
||
// To be implemented: requires term weight processing module.
|
||
func (qb *QueryBuilder) TokenSimilarity(atks string, btkss []string) []float64 {
|
||
// Placeholder implementation, returns zero values
|
||
result := make([]float64, len(btkss))
|
||
for i := range result {
|
||
result[i] = 0.0
|
||
}
|
||
return result
|
||
}
|
||
|
||
// HybridSimilarity calculates weighted combination of vector similarity and term similarity.
|
||
// To be implemented: requires vector cosine similarity calculation.
|
||
func (qb *QueryBuilder) HybridSimilarity(avec []float64, bvecs [][]float64, atks string, btkss []string, tkweight float64, vtweight float64) ([]float64, []float64, []float64) {
|
||
// Placeholder implementation, returns zero values
|
||
n := len(btkss)
|
||
sims := make([]float64, n)
|
||
tksim := make([]float64, n)
|
||
vecsim := make([]float64, n)
|
||
return sims, tksim, vecsim
|
||
}
|
||
|
||
// SetQueryFields sets the list of query fields.
|
||
func (qb *QueryBuilder) SetQueryFields(fields []string) {
|
||
qb.queryFields = fields
|
||
}
|