ragflow/internal/service/nlp/query_builder.go

// Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package nlp

import (
	"fmt"
	"path/filepath"
	"regexp"
	"sort"
	"strings"
	"sync"

	"ragflow/internal/engine/infinity"
	"ragflow/internal/tokenizer"

	"github.com/siongui/gojianfan"
)

var (
	// globalQueryBuilder is the global query builder instance
	globalQueryBuilder *QueryBuilder
	// qbOnce ensures the query builder is initialized only once
	qbOnce sync.Once
	// qbInitError stores any error during initialization
	qbInitError error
)

// QueryBuilder provides functionality to build query expressions based on text, referencing Python's FulltextQueryer and QueryBase.
type QueryBuilder struct {
	queryFields []string
	termWeight  *TermWeightDealer
	synonym     *Synonym
}

// InitQueryBuilder initializes the global QueryBuilder with the given wordnet directory.
// It should be called during the initialization phase of main.go, after tokenizer.Init.
// The wordnetDir is typically filepath.Join(tokenizer.Config.DictPath, "wordnet")
func InitQueryBuilder(wordnetDir string) error {
	qbOnce.Do(func() {
		globalQueryBuilder = &QueryBuilder{
			queryFields: []string{
				"title_tks^10",
				"title_sm_tks^5",
				"important_kwd^30",
				"important_tks^20",
				"question_tks^20",
				"content_ltks^2",
				"content_sm_ltks",
			},
			termWeight: NewTermWeightDealer(""),
			synonym:    NewSynonym(nil, "", wordnetDir),
		}
	})
	return qbInitError
}

// InitQueryBuilderFromTokenizer initializes the global QueryBuilder using tokenizer's DictPath.
// The wordnet directory is derived from tokenizer's DictPath as: DictPath/wordnet
// This should be called after tokenizer.Init().
func InitQueryBuilderFromTokenizer(tokenizerDictPath string) error {
	wordnetDir := filepath.Join(tokenizerDictPath, "wordnet")
	return InitQueryBuilder(wordnetDir)
}

// GetQueryBuilder returns the global QueryBuilder instance.
// Returns nil if InitQueryBuilder has not been called.
func GetQueryBuilder() *QueryBuilder {
	return globalQueryBuilder
}

// NewQueryBuilder creates a new QueryBuilder with default query fields.
// Deprecated: Use GetQueryBuilder() to get the global instance for better performance.
func NewQueryBuilder() *QueryBuilder {
	return &QueryBuilder{
		queryFields: []string{
			"title_tks^10",
			"title_sm_tks^5",
			"important_kwd^30",
			"important_tks^20",
			"question_tks^20",
			"content_ltks^2",
			"content_sm_ltks",
		},
		termWeight: NewTermWeightDealer(""),
		synonym:    NewSynonym(nil, "", ""),
	}
}

// IsChinese determines whether a line of text is primarily Chinese.
// Algorithm: split by whitespace, if segments <=3 return true; otherwise count ratio of non-pure-alphabet segments, return true if ratio >=0.7.
func (qb *QueryBuilder) IsChinese(line string) bool {
	fields := strings.Fields(line)
	if len(fields) <= 3 {
		return true
	}
	nonAlpha := 0
	for _, f := range fields {
		matched, _ := regexp.MatchString(`^[a-zA-Z]+$`, f)
		if !matched {
			nonAlpha++
		}
	}
	return float64(nonAlpha)/float64(len(fields)) >= 0.7
}

// SubSpecialChar escapes special characters for use in queries.
func (qb *QueryBuilder) SubSpecialChar(line string) string {
	// Regex matches : { } / [ ] - * " ( ) | + ~ ^ and prepends backslash
	re := regexp.MustCompile(`([:{}/\[\]\-\*"\(\)\|\+~\^])`)
	return re.ReplaceAllString(line, `\$1`)
}

// RmWWW removes common stop words and question words from queries.
func (qb *QueryBuilder) RmWWW(txt string) string {
	patterns := []struct {
		regex string
		repl  string
	}{
		// Chinese stop words
		{`是*(怎么办|什么样的|哪家|一下|那家|请问|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀|谁|哪位|哪个)是*`, ""},
		// English stop words (case-insensitive)
		{`(^| )(what|who|how|which|where|why)('re|'s)? `, " "},
		{`(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down|of|to|or|and|if) `, " "},
	}
	original := txt
	for _, p := range patterns {
		re := regexp.MustCompile(`(?i)` + p.regex)
		txt = re.ReplaceAllString(txt, p.repl)
	}
	if txt == "" {
		txt = original
	}
	return txt
}

// AddSpaceBetweenEngZh adds spaces between English letters and Chinese characters to improve tokenization.
func (qb *QueryBuilder) AddSpaceBetweenEngZh(txt string) string {
	// (ENG/ENG+NUM) + ZH: e.g., "ABC123中文" -> "ABC123 中文"
	re1 := regexp.MustCompile(`([A-Za-z]+[0-9]*)([\x{4e00}-\x{9fa5}]+)`)
	txt = re1.ReplaceAllString(txt, "$1 $2")

	// ENG + ZH: e.g., "ABC中文" -> "ABC 中文"
	re2 := regexp.MustCompile(`([A-Za-z])([\x{4e00}-\x{9fa5}]+)`)
	txt = re2.ReplaceAllString(txt, "$1 $2")

	// ZH + (ENG/ENG+NUM): e.g., "中文ABC123" -> "中文 ABC123"
	re3 := regexp.MustCompile(`([\x{4e00}-\x{9fa5}]+)([A-Za-z]+[0-9]*)`)
	txt = re3.ReplaceAllString(txt, "$1 $2")

	// ZH + ENG: e.g., "中文ABC" -> "中文 ABC"
	re4 := regexp.MustCompile(`([\x{4e00}-\x{9fa5}]+)([A-Za-z])`)
	txt = re4.ReplaceAllString(txt, "$1 $2")
	return txt
}

// StrFullWidth2HalfWidth converts full-width characters to half-width characters.
// Algorithm: For each character:
//   - Full-width space (U+3000) is converted to half-width space (U+0020).
//   - For other characters, subtract 0xFEE0 from its code point.
//   - If the resulting code point is not in the half-width character range (0x0020 to 0x7E),
//     the original character is kept.
func (qb *QueryBuilder) StrFullWidth2HalfWidth(ustring string) string {
	var rstring strings.Builder
	for _, uchar := range ustring {
		insideCode := int32(uchar)
		if insideCode == 0x3000 {
			insideCode = 0x0020
		} else {
			insideCode -= 0xFEE0
		}
		if insideCode < 0x0020 || insideCode > 0x7E {
			rstring.WriteRune(uchar)
		} else {
			rstring.WriteRune(insideCode)
		}
	}
	return rstring.String()
}

// Traditional2Simplified converts traditional Chinese characters to simplified Chinese characters.
// Uses gojianfan library which provides conversion similar to Python's HanziConv.
func (qb *QueryBuilder) Traditional2Simplified(line string) string {
	return gojianfan.T2S(line)
}

// NeedFineGrainedTokenize determines if fine-grained tokenization is needed for a token.
// Reference: rag/nlp/query.py L88-93
func (qb *QueryBuilder) NeedFineGrainedTokenize(tk string) bool {
	if len(tk) < 3 {
		return false
	}
	if matched, _ := regexp.MatchString(`^[0-9a-z\.\+#_\*-]+$`, tk); matched {
		return false
	}
	return true
}

// Question builds a full-text query expression based on input text.
// References Python FulltextQueryer.question method.
// Currently, a simplified version, returns basic MatchTextExpr; future integration of term weight and synonyms.
func (qb *QueryBuilder) Question(txt string, tbl string, minMatch float64) (*infinity.MatchTextExpr, []string) {
	// originalQuery stores the original input text for later use in query expression.
	originalQuery := txt

	// Add space between English and Chinese
	txtWithSpaces := qb.AddSpaceBetweenEngZh(txt)

	// Convert to lowercase and remove punctuation (simplified)
	txtLower := strings.ToLower(txtWithSpaces)

	// Convert to half-width
	txtHalfWidth := qb.StrFullWidth2HalfWidth(txtLower)

	// Convert to simplified Chinese
	txtSimplified := qb.Traditional2Simplified(txtHalfWidth)

	// Replace punctuation and special characters with space
	// Reference: rag/nlp/query.py L44-48
	// re is the regex pattern for matching punctuation and special characters.
	re := regexp.MustCompile(`[ :|\r\n\t,，.。?？/\` + "`" + `!！&^%()\[\]{}<>]+`)
	// txtCleaned is the text after removing punctuation and special characters.
	txtCleaned := re.ReplaceAllString(txtSimplified, " ")

	// Remove stop words
	txtNoStopWords := qb.RmWWW(txtCleaned)

	// Determine if text is Chinese
	if !qb.IsChinese(txtNoStopWords) {
		// Non-Chinese processing
		// Reference: rag/nlp/query.py L52-88

		// Remove stop words again
		// txtFinal is the text after removing stop words again.
		txtFinal := qb.RmWWW(txtNoStopWords)

		// Tokenize using rag_tokenizer
		tokenized, err := tokenizer.Tokenize(txtFinal)
		if err != nil {
			// If tokenizer fails, use simple split
			tokenized = txtFinal
		}

		// tks are tokens obtained by splitting the tokenized text by whitespace.
		tks := strings.Fields(tokenized)
		// keywords stores the non‑empty tokens as keywords.
		keywords := make([]string, 0, len(tks))
		for _, t := range tks {
			if t != "" {
				keywords = append(keywords, t)
			}
		}

		// Calculate term weights using TermWeightDealer
		// Reference: rag/nlp/query.py L56
		// tws holds the term weight list for each token.
		tws := qb.termWeight.Weights(tks, false)

		// Clean tokens and filter
		// Reference: rag/nlp/query.py L57-60
		type tokenWeight struct {
			tk string
			w  float64
		}
		// tksW holds the cleaned tokens with their weights.
		var tksW []tokenWeight
		for _, tw := range tws {
			tk := tw.Term
			w := tw.Weight

			// Clean token: remove special chars
			tk = regexp.MustCompile(`[ \"'^]+`).ReplaceAllString(tk, "")
			// Remove single alphanumeric chars
			tk = regexp.MustCompile(`^[a-z0-9]$`).ReplaceAllString(tk, "")
			// Remove leading +/-
			tk = regexp.MustCompile(`^[\+\-]+`).ReplaceAllString(tk, "")
			tk = strings.TrimSpace(tk)

			if tk == "" {
				continue
			}
			tksW = append(tksW, tokenWeight{tk, w})
		}

		// Limit to 256 tokens
		// Reference: rag/nlp/query.py L62
		if len(tksW) > 256 {
			tksW = tksW[:256]
		}

		// TODO: Synonym expansion (reference L61-67)
		// For now, use empty synonyms
		// syns is a placeholder for synonym expansion (currently empty).
		syns := make([]string, len(tksW))

		// Build query parts
		// Reference: rag/nlp/query.py L69-70
		// q collects the query part strings.
		var q []string
		for i, tw := range tksW {
			tk := tw.tk
			w := tw.w
			// Skip tokens with special regex chars
			if matched, _ := regexp.MatchString(`[.^+\(\)-]`, tk); matched {
				continue
			}
			// Format: (token^weight synonym)
			q = append(q, fmt.Sprintf("(%s^%.4f %s)", tk, w, syns[i]))
		}

		// Add phrase queries for adjacent tokens
		// Reference: rag/nlp/query.py L71-82
		for i := 1; i < len(tksW); i++ {
			left := strings.TrimSpace(tksW[i-1].tk)
			right := strings.TrimSpace(tksW[i].tk)
			if left == "" || right == "" {
				continue
			}
			// maxW is the maximum weight between two adjacent tokens.
			maxW := tksW[i-1].w
			if tksW[i].w > maxW {
				maxW = tksW[i].w
			}
			q = append(q, fmt.Sprintf(`"%s %s"^%.4f`, left, right, maxW*2))
		}

		if len(q) == 0 {
			q = append(q, txtFinal)
		}

		// query is the final query string built from all query parts.
		query := strings.Join(q, " ")
		return &infinity.MatchTextExpr{
			Fields:       qb.queryFields,
			MatchingText: query,
			TopN:         100,
			ExtraOptions: map[string]interface{}{
				"original_query": originalQuery,
			},
		}, keywords
	}
	// Chinese processing
	// Reference: rag/nlp/query.py L88-172

	// Save original text before removing stop words (for fallback)
	// otxt holds the original text before removing stop words, used as fallback.
	otxt := txtNoStopWords

	// Remove stop words for Chinese processing
	// txtChinese is the text after removing stop words for Chinese processing.
	txtChinese := qb.RmWWW(txtNoStopWords)

	// qs collects query strings for each segment.
	var qs []string
	// keywords stores keywords extracted from segments.
	var keywords []string

	// Split text and process each segment (limit to 256)
	// segments are the text segments after splitting by term weight.
	segments := qb.termWeight.Split(txtChinese)
	if len(segments) > 256 {
		segments = segments[:256]
	}

	for _, segment := range segments {
		if segment == "" {
			continue
		}
		keywords = append(keywords, segment)

		// Get term weights
		// termWeightList holds term weights for the current segment.
		termWeightList := qb.termWeight.Weights([]string{segment}, true)

		// Lookup synonyms
		// syns are synonyms for the current segment.
		syns := qb.synonym.Lookup(segment, 8)
		if len(syns) > 0 && len(keywords) < 32 {
			keywords = append(keywords, syns...)
		}

		// Sort by weight descending
		sort.Slice(termWeightList, func(i, j int) bool {
			return termWeightList[i].Weight > termWeightList[j].Weight
		})

		// terms stores term strings with their weights for the current segment.
		var terms []struct {
			term   string
			weight float64
		}

		for _, termWeight := range termWeightList {
			term := termWeight.Term
			weight := termWeight.Weight

			// Fine-grained tokenization if needed
			// sm holds fine‑grained tokens for the current term.
			var sm []string
			if qb.NeedFineGrainedTokenize(term) {
				fineGrained, err := tokenizer.FineGrainedTokenize(term)
				if err == nil && fineGrained != "" {
					sm = strings.Fields(fineGrained)
				}
			}

			// Clean special characters from sm
			// cleanSm holds cleaned fine‑grained tokens with special characters removed.
			var cleanSm []string
			// specialCharRe is the regex pattern for matching special characters.
			specialCharRe := regexp.MustCompile(`[,\.\/;'\[\]\\\` + "`" + `~!@#$%\^&\*\(\)=\+_<>\?:"\{\}\|，。；'‘’【】、！￥……（）——《》？："""-]+`)
			for _, m := range sm {
				m = specialCharRe.ReplaceAllString(m, "")
				m = qb.SubSpecialChar(m)
				if len(m) > 1 {
					cleanSm = append(cleanSm, m)
				}
			}
			sm = cleanSm

			// Add to keywords if under limit
			if len(keywords) < 32 {
				// cleanTk is the term with quotes and spaces removed.
				cleanTk := regexp.MustCompile(`[ \"']+`).ReplaceAllString(term, "")
				if cleanTk != "" {
					keywords = append(keywords, cleanTk)
				}
				keywords = append(keywords, sm...)
			}

			// Lookup synonyms for this token
			// tkSyns are synonyms for the current term.
			tkSyns := qb.synonym.Lookup(term, 8)
			for i, s := range tkSyns {
				tkSyns[i] = qb.SubSpecialChar(s)
			}
			if len(keywords) < 32 {
				for _, s := range tkSyns {
					if s != "" {
						keywords = append(keywords, s)
					}
				}
			}

			// Fine-grained tokenize synonyms
			// fineGrainedSyns holds fine‑grained tokenized synonyms.
			var fineGrainedSyns []string
			for _, s := range tkSyns {
				if s == "" {
					continue
				}
				fg, err := tokenizer.FineGrainedTokenize(s)
				if err == nil && fg != "" {
					// Quote if contains space
					if strings.Contains(fg, " ") {
						fg = fmt.Sprintf(`"%s"`, fg)
					}
					fineGrainedSyns = append(fineGrainedSyns, fg)
				}
			}

			if len(keywords) >= 32 {
				break
			}

			// Clean token for query
			term = qb.SubSpecialChar(term)
			if term == "" {
				continue
			}

			// Quote if contains space
			if strings.Contains(term, " ") {
				term = fmt.Sprintf(`"%s"`, term)
			}

			// Build query part with synonyms
			if len(fineGrainedSyns) > 0 {
				term = fmt.Sprintf("(%s OR (%s)^0.2)", term, strings.Join(fineGrainedSyns, " "))
			}
			if len(sm) > 0 {
				smStr := strings.Join(sm, " ")
				term = fmt.Sprintf(`%s OR "%s" OR ("%s"~2)^0.5`, term, smStr, smStr)
			}

			terms = append(terms, struct {
				term   string
				weight float64
			}{term, weight})
		}

		// Build query string for this segment
		// termParts collects query parts for each term in the segment.
		var termParts []string
		for _, termWeight := range terms {
			termParts = append(termParts, fmt.Sprintf("(%s)^%.4f", termWeight.term, termWeight.weight))
		}
		// tmsStr is the query string for the current segment.
		tmsStr := strings.Join(termParts, " ")

		// Add proximity query if multiple tokens
		if len(termWeightList) > 1 {
			// tokenized is the tokenized version of the segment.
			tokenized, _ := tokenizer.Tokenize(segment)
			if tokenized != "" {
				tmsStr += fmt.Sprintf(` ("%s"~2)^1.5`, tokenized)
			}
		}

		// Add segment-level synonyms
		if len(syns) > 0 && tmsStr != "" {
			// synParts collects synonym query parts.
			var synParts []string
			for _, s := range syns {
				s = qb.SubSpecialChar(s)
				if s != "" {
					tokenized, _ := tokenizer.Tokenize(s)
					if tokenized != "" {
						synParts = append(synParts, fmt.Sprintf(`"%s"`, tokenized))
					}
				}
			}
			if len(synParts) > 0 {
				tmsStr = fmt.Sprintf("(%s)^5 OR (%s)^0.7", tmsStr, strings.Join(synParts, " OR "))
			}
		}

		if tmsStr != "" {
			qs = append(qs, tmsStr)
		} else {
			fmt.Println("tmsStr is empty")
		}
	}

	// Build final query
	if len(qs) > 0 {
		// queryParts collects final query parts for each segment.
		var queryParts []string
		for _, q := range qs {
			if q != "" {
				queryParts = append(queryParts, fmt.Sprintf("(%s)", q))
			}
		}
		// query is the final query string built from all segments.
		query := strings.Join(queryParts, " OR ")
		if query == "" {
			query = otxt
		}
		return &infinity.MatchTextExpr{
			Fields:       qb.queryFields,
			MatchingText: query,
			TopN:         100,
			ExtraOptions: map[string]interface{}{
				"minimum_should_match": minMatch,
				"original_query":       originalQuery,
			},
		}, keywords
	}

	return nil, keywords
}

// Paragraph builds a query expression based on content terms and keywords.
// References Python FulltextQueryer.paragraph method.
func (qb *QueryBuilder) Paragraph(contentTks string, keywords []string, keywordsTopN int) *infinity.MatchTextExpr {
	// Simplified implementation: merge keywords and content terms
	allTerms := make([]string, 0, len(keywords))
	for _, k := range keywords {
		k = strings.TrimSpace(k)
		if k != "" {
			allTerms = append(allTerms, `"`+k+`"`)
		}
	}
	// Limit number of keywords
	if keywordsTopN > 0 && len(allTerms) > keywordsTopN {
		allTerms = allTerms[:keywordsTopN]
	}
	// Could add content term processing here, e.g., tokenization, weight calculation
	// Currently only uses keywords
	query := strings.Join(allTerms, " ")
	// Calculate minimum_should_match (could be used for extra_options in future)
	_ = 3
	if len(allTerms) > 0 {
		calc := int(float64(len(allTerms)) / 10.0)
		if calc < 3 {
			calc = 3
		}
		_ = calc
	}
	return &infinity.MatchTextExpr{
		Fields:       qb.queryFields,
		MatchingText: query,
		TopN:         100,
	}
}

// Similarity calculates similarity between two term weight dictionaries.
// Algorithm: s = sum(qtwt[k] for k in qtwt if k in dtwt) / sum(qtwt[k])
func (qb *QueryBuilder) Similarity(qtwt map[string]float64, dtwt map[string]float64) float64 {
	if len(qtwt) == 0 {
		return 0.0
	}
	var sum float64
	for k, v := range qtwt {
		if _, ok := dtwt[k]; ok {
			sum += v
		}
	}
	var total float64
	for _, v := range qtwt {
		total += v
	}
	if total == 0 {
		return 0.0
	}
	return sum / total
}

// TokenSimilarity calculates similarity between query terms and multiple document term sets.
// To be implemented: requires term weight processing module.
func (qb *QueryBuilder) TokenSimilarity(atks string, btkss []string) []float64 {
	// Placeholder implementation, returns zero values
	result := make([]float64, len(btkss))
	for i := range result {
		result[i] = 0.0
	}
	return result
}

// HybridSimilarity calculates weighted combination of vector similarity and term similarity.
// To be implemented: requires vector cosine similarity calculation.
func (qb *QueryBuilder) HybridSimilarity(avec []float64, bvecs [][]float64, atks string, btkss []string, tkweight float64, vtweight float64) ([]float64, []float64, []float64) {
	// Placeholder implementation, returns zero values
	n := len(btkss)
	sims := make([]float64, n)
	tksim := make([]float64, n)
	vecsim := make([]float64, n)
	return sims, tksim, vecsim
}

// SetQueryFields sets the list of query fields.
func (qb *QueryBuilder) SetQueryFields(fields []string) {
	qb.queryFields = fields
}