Files
ragflow/internal/service/nlp/query_builder.go
Jin Hai 70e9743ef1 RAGFlow go API server (#13240)
# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
2026-03-04 19:17:16 +08:00

656 lines
20 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package nlp
import (
"fmt"
"path/filepath"
"regexp"
"sort"
"strings"
"sync"
"ragflow/internal/engine/infinity"
"ragflow/internal/tokenizer"
"github.com/siongui/gojianfan"
)
var (
// globalQueryBuilder is the global query builder instance
globalQueryBuilder *QueryBuilder
// qbOnce ensures the query builder is initialized only once
qbOnce sync.Once
// qbInitError stores any error during initialization
qbInitError error
)
// QueryBuilder provides functionality to build query expressions based on text, referencing Python's FulltextQueryer and QueryBase.
type QueryBuilder struct {
queryFields []string
termWeight *TermWeightDealer
synonym *Synonym
}
// InitQueryBuilder initializes the global QueryBuilder with the given wordnet directory.
// It should be called during the initialization phase of main.go, after tokenizer.Init.
// The wordnetDir is typically filepath.Join(tokenizer.Config.DictPath, "wordnet")
func InitQueryBuilder(wordnetDir string) error {
qbOnce.Do(func() {
globalQueryBuilder = &QueryBuilder{
queryFields: []string{
"title_tks^10",
"title_sm_tks^5",
"important_kwd^30",
"important_tks^20",
"question_tks^20",
"content_ltks^2",
"content_sm_ltks",
},
termWeight: NewTermWeightDealer(""),
synonym: NewSynonym(nil, "", wordnetDir),
}
})
return qbInitError
}
// InitQueryBuilderFromTokenizer initializes the global QueryBuilder using tokenizer's DictPath.
// The wordnet directory is derived from tokenizer's DictPath as: DictPath/wordnet
// This should be called after tokenizer.Init().
func InitQueryBuilderFromTokenizer(tokenizerDictPath string) error {
wordnetDir := filepath.Join(tokenizerDictPath, "wordnet")
return InitQueryBuilder(wordnetDir)
}
// GetQueryBuilder returns the global QueryBuilder instance.
// Returns nil if InitQueryBuilder has not been called.
func GetQueryBuilder() *QueryBuilder {
return globalQueryBuilder
}
// NewQueryBuilder creates a new QueryBuilder with default query fields.
// Deprecated: Use GetQueryBuilder() to get the global instance for better performance.
func NewQueryBuilder() *QueryBuilder {
return &QueryBuilder{
queryFields: []string{
"title_tks^10",
"title_sm_tks^5",
"important_kwd^30",
"important_tks^20",
"question_tks^20",
"content_ltks^2",
"content_sm_ltks",
},
termWeight: NewTermWeightDealer(""),
synonym: NewSynonym(nil, "", ""),
}
}
// IsChinese determines whether a line of text is primarily Chinese.
// Algorithm: split by whitespace, if segments <=3 return true; otherwise count ratio of non-pure-alphabet segments, return true if ratio >=0.7.
func (qb *QueryBuilder) IsChinese(line string) bool {
fields := strings.Fields(line)
if len(fields) <= 3 {
return true
}
nonAlpha := 0
for _, f := range fields {
matched, _ := regexp.MatchString(`^[a-zA-Z]+$`, f)
if !matched {
nonAlpha++
}
}
return float64(nonAlpha)/float64(len(fields)) >= 0.7
}
// SubSpecialChar escapes special characters for use in queries.
func (qb *QueryBuilder) SubSpecialChar(line string) string {
// Regex matches : { } / [ ] - * " ( ) | + ~ ^ and prepends backslash
re := regexp.MustCompile(`([:{}/\[\]\-\*"\(\)\|\+~\^])`)
return re.ReplaceAllString(line, `\$1`)
}
// RmWWW removes common stop words and question words from queries.
func (qb *QueryBuilder) RmWWW(txt string) string {
patterns := []struct {
regex string
repl string
}{
// Chinese stop words
{`是*(怎么办|什么样的|哪家|一下|那家|请问|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀|谁|哪位|哪个)是*`, ""},
// English stop words (case-insensitive)
{`(^| )(what|who|how|which|where|why)('re|'s)? `, " "},
{`(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down|of|to|or|and|if) `, " "},
}
original := txt
for _, p := range patterns {
re := regexp.MustCompile(`(?i)` + p.regex)
txt = re.ReplaceAllString(txt, p.repl)
}
if txt == "" {
txt = original
}
return txt
}
// AddSpaceBetweenEngZh adds spaces between English letters and Chinese characters to improve tokenization.
func (qb *QueryBuilder) AddSpaceBetweenEngZh(txt string) string {
// (ENG/ENG+NUM) + ZH: e.g., "ABC123中文" -> "ABC123 中文"
re1 := regexp.MustCompile(`([A-Za-z]+[0-9]*)([\x{4e00}-\x{9fa5}]+)`)
txt = re1.ReplaceAllString(txt, "$1 $2")
// ENG + ZH: e.g., "ABC中文" -> "ABC 中文"
re2 := regexp.MustCompile(`([A-Za-z])([\x{4e00}-\x{9fa5}]+)`)
txt = re2.ReplaceAllString(txt, "$1 $2")
// ZH + (ENG/ENG+NUM): e.g., "中文ABC123" -> "中文 ABC123"
re3 := regexp.MustCompile(`([\x{4e00}-\x{9fa5}]+)([A-Za-z]+[0-9]*)`)
txt = re3.ReplaceAllString(txt, "$1 $2")
// ZH + ENG: e.g., "中文ABC" -> "中文 ABC"
re4 := regexp.MustCompile(`([\x{4e00}-\x{9fa5}]+)([A-Za-z])`)
txt = re4.ReplaceAllString(txt, "$1 $2")
return txt
}
// StrFullWidth2HalfWidth converts full-width characters to half-width characters.
// Algorithm: For each character:
// - Full-width space (U+3000) is converted to half-width space (U+0020).
// - For other characters, subtract 0xFEE0 from its code point.
// - If the resulting code point is not in the half-width character range (0x0020 to 0x7E),
// the original character is kept.
func (qb *QueryBuilder) StrFullWidth2HalfWidth(ustring string) string {
var rstring strings.Builder
for _, uchar := range ustring {
insideCode := int32(uchar)
if insideCode == 0x3000 {
insideCode = 0x0020
} else {
insideCode -= 0xFEE0
}
if insideCode < 0x0020 || insideCode > 0x7E {
rstring.WriteRune(uchar)
} else {
rstring.WriteRune(insideCode)
}
}
return rstring.String()
}
// Traditional2Simplified converts traditional Chinese characters to simplified Chinese characters.
// Uses gojianfan library which provides conversion similar to Python's HanziConv.
func (qb *QueryBuilder) Traditional2Simplified(line string) string {
return gojianfan.T2S(line)
}
// NeedFineGrainedTokenize determines if fine-grained tokenization is needed for a token.
// Reference: rag/nlp/query.py L88-93
func (qb *QueryBuilder) NeedFineGrainedTokenize(tk string) bool {
if len(tk) < 3 {
return false
}
if matched, _ := regexp.MatchString(`^[0-9a-z\.\+#_\*-]+$`, tk); matched {
return false
}
return true
}
// Question builds a full-text query expression based on input text.
// References Python FulltextQueryer.question method.
// Currently, a simplified version, returns basic MatchTextExpr; future integration of term weight and synonyms.
func (qb *QueryBuilder) Question(txt string, tbl string, minMatch float64) (*infinity.MatchTextExpr, []string) {
// originalQuery stores the original input text for later use in query expression.
originalQuery := txt
// Add space between English and Chinese
txtWithSpaces := qb.AddSpaceBetweenEngZh(txt)
// Convert to lowercase and remove punctuation (simplified)
txtLower := strings.ToLower(txtWithSpaces)
// Convert to half-width
txtHalfWidth := qb.StrFullWidth2HalfWidth(txtLower)
// Convert to simplified Chinese
txtSimplified := qb.Traditional2Simplified(txtHalfWidth)
// Replace punctuation and special characters with space
// Reference: rag/nlp/query.py L44-48
// re is the regex pattern for matching punctuation and special characters.
re := regexp.MustCompile(`[ :|\r\n\t,.。?/\` + "`" + `!&^%()\[\]{}<>]+`)
// txtCleaned is the text after removing punctuation and special characters.
txtCleaned := re.ReplaceAllString(txtSimplified, " ")
// Remove stop words
txtNoStopWords := qb.RmWWW(txtCleaned)
// Determine if text is Chinese
if !qb.IsChinese(txtNoStopWords) {
// Non-Chinese processing
// Reference: rag/nlp/query.py L52-88
// Remove stop words again
// txtFinal is the text after removing stop words again.
txtFinal := qb.RmWWW(txtNoStopWords)
// Tokenize using rag_tokenizer
tokenized, err := tokenizer.Tokenize(txtFinal)
if err != nil {
// If tokenizer fails, use simple split
tokenized = txtFinal
}
// tks are tokens obtained by splitting the tokenized text by whitespace.
tks := strings.Fields(tokenized)
// keywords stores the nonempty tokens as keywords.
keywords := make([]string, 0, len(tks))
for _, t := range tks {
if t != "" {
keywords = append(keywords, t)
}
}
// Calculate term weights using TermWeightDealer
// Reference: rag/nlp/query.py L56
// tws holds the term weight list for each token.
tws := qb.termWeight.Weights(tks, false)
// Clean tokens and filter
// Reference: rag/nlp/query.py L57-60
type tokenWeight struct {
tk string
w float64
}
// tksW holds the cleaned tokens with their weights.
var tksW []tokenWeight
for _, tw := range tws {
tk := tw.Term
w := tw.Weight
// Clean token: remove special chars
tk = regexp.MustCompile(`[ \"'^]+`).ReplaceAllString(tk, "")
// Remove single alphanumeric chars
tk = regexp.MustCompile(`^[a-z0-9]$`).ReplaceAllString(tk, "")
// Remove leading +/-
tk = regexp.MustCompile(`^[\+\-]+`).ReplaceAllString(tk, "")
tk = strings.TrimSpace(tk)
if tk == "" {
continue
}
tksW = append(tksW, tokenWeight{tk, w})
}
// Limit to 256 tokens
// Reference: rag/nlp/query.py L62
if len(tksW) > 256 {
tksW = tksW[:256]
}
// TODO: Synonym expansion (reference L61-67)
// For now, use empty synonyms
// syns is a placeholder for synonym expansion (currently empty).
syns := make([]string, len(tksW))
// Build query parts
// Reference: rag/nlp/query.py L69-70
// q collects the query part strings.
var q []string
for i, tw := range tksW {
tk := tw.tk
w := tw.w
// Skip tokens with special regex chars
if matched, _ := regexp.MatchString(`[.^+\(\)-]`, tk); matched {
continue
}
// Format: (token^weight synonym)
q = append(q, fmt.Sprintf("(%s^%.4f %s)", tk, w, syns[i]))
}
// Add phrase queries for adjacent tokens
// Reference: rag/nlp/query.py L71-82
for i := 1; i < len(tksW); i++ {
left := strings.TrimSpace(tksW[i-1].tk)
right := strings.TrimSpace(tksW[i].tk)
if left == "" || right == "" {
continue
}
// maxW is the maximum weight between two adjacent tokens.
maxW := tksW[i-1].w
if tksW[i].w > maxW {
maxW = tksW[i].w
}
q = append(q, fmt.Sprintf(`"%s %s"^%.4f`, left, right, maxW*2))
}
if len(q) == 0 {
q = append(q, txtFinal)
}
// query is the final query string built from all query parts.
query := strings.Join(q, " ")
return &infinity.MatchTextExpr{
Fields: qb.queryFields,
MatchingText: query,
TopN: 100,
ExtraOptions: map[string]interface{}{
"original_query": originalQuery,
},
}, keywords
}
// Chinese processing
// Reference: rag/nlp/query.py L88-172
// Save original text before removing stop words (for fallback)
// otxt holds the original text before removing stop words, used as fallback.
otxt := txtNoStopWords
// Remove stop words for Chinese processing
// txtChinese is the text after removing stop words for Chinese processing.
txtChinese := qb.RmWWW(txtNoStopWords)
// qs collects query strings for each segment.
var qs []string
// keywords stores keywords extracted from segments.
var keywords []string
// Split text and process each segment (limit to 256)
// segments are the text segments after splitting by term weight.
segments := qb.termWeight.Split(txtChinese)
if len(segments) > 256 {
segments = segments[:256]
}
for _, segment := range segments {
if segment == "" {
continue
}
keywords = append(keywords, segment)
// Get term weights
// termWeightList holds term weights for the current segment.
termWeightList := qb.termWeight.Weights([]string{segment}, true)
// Lookup synonyms
// syns are synonyms for the current segment.
syns := qb.synonym.Lookup(segment, 8)
if len(syns) > 0 && len(keywords) < 32 {
keywords = append(keywords, syns...)
}
// Sort by weight descending
sort.Slice(termWeightList, func(i, j int) bool {
return termWeightList[i].Weight > termWeightList[j].Weight
})
// terms stores term strings with their weights for the current segment.
var terms []struct {
term string
weight float64
}
for _, termWeight := range termWeightList {
term := termWeight.Term
weight := termWeight.Weight
// Fine-grained tokenization if needed
// sm holds finegrained tokens for the current term.
var sm []string
if qb.NeedFineGrainedTokenize(term) {
fineGrained, err := tokenizer.FineGrainedTokenize(term)
if err == nil && fineGrained != "" {
sm = strings.Fields(fineGrained)
}
}
// Clean special characters from sm
// cleanSm holds cleaned finegrained tokens with special characters removed.
var cleanSm []string
// specialCharRe is the regex pattern for matching special characters.
specialCharRe := regexp.MustCompile(`[,\.\/;'\[\]\\\` + "`" + `~!@#$%\^&\*\(\)=\+_<>\?:"\{\}\|,。;'‘’【】、!¥……()——《》?:"""-]+`)
for _, m := range sm {
m = specialCharRe.ReplaceAllString(m, "")
m = qb.SubSpecialChar(m)
if len(m) > 1 {
cleanSm = append(cleanSm, m)
}
}
sm = cleanSm
// Add to keywords if under limit
if len(keywords) < 32 {
// cleanTk is the term with quotes and spaces removed.
cleanTk := regexp.MustCompile(`[ \"']+`).ReplaceAllString(term, "")
if cleanTk != "" {
keywords = append(keywords, cleanTk)
}
keywords = append(keywords, sm...)
}
// Lookup synonyms for this token
// tkSyns are synonyms for the current term.
tkSyns := qb.synonym.Lookup(term, 8)
for i, s := range tkSyns {
tkSyns[i] = qb.SubSpecialChar(s)
}
if len(keywords) < 32 {
for _, s := range tkSyns {
if s != "" {
keywords = append(keywords, s)
}
}
}
// Fine-grained tokenize synonyms
// fineGrainedSyns holds finegrained tokenized synonyms.
var fineGrainedSyns []string
for _, s := range tkSyns {
if s == "" {
continue
}
fg, err := tokenizer.FineGrainedTokenize(s)
if err == nil && fg != "" {
// Quote if contains space
if strings.Contains(fg, " ") {
fg = fmt.Sprintf(`"%s"`, fg)
}
fineGrainedSyns = append(fineGrainedSyns, fg)
}
}
if len(keywords) >= 32 {
break
}
// Clean token for query
term = qb.SubSpecialChar(term)
if term == "" {
continue
}
// Quote if contains space
if strings.Contains(term, " ") {
term = fmt.Sprintf(`"%s"`, term)
}
// Build query part with synonyms
if len(fineGrainedSyns) > 0 {
term = fmt.Sprintf("(%s OR (%s)^0.2)", term, strings.Join(fineGrainedSyns, " "))
}
if len(sm) > 0 {
smStr := strings.Join(sm, " ")
term = fmt.Sprintf(`%s OR "%s" OR ("%s"~2)^0.5`, term, smStr, smStr)
}
terms = append(terms, struct {
term string
weight float64
}{term, weight})
}
// Build query string for this segment
// termParts collects query parts for each term in the segment.
var termParts []string
for _, termWeight := range terms {
termParts = append(termParts, fmt.Sprintf("(%s)^%.4f", termWeight.term, termWeight.weight))
}
// tmsStr is the query string for the current segment.
tmsStr := strings.Join(termParts, " ")
// Add proximity query if multiple tokens
if len(termWeightList) > 1 {
// tokenized is the tokenized version of the segment.
tokenized, _ := tokenizer.Tokenize(segment)
if tokenized != "" {
tmsStr += fmt.Sprintf(` ("%s"~2)^1.5`, tokenized)
}
}
// Add segment-level synonyms
if len(syns) > 0 && tmsStr != "" {
// synParts collects synonym query parts.
var synParts []string
for _, s := range syns {
s = qb.SubSpecialChar(s)
if s != "" {
tokenized, _ := tokenizer.Tokenize(s)
if tokenized != "" {
synParts = append(synParts, fmt.Sprintf(`"%s"`, tokenized))
}
}
}
if len(synParts) > 0 {
tmsStr = fmt.Sprintf("(%s)^5 OR (%s)^0.7", tmsStr, strings.Join(synParts, " OR "))
}
}
if tmsStr != "" {
qs = append(qs, tmsStr)
} else {
fmt.Println("tmsStr is empty")
}
}
// Build final query
if len(qs) > 0 {
// queryParts collects final query parts for each segment.
var queryParts []string
for _, q := range qs {
if q != "" {
queryParts = append(queryParts, fmt.Sprintf("(%s)", q))
}
}
// query is the final query string built from all segments.
query := strings.Join(queryParts, " OR ")
if query == "" {
query = otxt
}
return &infinity.MatchTextExpr{
Fields: qb.queryFields,
MatchingText: query,
TopN: 100,
ExtraOptions: map[string]interface{}{
"minimum_should_match": minMatch,
"original_query": originalQuery,
},
}, keywords
}
return nil, keywords
}
// Paragraph builds a query expression based on content terms and keywords.
// References Python FulltextQueryer.paragraph method.
func (qb *QueryBuilder) Paragraph(contentTks string, keywords []string, keywordsTopN int) *infinity.MatchTextExpr {
// Simplified implementation: merge keywords and content terms
allTerms := make([]string, 0, len(keywords))
for _, k := range keywords {
k = strings.TrimSpace(k)
if k != "" {
allTerms = append(allTerms, `"`+k+`"`)
}
}
// Limit number of keywords
if keywordsTopN > 0 && len(allTerms) > keywordsTopN {
allTerms = allTerms[:keywordsTopN]
}
// Could add content term processing here, e.g., tokenization, weight calculation
// Currently only uses keywords
query := strings.Join(allTerms, " ")
// Calculate minimum_should_match (could be used for extra_options in future)
_ = 3
if len(allTerms) > 0 {
calc := int(float64(len(allTerms)) / 10.0)
if calc < 3 {
calc = 3
}
_ = calc
}
return &infinity.MatchTextExpr{
Fields: qb.queryFields,
MatchingText: query,
TopN: 100,
}
}
// Similarity calculates similarity between two term weight dictionaries.
// Algorithm: s = sum(qtwt[k] for k in qtwt if k in dtwt) / sum(qtwt[k])
func (qb *QueryBuilder) Similarity(qtwt map[string]float64, dtwt map[string]float64) float64 {
if len(qtwt) == 0 {
return 0.0
}
var sum float64
for k, v := range qtwt {
if _, ok := dtwt[k]; ok {
sum += v
}
}
var total float64
for _, v := range qtwt {
total += v
}
if total == 0 {
return 0.0
}
return sum / total
}
// TokenSimilarity calculates similarity between query terms and multiple document term sets.
// To be implemented: requires term weight processing module.
func (qb *QueryBuilder) TokenSimilarity(atks string, btkss []string) []float64 {
// Placeholder implementation, returns zero values
result := make([]float64, len(btkss))
for i := range result {
result[i] = 0.0
}
return result
}
// HybridSimilarity calculates weighted combination of vector similarity and term similarity.
// To be implemented: requires vector cosine similarity calculation.
func (qb *QueryBuilder) HybridSimilarity(avec []float64, bvecs [][]float64, atks string, btkss []string, tkweight float64, vtweight float64) ([]float64, []float64, []float64) {
// Placeholder implementation, returns zero values
n := len(btkss)
sims := make([]float64, n)
tksim := make([]float64, n)
vecsim := make([]float64, n)
return sims, tksim, vecsim
}
// SetQueryFields sets the list of query fields.
func (qb *QueryBuilder) SetQueryFields(fields []string) {
qb.queryFields = fields
}