mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-06 10:17:49 +08:00
Implement GetChunk() in Infinity in GO (#13758)
### What problem does this PR solve? Implement GetChunk() in Infinity in GO Add cli: GET CHUNK 'XXX'; LIST CHUNKS OF DOCUMENT 'XXX'; ### Type of change - [x] Refactoring
This commit is contained in:
219
internal/engine/infinity/get.go
Normal file
219
internal/engine/infinity/get.go
Normal file
@ -0,0 +1,219 @@
|
||||
//
|
||||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package infinity
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
infinity "github.com/infiniflow/infinity-go-sdk"
|
||||
"ragflow/internal/logger"
|
||||
"ragflow/internal/utility"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// GetChunk gets a chunk by ID
|
||||
func (e *infinityEngine) GetChunk(ctx context.Context, tableName, chunkID string, kbIDs []string) (interface{}, error) {
|
||||
if e.client == nil || e.client.conn == nil {
|
||||
return nil, fmt.Errorf("Infinity client not initialized")
|
||||
}
|
||||
|
||||
// Build list of table names to search
|
||||
var tableNames []string
|
||||
if strings.HasPrefix(tableName, "ragflow_doc_meta_") {
|
||||
tableNames = []string{tableName}
|
||||
} else {
|
||||
// Search in tables like <tableName>_<kb_id> for each kbID
|
||||
if len(kbIDs) > 0 {
|
||||
for _, kbID := range kbIDs {
|
||||
tableNames = append(tableNames, fmt.Sprintf("%s_%s", tableName, kbID))
|
||||
}
|
||||
}
|
||||
// Also try the base tableName
|
||||
tableNames = append(tableNames, tableName)
|
||||
}
|
||||
|
||||
// Try each table and collect results from all tables
|
||||
db, err := e.client.conn.GetDatabase(e.client.dbName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get database: %w", err)
|
||||
}
|
||||
|
||||
// Collect chunks from all tables (same as Python's concat_dataframes)
|
||||
allChunks := make(map[string]map[string]interface{})
|
||||
|
||||
for _, tblName := range tableNames {
|
||||
table, err := db.GetTable(tblName)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Query with filter for the specific chunk ID
|
||||
filter := fmt.Sprintf("id = '%s'", chunkID)
|
||||
result, err := table.Output([]string{"*"}).Filter(filter).ToResult()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
qr, ok := result.(*infinity.QueryResult)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
if len(qr.Data) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Convert to chunk format
|
||||
chunks := make([]map[string]interface{}, 0)
|
||||
for colName, colData := range qr.Data {
|
||||
for i, val := range colData {
|
||||
for len(chunks) <= i {
|
||||
chunks = append(chunks, make(map[string]interface{}))
|
||||
}
|
||||
chunks[i][colName] = val
|
||||
}
|
||||
}
|
||||
|
||||
// Merge chunks into allChunks (by id), keeping first non-empty value
|
||||
for _, chunk := range chunks {
|
||||
if idVal, ok := chunk["id"].(string); ok {
|
||||
if existing, exists := allChunks[idVal]; exists {
|
||||
// Merge: keep first non-empty value for each field
|
||||
for k, v := range chunk {
|
||||
if _, has := existing[k]; !has || utility.IsEmpty(v) {
|
||||
existing[k] = v
|
||||
}
|
||||
}
|
||||
} else {
|
||||
allChunks[idVal] = chunk
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get the chunk by chunkID
|
||||
chunk, found := allChunks[chunkID]
|
||||
if !found {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
getFields(chunk)
|
||||
|
||||
logger.Debug("infinity get chunk", zap.String("chunkID", chunkID), zap.Any("tables", tableNames))
|
||||
|
||||
return chunk, nil
|
||||
}
|
||||
|
||||
// getFields applies field mappings to a chunk, similar to Python's get_fields function.
|
||||
func getFields(chunk map[string]interface{}) {
|
||||
// Field mappings
|
||||
// docnm -> docnm_kwd, title_tks, title_sm_tks
|
||||
if val, ok := chunk["docnm"].(string); ok {
|
||||
chunk["docnm_kwd"] = val
|
||||
chunk["title_tks"] = val
|
||||
chunk["title_sm_tks"] = val
|
||||
}
|
||||
|
||||
// important_keywords -> important_kwd (split by comma), important_tks
|
||||
if val, ok := chunk["important_keywords"].(string); ok {
|
||||
if val == "" {
|
||||
chunk["important_kwd"] = []interface{}{}
|
||||
} else {
|
||||
parts := strings.Split(val, ",")
|
||||
chunk["important_kwd"] = parts
|
||||
}
|
||||
chunk["important_tks"] = val
|
||||
} else {
|
||||
chunk["important_kwd"] = []interface{}{}
|
||||
chunk["important_tks"] = []interface{}{}
|
||||
}
|
||||
|
||||
// questions -> question_kwd (split by newline), question_tks
|
||||
if val, ok := chunk["questions"].(string); ok {
|
||||
if val == "" {
|
||||
chunk["question_kwd"] = []interface{}{}
|
||||
} else {
|
||||
parts := strings.Split(val, "\n")
|
||||
chunk["question_kwd"] = parts
|
||||
}
|
||||
chunk["question_tks"] = val
|
||||
} else {
|
||||
chunk["question_kwd"] = []interface{}{}
|
||||
chunk["question_tks"] = []interface{}{}
|
||||
}
|
||||
|
||||
// content -> content_with_weight, content_ltks, content_sm_ltks
|
||||
if val, ok := chunk["content"].(string); ok {
|
||||
chunk["content_with_weight"] = val
|
||||
chunk["content_ltks"] = val
|
||||
chunk["content_sm_ltks"] = val
|
||||
}
|
||||
|
||||
// authors -> authors_tks, authors_sm_tks
|
||||
if val, ok := chunk["authors"].(string); ok {
|
||||
chunk["authors_tks"] = val
|
||||
chunk["authors_sm_tks"] = val
|
||||
}
|
||||
|
||||
// position_int: convert from hex string to array format (grouped by 5)
|
||||
if val, ok := chunk["position_int"].(string); ok {
|
||||
chunk["position_int"] = utility.ConvertHexToPositionIntArray(val)
|
||||
} else {
|
||||
chunk["position_int"] = []interface{}{}
|
||||
}
|
||||
|
||||
// Convert page_num_int and top_int from hex string to array
|
||||
for _, colName := range []string{"page_num_int", "top_int"} {
|
||||
if val, ok := chunk[colName].(string); ok && val != "" {
|
||||
chunk[colName] = utility.ConvertHexToIntArray(val)
|
||||
} else {
|
||||
chunk[colName] = []int{}
|
||||
}
|
||||
}
|
||||
|
||||
// Post-process: convert nil/empty values to empty slices for array-like fields
|
||||
// and split _kwd fields by "###" (except knowledge_graph_kwd, docnm_kwd, important_kwd, question_kwd)
|
||||
kwdNoSplit := map[string]bool{
|
||||
"knowledge_graph_kwd": true, "docnm_kwd": true,
|
||||
"important_kwd": true, "question_kwd": true,
|
||||
}
|
||||
arrayFields := []string{
|
||||
"doc_type_kwd", "important_kwd", "important_tks", "question_tks",
|
||||
"question_kwd", "authors_tks", "authors_sm_tks", "title_tks",
|
||||
"title_sm_tks", "content_ltks", "content_sm_ltks",
|
||||
}
|
||||
for _, colName := range arrayFields {
|
||||
if val, ok := chunk[colName]; !ok || val == nil || val == "" {
|
||||
chunk[colName] = []interface{}{}
|
||||
} else if !kwdNoSplit[colName] {
|
||||
// Split by "###" for _kwd fields
|
||||
if strVal, ok := val.(string); ok && strings.Contains(strVal, "###") {
|
||||
parts := strings.Split(strVal, "###")
|
||||
var filtered []interface{}
|
||||
for _, p := range parts {
|
||||
if p != "" {
|
||||
filtered = append(filtered, p)
|
||||
}
|
||||
}
|
||||
chunk[colName] = filtered
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -20,7 +20,7 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"ragflow/internal/engine/types"
|
||||
"strconv"
|
||||
"ragflow/internal/utility"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
@ -458,18 +458,25 @@ func (e *infinityEngine) searchUnified(ctx context.Context, req *types.SearchReq
|
||||
}
|
||||
}
|
||||
|
||||
// DocIDs filters by doc_id (document ID) to find all chunks belonging to a document
|
||||
// This is used by ChunkService.List() to list all chunks for a document
|
||||
if len(req.DocIDs) > 0 {
|
||||
if len(req.DocIDs) == 1 {
|
||||
filterParts = append(filterParts, fmt.Sprintf("id = '%s'", req.DocIDs[0]))
|
||||
filterParts = append(filterParts, fmt.Sprintf("doc_id = '%s'", req.DocIDs[0]))
|
||||
} else {
|
||||
docIDs := strings.Join(req.DocIDs, "', '")
|
||||
filterParts = append(filterParts, fmt.Sprintf("id IN ('%s')", docIDs))
|
||||
filterParts = append(filterParts, fmt.Sprintf("doc_id IN ('%s')", docIDs))
|
||||
}
|
||||
}
|
||||
|
||||
if !isMetadataTable {
|
||||
// Default filter for available chunks
|
||||
filterParts = append(filterParts, "available_int=1")
|
||||
// Only add available_int filter when there's text/vector match or AvailableInt is explicitly set
|
||||
// This matches Python's behavior where chunk_list doesn't filter by available_int
|
||||
if !isMetadataTable && (hasTextMatch || hasVectorMatch || req.AvailableInt != nil) {
|
||||
if req.AvailableInt != nil {
|
||||
filterParts = append(filterParts, fmt.Sprintf("available_int=%d", *req.AvailableInt))
|
||||
} else {
|
||||
filterParts = append(filterParts, "available_int=1")
|
||||
}
|
||||
}
|
||||
|
||||
filterStr := strings.Join(filterParts, " AND ")
|
||||
@ -637,13 +644,13 @@ func calculateScores(chunks []map[string]interface{}, scoreColumn, pagerankField
|
||||
for i := range chunks {
|
||||
score := 0.0
|
||||
if scoreVal, ok := chunks[i][scoreColumn]; ok {
|
||||
if f, ok := toFloat64(scoreVal); ok {
|
||||
if f, ok := utility.ToFloat64(scoreVal); ok {
|
||||
score += f
|
||||
fmt.Printf("[DEBUG] chunk[%d]: %s=%f\n", i, scoreColumn, f)
|
||||
}
|
||||
}
|
||||
if pagerankVal, ok := chunks[i][pagerankField]; ok {
|
||||
if f, ok := toFloat64(pagerankVal); ok {
|
||||
if f, ok := utility.ToFloat64(pagerankVal); ok {
|
||||
score += f
|
||||
}
|
||||
}
|
||||
@ -699,27 +706,6 @@ func getScore(chunk map[string]interface{}) float64 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
func toFloat64(val interface{}) (float64, bool) {
|
||||
switch v := val.(type) {
|
||||
case float64:
|
||||
return v, true
|
||||
case float32:
|
||||
return float64(v), true
|
||||
case int:
|
||||
return float64(v), true
|
||||
case int64:
|
||||
return float64(v), true
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(v, 64)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return f, true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
// executeTableSearch executes search on a single table
|
||||
func (e *infinityEngine) executeTableSearch(db *infinity.Database, tableName string, outputColumns []string, question string, vector []float64, filterStr string, topK, pageSize, offset int, orderBy *OrderByExpr, rankFeature map[string]float64, similarityThreshold float64, minMatch float64) (*types.SearchResponse, error) {
|
||||
// Debug logging
|
||||
@ -937,6 +923,18 @@ func (e *infinityEngine) executeQuery(table *infinity.Table) (*types.SearchRespo
|
||||
chunks[i][colName] = []interface{}{}
|
||||
}
|
||||
}
|
||||
// Convert position_int from hex string to array format
|
||||
if posVal, ok := chunks[i]["position_int"].(string); ok {
|
||||
chunks[i]["position_int"] = utility.ConvertHexToPositionIntArray(posVal)
|
||||
} else {
|
||||
chunks[i]["position_int"] = []interface{}{}
|
||||
}
|
||||
// Convert page_num_int and top_int from hex string to array
|
||||
for _, colName := range []string{"page_num_int", "top_int"} {
|
||||
if val, ok := chunks[i][colName].(string); ok {
|
||||
chunks[i][colName] = utility.ConvertHexToIntArray(val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return &types.SearchResponse{
|
||||
|
||||
Reference in New Issue
Block a user