Implement GetChunk() in Infinity in GO (#13758)

### What problem does this PR solve? Implement GetChunk() in Infinity in GO Add cli: GET CHUNK 'XXX'; LIST CHUNKS OF DOCUMENT 'XXX'; ### Type of change - [x] Refactoring
2026-05-06 10:17:49 +08:00 · 2026-03-24 20:10:21 +08:00
parent b308cd3a02
commit 7c8927c4fb
11 changed files with 989 additions and 75 deletions
--- a/internal/engine/infinity/get.go
+++ b/internal/engine/infinity/get.go
@ -0,0 +1,219 @@
+//
+//  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+package infinity
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	infinity "github.com/infiniflow/infinity-go-sdk"
+	"ragflow/internal/logger"
+	"ragflow/internal/utility"
+
+	"go.uber.org/zap"
+)
+
+// GetChunk gets a chunk by ID
+func (e *infinityEngine) GetChunk(ctx context.Context, tableName, chunkID string, kbIDs []string) (interface{}, error) {
+	if e.client == nil || e.client.conn == nil {
+		return nil, fmt.Errorf("Infinity client not initialized")
+	}
+
+	// Build list of table names to search
+	var tableNames []string
+	if strings.HasPrefix(tableName, "ragflow_doc_meta_") {
+		tableNames = []string{tableName}
+	} else {
+		// Search in tables like <tableName>_<kb_id> for each kbID
+		if len(kbIDs) > 0 {
+			for _, kbID := range kbIDs {
+				tableNames = append(tableNames, fmt.Sprintf("%s_%s", tableName, kbID))
+			}
+		}
+		// Also try the base tableName
+		tableNames = append(tableNames, tableName)
+	}
+
+	// Try each table and collect results from all tables
+	db, err := e.client.conn.GetDatabase(e.client.dbName)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get database: %w", err)
+	}
+
+	// Collect chunks from all tables (same as Python's concat_dataframes)
+	allChunks := make(map[string]map[string]interface{})
+
+	for _, tblName := range tableNames {
+		table, err := db.GetTable(tblName)
+		if err != nil {
+			continue
+		}
+
+		// Query with filter for the specific chunk ID
+		filter := fmt.Sprintf("id = '%s'", chunkID)
+		result, err := table.Output([]string{"*"}).Filter(filter).ToResult()
+		if err != nil {
+			continue
+		}
+
+		qr, ok := result.(*infinity.QueryResult)
+		if !ok {
+			continue
+		}
+
+		if len(qr.Data) == 0 {
+			continue
+		}
+
+		// Convert to chunk format
+		chunks := make([]map[string]interface{}, 0)
+		for colName, colData := range qr.Data {
+			for i, val := range colData {
+				for len(chunks) <= i {
+					chunks = append(chunks, make(map[string]interface{}))
+				}
+				chunks[i][colName] = val
+			}
+		}
+
+		// Merge chunks into allChunks (by id), keeping first non-empty value
+		for _, chunk := range chunks {
+			if idVal, ok := chunk["id"].(string); ok {
+				if existing, exists := allChunks[idVal]; exists {
+					// Merge: keep first non-empty value for each field
+					for k, v := range chunk {
+						if _, has := existing[k]; !has || utility.IsEmpty(v) {
+							existing[k] = v
+						}
+					}
+				} else {
+					allChunks[idVal] = chunk
+				}
+			}
+		}
+	}
+
+	// Get the chunk by chunkID
+	chunk, found := allChunks[chunkID]
+	if !found {
+		return nil, nil
+	}
+
+	getFields(chunk)
+
+	logger.Debug("infinity get chunk", zap.String("chunkID", chunkID), zap.Any("tables", tableNames))
+
+	return chunk, nil
+}
+
+// getFields applies field mappings to a chunk, similar to Python's get_fields function.
+func getFields(chunk map[string]interface{}) {
+	// Field mappings
+	// docnm -> docnm_kwd, title_tks, title_sm_tks
+	if val, ok := chunk["docnm"].(string); ok {
+		chunk["docnm_kwd"] = val
+		chunk["title_tks"] = val
+		chunk["title_sm_tks"] = val
+	}
+
+	// important_keywords -> important_kwd (split by comma), important_tks
+	if val, ok := chunk["important_keywords"].(string); ok {
+		if val == "" {
+			chunk["important_kwd"] = []interface{}{}
+		} else {
+			parts := strings.Split(val, ",")
+			chunk["important_kwd"] = parts
+		}
+		chunk["important_tks"] = val
+	} else {
+		chunk["important_kwd"] = []interface{}{}
+		chunk["important_tks"] = []interface{}{}
+	}
+
+	// questions -> question_kwd (split by newline), question_tks
+	if val, ok := chunk["questions"].(string); ok {
+		if val == "" {
+			chunk["question_kwd"] = []interface{}{}
+		} else {
+			parts := strings.Split(val, "\n")
+			chunk["question_kwd"] = parts
+		}
+		chunk["question_tks"] = val
+	} else {
+		chunk["question_kwd"] = []interface{}{}
+		chunk["question_tks"] = []interface{}{}
+	}
+
+	// content -> content_with_weight, content_ltks, content_sm_ltks
+	if val, ok := chunk["content"].(string); ok {
+		chunk["content_with_weight"] = val
+		chunk["content_ltks"] = val
+		chunk["content_sm_ltks"] = val
+	}
+
+	// authors -> authors_tks, authors_sm_tks
+	if val, ok := chunk["authors"].(string); ok {
+		chunk["authors_tks"] = val
+		chunk["authors_sm_tks"] = val
+	}
+
+	// position_int: convert from hex string to array format (grouped by 5)
+	if val, ok := chunk["position_int"].(string); ok {
+		chunk["position_int"] = utility.ConvertHexToPositionIntArray(val)
+	} else {
+		chunk["position_int"] = []interface{}{}
+	}
+
+	// Convert page_num_int and top_int from hex string to array
+	for _, colName := range []string{"page_num_int", "top_int"} {
+		if val, ok := chunk[colName].(string); ok && val != "" {
+			chunk[colName] = utility.ConvertHexToIntArray(val)
+		} else {
+			chunk[colName] = []int{}
+		}
+	}
+
+	// Post-process: convert nil/empty values to empty slices for array-like fields
+	// and split _kwd fields by "###" (except knowledge_graph_kwd, docnm_kwd, important_kwd, question_kwd)
+	kwdNoSplit := map[string]bool{
+		"knowledge_graph_kwd": true, "docnm_kwd": true,
+		"important_kwd": true, "question_kwd": true,
+	}
+	arrayFields := []string{
+		"doc_type_kwd", "important_kwd", "important_tks", "question_tks",
+		"question_kwd", "authors_tks", "authors_sm_tks", "title_tks",
+		"title_sm_tks", "content_ltks", "content_sm_ltks",
+	}
+	for _, colName := range arrayFields {
+		if val, ok := chunk[colName]; !ok || val == nil || val == "" {
+			chunk[colName] = []interface{}{}
+		} else if !kwdNoSplit[colName] {
+			// Split by "###" for _kwd fields
+			if strVal, ok := val.(string); ok && strings.Contains(strVal, "###") {
+				parts := strings.Split(strVal, "###")
+				var filtered []interface{}
+				for _, p := range parts {
+					if p != "" {
+						filtered = append(filtered, p)
+					}
+				}
+				chunk[colName] = filtered
+			}
+		}
+	}
+}
--- a/internal/engine/infinity/search.go
+++ b/internal/engine/infinity/search.go
@ -20,7 +20,7 @@ import (
 	"context"
 	"fmt"
 	"ragflow/internal/engine/types"
-	"strconv"
+	"ragflow/internal/utility"
 	"strings"
 	"unicode/utf8"

@ -458,18 +458,25 @@ func (e *infinityEngine) searchUnified(ctx context.Context, req *types.SearchReq
 		}
 	}

+	// DocIDs filters by doc_id (document ID) to find all chunks belonging to a document
+	// This is used by ChunkService.List() to list all chunks for a document
 	if len(req.DocIDs) > 0 {
 		if len(req.DocIDs) == 1 {
-			filterParts = append(filterParts, fmt.Sprintf("id = '%s'", req.DocIDs[0]))
+			filterParts = append(filterParts, fmt.Sprintf("doc_id = '%s'", req.DocIDs[0]))
 		} else {
 			docIDs := strings.Join(req.DocIDs, "', '")
-			filterParts = append(filterParts, fmt.Sprintf("id IN ('%s')", docIDs))
+			filterParts = append(filterParts, fmt.Sprintf("doc_id IN ('%s')", docIDs))
 		}
 	}

-	if !isMetadataTable {
-		// Default filter for available chunks
-		filterParts = append(filterParts, "available_int=1")
+	// Only add available_int filter when there's text/vector match or AvailableInt is explicitly set
+	// This matches Python's behavior where chunk_list doesn't filter by available_int
+	if !isMetadataTable && (hasTextMatch || hasVectorMatch || req.AvailableInt != nil) {
+		if req.AvailableInt != nil {
+			filterParts = append(filterParts, fmt.Sprintf("available_int=%d", *req.AvailableInt))
+		} else {
+			filterParts = append(filterParts, "available_int=1")
+		}
 	}

 	filterStr := strings.Join(filterParts, " AND ")
@ -637,13 +644,13 @@ func calculateScores(chunks []map[string]interface{}, scoreColumn, pagerankField
 	for i := range chunks {
 		score := 0.0
 		if scoreVal, ok := chunks[i][scoreColumn]; ok {
-			if f, ok := toFloat64(scoreVal); ok {
+			if f, ok := utility.ToFloat64(scoreVal); ok {
 				score += f
 				fmt.Printf("[DEBUG]   chunk[%d]: %s=%f\n", i, scoreColumn, f)
 			}
 		}
 		if pagerankVal, ok := chunks[i][pagerankField]; ok {
-			if f, ok := toFloat64(pagerankVal); ok {
+			if f, ok := utility.ToFloat64(pagerankVal); ok {
 				score += f
 			}
 		}
@ -699,27 +706,6 @@ func getScore(chunk map[string]interface{}) float64 {
 	return 0.0
 }

-func toFloat64(val interface{}) (float64, bool) {
-	switch v := val.(type) {
-	case float64:
-		return v, true
-	case float32:
-		return float64(v), true
-	case int:
-		return float64(v), true
-	case int64:
-		return float64(v), true
-	case string:
-		f, err := strconv.ParseFloat(v, 64)
-		if err != nil {
-			return 0, false
-		}
-		return f, true
-	default:
-		return 0, false
-	}
-}
-
 // executeTableSearch executes search on a single table
 func (e *infinityEngine) executeTableSearch(db *infinity.Database, tableName string, outputColumns []string, question string, vector []float64, filterStr string, topK, pageSize, offset int, orderBy *OrderByExpr, rankFeature map[string]float64, similarityThreshold float64, minMatch float64) (*types.SearchResponse, error) {
 	// Debug logging
@ -937,6 +923,18 @@ func (e *infinityEngine) executeQuery(table *infinity.Table) (*types.SearchRespo
 				chunks[i][colName] = []interface{}{}
 			}
 		}
+		// Convert position_int from hex string to array format
+		if posVal, ok := chunks[i]["position_int"].(string); ok {
+			chunks[i]["position_int"] = utility.ConvertHexToPositionIntArray(posVal)
+		} else {
+			chunks[i]["position_int"] = []interface{}{}
+		}
+		// Convert page_num_int and top_int from hex string to array
+		for _, colName := range []string{"page_num_int", "top_int"} {
+			if val, ok := chunks[i][colName].(string); ok {
+				chunks[i][colName] = utility.ConvertHexToIntArray(val)
+			}
+		}
 	}

 	return &types.SearchResponse{