perf: optimize DatasetRetrieval.retrieve、RetrievalService._deduplicat… (#29981)

2026-05-04 17:38:04 +08:00 · 2025-12-22 20:08:21 +08:00
parent 4d8223d517
commit eaf4146e2f
3 changed files with 201 additions and 145 deletions
--- a/api/core/rag/datasource/keyword/jieba/jieba.py
+++ b/api/core/rag/datasource/keyword/jieba/jieba.py
@ -90,13 +90,17 @@ class Jieba(BaseKeyword):
        sorted_chunk_indices = self._retrieve_ids_by_query(keyword_table or {}, query, k)

        documents = []
+
+        segment_query_stmt = db.session.query(DocumentSegment).where(
+            DocumentSegment.dataset_id == self.dataset.id, DocumentSegment.index_node_id.in_(sorted_chunk_indices)
+        )
+        if document_ids_filter:
+            segment_query_stmt = segment_query_stmt.where(DocumentSegment.document_id.in_(document_ids_filter))
+
+        segments = db.session.execute(segment_query_stmt).scalars().all()
+        segment_map = {segment.index_node_id: segment for segment in segments}
        for chunk_index in sorted_chunk_indices:
-            segment_query = db.session.query(DocumentSegment).where(
-                DocumentSegment.dataset_id == self.dataset.id, DocumentSegment.index_node_id == chunk_index
-            )
-            if document_ids_filter:
-                segment_query = segment_query.where(DocumentSegment.document_id.in_(document_ids_filter))
-            segment = segment_query.first()
+            segment = segment_map.get(chunk_index)

            if segment:
                documents.append(