feat: Add summary index for knowledge. (#31625)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: Yansong Zhang <916125788@qq.com> Co-authored-by: hj24 <mambahj24@gmail.com> Co-authored-by: CodingOnStar <hanxujiang@dify.ai> Co-authored-by: CodingOnStar <hanxujiang@dify.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-05-02 08:28:03 +08:00 · 2026-01-29 13:47:35 +08:00
parent 5ce3a04a2c
commit c2473d85dc
51 changed files with 3797 additions and 60 deletions
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@ -24,7 +24,13 @@ from core.rag.rerank.rerank_type import RerankMode
 from core.rag.retrieval.retrieval_methods import RetrievalMethod
 from core.tools.signature import sign_upload_file
 from extensions.ext_database import db
-from models.dataset import ChildChunk, Dataset, DocumentSegment, SegmentAttachmentBinding
+from models.dataset import (
+    ChildChunk,
+    Dataset,
+    DocumentSegment,
+    DocumentSegmentSummary,
+    SegmentAttachmentBinding,
+)
 from models.dataset import Document as DatasetDocument
 from models.model import UploadFile
 from services.external_knowledge_service import ExternalDatasetService
@ -389,15 +395,15 @@ class RetrievalService:
                .all()
            }

-            records = []
-            include_segment_ids = set()
-            segment_child_map = {}
-
            valid_dataset_documents = {}
            image_doc_ids: list[Any] = []
            child_index_node_ids = []
            index_node_ids = []
            doc_to_document_map = {}
+            summary_segment_ids = set()  # Track segments retrieved via summary
+            summary_score_map: dict[str, float] = {}  # Map original_chunk_id to summary score
+
+            # First pass: collect all document IDs and identify summary documents
            for document in documents:
                document_id = document.metadata.get("document_id")
                if document_id not in dataset_documents:
@ -408,16 +414,39 @@ class RetrievalService:
                    continue
                valid_dataset_documents[document_id] = dataset_document

+                doc_id = document.metadata.get("doc_id") or ""
+                doc_to_document_map[doc_id] = document
+
+                # Check if this is a summary document
+                is_summary = document.metadata.get("is_summary", False)
+                if is_summary:
+                    # For summary documents, find the original chunk via original_chunk_id
+                    original_chunk_id = document.metadata.get("original_chunk_id")
+                    if original_chunk_id:
+                        summary_segment_ids.add(original_chunk_id)
+                        # Save summary's score for later use
+                        summary_score = document.metadata.get("score")
+                        if summary_score is not None:
+                            try:
+                                summary_score_float = float(summary_score)
+                                # If the same segment has multiple summary hits, take the highest score
+                                if original_chunk_id not in summary_score_map:
+                                    summary_score_map[original_chunk_id] = summary_score_float
+                                else:
+                                    summary_score_map[original_chunk_id] = max(
+                                        summary_score_map[original_chunk_id], summary_score_float
+                                    )
+                            except (ValueError, TypeError):
+                                # Skip invalid score values
+                                pass
+                    continue  # Skip adding to other lists for summary documents
+
                if dataset_document.doc_form == IndexStructureType.PARENT_CHILD_INDEX:
-                    doc_id = document.metadata.get("doc_id") or ""
-                    doc_to_document_map[doc_id] = document
                    if document.metadata.get("doc_type") == DocType.IMAGE:
                        image_doc_ids.append(doc_id)
                    else:
                        child_index_node_ids.append(doc_id)
                else:
-                    doc_id = document.metadata.get("doc_id") or ""
-                    doc_to_document_map[doc_id] = document
                    if document.metadata.get("doc_type") == DocType.IMAGE:
                        image_doc_ids.append(doc_id)
                    else:
@ -433,6 +462,7 @@ class RetrievalService:
            attachment_map: dict[str, list[dict[str, Any]]] = {}
            child_chunk_map: dict[str, list[ChildChunk]] = {}
            doc_segment_map: dict[str, list[str]] = {}
+            segment_summary_map: dict[str, str] = {}  # Map segment_id to summary content

            with session_factory.create_session() as session:
                attachments = cls.get_segment_attachment_infos(image_doc_ids, session)
@ -447,6 +477,7 @@ class RetrievalService:
                        doc_segment_map[attachment["segment_id"]].append(attachment["attachment_id"])
                    else:
                        doc_segment_map[attachment["segment_id"]] = [attachment["attachment_id"]]
+
                child_chunk_stmt = select(ChildChunk).where(ChildChunk.index_node_id.in_(child_index_node_ids))
                child_index_nodes = session.execute(child_chunk_stmt).scalars().all()

@ -470,6 +501,7 @@ class RetrievalService:
                    index_node_segments = session.execute(document_segment_stmt).scalars().all()  # type: ignore
                    for index_node_segment in index_node_segments:
                        doc_segment_map[index_node_segment.id] = [index_node_segment.index_node_id]
+
                if segment_ids:
                    document_segment_stmt = select(DocumentSegment).where(
                        DocumentSegment.enabled == True,
@ -481,6 +513,40 @@ class RetrievalService:
                if index_node_segments:
                    segments.extend(index_node_segments)

+                # Handle summary documents: query segments by original_chunk_id
+                if summary_segment_ids:
+                    summary_segment_ids_list = list(summary_segment_ids)
+                    summary_segment_stmt = select(DocumentSegment).where(
+                        DocumentSegment.enabled == True,
+                        DocumentSegment.status == "completed",
+                        DocumentSegment.id.in_(summary_segment_ids_list),
+                    )
+                    summary_segments = session.execute(summary_segment_stmt).scalars().all()  # type: ignore
+                    segments.extend(summary_segments)
+                    # Add summary segment IDs to segment_ids for summary query
+                    for seg in summary_segments:
+                        if seg.id not in segment_ids:
+                            segment_ids.append(seg.id)
+
+                # Batch query summaries for segments retrieved via summary (only enabled summaries)
+                if summary_segment_ids:
+                    summaries = (
+                        session.query(DocumentSegmentSummary)
+                        .filter(
+                            DocumentSegmentSummary.chunk_id.in_(list(summary_segment_ids)),
+                            DocumentSegmentSummary.status == "completed",
+                            DocumentSegmentSummary.enabled == True,  # Only retrieve enabled summaries
+                        )
+                        .all()
+                    )
+                    for summary in summaries:
+                        if summary.summary_content:
+                            segment_summary_map[summary.chunk_id] = summary.summary_content
+
+            include_segment_ids = set()
+            segment_child_map: dict[str, dict[str, Any]] = {}
+            records: list[dict[str, Any]] = []
+
            for segment in segments:
                child_chunks: list[ChildChunk] = child_chunk_map.get(segment.id, [])
                attachment_infos: list[dict[str, Any]] = attachment_map.get(segment.id, [])
@ -489,30 +555,44 @@ class RetrievalService:
                if ds_dataset_document and ds_dataset_document.doc_form == IndexStructureType.PARENT_CHILD_INDEX:
                    if segment.id not in include_segment_ids:
                        include_segment_ids.add(segment.id)
+                        # Check if this segment was retrieved via summary
+                        # Use summary score as base score if available, otherwise 0.0
+                        max_score = summary_score_map.get(segment.id, 0.0)
+
                        if child_chunks or attachment_infos:
                            child_chunk_details = []
-                            max_score = 0.0
                            for child_chunk in child_chunks:
-                                document = doc_to_document_map[child_chunk.index_node_id]
+                                child_document: Document | None = doc_to_document_map.get(child_chunk.index_node_id)
+                                if child_document:
+                                    child_score = child_document.metadata.get("score", 0.0)
+                                else:
+                                    child_score = 0.0
                                child_chunk_detail = {
                                    "id": child_chunk.id,
                                    "content": child_chunk.content,
                                    "position": child_chunk.position,
-                                    "score": document.metadata.get("score", 0.0) if document else 0.0,
+                                    "score": child_score,
                                }
                                child_chunk_details.append(child_chunk_detail)
-                                max_score = max(max_score, document.metadata.get("score", 0.0) if document else 0.0)
+                                max_score = max(max_score, child_score)
                            for attachment_info in attachment_infos:
-                                file_document = doc_to_document_map[attachment_info["id"]]
-                                max_score = max(
-                                    max_score, file_document.metadata.get("score", 0.0) if file_document else 0.0
-                                )
+                                file_document = doc_to_document_map.get(attachment_info["id"])
+                                if file_document:
+                                    max_score = max(max_score, file_document.metadata.get("score", 0.0))

                            map_detail = {
                                "max_score": max_score,
                                "child_chunks": child_chunk_details,
                            }
                            segment_child_map[segment.id] = map_detail
+                        else:
+                            # No child chunks or attachments, use summary score if available
+                            summary_score = summary_score_map.get(segment.id)
+                            if summary_score is not None:
+                                segment_child_map[segment.id] = {
+                                    "max_score": summary_score,
+                                    "child_chunks": [],
+                                }
                        record: dict[str, Any] = {
                            "segment": segment,
                        }
@ -520,14 +600,23 @@ class RetrievalService:
                else:
                    if segment.id not in include_segment_ids:
                        include_segment_ids.add(segment.id)
-                        max_score = 0.0
-                        segment_document = doc_to_document_map.get(segment.index_node_id)
-                        if segment_document:
-                            max_score = max(max_score, segment_document.metadata.get("score", 0.0))
+
+                        # Check if this segment was retrieved via summary
+                        # Use summary score if available (summary retrieval takes priority)
+                        max_score = summary_score_map.get(segment.id, 0.0)
+
+                        # If not retrieved via summary, use original segment's score
+                        if segment.id not in summary_score_map:
+                            segment_document = doc_to_document_map.get(segment.index_node_id)
+                            if segment_document:
+                                max_score = max(max_score, segment_document.metadata.get("score", 0.0))
+
+                        # Also consider attachment scores
                        for attachment_info in attachment_infos:
                            file_doc = doc_to_document_map.get(attachment_info["id"])
                            if file_doc:
                                max_score = max(max_score, file_doc.metadata.get("score", 0.0))
+
                        record = {
                            "segment": segment,
                            "score": max_score,
@ -576,9 +665,16 @@ class RetrievalService:
                    else None
                )

+                # Extract summary if this segment was retrieved via summary
+                summary_content = segment_summary_map.get(segment.id)
+
                # Create RetrievalSegments object
                retrieval_segment = RetrievalSegments(
-                    segment=segment, child_chunks=child_chunks_list, score=score, files=files
+                    segment=segment,
+                    child_chunks=child_chunks_list,
+                    score=score,
+                    files=files,
+                    summary=summary_content,
                )
                result.append(retrieval_segment)