Merge remote-tracking branch 'origin/feat/knowledgebase-summaryIndex' into feat/knowledgebase-summaryIndex

2026-05-30 13:47:52 +08:00 · 2026-01-28 10:47:24 +08:00
parent 32baf70fc7 2674ca5a7d
commit 38aeac70f6
12 changed files with 28 additions and 47 deletions
--- a/api/core/llm_generator/prompts.py
+++ b/api/core/llm_generator/prompts.py
@ -450,4 +450,4 @@ Requirements:
 Output only the summary text. Start summarizing now:

 """
-)
+)
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@ -554,7 +554,7 @@ class RetrievalService:
                        # Check if this segment was retrieved via summary
                        # Use summary score as base score if available, otherwise 0.0
                        max_score = summary_score_map.get(segment.id, 0.0)
-                        
+
                        if child_chunks or attachment_infos:
                            child_chunk_details = []
                            for child_chunk in child_chunks:
@ -571,9 +571,7 @@ class RetrievalService:
                            for attachment_info in attachment_infos:
                                file_document = doc_to_document_map.get(attachment_info["id"])
                                if file_document:
-                                    max_score = max(
-                                        max_score, file_document.metadata.get("score", 0.0)
-                                    )
+                                    max_score = max(max_score, file_document.metadata.get("score", 0.0))

                            map_detail = {
                                "max_score": max_score,
@ -595,23 +593,23 @@ class RetrievalService:
                else:
                    if segment.id not in include_segment_ids:
                        include_segment_ids.add(segment.id)
-                        
+
                        # Check if this segment was retrieved via summary
                        # Use summary score if available (summary retrieval takes priority)
                        max_score = summary_score_map.get(segment.id, 0.0)
-                        
+
                        # If not retrieved via summary, use original segment's score
                        if segment.id not in summary_score_map:
                            segment_document = doc_to_document_map.get(segment.index_node_id)
                            if segment_document:
                                max_score = max(max_score, segment_document.metadata.get("score", 0.0))
-                        
+
                        # Also consider attachment scores
                        for attachment_info in attachment_infos:
                            file_doc = doc_to_document_map.get(attachment_info["id"])
                            if file_doc:
                                max_score = max(max_score, file_doc.metadata.get("score", 0.0))
-                        
+
                        record = {
                            "segment": segment,
                            "score": max_score,
--- a/api/core/rag/index_processor/processor/paragraph_index_processor.py
+++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py
@ -309,10 +309,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
        errors: list[Exception] = []

        with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(preview_texts))) as executor:
-            futures = [
-                executor.submit(process, preview)
-                for preview in preview_texts
-            ]
+            futures = [executor.submit(process, preview) for preview in preview_texts]
            # Wait for all tasks to complete with timeout
            done, not_done = concurrent.futures.wait(futures, timeout=timeout_seconds)

--- a/api/core/rag/index_processor/processor/parent_child_index_processor.py
+++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py
@ -362,7 +362,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
        For each parent chunk in preview_texts, concurrently call generate_summary to generate a summary
        and write it to the summary attribute of PreviewDetail.
        In preview mode (indexing-estimate), if any summary generation fails, the method will raise an exception.
-        
+
        Note: For parent-child structure, we only generate summaries for parent chunks.
        """
        import concurrent.futures
@ -379,6 +379,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
        def process(preview: PreviewDetail) -> None:
            """Generate summary for a single preview item (parent chunk)."""
            from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor
+
            if flask_app:
                # Ensure Flask app context in worker thread
                with flask_app.app_context():
@ -403,10 +404,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
        errors: list[Exception] = []

        with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(preview_texts))) as executor:
-            futures = [
-                executor.submit(process, preview)
-                for preview in preview_texts
-            ]
+            futures = [executor.submit(process, preview) for preview in preview_texts]
            # Wait for all tasks to complete with timeout
            done, not_done = concurrent.futures.wait(futures, timeout=timeout_seconds)

--- a/api/core/rag/index_processor/processor/qa_index_processor.py
+++ b/api/core/rag/index_processor/processor/qa_index_processor.py
@ -243,7 +243,7 @@ class QAIndexProcessor(BaseIndexProcessor):
    ) -> list[PreviewDetail]:
        """
        QA model doesn't generate summaries, so this method returns preview_texts unchanged.
-        
+
        Note: QA model uses question-answer pairs, which don't require summary generation.
        """
        # QA model doesn't generate summaries, return as-is
--- a/api/core/rag/retrieval/dataset_retrieval.py
+++ b/api/core/rag/retrieval/dataset_retrieval.py
@ -241,13 +241,13 @@ class DatasetRetrieval:
                        segment_content = f"question:{segment.get_sign_content()} answer:{segment.answer}"
                    else:
                        segment_content = segment.get_sign_content()
-                    
+
                    # If summary exists, prepend it to the content
                    if record.summary:
                        final_content = f"{record.summary}\n{segment_content}"
                    else:
                        final_content = segment_content
-                    
+
                    document_context_list.append(
                        DocumentContext(
                            content=final_content,
@ -321,7 +321,7 @@ class DatasetRetrieval:
                            else:
                                source.content = segment.content
                            # Add summary if this segment was retrieved via summary
-                            if hasattr(record, 'summary') and record.summary:
+                            if hasattr(record, "summary") and record.summary:
                                source.summary = record.summary
                            retrieval_resource_list.append(source)
        if hit_callback and retrieval_resource_list:
--- a/api/core/tools/utils/dataset_retriever/dataset_retriever_tool.py
+++ b/api/core/tools/utils/dataset_retriever/dataset_retriever_tool.py
@ -174,13 +174,13 @@ class DatasetRetrieverTool(DatasetRetrieverBaseTool):
                            segment_content = f"question:{segment.get_sign_content()} answer:{segment.answer}"
                        else:
                            segment_content = segment.get_sign_content()
-                        
+
                        # If summary exists, prepend it to the content
                        if record.summary:
                            final_content = f"{record.summary}\n{segment_content}"
                        else:
                            final_content = segment_content
-                        
+
                        document_context_list.append(
                            DocumentContext(
                                content=final_content,
@ -221,7 +221,7 @@ class DatasetRetrieverTool(DatasetRetrieverBaseTool):
                                else:
                                    source.content = segment.content
                                # Add summary if this segment was retrieved via summary
-                                if hasattr(record, 'summary') and record.summary:
+                                if hasattr(record, "summary") and record.summary:
                                    source.summary = record.summary
                                retrieval_resource_list.append(source)

--- a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
+++ b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
@ -385,7 +385,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
            # Set a reasonable timeout to prevent hanging (60 seconds per chunk, max 5 minutes total)
            timeout_seconds = min(300, 60 * len(preview_output["preview"]))
            errors: list[Exception] = []
-            
+
            with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(preview_output["preview"]))) as executor:
                futures = [
                    executor.submit(generate_summary_for_chunk, preview_item)
--- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py
+++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py
@ -419,7 +419,7 @@ class KnowledgeRetrievalNode(LLMUsageTrackingMixin, Node[KnowledgeRetrievalNodeD
                            source["content"] = f"question:{segment.get_sign_content()} \nanswer:{segment.answer}"
                        else:
                            source["content"] = segment.get_sign_content()
-                        # Add summary if available 
+                        # Add summary if available
                        if record.summary:
                            source["summary"] = record.summary
                        retrieval_resource_list.append(source)
--- a/api/core/workflow/nodes/llm/node.py
+++ b/api/core/workflow/nodes/llm/node.py
@ -748,7 +748,7 @@ class LLMNode(Node[LLMNodeData]):
                page=metadata.get("page"),
                doc_metadata=metadata.get("doc_metadata"),
                files=context_dict.get("files"),
-                summary=context_dict.get("summary"), 
+                summary=context_dict.get("summary"),
            )

            return source
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@ -3384,9 +3384,7 @@ class SegmentService:
                                SummaryIndexService.generate_and_vectorize_summary(
                                    segment, dataset, dataset.summary_index_setting
                                )
-                                logger.info(
-                                    "Auto-regenerated summary for segment %s after content change", segment.id
-                                )
+                                logger.info("Auto-regenerated summary for segment %s after content change", segment.id)
                            except Exception:
                                logger.exception("Failed to auto-regenerate summary for segment %s", segment.id)
                                # Don't fail the entire update if summary regeneration fails
@ -3400,9 +3398,7 @@ class SegmentService:

                            try:
                                SummaryIndexService.update_summary_for_segment(segment, dataset, args.summary)
-                                logger.info(
-                                    "Updated summary for segment %s with user-provided content", segment.id
-                                )
+                                logger.info("Updated summary for segment %s with user-provided content", segment.id)
                            except Exception:
                                logger.exception("Failed to update summary for segment %s", segment.id)
                                # Don't fail the entire update if summary update fails
--- a/api/services/summary_index_service.py
+++ b/api/services/summary_index_service.py
@ -216,7 +216,7 @@ class SummaryIndexService:
                db.session.add(summary_record)
                db.session.flush()
                # Success, exit function
-                return 
+                return

            except (ConnectionError, Exception) as e:
                error_str = str(e).lower()
@ -333,9 +333,7 @@ class SummaryIndexService:
            error: Error message
        """
        summary_record = (
-            db.session.query(DocumentSegmentSummary)
-            .filter_by(chunk_id=segment.id, dataset_id=dataset.id)
-            .first()
+            db.session.query(DocumentSegmentSummary).filter_by(chunk_id=segment.id, dataset_id=dataset.id).first()
        )

        if summary_record:
@ -344,9 +342,7 @@ class SummaryIndexService:
            db.session.add(summary_record)
            db.session.flush()
        else:
-            logger.warning(
-                "Summary record not found for segment %s when updating error", segment.id
-            )
+            logger.warning("Summary record not found for segment %s when updating error", segment.id)

    @staticmethod
    def generate_and_vectorize_summary(
@ -371,16 +367,12 @@ class SummaryIndexService:
        """
        # Get existing summary record (should have been created by batch_create_summary_records)
        summary_record = (
-            db.session.query(DocumentSegmentSummary)
-            .filter_by(chunk_id=segment.id, dataset_id=dataset.id)
-            .first()
+            db.session.query(DocumentSegmentSummary).filter_by(chunk_id=segment.id, dataset_id=dataset.id).first()
        )

        if not summary_record:
            # If not found (shouldn't happen), create one
-            logger.warning(
-                "Summary record not found for segment %s, creating one", segment.id
-            )
+            logger.warning("Summary record not found for segment %s, creating one", segment.id)
            summary_record = SummaryIndexService.create_summary_record(
                segment, dataset, summary_content="", status="generating"
            )