feat: Make summary index support vision, and make the code more standardized.

2026-05-05 18:08:07 +08:00 · 2026-01-14 17:52:27 +08:00
parent 9b7e807690
commit 7eb65b07c8
23 changed files with 569 additions and 307 deletions
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@ -367,8 +367,8 @@ class IndexingRunner:
            return IndexingEstimate(total_segments=total_segments * 20, qa_preview=qa_preview_texts, preview=[])

        # Generate summary preview
-        summary_index_setting = tmp_processing_rule["summary_index_setting"] if "summary_index_setting" in tmp_processing_rule else None
-        if summary_index_setting and summary_index_setting.get('enable') and preview_texts:
+        summary_index_setting = tmp_processing_rule.get("summary_index_setting")
+        if summary_index_setting and summary_index_setting.get("enable") and preview_texts:
            preview_texts = index_processor.generate_summary_preview(tenant_id, preview_texts, summary_index_setting)

        return IndexingEstimate(total_segments=total_segments, preview=preview_texts)
--- a/api/core/llm_generator/prompts.py
+++ b/api/core/llm_generator/prompts.py
@ -436,4 +436,6 @@ You should edit the prompt according to the IDEAL OUTPUT."""
 INSTRUCTION_GENERATE_TEMPLATE_CODE = """Please fix the errors in the {{#error_message#}}."""

 DEFAULT_GENERATOR_SUMMARY_PROMPT = """
-You are a helpful assistant that summarizes long pieces of text into concise summaries. Given the following text, generate a brief summary that captures the main points and key information. The summary should be clear, concise, and written in complete sentences. """
+You are a helpful assistant that summarizes long pieces of text into concise summaries. 
+Given the following text, generate a brief summary that captures the main points and key information. 
+The summary should be clear, concise, and written in complete sentences. """
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@ -395,7 +395,7 @@ class RetrievalService:
            index_node_ids = []
            doc_to_document_map = {}
            summary_segment_ids = set()  # Track segments retrieved via summary
-            
+
            # First pass: collect all document IDs and identify summary documents
            for document in documents:
                document_id = document.metadata.get("document_id")
@ -455,7 +455,7 @@ class RetrievalService:
                        doc_segment_map[attachment["segment_id"]].append(attachment["attachment_id"])
                    else:
                        doc_segment_map[attachment["segment_id"]] = [attachment["attachment_id"]]
-                        
+
                child_chunk_stmt = select(ChildChunk).where(ChildChunk.index_node_id.in_(child_index_node_ids))
                child_index_nodes = session.execute(child_chunk_stmt).scalars().all()

@ -479,7 +479,7 @@ class RetrievalService:
                    index_node_segments = session.execute(document_segment_stmt).scalars().all()  # type: ignore
                    for index_node_segment in index_node_segments:
                        doc_segment_map[index_node_segment.id] = [index_node_segment.index_node_id]
-                        
+
                if segment_ids:
                    document_segment_stmt = select(DocumentSegment).where(
                        DocumentSegment.enabled == True,
--- a/api/core/rag/index_processor/index_processor_base.py
+++ b/api/core/rag/index_processor/index_processor_base.py
@ -47,7 +47,9 @@ class BaseIndexProcessor(ABC):
        raise NotImplementedError

    @abstractmethod
-    def generate_summary_preview(self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict) -> list[PreviewDetail]:
+    def generate_summary_preview(
+        self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict
+    ) -> list[PreviewDetail]:
        """
        For each segment in preview_texts, generate a summary using LLM and attach it to the segment.
        The summary can be stored in a new attribute, e.g., summary.
--- a/api/core/rag/index_processor/processor/paragraph_index_processor.py
+++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py
@ -1,6 +1,7 @@
 """Paragraph index processor."""

 import logging
+import re
 import uuid
 from collections.abc import Mapping
 from typing import Any
@ -8,6 +9,17 @@ from typing import Any
 logger = logging.getLogger(__name__)

 from core.entities.knowledge_entities import PreviewDetail
+from core.file import File, FileTransferMethod, FileType, file_manager
+from core.llm_generator.prompts import DEFAULT_GENERATOR_SUMMARY_PROMPT
+from core.model_manager import ModelInstance
+from core.model_runtime.entities.message_entities import (
+    ImagePromptMessageContent,
+    PromptMessageContentUnionTypes,
+    TextPromptMessageContent,
+    UserPromptMessage,
+)
+from core.model_runtime.entities.model_entities import ModelFeature, ModelType
+from core.provider_manager import ProviderManager
 from core.rag.cleaner.clean_processor import CleanProcessor
 from core.rag.datasource.keyword.keyword_factory import Keyword
 from core.rag.datasource.retrieval_service import RetrievalService
@ -22,18 +34,15 @@ from core.rag.models.document import AttachmentDocument, Document, MultimodalGen
 from core.rag.retrieval.retrieval_methods import RetrievalMethod
 from core.tools.utils.text_processing_utils import remove_leading_symbols
 from extensions.ext_database import db
+from factories.file_factory import build_from_mapping
 from libs import helper
+from models import UploadFile
 from models.account import Account
-from models.dataset import Dataset, DatasetProcessRule, DocumentSegment
+from models.dataset import Dataset, DatasetProcessRule, DocumentSegment, SegmentAttachmentBinding
 from models.dataset import Document as DatasetDocument
 from services.account_service import AccountService
 from services.entities.knowledge_entities.knowledge_entities import Rule
 from services.summary_index_service import SummaryIndexService
-from core.llm_generator.prompts import DEFAULT_GENERATOR_SUMMARY_PROMPT
-from core.model_runtime.entities.message_entities import UserPromptMessage
-from core.model_runtime.entities.model_entities import ModelType
-from core.provider_manager import ProviderManager
-from core.model_manager import ModelInstance


 class ParagraphIndexProcessor(BaseIndexProcessor):
@ -262,12 +271,15 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
        else:
            raise ValueError("Chunks is not a list")

-    def generate_summary_preview(self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict) -> list[PreviewDetail]:
+    def generate_summary_preview(
+        self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict
+    ) -> list[PreviewDetail]:
        """
        For each segment, concurrently call generate_summary to generate a summary
        and write it to the summary attribute of PreviewDetail.
        """
        import concurrent.futures
+
        from flask import current_app

        # Capture Flask app context for worker threads
@ -289,8 +301,8 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
                    # Fallback: try without app context (may fail)
                    summary = self.generate_summary(tenant_id, preview.content, summary_index_setting)
                    preview.summary = summary
-            except Exception as e:
-                logger.error(f"Failed to generate summary for preview: {str(e)}")
+            except Exception:
+                logger.exception("Failed to generate summary for preview")
                # Don't fail the entire preview if summary generation fails
                preview.summary = None

@ -299,9 +311,21 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
        return preview_texts

    @staticmethod
-    def generate_summary(tenant_id: str, text: str, summary_index_setting: dict = None) -> str:
+    def generate_summary(
+        tenant_id: str,
+        text: str,
+        summary_index_setting: dict | None = None,
+        segment_id: str | None = None,
+    ) -> str:
        """
-        Generate summary for the given text using ModelInstance.invoke_llm and the default or custom summary prompt.
+        Generate summary for the given text using ModelInstance.invoke_llm and the default or custom summary prompt,
+        and supports vision models by including images from the segment attachments or text content.
+
+        Args:
+            tenant_id: Tenant ID
+            text: Text content to summarize
+            summary_index_setting: Summary index configuration
+            segment_id: Optional segment ID to fetch attachments from SegmentAttachmentBinding table
        """
        if not summary_index_setting or not summary_index_setting.get("enable"):
            raise ValueError("summary_index_setting is required and must be enabled to generate summary.")
@ -314,17 +338,195 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
        if not summary_prompt:
            summary_prompt = DEFAULT_GENERATOR_SUMMARY_PROMPT

-        prompt = f"{summary_prompt}\n{text}"
-
        provider_manager = ProviderManager()
-        provider_model_bundle = provider_manager.get_provider_model_bundle(tenant_id, model_provider_name, ModelType.LLM)
-        model_instance = ModelInstance(provider_model_bundle, model_name)
-        prompt_messages = [UserPromptMessage(content=prompt)]
-
-        result = model_instance.invoke_llm(
-            prompt_messages=prompt_messages,
-            model_parameters={},
-            stream=False
+        provider_model_bundle = provider_manager.get_provider_model_bundle(
+            tenant_id, model_provider_name, ModelType.LLM
        )
+        model_instance = ModelInstance(provider_model_bundle, model_name)
+
+        # Get model schema to check if vision is supported
+        model_schema = model_instance.get_model_schema(model_name, provider_model_bundle.credentials)
+        supports_vision = model_schema and model_schema.features and ModelFeature.VISION in model_schema.features
+
+        # Extract images if model supports vision
+        image_files = []
+        if supports_vision:
+            # First, try to get images from SegmentAttachmentBinding (preferred method)
+            if segment_id:
+                image_files = ParagraphIndexProcessor._extract_images_from_segment_attachments(tenant_id, segment_id)
+
+            # If no images from attachments, fall back to extracting from text
+            if not image_files:
+                image_files = ParagraphIndexProcessor._extract_images_from_text(tenant_id, text)
+
+        # Build prompt messages
+        prompt_messages = []
+
+        if image_files:
+            # If we have images, create a UserPromptMessage with both text and images
+            prompt_message_contents: list[PromptMessageContentUnionTypes] = []
+
+            # Add images first
+            for file in image_files:
+                try:
+                    file_content = file_manager.to_prompt_message_content(
+                        file, image_detail_config=ImagePromptMessageContent.DETAIL.LOW
+                    )
+                    prompt_message_contents.append(file_content)
+                except Exception as e:
+                    logger.warning("Failed to convert image file to prompt message content: %s", str(e))
+                    continue
+
+            # Add text content
+            if prompt_message_contents:  # Only add text if we successfully added images
+                prompt_message_contents.append(TextPromptMessageContent(data=f"{summary_prompt}\n{text}"))
+                prompt_messages.append(UserPromptMessage(content=prompt_message_contents))
+            else:
+                # If image conversion failed, fall back to text-only
+                prompt = f"{summary_prompt}\n{text}"
+                prompt_messages.append(UserPromptMessage(content=prompt))
+        else:
+            # No images, use simple text prompt
+            prompt = f"{summary_prompt}\n{text}"
+            prompt_messages.append(UserPromptMessage(content=prompt))
+
+        result = model_instance.invoke_llm(prompt_messages=prompt_messages, model_parameters={}, stream=False)

        return getattr(result.message, "content", "")
+
+    @staticmethod
+    def _extract_images_from_text(tenant_id: str, text: str) -> list[File]:
+        """
+        Extract images from markdown text and convert them to File objects.
+
+        Args:
+            tenant_id: Tenant ID
+            text: Text content that may contain markdown image links
+
+        Returns:
+            List of File objects representing images found in the text
+        """
+        # Extract markdown images using regex pattern
+        pattern = r"!\[.*?\]\((.*?)\)"
+        images = re.findall(pattern, text)
+
+        if not images:
+            return []
+
+        upload_file_id_list = []
+
+        for image in images:
+            # For data before v0.10.0
+            pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?"
+            match = re.search(pattern, image)
+            if match:
+                upload_file_id = match.group(1)
+                upload_file_id_list.append(upload_file_id)
+                continue
+
+            # For data after v0.10.0
+            pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?"
+            match = re.search(pattern, image)
+            if match:
+                upload_file_id = match.group(1)
+                upload_file_id_list.append(upload_file_id)
+                continue
+
+            # For tools directory - direct file formats (e.g., .png, .jpg, etc.)
+            pattern = r"/files/tools/([a-f0-9\-]+)\.([a-zA-Z0-9]+)(?:\?[^\s\)\"\']*)?"
+            match = re.search(pattern, image)
+            if match:
+                # Tool files are handled differently, skip for now
+                continue
+
+        if not upload_file_id_list:
+            return []
+
+        # Get unique IDs for database query
+        unique_upload_file_ids = list(set(upload_file_id_list))
+        upload_files = (
+            db.session.query(UploadFile)
+            .where(UploadFile.id.in_(unique_upload_file_ids), UploadFile.tenant_id == tenant_id)
+            .all()
+        )
+
+        # Create File objects from UploadFile records
+        file_objects = []
+        for upload_file in upload_files:
+            # Only process image files
+            if not upload_file.mime_type or "image" not in upload_file.mime_type:
+                continue
+
+            mapping = {
+                "upload_file_id": upload_file.id,
+                "transfer_method": FileTransferMethod.LOCAL_FILE.value,
+                "type": FileType.IMAGE.value,
+            }
+
+            try:
+                file_obj = build_from_mapping(
+                    mapping=mapping,
+                    tenant_id=tenant_id,
+                )
+                file_objects.append(file_obj)
+            except Exception as e:
+                logger.warning("Failed to create File object from UploadFile %s: %s", upload_file.id, str(e))
+                continue
+
+        return file_objects
+
+    @staticmethod
+    def _extract_images_from_segment_attachments(tenant_id: str, segment_id: str) -> list[File]:
+        """
+        Extract images from SegmentAttachmentBinding table (preferred method).
+        This matches how DatasetRetrieval gets segment attachments.
+
+        Args:
+            tenant_id: Tenant ID
+            segment_id: Segment ID to fetch attachments for
+
+        Returns:
+            List of File objects representing images found in segment attachments
+        """
+        from sqlalchemy import select
+
+        # Query attachments from SegmentAttachmentBinding table
+        attachments_with_bindings = db.session.execute(
+            select(SegmentAttachmentBinding, UploadFile)
+            .join(UploadFile, UploadFile.id == SegmentAttachmentBinding.attachment_id)
+            .where(
+                SegmentAttachmentBinding.segment_id == segment_id,
+                SegmentAttachmentBinding.tenant_id == tenant_id,
+            )
+        ).all()
+
+        if not attachments_with_bindings:
+            return []
+
+        file_objects = []
+        for _, upload_file in attachments_with_bindings:
+            # Only process image files
+            if not upload_file.mime_type or "image" not in upload_file.mime_type:
+                continue
+
+            try:
+                # Create File object directly (similar to DatasetRetrieval)
+                file_obj = File(
+                    id=upload_file.id,
+                    filename=upload_file.name,
+                    extension="." + upload_file.extension,
+                    mime_type=upload_file.mime_type,
+                    tenant_id=tenant_id,
+                    type=FileType.IMAGE,
+                    transfer_method=FileTransferMethod.LOCAL_FILE,
+                    remote_url=upload_file.source_url,
+                    related_id=upload_file.id,
+                    size=upload_file.size,
+                    storage_key=upload_file.key,
+                )
+                file_objects.append(file_obj)
+            except Exception as e:
+                logger.warning("Failed to create File object from UploadFile %s: %s", upload_file.id, str(e))
+                continue
+
+        return file_objects
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@ -65,14 +65,14 @@ class DocumentExtractorNode(Node[DocumentExtractorNodeData]):
        # Ensure storage_key is loaded for File objects
        files_to_check = value if isinstance(value, list) else [value]
        files_needing_storage_key = [
-            f for f in files_to_check
-            if isinstance(f, File) and not f.storage_key and f.related_id
+            f for f in files_to_check if isinstance(f, File) and not f.storage_key and f.related_id
        ]
        if files_needing_storage_key:
-            from factories.file_factory import StorageKeyLoader
-            from extensions.ext_database import db
            from sqlalchemy.orm import Session
-            
+
+            from extensions.ext_database import db
+            from factories.file_factory import StorageKeyLoader
+
            with Session(bind=db.engine) as session:
                storage_key_loader = StorageKeyLoader(session, tenant_id=self.tenant_id)
                storage_key_loader.load_storage_keys(files_needing_storage_key)
@ -433,12 +433,13 @@ def _download_file_content(file: File) -> bytes:
            # Check if storage_key is set
            if not file.storage_key:
                raise FileDownloadError(f"File storage_key is missing for file: {file.filename}")
-            
+
            # Check if file exists before downloading
            from extensions.ext_storage import storage
+
            if not storage.exists(file.storage_key):
                raise FileDownloadError(f"File not found in storage: {file.storage_key}")
-            
+
            return file_manager.download(file)
    except Exception as e:
        raise FileDownloadError(f"Error downloading file: {str(e)}") from e
--- a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
+++ b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
@ -77,11 +77,13 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
                # or fallback to dataset if not available in node_data
                indexing_technique = node_data.indexing_technique or dataset.indexing_technique
                summary_index_setting = node_data.summary_index_setting or dataset.summary_index_setting
-                
+
                outputs = self._get_preview_output_with_summaries(
-                    node_data.chunk_structure, chunks, dataset=dataset,
+                    node_data.chunk_structure,
+                    chunks,
+                    dataset=dataset,
                    indexing_technique=indexing_technique,
-                    summary_index_setting=summary_index_setting
+                    summary_index_setting=summary_index_setting,
                )
                return NodeRunResult(
                    status=WorkflowNodeExecutionStatus.SUCCEEDED,
@ -237,7 +239,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
                segments = query.all()

                if not segments:
-                    logger.info(f"No segments found for document {document.id}")
+                    logger.info("No segments found for document %s", document.id)
                    return

                # Filter segments based on mode
@ -256,7 +258,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
                    segments_to_process.append(segment)

                if not segments_to_process:
-                    logger.info(f"No segments need summary generation for document {document.id}")
+                    logger.info("No segments need summary generation for document %s", document.id)
                    return

                # Use ThreadPoolExecutor for concurrent generation
@ -267,46 +269,55 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
                    """Process a single segment in a thread with Flask app context."""
                    with flask_app.app_context():
                        try:
-                            SummaryIndexService.generate_and_vectorize_summary(
-                                segment, dataset, summary_index_setting
+                            SummaryIndexService.generate_and_vectorize_summary(segment, dataset, summary_index_setting)
+                        except Exception:
+                            logger.exception(
+                                "Failed to generate summary for segment %s",
+                                segment.id,
                            )
-                        except Exception as e:
-                            logger.error(f"Failed to generate summary for segment {segment.id}: {str(e)}")
                            # Continue processing other segments

                with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                    futures = [
-                        executor.submit(process_segment, segment) for segment in segments_to_process
-                    ]
+                    futures = [executor.submit(process_segment, segment) for segment in segments_to_process]
                    # Wait for all tasks to complete
                    concurrent.futures.wait(futures)

                logger.info(
-                    f"Successfully generated summary index for {len(segments_to_process)} segments "
-                    f"in document {document.id}"
+                    "Successfully generated summary index for %s segments in document %s",
+                    len(segments_to_process),
+                    document.id,
                )
-            except Exception as e:
-                logger.exception(f"Failed to generate summary index for document {document.id}: {str(e)}")
+            except Exception:
+                logger.exception("Failed to generate summary index for document %s", document.id)
                # Don't fail the entire indexing process if summary generation fails
        else:
            # Production mode: asynchronous generation
-            logger.info(f"Queuing summary index generation task for document {document.id} (production mode)")
+            logger.info(
+                "Queuing summary index generation task for document %s (production mode)",
+                document.id,
+            )
            try:
                generate_summary_index_task.delay(dataset.id, document.id, None)
-                logger.info(f"Summary index generation task queued for document {document.id}")
-            except Exception as e:
-                logger.exception(f"Failed to queue summary index generation task for document {document.id}: {str(e)}")
+                logger.info("Summary index generation task queued for document %s", document.id)
+            except Exception:
+                logger.exception(
+                    "Failed to queue summary index generation task for document %s",
+                    document.id,
+                )
                # Don't fail the entire indexing process if task queuing fails

    def _get_preview_output_with_summaries(
-        self, chunk_structure: str, chunks: Any, dataset: Dataset,
+        self,
+        chunk_structure: str,
+        chunks: Any,
+        dataset: Dataset,
        indexing_technique: str | None = None,
-        summary_index_setting: dict | None = None
+        summary_index_setting: dict | None = None,
    ) -> Mapping[str, Any]:
        """
        Generate preview output with summaries for chunks in preview mode.
        This method generates summaries on-the-fly without saving to database.
-        
+
        Args:
            chunk_structure: Chunk structure type
            chunks: Chunks to generate preview for
@ -316,31 +327,32 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
        """
        index_processor = IndexProcessorFactory(chunk_structure).init_index_processor()
        preview_output = index_processor.format_preview(chunks)
-        
+
        # Check if summary index is enabled
        if indexing_technique != "high_quality":
            return preview_output
-        
+
        if not summary_index_setting or not summary_index_setting.get("enable"):
            return preview_output
-        
+
        # Generate summaries for chunks
        if "preview" in preview_output and isinstance(preview_output["preview"], list):
            chunk_count = len(preview_output["preview"])
            logger.info(
-                f"Generating summaries for {chunk_count} chunks in preview mode "
-                f"(dataset: {dataset.id})"
+                "Generating summaries for %s chunks in preview mode (dataset: %s)",
+                chunk_count,
+                dataset.id,
            )
            # Use ParagraphIndexProcessor's generate_summary method
            from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor
-            
+
            # Get Flask app for application context in worker threads
            flask_app = None
            try:
                flask_app = current_app._get_current_object()  # type: ignore
            except RuntimeError:
                logger.warning("No Flask application context available, summary generation may fail")
-            
+
            def generate_summary_for_chunk(preview_item: dict) -> None:
                """Generate summary for a single chunk."""
                if "content" in preview_item:
@ -364,10 +376,10 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
                            )
                            if summary:
                                preview_item["summary"] = summary
-                    except Exception as e:
-                        logger.error(f"Failed to generate summary for chunk: {str(e)}")
+                    except Exception:
+                        logger.exception("Failed to generate summary for chunk")
                        # Don't fail the entire preview if summary generation fails
-            
+
            # Generate summaries concurrently using ThreadPoolExecutor
            # Set a reasonable timeout to prevent hanging (60 seconds per chunk, max 5 minutes total)
            timeout_seconds = min(300, 60 * len(preview_output["preview"]))
@ -378,31 +390,39 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
                ]
                # Wait for all tasks to complete with timeout
                done, not_done = concurrent.futures.wait(futures, timeout=timeout_seconds)
-                
+
                # Cancel tasks that didn't complete in time
                if not_done:
                    logger.warning(
-                        f"Summary generation timeout: {len(not_done)} chunks did not complete within {timeout_seconds}s. "
-                        "Cancelling remaining tasks..."
+                        "Summary generation timeout: %s chunks did not complete within %ss. "
+                        "Cancelling remaining tasks...",
+                        len(not_done),
+                        timeout_seconds,
                    )
                    for future in not_done:
                        future.cancel()
                    # Wait a bit for cancellation to take effect
                    concurrent.futures.wait(not_done, timeout=5)
-            
+
            completed_count = sum(1 for item in preview_output["preview"] if item.get("summary") is not None)
            logger.info(
-                f"Completed summary generation for preview chunks: {completed_count}/{len(preview_output['preview'])} succeeded"
+                "Completed summary generation for preview chunks: %s/%s succeeded",
+                completed_count,
+                len(preview_output["preview"]),
            )
-        
+
        return preview_output

    def _get_preview_output(
-        self, chunk_structure: str, chunks: Any, dataset: Dataset | None = None, variable_pool: VariablePool | None = None
+        self,
+        chunk_structure: str,
+        chunks: Any,
+        dataset: Dataset | None = None,
+        variable_pool: VariablePool | None = None,
    ) -> Mapping[str, Any]:
        index_processor = IndexProcessorFactory(chunk_structure).init_index_processor()
        preview_output = index_processor.format_preview(chunks)
-        
+
        # If dataset is provided, try to enrich preview with summaries
        if dataset and variable_pool:
            document_id = variable_pool.get(["sys", SystemVariableKey.DOCUMENT_ID])
@ -420,7 +440,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
                        )
                        .all()
                    )
-                    
+
                    if summaries:
                        # Create a map of segment content to summary for matching
                        # Use content matching as chunks in preview might not be indexed yet
@ -435,7 +455,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
                                # Normalize content for matching (strip whitespace)
                                normalized_content = segment.content.strip()
                                summary_by_content[normalized_content] = summary.summary_content
-                        
+
                        # Enrich preview with summaries by content matching
                        if "preview" in preview_output and isinstance(preview_output["preview"], list):
                            matched_count = 0
@ -446,13 +466,15 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
                                    if normalized_chunk_content in summary_by_content:
                                        preview_item["summary"] = summary_by_content[normalized_chunk_content]
                                        matched_count += 1
-                            
+
                            if matched_count > 0:
                                logger.info(
-                                    f"Enriched preview with {matched_count} existing summaries "
-                                    f"(dataset: {dataset.id}, document: {document.id})"
+                                    "Enriched preview with %s existing summaries (dataset: %s, document: %s)",
+                                    matched_count,
+                                    dataset.id,
+                                    document.id,
                                )
-        
+
        return preview_output

    @classmethod