feat: implement Summary Index feature.

This commit is contained in:
FFXN
2026-01-12 16:52:21 +08:00
parent f4a7efde3d
commit 25bfc1cc3b
36 changed files with 2290 additions and 32 deletions

View File

@ -117,6 +117,18 @@ def add_document_to_index_task(dataset_document_id: str):
)
db.session.commit()
# Enable summary indexes for all segments in this document
from services.summary_index_service import SummaryIndexService
segment_ids_list = [segment.id for segment in segments]
if segment_ids_list:
try:
SummaryIndexService.enable_summaries_for_segments(
dataset=dataset,
segment_ids=segment_ids_list,
)
except Exception as e:
logger.warning(f"Failed to enable summaries for document {dataset_document.id}: {str(e)}")
end_at = time.perf_counter()
logger.info(
click.style(f"Document added to index: {dataset_document.id} latency: {end_at - start_at}", fg="green")

View File

@ -42,6 +42,7 @@ def delete_segment_from_index_task(
doc_form = dataset_document.doc_form
# Proceed with index cleanup using the index_node_ids directly
# For actual deletion, we should delete summaries (not just disable them)
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
index_processor.clean(
dataset,
@ -49,6 +50,7 @@ def delete_segment_from_index_task(
with_keywords=True,
delete_child_chunks=True,
precomputed_child_node_ids=child_node_ids,
delete_summaries=True, # Actually delete summaries when segment is deleted
)
if dataset.is_multimodal:
# delete segment attachment binding

View File

@ -53,6 +53,17 @@ def disable_segment_from_index_task(segment_id: str):
logger.info(click.style(f"Segment {segment.id} document status is invalid, pass.", fg="cyan"))
return
# Disable summary index for this segment
from services.summary_index_service import SummaryIndexService
try:
SummaryIndexService.disable_summaries_for_segments(
dataset=dataset,
segment_ids=[segment.id],
disabled_by=segment.disabled_by,
)
except Exception as e:
logger.warning(f"Failed to disable summary for segment {segment.id}: {str(e)}")
index_type = dataset_document.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
index_processor.clean(dataset, [segment.index_node_id])

View File

@ -58,12 +58,25 @@ def disable_segments_from_index_task(segment_ids: list, dataset_id: str, documen
return
try:
# Disable summary indexes for these segments
from services.summary_index_service import SummaryIndexService
segment_ids_list = [segment.id for segment in segments]
try:
# Get disabled_by from first segment (they should all have the same disabled_by)
disabled_by = segments[0].disabled_by if segments else None
SummaryIndexService.disable_summaries_for_segments(
dataset=dataset,
segment_ids=segment_ids_list,
disabled_by=disabled_by,
)
except Exception as e:
logger.warning(f"Failed to disable summaries for segments: {str(e)}")
index_node_ids = [segment.index_node_id for segment in segments]
if dataset.is_multimodal:
segment_ids = [segment.id for segment in segments]
segment_attachment_bindings = (
db.session.query(SegmentAttachmentBinding)
.where(SegmentAttachmentBinding.segment_id.in_(segment_ids))
.where(SegmentAttachmentBinding.segment_id.in_(segment_ids_list))
.all()
)
if segment_attachment_bindings:

View File

@ -8,6 +8,7 @@ from celery import shared_task
from configs import dify_config
from core.entities.document_task import DocumentTask
from core.indexing_runner import DocumentIsPausedError, IndexingRunner
from tasks.generate_summary_index_task import generate_summary_index_task
from core.rag.pipeline.queue import TenantIsolatedTaskQueue
from enums.cloud_plan import CloudPlan
from extensions.ext_database import db
@ -100,6 +101,60 @@ def _document_indexing(dataset_id: str, document_ids: Sequence[str]):
indexing_runner.run(documents)
end_at = time.perf_counter()
logger.info(click.style(f"Processed dataset: {dataset_id} latency: {end_at - start_at}", fg="green"))
# Trigger summary index generation for completed documents if enabled
# Only generate for high_quality indexing technique and when summary_index_setting is enabled
# Re-query dataset to get latest summary_index_setting (in case it was updated)
dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first()
if not dataset:
logger.warning(f"Dataset {dataset_id} not found after indexing")
return
if dataset.indexing_technique == "high_quality":
summary_index_setting = dataset.summary_index_setting
if summary_index_setting and summary_index_setting.get("enable"):
# Check each document's indexing status and trigger summary generation if completed
for document_id in document_ids:
# Re-query document to get latest status (IndexingRunner may have updated it)
document = (
db.session.query(Document)
.where(Document.id == document_id, Document.dataset_id == dataset_id)
.first()
)
if document:
logger.info(
f"Checking document {document_id} for summary generation: "
f"status={document.indexing_status}, doc_form={document.doc_form}"
)
if document.indexing_status == "completed" and document.doc_form != "qa_model":
try:
generate_summary_index_task.delay(dataset.id, document_id, None)
logger.info(
f"Queued summary index generation task for document {document_id} "
f"in dataset {dataset.id} after indexing completed"
)
except Exception as e:
logger.exception(
f"Failed to queue summary index generation task for document {document_id}: {str(e)}"
)
# Don't fail the entire indexing process if summary task queuing fails
else:
logger.info(
f"Skipping summary generation for document {document_id}: "
f"status={document.indexing_status}, doc_form={document.doc_form}"
)
else:
logger.warning(f"Document {document_id} not found after indexing")
else:
logger.info(
f"Summary index generation skipped for dataset {dataset.id}: "
f"summary_index_setting.enable={summary_index_setting.get('enable') if summary_index_setting else None}"
)
else:
logger.info(
f"Summary index generation skipped for dataset {dataset.id}: "
f"indexing_technique={dataset.indexing_technique} (not 'high_quality')"
)
except DocumentIsPausedError as ex:
logger.info(click.style(str(ex), fg="yellow"))
except Exception:

View File

@ -103,6 +103,16 @@ def enable_segment_to_index_task(segment_id: str):
# save vector index
index_processor.load(dataset, [document], multimodal_documents=multimodel_documents)
# Enable summary index for this segment
from services.summary_index_service import SummaryIndexService
try:
SummaryIndexService.enable_summaries_for_segments(
dataset=dataset,
segment_ids=[segment.id],
)
except Exception as e:
logger.warning(f"Failed to enable summary for segment {segment.id}: {str(e)}")
end_at = time.perf_counter()
logger.info(click.style(f"Segment enabled to index: {segment.id} latency: {end_at - start_at}", fg="green"))
except Exception as e:

View File

@ -108,6 +108,17 @@ def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_i
# save vector index
index_processor.load(dataset, documents, multimodal_documents=multimodal_documents)
# Enable summary indexes for these segments
from services.summary_index_service import SummaryIndexService
segment_ids_list = [segment.id for segment in segments]
try:
SummaryIndexService.enable_summaries_for_segments(
dataset=dataset,
segment_ids=segment_ids_list,
)
except Exception as e:
logger.warning(f"Failed to enable summaries for segments: {str(e)}")
end_at = time.perf_counter()
logger.info(click.style(f"Segments enabled to index latency: {end_at - start_at}", fg="green"))
except Exception as e:

View File

@ -0,0 +1,113 @@
"""Async task for generating summary indexes."""
import logging
import time
import click
from celery import shared_task
from extensions.ext_database import db
from models.dataset import Dataset, DocumentSegment
from models.dataset import Document as DatasetDocument
from services.summary_index_service import SummaryIndexService
logger = logging.getLogger(__name__)
@shared_task(queue="dataset")
def generate_summary_index_task(dataset_id: str, document_id: str, segment_ids: list[str] | None = None):
"""
Async generate summary index for document segments.
Args:
dataset_id: Dataset ID
document_id: Document ID
segment_ids: Optional list of specific segment IDs to process. If None, process all segments.
Usage:
generate_summary_index_task.delay(dataset_id, document_id)
generate_summary_index_task.delay(dataset_id, document_id, segment_ids)
"""
logger.info(
click.style(
f"Start generating summary index for document {document_id} in dataset {dataset_id}",
fg="green",
)
)
start_at = time.perf_counter()
try:
dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first()
if not dataset:
logger.error(click.style(f"Dataset not found: {dataset_id}", fg="red"))
db.session.close()
return
document = db.session.query(DatasetDocument).where(DatasetDocument.id == document_id).first()
if not document:
logger.error(click.style(f"Document not found: {document_id}", fg="red"))
db.session.close()
return
# Only generate summary index for high_quality indexing technique
if dataset.indexing_technique != "high_quality":
logger.info(
click.style(
f"Skipping summary generation for dataset {dataset_id}: "
f"indexing_technique is {dataset.indexing_technique}, not 'high_quality'",
fg="cyan",
)
)
db.session.close()
return
# Check if summary index is enabled
summary_index_setting = dataset.summary_index_setting
if not summary_index_setting or not summary_index_setting.get("enable"):
logger.info(
click.style(
f"Summary index is disabled for dataset {dataset_id}",
fg="cyan",
)
)
db.session.close()
return
# Determine if only parent chunks should be processed
only_parent_chunks = dataset.chunk_structure == "parent_child_index"
# Generate summaries
summary_records = SummaryIndexService.generate_summaries_for_document(
dataset=dataset,
document=document,
summary_index_setting=summary_index_setting,
segment_ids=segment_ids,
only_parent_chunks=only_parent_chunks,
)
end_at = time.perf_counter()
logger.info(
click.style(
f"Summary index generation completed for document {document_id}: "
f"{len(summary_records)} summaries generated, latency: {end_at - start_at}",
fg="green",
)
)
except Exception as e:
logger.exception(f"Failed to generate summary index for document {document_id}: {str(e)}")
# Update document segments with error status if needed
if segment_ids:
db.session.query(DocumentSegment).filter(
DocumentSegment.id.in_(segment_ids),
DocumentSegment.dataset_id == dataset_id,
).update(
{
DocumentSegment.error: f"Summary generation failed: {str(e)}",
},
synchronize_session=False,
)
db.session.commit()
finally:
db.session.close()

View File

@ -0,0 +1,219 @@
"""Task for regenerating summary indexes when dataset settings change."""
import logging
import time
from typing import Any
import click
from celery import shared_task
from sqlalchemy import select
from extensions.ext_database import db
from models.dataset import Dataset, DocumentSegment, DocumentSegmentSummary
from models.dataset import Document as DatasetDocument
from services.summary_index_service import SummaryIndexService
logger = logging.getLogger(__name__)
@shared_task(queue="dataset")
def regenerate_summary_index_task(
dataset_id: str,
regenerate_reason: str = "summary_model_changed",
regenerate_vectors_only: bool = False,
):
"""
Regenerate summary indexes for all documents in a dataset.
This task is triggered when:
1. summary_index_setting model changes (regenerate_reason="summary_model_changed")
- Regenerates summary content and vectors for all existing summaries
2. embedding_model changes (regenerate_reason="embedding_model_changed")
- Only regenerates vectors for existing summaries (keeps summary content)
Args:
dataset_id: Dataset ID
regenerate_reason: Reason for regeneration ("summary_model_changed" or "embedding_model_changed")
regenerate_vectors_only: If True, only regenerate vectors without regenerating summary content
"""
logger.info(
click.style(
f"Start regenerate summary index for dataset {dataset_id}, reason: {regenerate_reason}",
fg="green",
)
)
start_at = time.perf_counter()
try:
dataset = db.session.query(Dataset).filter_by(id=dataset_id).first()
if not dataset:
logger.error(click.style(f"Dataset not found: {dataset_id}", fg="red"))
db.session.close()
return
# Only regenerate summary index for high_quality indexing technique
if dataset.indexing_technique != "high_quality":
logger.info(
click.style(
f"Skipping summary regeneration for dataset {dataset_id}: "
f"indexing_technique is {dataset.indexing_technique}, not 'high_quality'",
fg="cyan",
)
)
db.session.close()
return
# Check if summary index is enabled
summary_index_setting = dataset.summary_index_setting
if not summary_index_setting or not summary_index_setting.get("enable"):
logger.info(
click.style(
f"Summary index is disabled for dataset {dataset_id}",
fg="cyan",
)
)
db.session.close()
return
# Get all documents with completed indexing status
dataset_documents = db.session.scalars(
select(DatasetDocument).where(
DatasetDocument.dataset_id == dataset_id,
DatasetDocument.indexing_status == "completed",
DatasetDocument.enabled == True,
DatasetDocument.archived == False,
)
).all()
if not dataset_documents:
logger.info(
click.style(
f"No documents found for summary regeneration in dataset {dataset_id}",
fg="cyan",
)
)
db.session.close()
return
logger.info(
f"Found {len(dataset_documents)} documents for summary regeneration in dataset {dataset_id}"
)
total_segments_processed = 0
total_segments_failed = 0
for dataset_document in dataset_documents:
# Skip qa_model documents
if dataset_document.doc_form == "qa_model":
continue
try:
# Get all segments with existing summaries
segments = (
db.session.query(DocumentSegment)
.join(
DocumentSegmentSummary,
DocumentSegment.id == DocumentSegmentSummary.chunk_id,
)
.where(
DocumentSegment.document_id == dataset_document.id,
DocumentSegment.dataset_id == dataset_id,
DocumentSegment.status == "completed",
DocumentSegment.enabled == True,
DocumentSegmentSummary.dataset_id == dataset_id,
)
.order_by(DocumentSegment.position.asc())
.all()
)
if not segments:
continue
logger.info(
f"Regenerating summaries for {len(segments)} segments in document {dataset_document.id}"
)
for segment in segments:
try:
# Get existing summary record
summary_record = (
db.session.query(DocumentSegmentSummary)
.filter_by(
chunk_id=segment.id,
dataset_id=dataset_id,
)
.first()
)
if not summary_record:
logger.warning(
f"Summary record not found for segment {segment.id}, skipping"
)
continue
if regenerate_vectors_only:
# Only regenerate vectors (for embedding_model change)
# Delete old vector
if summary_record.summary_index_node_id:
try:
from core.rag.datasource.vdb.vector_factory import Vector
vector = Vector(dataset)
vector.delete_by_ids([summary_record.summary_index_node_id])
except Exception as e:
logger.warning(
f"Failed to delete old summary vector for segment {segment.id}: {str(e)}"
)
# Re-vectorize with new embedding model
SummaryIndexService.vectorize_summary(
summary_record, segment, dataset
)
db.session.commit()
else:
# Regenerate both summary content and vectors (for summary_model change)
SummaryIndexService.generate_and_vectorize_summary(
segment, dataset, summary_index_setting
)
db.session.commit()
total_segments_processed += 1
except Exception as e:
logger.error(
f"Failed to regenerate summary for segment {segment.id}: {str(e)}",
exc_info=True,
)
total_segments_failed += 1
# Update summary record with error status
if summary_record:
summary_record.status = "error"
summary_record.error = f"Regeneration failed: {str(e)}"
db.session.add(summary_record)
db.session.commit()
continue
except Exception as e:
logger.error(
f"Failed to process document {dataset_document.id} for summary regeneration: {str(e)}",
exc_info=True,
)
continue
end_at = time.perf_counter()
logger.info(
click.style(
f"Summary index regeneration completed for dataset {dataset_id}: "
f"{total_segments_processed} segments processed successfully, "
f"{total_segments_failed} segments failed, "
f"total documents: {len(dataset_documents)}, "
f"latency: {end_at - start_at:.2f}s",
fg="green",
)
)
except Exception:
logger.exception(f"Regenerate summary index failed for dataset {dataset_id}")
finally:
db.session.close()

View File

@ -47,6 +47,20 @@ def remove_document_from_index_task(document_id: str):
index_processor = IndexProcessorFactory(document.doc_form).init_index_processor()
segments = db.session.scalars(select(DocumentSegment).where(DocumentSegment.document_id == document.id)).all()
# Disable summary indexes for all segments in this document
from services.summary_index_service import SummaryIndexService
segment_ids_list = [segment.id for segment in segments]
if segment_ids_list:
try:
SummaryIndexService.disable_summaries_for_segments(
dataset=dataset,
segment_ids=segment_ids_list,
disabled_by=document.disabled_by,
)
except Exception as e:
logger.warning(f"Failed to disable summaries for document {document.id}: {str(e)}")
index_node_ids = [segment.index_node_id for segment in segments]
if index_node_ids:
try: