fix re-chunk document

This commit is contained in:
jyong
2025-09-16 16:05:01 +08:00
parent 610f0414db
commit 05aec66424
7 changed files with 50 additions and 52 deletions

View File

@ -24,6 +24,7 @@ class SystemVariableKey(StrEnum):
WORKFLOW_EXECUTION_ID = "workflow_run_id"
# RAG Pipeline
DOCUMENT_ID = "document_id"
ORIGINAL_DOCUMENT_ID = "original_document_id"
BATCH = "batch"
DATASET_ID = "dataset_id"
DATASOURCE_TYPE = "datasource_type"

View File

@ -4,7 +4,7 @@ import time
from collections.abc import Mapping
from typing import Any, Optional, cast
from sqlalchemy import func
from sqlalchemy import func, select
from core.app.entities.app_invoke_entities import InvokeFrom
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
@ -128,6 +128,8 @@ class KnowledgeIndexNode(Node):
document_id = variable_pool.get(["sys", SystemVariableKey.DOCUMENT_ID])
if not document_id:
raise KnowledgeIndexNodeError("Document ID is required.")
original_document_id = variable_pool.get(["sys", SystemVariableKey.ORIGINAL_DOCUMENT_ID])
batch = variable_pool.get(["sys", SystemVariableKey.BATCH])
if not batch:
raise KnowledgeIndexNodeError("Batch is required.")
@ -137,6 +139,19 @@ class KnowledgeIndexNode(Node):
# chunk nodes by chunk size
indexing_start_at = time.perf_counter()
index_processor = IndexProcessorFactory(dataset.chunk_structure).init_index_processor()
if original_document_id:
segments = db.session.scalars(
select(DocumentSegment).where(DocumentSegment.document_id == document_id)
).all()
if segments:
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
db.session.delete(segment)
db.session.commit()
index_processor.index(dataset, document, chunks)
indexing_end_at = time.perf_counter()
document.indexing_latency = indexing_end_at - indexing_start_at

View File

@ -44,6 +44,7 @@ class SystemVariable(BaseModel):
conversation_id: str | None = None
dialogue_count: int | None = None
document_id: str | None = None
original_document_id: str | None = None
dataset_id: str | None = None
batch: str | None = None
datasource_type: str | None = None
@ -94,6 +95,8 @@ class SystemVariable(BaseModel):
d[SystemVariableKey.DIALOGUE_COUNT] = self.dialogue_count
if self.document_id is not None:
d[SystemVariableKey.DOCUMENT_ID] = self.document_id
if self.original_document_id is not None:
d[SystemVariableKey.ORIGINAL_DOCUMENT_ID] = self.original_document_id
if self.dataset_id is not None:
d[SystemVariableKey.DATASET_ID] = self.dataset_id
if self.batch is not None: