mirror of
https://github.com/langgenius/dify.git
synced 2026-05-03 00:48:04 +08:00
fix re-chunk document
This commit is contained in:
@ -24,6 +24,7 @@ class SystemVariableKey(StrEnum):
|
||||
WORKFLOW_EXECUTION_ID = "workflow_run_id"
|
||||
# RAG Pipeline
|
||||
DOCUMENT_ID = "document_id"
|
||||
ORIGINAL_DOCUMENT_ID = "original_document_id"
|
||||
BATCH = "batch"
|
||||
DATASET_ID = "dataset_id"
|
||||
DATASOURCE_TYPE = "datasource_type"
|
||||
|
||||
@ -4,7 +4,7 @@ import time
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, Optional, cast
|
||||
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy import func, select
|
||||
|
||||
from core.app.entities.app_invoke_entities import InvokeFrom
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
@ -128,6 +128,8 @@ class KnowledgeIndexNode(Node):
|
||||
document_id = variable_pool.get(["sys", SystemVariableKey.DOCUMENT_ID])
|
||||
if not document_id:
|
||||
raise KnowledgeIndexNodeError("Document ID is required.")
|
||||
original_document_id = variable_pool.get(["sys", SystemVariableKey.ORIGINAL_DOCUMENT_ID])
|
||||
|
||||
batch = variable_pool.get(["sys", SystemVariableKey.BATCH])
|
||||
if not batch:
|
||||
raise KnowledgeIndexNodeError("Batch is required.")
|
||||
@ -137,6 +139,19 @@ class KnowledgeIndexNode(Node):
|
||||
# chunk nodes by chunk size
|
||||
indexing_start_at = time.perf_counter()
|
||||
index_processor = IndexProcessorFactory(dataset.chunk_structure).init_index_processor()
|
||||
if original_document_id:
|
||||
segments = db.session.scalars(
|
||||
select(DocumentSegment).where(DocumentSegment.document_id == document_id)
|
||||
).all()
|
||||
if segments:
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
|
||||
# delete from vector index
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
|
||||
|
||||
for segment in segments:
|
||||
db.session.delete(segment)
|
||||
db.session.commit()
|
||||
index_processor.index(dataset, document, chunks)
|
||||
indexing_end_at = time.perf_counter()
|
||||
document.indexing_latency = indexing_end_at - indexing_start_at
|
||||
|
||||
@ -44,6 +44,7 @@ class SystemVariable(BaseModel):
|
||||
conversation_id: str | None = None
|
||||
dialogue_count: int | None = None
|
||||
document_id: str | None = None
|
||||
original_document_id: str | None = None
|
||||
dataset_id: str | None = None
|
||||
batch: str | None = None
|
||||
datasource_type: str | None = None
|
||||
@ -94,6 +95,8 @@ class SystemVariable(BaseModel):
|
||||
d[SystemVariableKey.DIALOGUE_COUNT] = self.dialogue_count
|
||||
if self.document_id is not None:
|
||||
d[SystemVariableKey.DOCUMENT_ID] = self.document_id
|
||||
if self.original_document_id is not None:
|
||||
d[SystemVariableKey.ORIGINAL_DOCUMENT_ID] = self.original_document_id
|
||||
if self.dataset_id is not None:
|
||||
d[SystemVariableKey.DATASET_ID] = self.dataset_id
|
||||
if self.batch is not None:
|
||||
|
||||
Reference in New Issue
Block a user