mirror of
https://github.com/langgenius/dify.git
synced 2026-03-04 15:26:21 +08:00
153 lines
6.0 KiB
Python
153 lines
6.0 KiB
Python
import logging
|
|
from collections.abc import Mapping
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
from core.app.entities.app_invoke_entities import InvokeFrom
|
|
from dify_graph.entities.workflow_node_execution import WorkflowNodeExecutionStatus
|
|
from dify_graph.enums import NodeExecutionType, NodeType, SystemVariableKey
|
|
from dify_graph.node_events import NodeRunResult
|
|
from dify_graph.nodes.base.node import Node
|
|
from dify_graph.nodes.base.template import Template
|
|
from dify_graph.repositories.index_processor_protocol import IndexProcessorProtocol
|
|
from dify_graph.repositories.summary_index_service_protocol import SummaryIndexServiceProtocol
|
|
|
|
from .entities import KnowledgeIndexNodeData
|
|
from .exc import (
|
|
KnowledgeIndexNodeError,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from dify_graph.entities import GraphInitParams
|
|
from dify_graph.runtime import GraphRuntimeState
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
|
|
node_type = NodeType.KNOWLEDGE_INDEX
|
|
execution_type = NodeExecutionType.RESPONSE
|
|
|
|
def __init__(
|
|
self,
|
|
id: str,
|
|
config: Mapping[str, Any],
|
|
graph_init_params: "GraphInitParams",
|
|
graph_runtime_state: "GraphRuntimeState",
|
|
index_processor: IndexProcessorProtocol,
|
|
summary_index_service: SummaryIndexServiceProtocol,
|
|
) -> None:
|
|
super().__init__(id, config, graph_init_params, graph_runtime_state)
|
|
self.index_processor = index_processor
|
|
self.summary_index_service = summary_index_service
|
|
|
|
def _run(self) -> NodeRunResult: # type: ignore
|
|
node_data = self.node_data
|
|
variable_pool = self.graph_runtime_state.variable_pool
|
|
|
|
# get dataset id as string
|
|
dataset_id_segment = variable_pool.get(["sys", SystemVariableKey.DATASET_ID])
|
|
if not dataset_id_segment:
|
|
raise KnowledgeIndexNodeError("Dataset ID is required.")
|
|
dataset_id: str = dataset_id_segment.value
|
|
|
|
# get document id as string (may be empty when not provided)
|
|
document_id_segment = variable_pool.get(["sys", SystemVariableKey.DOCUMENT_ID])
|
|
document_id: str = document_id_segment.value if document_id_segment else ""
|
|
|
|
# extract variables
|
|
variable = variable_pool.get(node_data.index_chunk_variable_selector)
|
|
if not variable:
|
|
raise KnowledgeIndexNodeError("Index chunk variable is required.")
|
|
invoke_from = variable_pool.get(["sys", SystemVariableKey.INVOKE_FROM])
|
|
is_preview = invoke_from.value == InvokeFrom.DEBUGGER if invoke_from else False
|
|
|
|
chunks = variable.value
|
|
variables = {"chunks": chunks}
|
|
if not chunks:
|
|
return NodeRunResult(
|
|
status=WorkflowNodeExecutionStatus.FAILED, inputs=variables, error="Chunks is required."
|
|
)
|
|
|
|
try:
|
|
summary_index_setting = node_data.summary_index_setting
|
|
if is_preview:
|
|
# Preview mode: generate summaries for chunks directly without saving to database
|
|
# Format preview and generate summaries on-the-fly
|
|
# Get indexing_technique and summary_index_setting from node_data (workflow graph config)
|
|
# or fallback to dataset if not available in node_data
|
|
|
|
outputs = self.index_processor.get_preview_output(
|
|
chunks, dataset_id, document_id, node_data.chunk_structure, summary_index_setting
|
|
)
|
|
return NodeRunResult(
|
|
status=WorkflowNodeExecutionStatus.SUCCEEDED,
|
|
inputs=variables,
|
|
outputs=outputs.model_dump(exclude_none=True),
|
|
)
|
|
|
|
original_document_id_segment = variable_pool.get(["sys", SystemVariableKey.ORIGINAL_DOCUMENT_ID])
|
|
batch = variable_pool.get(["sys", SystemVariableKey.BATCH])
|
|
if not batch:
|
|
raise KnowledgeIndexNodeError("Batch is required.")
|
|
|
|
results = self._invoke_knowledge_index(
|
|
dataset_id=dataset_id,
|
|
document_id=document_id,
|
|
original_document_id=original_document_id_segment.value if original_document_id_segment else "",
|
|
is_preview=is_preview,
|
|
batch=batch.value,
|
|
chunks=chunks,
|
|
summary_index_setting=summary_index_setting,
|
|
)
|
|
return NodeRunResult(status=WorkflowNodeExecutionStatus.SUCCEEDED, inputs=variables, outputs=results)
|
|
|
|
except KnowledgeIndexNodeError as e:
|
|
logger.warning("Error when running knowledge index node", exc_info=True)
|
|
return NodeRunResult(
|
|
status=WorkflowNodeExecutionStatus.FAILED,
|
|
inputs=variables,
|
|
error=str(e),
|
|
error_type=type(e).__name__,
|
|
)
|
|
except Exception as e:
|
|
logger.error(e, exc_info=True)
|
|
return NodeRunResult(
|
|
status=WorkflowNodeExecutionStatus.FAILED,
|
|
inputs=variables,
|
|
error=str(e),
|
|
error_type=type(e).__name__,
|
|
)
|
|
|
|
def _invoke_knowledge_index(
|
|
self,
|
|
dataset_id: str,
|
|
document_id: str,
|
|
original_document_id: str,
|
|
is_preview: bool,
|
|
batch: Any,
|
|
chunks: Mapping[str, Any],
|
|
summary_index_setting: dict | None = None,
|
|
):
|
|
if not document_id:
|
|
raise KnowledgeIndexNodeError("document_id is required.")
|
|
rst = self.index_processor.index_and_clean(
|
|
dataset_id, document_id, original_document_id, chunks, batch, summary_index_setting
|
|
)
|
|
self.summary_index_service.generate_and_vectorize_summary(
|
|
dataset_id, document_id, is_preview, summary_index_setting
|
|
)
|
|
return rst
|
|
|
|
@classmethod
|
|
def version(cls) -> str:
|
|
return "1"
|
|
|
|
def get_streaming_template(self) -> Template:
|
|
"""
|
|
Get the template for streaming.
|
|
|
|
Returns:
|
|
Template instance for this knowledge index node
|
|
"""
|
|
return Template(segments=[])
|