feat(enterprise): Add OTEL telemetry with slim traces, metrics, and structured logs

- Add EnterpriseOtelTrace handler with span emission for workflows and nodes
- Implement minimal-span strategy: slim spans + detailed companion logs
- Add deterministic span/trace IDs for cross-workflow trace correlation
- Add metric collection at 100% accuracy (counters & histograms)
- Add event handlers for app lifecycle and feedback telemetry
- Add cross-workflow trace linking with parent context propagation
- Add OTEL exporter with configurable sampling and privacy controls
- Wire enterprise telemetry into workflow execution pipeline
- Add telemetry configuration in enterprise configs
This commit is contained in:
GareArc
2026-02-02 15:09:44 -08:00
parent d8402f686e
commit 3461c3a8ef
27 changed files with 1812 additions and 41 deletions

View File

@ -373,6 +373,7 @@ class WorkflowPersistenceLayer(GraphEngineLayer):
self._workflow_node_execution_repository.save(domain_execution)
self._workflow_node_execution_repository.save_execution_data(domain_execution)
self._enqueue_node_trace_task(domain_execution)
def _fail_running_node_executions(self, *, error_message: str) -> None:
now = naive_utc_now()
@ -390,8 +391,10 @@ class WorkflowPersistenceLayer(GraphEngineLayer):
conversation_id = self._system_variables().get(SystemVariableKey.CONVERSATION_ID.value)
external_trace_id = None
parent_trace_context = None
if isinstance(self._application_generate_entity, (WorkflowAppGenerateEntity, AdvancedChatAppGenerateEntity)):
external_trace_id = self._application_generate_entity.extras.get("external_trace_id")
parent_trace_context = self._application_generate_entity.extras.get("parent_trace_context")
trace_task = TraceTask(
TraceTaskName.WORKFLOW_TRACE,
@ -399,6 +402,99 @@ class WorkflowPersistenceLayer(GraphEngineLayer):
conversation_id=conversation_id,
user_id=self._trace_manager.user_id,
external_trace_id=external_trace_id,
parent_trace_context=parent_trace_context,
)
self._trace_manager.add_trace_task(trace_task)
def _enqueue_node_trace_task(self, domain_execution: WorkflowNodeExecution) -> None:
if not self._trace_manager:
return
from enterprise.telemetry.exporter import is_enterprise_telemetry_enabled
if not is_enterprise_telemetry_enabled():
return
execution = self._get_workflow_execution()
meta = domain_execution.metadata or {}
parent_trace_context = None
if isinstance(self._application_generate_entity, (WorkflowAppGenerateEntity, AdvancedChatAppGenerateEntity)):
parent_trace_context = self._application_generate_entity.extras.get("parent_trace_context")
node_data: dict[str, Any] = {
"workflow_id": domain_execution.workflow_id,
"workflow_execution_id": execution.id_,
"tenant_id": self._application_generate_entity.app_config.tenant_id,
"app_id": self._application_generate_entity.app_config.app_id,
"node_execution_id": domain_execution.id,
"node_id": domain_execution.node_id,
"node_type": str(domain_execution.node_type.value),
"title": domain_execution.title,
"status": str(domain_execution.status.value),
"error": domain_execution.error,
"elapsed_time": domain_execution.elapsed_time,
"index": domain_execution.index,
"predecessor_node_id": domain_execution.predecessor_node_id,
"created_at": domain_execution.created_at,
"finished_at": domain_execution.finished_at,
"total_tokens": meta.get(WorkflowNodeExecutionMetadataKey.TOTAL_TOKENS, 0),
"total_price": meta.get(WorkflowNodeExecutionMetadataKey.TOTAL_PRICE, 0.0),
"currency": meta.get(WorkflowNodeExecutionMetadataKey.CURRENCY),
"tool_name": (meta.get(WorkflowNodeExecutionMetadataKey.TOOL_INFO) or {}).get("tool_name")
if isinstance(meta.get(WorkflowNodeExecutionMetadataKey.TOOL_INFO), dict)
else None,
"iteration_id": meta.get(WorkflowNodeExecutionMetadataKey.ITERATION_ID),
"iteration_index": meta.get(WorkflowNodeExecutionMetadataKey.ITERATION_INDEX),
"loop_id": meta.get(WorkflowNodeExecutionMetadataKey.LOOP_ID),
"loop_index": meta.get(WorkflowNodeExecutionMetadataKey.LOOP_INDEX),
"parallel_id": meta.get(WorkflowNodeExecutionMetadataKey.PARALLEL_ID),
"node_inputs": dict(domain_execution.inputs) if domain_execution.inputs else None,
"node_outputs": dict(domain_execution.outputs) if domain_execution.outputs else None,
"process_data": dict(domain_execution.process_data) if domain_execution.process_data else None,
}
node_data["invoke_from"] = self._application_generate_entity.invoke_from.value
node_data["user_id"] = self._system_variables().get(SystemVariableKey.USER_ID.value)
if domain_execution.node_type.value == "knowledge-retrieval" and domain_execution.outputs:
results = domain_execution.outputs.get("result") or []
dataset_ids: list[str] = []
dataset_names: list[str] = []
for doc in results:
if not isinstance(doc, dict):
continue
doc_meta = doc.get("metadata") or {}
did = doc_meta.get("dataset_id")
dname = doc_meta.get("dataset_name")
if did and did not in dataset_ids:
dataset_ids.append(did)
if dname and dname not in dataset_names:
dataset_names.append(dname)
if dataset_ids:
node_data["dataset_ids"] = dataset_ids
if dataset_names:
node_data["dataset_names"] = dataset_names
tool_info = meta.get(WorkflowNodeExecutionMetadataKey.TOOL_INFO)
if isinstance(tool_info, dict):
plugin_id = tool_info.get("plugin_unique_identifier")
if plugin_id:
node_data["plugin_name"] = plugin_id
credential_id = tool_info.get("credential_id")
if credential_id:
node_data["credential_id"] = credential_id
node_data["credential_provider_type"] = tool_info.get("provider_type")
conversation_id = self._system_variables().get(SystemVariableKey.CONVERSATION_ID.value)
if conversation_id:
node_data["conversation_id"] = conversation_id
if parent_trace_context:
node_data["parent_trace_context"] = parent_trace_context
trace_task = TraceTask(
TraceTaskName.NODE_EXECUTION_TRACE,
node_execution_data=node_data,
)
self._trace_manager.add_trace_task(trace_task)