mirror of
https://github.com/langgenius/dify.git
synced 2026-01-19 11:45:05 +08:00
add service layer OTel Span (#28582)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
@ -1,148 +1,22 @@
|
||||
import atexit
|
||||
import contextlib
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import socket
|
||||
import sys
|
||||
from typing import Union
|
||||
|
||||
import flask
|
||||
from celery.signals import worker_init
|
||||
from flask_login import user_loaded_from_request, user_logged_in
|
||||
|
||||
from configs import dify_config
|
||||
from dify_app import DifyApp
|
||||
from libs.helper import extract_tenant_id
|
||||
from models import Account, EndUser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@user_logged_in.connect
|
||||
@user_loaded_from_request.connect
|
||||
def on_user_loaded(_sender, user: Union["Account", "EndUser"]):
|
||||
if dify_config.ENABLE_OTEL:
|
||||
from opentelemetry.trace import get_current_span
|
||||
|
||||
if user:
|
||||
try:
|
||||
current_span = get_current_span()
|
||||
tenant_id = extract_tenant_id(user)
|
||||
if not tenant_id:
|
||||
return
|
||||
if current_span:
|
||||
current_span.set_attribute("service.tenant.id", tenant_id)
|
||||
current_span.set_attribute("service.user.id", user.id)
|
||||
except Exception:
|
||||
logger.exception("Error setting tenant and user attributes")
|
||||
pass
|
||||
|
||||
|
||||
def init_app(app: DifyApp):
|
||||
from opentelemetry.semconv.trace import SpanAttributes
|
||||
|
||||
def is_celery_worker():
|
||||
return "celery" in sys.argv[0].lower()
|
||||
|
||||
def instrument_exception_logging():
|
||||
exception_handler = ExceptionLoggingHandler()
|
||||
logging.getLogger().addHandler(exception_handler)
|
||||
|
||||
def init_flask_instrumentor(app: DifyApp):
|
||||
meter = get_meter("http_metrics", version=dify_config.project.version)
|
||||
_http_response_counter = meter.create_counter(
|
||||
"http.server.response.count",
|
||||
description="Total number of HTTP responses by status code, method and target",
|
||||
unit="{response}",
|
||||
)
|
||||
|
||||
def response_hook(span: Span, status: str, response_headers: list):
|
||||
if span and span.is_recording():
|
||||
try:
|
||||
if status.startswith("2"):
|
||||
span.set_status(StatusCode.OK)
|
||||
else:
|
||||
span.set_status(StatusCode.ERROR, status)
|
||||
|
||||
status = status.split(" ")[0]
|
||||
status_code = int(status)
|
||||
status_class = f"{status_code // 100}xx"
|
||||
attributes: dict[str, str | int] = {"status_code": status_code, "status_class": status_class}
|
||||
request = flask.request
|
||||
if request and request.url_rule:
|
||||
attributes[SpanAttributes.HTTP_TARGET] = str(request.url_rule.rule)
|
||||
if request and request.method:
|
||||
attributes[SpanAttributes.HTTP_METHOD] = str(request.method)
|
||||
_http_response_counter.add(1, attributes)
|
||||
except Exception:
|
||||
logger.exception("Error setting status and attributes")
|
||||
pass
|
||||
|
||||
instrumentor = FlaskInstrumentor()
|
||||
if dify_config.DEBUG:
|
||||
logger.info("Initializing Flask instrumentor")
|
||||
instrumentor.instrument_app(app, response_hook=response_hook)
|
||||
|
||||
def init_sqlalchemy_instrumentor(app: DifyApp):
|
||||
with app.app_context():
|
||||
engines = list(app.extensions["sqlalchemy"].engines.values())
|
||||
SQLAlchemyInstrumentor().instrument(enable_commenter=True, engines=engines)
|
||||
|
||||
def setup_context_propagation():
|
||||
# Configure propagators
|
||||
set_global_textmap(
|
||||
CompositePropagator(
|
||||
[
|
||||
TraceContextTextMapPropagator(), # W3C trace context
|
||||
B3Format(), # B3 propagation (used by many systems)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
def shutdown_tracer():
|
||||
provider = trace.get_tracer_provider()
|
||||
if hasattr(provider, "force_flush"):
|
||||
provider.force_flush()
|
||||
|
||||
class ExceptionLoggingHandler(logging.Handler):
|
||||
"""Custom logging handler that creates spans for logging.exception() calls"""
|
||||
|
||||
def emit(self, record: logging.LogRecord):
|
||||
with contextlib.suppress(Exception):
|
||||
if record.exc_info:
|
||||
tracer = get_tracer_provider().get_tracer("dify.exception.logging")
|
||||
with tracer.start_as_current_span(
|
||||
"log.exception",
|
||||
attributes={
|
||||
"log.level": record.levelname,
|
||||
"log.message": record.getMessage(),
|
||||
"log.logger": record.name,
|
||||
"log.file.path": record.pathname,
|
||||
"log.file.line": record.lineno,
|
||||
},
|
||||
) as span:
|
||||
span.set_status(StatusCode.ERROR)
|
||||
if record.exc_info[1]:
|
||||
span.record_exception(record.exc_info[1])
|
||||
span.set_attribute("exception.message", str(record.exc_info[1]))
|
||||
if record.exc_info[0]:
|
||||
span.set_attribute("exception.type", record.exc_info[0].__name__)
|
||||
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GRPCMetricExporter
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as GRPCSpanExporter
|
||||
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HTTPMetricExporter
|
||||
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as HTTPSpanExporter
|
||||
from opentelemetry.instrumentation.celery import CeleryInstrumentor
|
||||
from opentelemetry.instrumentation.flask import FlaskInstrumentor
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
||||
from opentelemetry.metrics import get_meter, get_meter_provider, set_meter_provider
|
||||
from opentelemetry.propagate import set_global_textmap
|
||||
from opentelemetry.propagators.b3 import B3Format
|
||||
from opentelemetry.propagators.composite import CompositePropagator
|
||||
from opentelemetry.metrics import set_meter_provider
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import ConsoleMetricExporter, PeriodicExportingMetricReader
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
@ -153,9 +27,10 @@ def init_app(app: DifyApp):
|
||||
)
|
||||
from opentelemetry.sdk.trace.sampling import ParentBasedTraceIdRatio
|
||||
from opentelemetry.semconv.resource import ResourceAttributes
|
||||
from opentelemetry.trace import Span, get_tracer_provider, set_tracer_provider
|
||||
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
|
||||
from opentelemetry.trace.status import StatusCode
|
||||
from opentelemetry.trace import set_tracer_provider
|
||||
|
||||
from extensions.otel.instrumentation import init_instruments
|
||||
from extensions.otel.runtime import setup_context_propagation, shutdown_tracer
|
||||
|
||||
setup_context_propagation()
|
||||
# Initialize OpenTelemetry
|
||||
@ -177,6 +52,7 @@ def init_app(app: DifyApp):
|
||||
)
|
||||
sampler = ParentBasedTraceIdRatio(dify_config.OTEL_SAMPLING_RATE)
|
||||
provider = TracerProvider(resource=resource, sampler=sampler)
|
||||
|
||||
set_tracer_provider(provider)
|
||||
exporter: Union[GRPCSpanExporter, HTTPSpanExporter, ConsoleSpanExporter]
|
||||
metric_exporter: Union[GRPCMetricExporter, HTTPMetricExporter, ConsoleMetricExporter]
|
||||
@ -231,29 +107,11 @@ def init_app(app: DifyApp):
|
||||
export_timeout_millis=dify_config.OTEL_METRIC_EXPORT_TIMEOUT,
|
||||
)
|
||||
set_meter_provider(MeterProvider(resource=resource, metric_readers=[reader]))
|
||||
if not is_celery_worker():
|
||||
init_flask_instrumentor(app)
|
||||
CeleryInstrumentor(tracer_provider=get_tracer_provider(), meter_provider=get_meter_provider()).instrument()
|
||||
instrument_exception_logging()
|
||||
init_sqlalchemy_instrumentor(app)
|
||||
RedisInstrumentor().instrument()
|
||||
HTTPXClientInstrumentor().instrument()
|
||||
|
||||
init_instruments(app)
|
||||
|
||||
atexit.register(shutdown_tracer)
|
||||
|
||||
|
||||
def is_enabled():
|
||||
return dify_config.ENABLE_OTEL
|
||||
|
||||
|
||||
@worker_init.connect(weak=False)
|
||||
def init_celery_worker(*args, **kwargs):
|
||||
if dify_config.ENABLE_OTEL:
|
||||
from opentelemetry.instrumentation.celery import CeleryInstrumentor
|
||||
from opentelemetry.metrics import get_meter_provider
|
||||
from opentelemetry.trace import get_tracer_provider
|
||||
|
||||
tracer_provider = get_tracer_provider()
|
||||
metric_provider = get_meter_provider()
|
||||
if dify_config.DEBUG:
|
||||
logger.info("Initializing OpenTelemetry for Celery worker")
|
||||
CeleryInstrumentor(tracer_provider=tracer_provider, meter_provider=metric_provider).instrument()
|
||||
|
||||
11
api/extensions/otel/__init__.py
Normal file
11
api/extensions/otel/__init__.py
Normal file
@ -0,0 +1,11 @@
|
||||
from extensions.otel.decorators.base import trace_span
|
||||
from extensions.otel.decorators.handler import SpanHandler
|
||||
from extensions.otel.decorators.handlers.generate_handler import AppGenerateHandler
|
||||
from extensions.otel.decorators.handlers.workflow_app_runner_handler import WorkflowAppRunnerHandler
|
||||
|
||||
__all__ = [
|
||||
"AppGenerateHandler",
|
||||
"SpanHandler",
|
||||
"WorkflowAppRunnerHandler",
|
||||
"trace_span",
|
||||
]
|
||||
0
api/extensions/otel/decorators/__init__.py
Normal file
0
api/extensions/otel/decorators/__init__.py
Normal file
61
api/extensions/otel/decorators/base.py
Normal file
61
api/extensions/otel/decorators/base.py
Normal file
@ -0,0 +1,61 @@
|
||||
import functools
|
||||
import os
|
||||
from collections.abc import Callable
|
||||
from typing import Any, TypeVar, cast
|
||||
|
||||
from opentelemetry.trace import get_tracer
|
||||
|
||||
from configs import dify_config
|
||||
from extensions.otel.decorators.handler import SpanHandler
|
||||
|
||||
T = TypeVar("T", bound=Callable[..., Any])
|
||||
|
||||
_HANDLER_INSTANCES: dict[type[SpanHandler], SpanHandler] = {SpanHandler: SpanHandler()}
|
||||
|
||||
|
||||
def _is_instrument_flag_enabled() -> bool:
|
||||
"""
|
||||
Check if external instrumentation is enabled via environment variable.
|
||||
|
||||
Third-party non-invasive instrumentation agents set this flag to coordinate
|
||||
with Dify's manual OpenTelemetry instrumentation.
|
||||
"""
|
||||
return os.getenv("ENABLE_OTEL_FOR_INSTRUMENT", "").strip().lower() == "true"
|
||||
|
||||
|
||||
def _get_handler_instance(handler_class: type[SpanHandler]) -> SpanHandler:
|
||||
"""Get or create a singleton instance of the handler class."""
|
||||
if handler_class not in _HANDLER_INSTANCES:
|
||||
_HANDLER_INSTANCES[handler_class] = handler_class()
|
||||
return _HANDLER_INSTANCES[handler_class]
|
||||
|
||||
|
||||
def trace_span(handler_class: type[SpanHandler] | None = None) -> Callable[[T], T]:
|
||||
"""
|
||||
Decorator that traces a function with an OpenTelemetry span.
|
||||
|
||||
The decorator uses the provided handler class to create a singleton handler instance
|
||||
and delegates the wrapper implementation to that handler.
|
||||
|
||||
:param handler_class: Optional handler class to use for this span. If None, uses the default SpanHandler.
|
||||
"""
|
||||
|
||||
def decorator(func: T) -> T:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
||||
if not (dify_config.ENABLE_OTEL or _is_instrument_flag_enabled()):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
handler = _get_handler_instance(handler_class or SpanHandler)
|
||||
tracer = get_tracer(__name__)
|
||||
|
||||
return handler.wrapper(
|
||||
tracer=tracer,
|
||||
wrapped=func,
|
||||
args=args,
|
||||
kwargs=kwargs,
|
||||
)
|
||||
|
||||
return cast(T, wrapper)
|
||||
|
||||
return decorator
|
||||
95
api/extensions/otel/decorators/handler.py
Normal file
95
api/extensions/otel/decorators/handler.py
Normal file
@ -0,0 +1,95 @@
|
||||
import inspect
|
||||
from collections.abc import Callable, Mapping
|
||||
from typing import Any
|
||||
|
||||
from opentelemetry.trace import SpanKind, Status, StatusCode
|
||||
|
||||
|
||||
class SpanHandler:
|
||||
"""
|
||||
Base class for all span handlers.
|
||||
|
||||
Each instrumentation point provides a handler implementation that fully controls
|
||||
how spans are created, annotated, and finalized through the wrapper method.
|
||||
|
||||
This class provides a default implementation that creates a basic span and handles
|
||||
exceptions. Handlers can override the wrapper method to customize behavior.
|
||||
"""
|
||||
|
||||
_signature_cache: dict[Callable[..., Any], inspect.Signature] = {}
|
||||
|
||||
def _build_span_name(self, wrapped: Callable[..., Any]) -> str:
|
||||
"""
|
||||
Build the span name from the wrapped function.
|
||||
|
||||
Handlers can override this method to customize span name generation.
|
||||
|
||||
:param wrapped: The original function being traced
|
||||
:return: The span name
|
||||
"""
|
||||
return f"{wrapped.__module__}.{wrapped.__qualname__}"
|
||||
|
||||
def _extract_arguments(
|
||||
self,
|
||||
wrapped: Callable[..., Any],
|
||||
args: tuple[Any, ...],
|
||||
kwargs: Mapping[str, Any],
|
||||
) -> dict[str, Any] | None:
|
||||
"""
|
||||
Extract function arguments using inspect.signature.
|
||||
|
||||
Returns a dictionary of bound arguments, or None if extraction fails.
|
||||
Handlers can use this to safely extract parameters from args/kwargs.
|
||||
|
||||
The function signature is cached to improve performance on repeated calls.
|
||||
|
||||
:param wrapped: The function being traced
|
||||
:param args: Positional arguments
|
||||
:param kwargs: Keyword arguments
|
||||
:return: Dictionary of bound arguments, or None if extraction fails
|
||||
"""
|
||||
try:
|
||||
if wrapped not in self._signature_cache:
|
||||
self._signature_cache[wrapped] = inspect.signature(wrapped)
|
||||
|
||||
sig = self._signature_cache[wrapped]
|
||||
bound = sig.bind(*args, **kwargs)
|
||||
bound.apply_defaults()
|
||||
return bound.arguments
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def wrapper(
|
||||
self,
|
||||
tracer: Any,
|
||||
wrapped: Callable[..., Any],
|
||||
args: tuple[Any, ...],
|
||||
kwargs: Mapping[str, Any],
|
||||
) -> Any:
|
||||
"""
|
||||
Fully control the wrapper behavior.
|
||||
|
||||
Default implementation creates a basic span and handles exceptions.
|
||||
Handlers can override this method to provide complete control over:
|
||||
- Span creation and configuration
|
||||
- Attribute extraction
|
||||
- Function invocation
|
||||
- Exception handling
|
||||
- Status setting
|
||||
|
||||
:param tracer: OpenTelemetry tracer instance
|
||||
:param wrapped: The original function being traced
|
||||
:param args: Positional arguments (including self/cls if applicable)
|
||||
:param kwargs: Keyword arguments
|
||||
:return: Result of calling wrapped function
|
||||
"""
|
||||
span_name = self._build_span_name(wrapped)
|
||||
with tracer.start_as_current_span(span_name, kind=SpanKind.INTERNAL) as span:
|
||||
try:
|
||||
result = wrapped(*args, **kwargs)
|
||||
span.set_status(Status(StatusCode.OK))
|
||||
return result
|
||||
except Exception as exc:
|
||||
span.record_exception(exc)
|
||||
span.set_status(Status(StatusCode.ERROR, str(exc)))
|
||||
raise
|
||||
1
api/extensions/otel/decorators/handlers/__init__.py
Normal file
1
api/extensions/otel/decorators/handlers/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
|
||||
64
api/extensions/otel/decorators/handlers/generate_handler.py
Normal file
64
api/extensions/otel/decorators/handlers/generate_handler.py
Normal file
@ -0,0 +1,64 @@
|
||||
import logging
|
||||
from collections.abc import Callable, Mapping
|
||||
from typing import Any
|
||||
|
||||
from opentelemetry.trace import SpanKind, Status, StatusCode
|
||||
from opentelemetry.util.types import AttributeValue
|
||||
|
||||
from extensions.otel.decorators.handler import SpanHandler
|
||||
from extensions.otel.semconv import DifySpanAttributes, GenAIAttributes
|
||||
from models.model import Account
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AppGenerateHandler(SpanHandler):
|
||||
"""Span handler for ``AppGenerateService.generate``."""
|
||||
|
||||
def wrapper(
|
||||
self,
|
||||
tracer: Any,
|
||||
wrapped: Callable[..., Any],
|
||||
args: tuple[Any, ...],
|
||||
kwargs: Mapping[str, Any],
|
||||
) -> Any:
|
||||
try:
|
||||
arguments = self._extract_arguments(wrapped, args, kwargs)
|
||||
if not arguments:
|
||||
return wrapped(*args, **kwargs)
|
||||
|
||||
app_model = arguments.get("app_model")
|
||||
user = arguments.get("user")
|
||||
args_dict = arguments.get("args", {})
|
||||
streaming = arguments.get("streaming", True)
|
||||
|
||||
if not app_model or not user or not isinstance(args_dict, dict):
|
||||
return wrapped(*args, **kwargs)
|
||||
app_id = getattr(app_model, "id", None) or "unknown"
|
||||
tenant_id = getattr(app_model, "tenant_id", None) or "unknown"
|
||||
user_id = getattr(user, "id", None) or "unknown"
|
||||
workflow_id = args_dict.get("workflow_id") or "unknown"
|
||||
|
||||
attributes: dict[str, AttributeValue] = {
|
||||
DifySpanAttributes.APP_ID: app_id,
|
||||
DifySpanAttributes.TENANT_ID: tenant_id,
|
||||
GenAIAttributes.USER_ID: user_id,
|
||||
DifySpanAttributes.USER_TYPE: "Account" if isinstance(user, Account) else "EndUser",
|
||||
DifySpanAttributes.STREAMING: streaming,
|
||||
DifySpanAttributes.WORKFLOW_ID: workflow_id,
|
||||
}
|
||||
|
||||
span_name = self._build_span_name(wrapped)
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to prepare span attributes for AppGenerateService.generate: %s", exc, exc_info=True)
|
||||
return wrapped(*args, **kwargs)
|
||||
|
||||
with tracer.start_as_current_span(span_name, kind=SpanKind.INTERNAL, attributes=attributes) as span:
|
||||
try:
|
||||
result = wrapped(*args, **kwargs)
|
||||
span.set_status(Status(StatusCode.OK))
|
||||
return result
|
||||
except Exception as exc:
|
||||
span.record_exception(exc)
|
||||
span.set_status(Status(StatusCode.ERROR, str(exc)))
|
||||
raise
|
||||
@ -0,0 +1,65 @@
|
||||
import logging
|
||||
from collections.abc import Callable, Mapping
|
||||
from typing import Any
|
||||
|
||||
from opentelemetry.trace import SpanKind, Status, StatusCode
|
||||
from opentelemetry.util.types import AttributeValue
|
||||
|
||||
from extensions.otel.decorators.handler import SpanHandler
|
||||
from extensions.otel.semconv import DifySpanAttributes, GenAIAttributes
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WorkflowAppRunnerHandler(SpanHandler):
|
||||
"""Span handler for ``WorkflowAppRunner.run``."""
|
||||
|
||||
def wrapper(
|
||||
self,
|
||||
tracer: Any,
|
||||
wrapped: Callable[..., Any],
|
||||
args: tuple[Any, ...],
|
||||
kwargs: Mapping[str, Any],
|
||||
) -> Any:
|
||||
try:
|
||||
arguments = self._extract_arguments(wrapped, args, kwargs)
|
||||
if not arguments:
|
||||
return wrapped(*args, **kwargs)
|
||||
|
||||
runner = arguments.get("self")
|
||||
if runner is None or not hasattr(runner, "application_generate_entity"):
|
||||
return wrapped(*args, **kwargs)
|
||||
|
||||
entity = runner.application_generate_entity
|
||||
app_config = getattr(entity, "app_config", None)
|
||||
if app_config is None:
|
||||
return wrapped(*args, **kwargs)
|
||||
|
||||
user_id: AttributeValue = getattr(entity, "user_id", None) or "unknown"
|
||||
app_id: AttributeValue = getattr(app_config, "app_id", None) or "unknown"
|
||||
tenant_id: AttributeValue = getattr(app_config, "tenant_id", None) or "unknown"
|
||||
workflow_id: AttributeValue = getattr(app_config, "workflow_id", None) or "unknown"
|
||||
streaming = getattr(entity, "stream", True)
|
||||
|
||||
attributes: dict[str, AttributeValue] = {
|
||||
DifySpanAttributes.APP_ID: app_id,
|
||||
DifySpanAttributes.TENANT_ID: tenant_id,
|
||||
GenAIAttributes.USER_ID: user_id,
|
||||
DifySpanAttributes.STREAMING: streaming,
|
||||
DifySpanAttributes.WORKFLOW_ID: workflow_id,
|
||||
}
|
||||
|
||||
span_name = self._build_span_name(wrapped)
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to prepare span attributes for WorkflowAppRunner.run: %s", exc, exc_info=True)
|
||||
return wrapped(*args, **kwargs)
|
||||
|
||||
with tracer.start_as_current_span(span_name, kind=SpanKind.INTERNAL, attributes=attributes) as span:
|
||||
try:
|
||||
result = wrapped(*args, **kwargs)
|
||||
span.set_status(Status(StatusCode.OK))
|
||||
return result
|
||||
except Exception as exc:
|
||||
span.record_exception(exc)
|
||||
span.set_status(Status(StatusCode.ERROR, str(exc)))
|
||||
raise
|
||||
108
api/extensions/otel/instrumentation.py
Normal file
108
api/extensions/otel/instrumentation.py
Normal file
@ -0,0 +1,108 @@
|
||||
import contextlib
|
||||
import logging
|
||||
|
||||
import flask
|
||||
from opentelemetry.instrumentation.celery import CeleryInstrumentor
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
||||
from opentelemetry.metrics import get_meter, get_meter_provider
|
||||
from opentelemetry.semconv.trace import SpanAttributes
|
||||
from opentelemetry.trace import Span, get_tracer_provider
|
||||
from opentelemetry.trace.status import StatusCode
|
||||
|
||||
from configs import dify_config
|
||||
from dify_app import DifyApp
|
||||
from extensions.otel.runtime import is_celery_worker
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExceptionLoggingHandler(logging.Handler):
|
||||
def emit(self, record: logging.LogRecord):
|
||||
with contextlib.suppress(Exception):
|
||||
if record.exc_info:
|
||||
tracer = get_tracer_provider().get_tracer("dify.exception.logging")
|
||||
with tracer.start_as_current_span(
|
||||
"log.exception",
|
||||
attributes={
|
||||
"log.level": record.levelname,
|
||||
"log.message": record.getMessage(),
|
||||
"log.logger": record.name,
|
||||
"log.file.path": record.pathname,
|
||||
"log.file.line": record.lineno,
|
||||
},
|
||||
) as span:
|
||||
span.set_status(StatusCode.ERROR)
|
||||
if record.exc_info[1]:
|
||||
span.record_exception(record.exc_info[1])
|
||||
span.set_attribute("exception.message", str(record.exc_info[1]))
|
||||
if record.exc_info[0]:
|
||||
span.set_attribute("exception.type", record.exc_info[0].__name__)
|
||||
|
||||
|
||||
def instrument_exception_logging() -> None:
|
||||
exception_handler = ExceptionLoggingHandler()
|
||||
logging.getLogger().addHandler(exception_handler)
|
||||
|
||||
|
||||
def init_flask_instrumentor(app: DifyApp) -> None:
|
||||
meter = get_meter("http_metrics", version=dify_config.project.version)
|
||||
_http_response_counter = meter.create_counter(
|
||||
"http.server.response.count",
|
||||
description="Total number of HTTP responses by status code, method and target",
|
||||
unit="{response}",
|
||||
)
|
||||
|
||||
def response_hook(span: Span, status: str, response_headers: list) -> None:
|
||||
if span and span.is_recording():
|
||||
try:
|
||||
if status.startswith("2"):
|
||||
span.set_status(StatusCode.OK)
|
||||
else:
|
||||
span.set_status(StatusCode.ERROR, status)
|
||||
|
||||
status = status.split(" ")[0]
|
||||
status_code = int(status)
|
||||
status_class = f"{status_code // 100}xx"
|
||||
attributes: dict[str, str | int] = {"status_code": status_code, "status_class": status_class}
|
||||
request = flask.request
|
||||
if request and request.url_rule:
|
||||
attributes[SpanAttributes.HTTP_TARGET] = str(request.url_rule.rule)
|
||||
if request and request.method:
|
||||
attributes[SpanAttributes.HTTP_METHOD] = str(request.method)
|
||||
_http_response_counter.add(1, attributes)
|
||||
except Exception:
|
||||
logger.exception("Error setting status and attributes")
|
||||
|
||||
from opentelemetry.instrumentation.flask import FlaskInstrumentor
|
||||
|
||||
instrumentor = FlaskInstrumentor()
|
||||
if dify_config.DEBUG:
|
||||
logger.info("Initializing Flask instrumentor")
|
||||
instrumentor.instrument_app(app, response_hook=response_hook)
|
||||
|
||||
|
||||
def init_sqlalchemy_instrumentor(app: DifyApp) -> None:
|
||||
with app.app_context():
|
||||
engines = list(app.extensions["sqlalchemy"].engines.values())
|
||||
SQLAlchemyInstrumentor().instrument(enable_commenter=True, engines=engines)
|
||||
|
||||
|
||||
def init_redis_instrumentor() -> None:
|
||||
RedisInstrumentor().instrument()
|
||||
|
||||
|
||||
def init_httpx_instrumentor() -> None:
|
||||
HTTPXClientInstrumentor().instrument()
|
||||
|
||||
|
||||
def init_instruments(app: DifyApp) -> None:
|
||||
if not is_celery_worker():
|
||||
init_flask_instrumentor(app)
|
||||
CeleryInstrumentor(tracer_provider=get_tracer_provider(), meter_provider=get_meter_provider()).instrument()
|
||||
|
||||
instrument_exception_logging()
|
||||
init_sqlalchemy_instrumentor(app)
|
||||
init_redis_instrumentor()
|
||||
init_httpx_instrumentor()
|
||||
72
api/extensions/otel/runtime.py
Normal file
72
api/extensions/otel/runtime.py
Normal file
@ -0,0 +1,72 @@
|
||||
import logging
|
||||
import sys
|
||||
from typing import Union
|
||||
|
||||
from celery.signals import worker_init
|
||||
from flask_login import user_loaded_from_request, user_logged_in
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.propagate import set_global_textmap
|
||||
from opentelemetry.propagators.b3 import B3Format
|
||||
from opentelemetry.propagators.composite import CompositePropagator
|
||||
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
|
||||
|
||||
from configs import dify_config
|
||||
from libs.helper import extract_tenant_id
|
||||
from models import Account, EndUser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def setup_context_propagation() -> None:
|
||||
set_global_textmap(
|
||||
CompositePropagator(
|
||||
[
|
||||
TraceContextTextMapPropagator(),
|
||||
B3Format(),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def shutdown_tracer() -> None:
|
||||
provider = trace.get_tracer_provider()
|
||||
if hasattr(provider, "force_flush"):
|
||||
provider.force_flush()
|
||||
|
||||
|
||||
def is_celery_worker():
|
||||
return "celery" in sys.argv[0].lower()
|
||||
|
||||
|
||||
@user_logged_in.connect
|
||||
@user_loaded_from_request.connect
|
||||
def on_user_loaded(_sender, user: Union["Account", "EndUser"]):
|
||||
if dify_config.ENABLE_OTEL:
|
||||
from opentelemetry.trace import get_current_span
|
||||
|
||||
if user:
|
||||
try:
|
||||
current_span = get_current_span()
|
||||
tenant_id = extract_tenant_id(user)
|
||||
if not tenant_id:
|
||||
return
|
||||
if current_span:
|
||||
current_span.set_attribute("service.tenant.id", tenant_id)
|
||||
current_span.set_attribute("service.user.id", user.id)
|
||||
except Exception:
|
||||
logger.exception("Error setting tenant and user attributes")
|
||||
pass
|
||||
|
||||
|
||||
@worker_init.connect(weak=False)
|
||||
def init_celery_worker(*args, **kwargs):
|
||||
if dify_config.ENABLE_OTEL:
|
||||
from opentelemetry.instrumentation.celery import CeleryInstrumentor
|
||||
from opentelemetry.metrics import get_meter_provider
|
||||
from opentelemetry.trace import get_tracer_provider
|
||||
|
||||
tracer_provider = get_tracer_provider()
|
||||
metric_provider = get_meter_provider()
|
||||
if dify_config.DEBUG:
|
||||
logger.info("Initializing OpenTelemetry for Celery worker")
|
||||
CeleryInstrumentor(tracer_provider=tracer_provider, meter_provider=metric_provider).instrument()
|
||||
6
api/extensions/otel/semconv/__init__.py
Normal file
6
api/extensions/otel/semconv/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
"""Semantic convention shortcuts for Dify-specific spans."""
|
||||
|
||||
from .dify import DifySpanAttributes
|
||||
from .gen_ai import GenAIAttributes
|
||||
|
||||
__all__ = ["DifySpanAttributes", "GenAIAttributes"]
|
||||
23
api/extensions/otel/semconv/dify.py
Normal file
23
api/extensions/otel/semconv/dify.py
Normal file
@ -0,0 +1,23 @@
|
||||
"""Dify-specific semantic convention definitions."""
|
||||
|
||||
|
||||
class DifySpanAttributes:
|
||||
"""Attribute names for Dify-specific spans."""
|
||||
|
||||
APP_ID = "dify.app_id"
|
||||
"""Application identifier."""
|
||||
|
||||
TENANT_ID = "dify.tenant_id"
|
||||
"""Tenant identifier."""
|
||||
|
||||
USER_TYPE = "dify.user_type"
|
||||
"""User type, e.g. Account, EndUser."""
|
||||
|
||||
STREAMING = "dify.streaming"
|
||||
"""Whether streaming response is enabled."""
|
||||
|
||||
WORKFLOW_ID = "dify.workflow_id"
|
||||
"""Workflow identifier."""
|
||||
|
||||
INVOKE_FROM = "dify.invoke_from"
|
||||
"""Invocation source, e.g. SERVICE_API, WEB_APP, DEBUGGER."""
|
||||
64
api/extensions/otel/semconv/gen_ai.py
Normal file
64
api/extensions/otel/semconv/gen_ai.py
Normal file
@ -0,0 +1,64 @@
|
||||
"""
|
||||
GenAI semantic conventions.
|
||||
"""
|
||||
|
||||
|
||||
class GenAIAttributes:
|
||||
"""Common GenAI attribute keys."""
|
||||
|
||||
USER_ID = "gen_ai.user.id"
|
||||
"""Identifier of the end user in the application layer."""
|
||||
|
||||
FRAMEWORK = "gen_ai.framework"
|
||||
"""Framework type. Fixed to 'dify' in this project."""
|
||||
|
||||
SPAN_KIND = "gen_ai.span.kind"
|
||||
"""Operation type. Extended specification, not in OTel standard."""
|
||||
|
||||
|
||||
class ChainAttributes:
|
||||
"""Chain operation attribute keys."""
|
||||
|
||||
OPERATION_NAME = "gen_ai.operation.name"
|
||||
"""Secondary operation type, e.g. WORKFLOW, TASK."""
|
||||
|
||||
INPUT_VALUE = "input.value"
|
||||
"""Input content."""
|
||||
|
||||
OUTPUT_VALUE = "output.value"
|
||||
"""Output content."""
|
||||
|
||||
TIME_TO_FIRST_TOKEN = "gen_ai.user.time_to_first_token"
|
||||
"""Time to first token in nanoseconds from receiving the request to first token return."""
|
||||
|
||||
|
||||
class RetrieverAttributes:
|
||||
"""Retriever operation attribute keys."""
|
||||
|
||||
QUERY = "retrieval.query"
|
||||
"""Retrieval query string."""
|
||||
|
||||
DOCUMENT = "retrieval.document"
|
||||
"""Retrieved document list as JSON array."""
|
||||
|
||||
|
||||
class ToolAttributes:
|
||||
"""Tool operation attribute keys."""
|
||||
|
||||
TOOL_CALL_ID = "gen_ai.tool.call.id"
|
||||
"""Tool call identifier."""
|
||||
|
||||
TOOL_DESCRIPTION = "gen_ai.tool.description"
|
||||
"""Tool description."""
|
||||
|
||||
TOOL_NAME = "gen_ai.tool.name"
|
||||
"""Tool name."""
|
||||
|
||||
TOOL_TYPE = "gen_ai.tool.type"
|
||||
"""Tool type. Examples: function, extension, datastore."""
|
||||
|
||||
TOOL_CALL_ARGUMENTS = "gen_ai.tool.call.arguments"
|
||||
"""Tool invocation arguments."""
|
||||
|
||||
TOOL_CALL_RESULT = "gen_ai.tool.call.result"
|
||||
"""Tool invocation result."""
|
||||
Reference in New Issue
Block a user