fix: Langfuse chat observation (#15026)

### What problem does this PR solve? Closes #15025 Langfuse-enabled `dialog_service.async_chat()` regressed to `langfuse_tracer.start_generation(...)` after the earlier Langfuse v4 migration. Langfuse v4 uses `start_observation(as_type="generation")`, so the remaining `start_generation` call can fail when chat tracing is enabled. This restores the migrated `start_observation(as_type="generation")` call for chat observations while preserving the existing trace context, model, input payload, and update/end flow. It also adds a regression test with a fake Langfuse v4-style client that exposes `start_observation()` but not `start_generation()`. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) ### Tests - `.venv/bin/pytest test/unit_test/api/db/services/test_dialog_service_final_answer.py -q` - `.venv/bin/ruff check api/db/services/dialog_service.py test/unit_test/api/db/services/test_dialog_service_final_answer.py`
2026-05-28 03:33:05 +08:00 · 2026-05-20 04:01:19 -03:00
parent 1ed8a118cf
commit 6499bce2a6
2 changed files with 180 additions and 6 deletions
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@ -572,6 +572,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
    check_llm_ts = timer()

    langfuse_tracer = None
+    langfuse_generation = None
    trace_context = {}
    langfuse_keys = TenantLangfuseService.filter_by_tenant(tenant_id=dialog.tenant_id)
    if langfuse_keys:
@ -783,7 +784,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
        gen_conf["max_tokens"] = min(gen_conf["max_tokens"], max_tokens - used_token_count)

    async def decorate_answer(answer):
-        nonlocal embd_mdl, prompt_config, knowledges, kwargs, kbinfos, prompt, retrieval_ts, questions, langfuse_tracer
+        nonlocal embd_mdl, prompt_config, knowledges, kwargs, kbinfos, prompt, retrieval_ts, questions, langfuse_generation

        refs = []
        ans = answer.split("</think>")
@ -855,8 +856,8 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
            f"  - Token speed: {int(tk_num / (generate_result_time_cost / 1000.0))}/s"
        )

-        # Add a condition check to call the end method only if langfuse_tracer exists
-        if langfuse_tracer and "langfuse_generation" in locals():
+        # Add a condition check to call the end method only if langfuse_generation exists
+        if langfuse_generation is not None:
            langfuse_output = "\n" + re.sub(r"^.*?(### Query:.*)", r"\1", prompt, flags=re.DOTALL)
            langfuse_output = {"time_elapsed:": re.sub(r"\n", "  \n", langfuse_output), "created_at": time.time()}
            langfuse_generation.update(
@ -872,9 +873,18 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
        return {"answer": think + answer, "reference": refs, "prompt": re.sub(r"\n", "  \n", prompt), "created_at": time.time()}

    if langfuse_tracer:
-        langfuse_generation = langfuse_tracer.start_generation(
-            trace_context=trace_context, name="chat", model=llm_model_config["llm_name"], input={"prompt": prompt, "prompt4citation": prompt4citation, "messages": msg}
-        )
+        try:
+            langfuse_generation = langfuse_tracer.start_observation(
+                as_type="generation",
+                trace_context=trace_context,
+                name="chat",
+                model=llm_model_config["llm_name"],
+                input={"prompt": prompt, "prompt4citation": prompt4citation, "messages": msg},
+            )
+        except Exception as e:  # noqa: BLE001 - tracing must not break chat flow
+            logger.warning("Langfuse start_observation failed; continuing without tracing: %s", e)
+            langfuse_tracer = None
+            langfuse_generation = None

    if stream:
        if llm_type == "chat":