Feat/configurable metadata display (#13464)

### What problem does this PR solve? Currently, RAGFlow's Search and Chat interfaces display only raw vectorized text chunks during retrieval, without contextual information about their source documents. Users cannot see document titles, page numbers, upload dates, or custom metadata fields that would help them understand and trust the retrieved results. This PR introduces an **optional metadata display feature** that enriches retrieved chunks with document-level metadata in both the Search tab and Chatbot interface. **Key improvements:** - **Search results**: Display document metadata as styled badges beneath chunk snippets - **Chat citations**: Show metadata in citation popovers and reference lists for better source context - **LLM context**: Metadata is injected into the LLM prompt to enable more accurate, citation-aware responses - **External API support**: Applications using RAGFlow's SDK retrieval endpoints (`/v1/retrieval`, `/v1/searchbots/retrieval_test`) can opt-in via request parameters - **User control**: Multi-select dropdown UI allows users to choose which metadata fields to display **Implementation approach:** - ✅ Reuses existing `DocMetadataService` infrastructure (no new database tables or indices) - ✅ Settings stored in existing JSON configuration fields (`search_config.reference_metadata`, `prompt_config.reference_metadata`) - ✅ No database migrations required - ✅ Disabled by default (fully opt-in and backward-compatible) - ✅ Dynamic metadata field selection populated from actual document metadata keys - ✅ Fixed critical bug where Python's builtin `set()` was shadowed by a route handler function **Modified endpoints (all backward-compatible):** - `POST /v1/retrieval` (Public SDK) - `POST /v1/searchbots/retrieval_test` (Searchbots) - `POST /v1/chunk/retrieval_test` (UI/Internal) - Chat completions endpoints (via `extra_body.reference_metadata` or `prompt_config`) ### Type of change - [x] New Feature (non-breaking change which adds functionality) ###Images - <img width="879" height="1275" alt="image" src="https://github.com/user-attachments/assets/95b2d731-31ae-45a1-b081-bf5893f52aeb" /> <br><br> <br><br> <img width="1532" height="362" alt="image" src="https://github.com/user-attachments/assets/9cebc65b-b7a7-459f-b25e-3b13fa9b638e" /> <br><br> <br><br> <img width="2586" height="1320" alt="image" src="https://github.com/user-attachments/assets/2153d493-d899-461f-a7a9-041391e07776" /> --------- Co-authored-by: Cursor Agent <cursoragent@cursor.com> Co-authored-by: Attili-sys <Attili-sys@users.noreply.github.com> Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
2026-05-06 10:17:49 +08:00 · 2026-04-30 18:13:27 +03:00
parent d38d6e7931
commit 24af0875e5
23 changed files with 1004 additions and 67 deletions
--- a/api/apps/restful_apis/openai_api.py
+++ b/api/apps/restful_apis/openai_api.py
@ -48,44 +48,35 @@ def _validate_llm_id(llm_id, tenant_id, llm_setting=None):
    return None


+import logging
+from api.utils.reference_metadata_utils import enrich_chunks_with_document_metadata
+
 def _build_reference_chunks(reference, include_metadata=False, metadata_fields=None):
    chunks = chunks_format(reference)
    if not include_metadata:
+        logging.debug("Skipping document metadata enrichment (include_metadata=False)")
        return chunks

-    doc_ids_by_kb = {}
-    for chunk in chunks:
-        kb_id = chunk.get("dataset_id")
-        doc_id = chunk.get("document_id")
-        if not kb_id or not doc_id:
-            continue
-        doc_ids_by_kb.setdefault(kb_id, set()).add(doc_id)
-
-    if not doc_ids_by_kb:
-        return chunks
-
-    meta_by_doc = {}
-    for kb_id, doc_ids in doc_ids_by_kb.items():
-        meta_map = DocMetadataService.get_metadata_for_documents(list(doc_ids), kb_id)
-        if meta_map:
-            meta_by_doc.update(meta_map)
-
+    normalized_fields = None
    if metadata_fields is not None:
-        metadata_fields = {f for f in metadata_fields if isinstance(f, str)}
-        if not metadata_fields:
+        if not isinstance(metadata_fields, list):
+            return chunks
+        normalized_fields = {f for f in metadata_fields if isinstance(f, str)}
+        if not normalized_fields:
            return chunks

-    for chunk in chunks:
-        doc_id = chunk.get("document_id")
-        if not doc_id:
-            continue
-        meta = meta_by_doc.get(doc_id)
-        if not meta:
-            continue
-        if metadata_fields is not None:
-            meta = {k: v for k, v in meta.items() if k in metadata_fields}
-        if meta:
-            chunk["document_metadata"] = meta
+    logging.debug(
+        "Enriching %d chunks with document metadata (fields: %s)",
+        len(chunks),
+        "ALL" if normalized_fields is None else list(normalized_fields),
+    )
+
+    enrich_chunks_with_document_metadata(
+        chunks,
+        normalized_fields,
+        kb_field="dataset_id",
+        doc_field="document_id",
+    )

    return chunks

--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@ -13,6 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 from io import BytesIO

 from quart import request, send_file
@ -37,6 +38,18 @@ from rag.prompts.generator import cross_languages, keyword_extraction
 MAXIMUM_OF_UPLOADING_FILES = 256


+from api.utils.reference_metadata_utils import (
+    enrich_chunks_with_document_metadata,
+    resolve_reference_metadata_preferences,
+)
+
+def _resolve_reference_metadata(req: dict, search_config: dict | None = None):
+    return resolve_reference_metadata_preferences(req, search_config)
+
+def _enrich_chunks_with_document_metadata(chunks: list[dict], metadata_fields=None) -> None:
+    enrich_chunks_with_document_metadata(chunks, metadata_fields)
+
+
@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"])  # noqa: F821
@token_required
 async def download(tenant_id, dataset_id, document_id):
@ -450,6 +463,7 @@ async def retrieval_test(tenant_id):
            return get_error_data_result("`highlight` should be a boolean")
    else:
        return get_error_data_result("`highlight` should be a boolean")
+    include_metadata, metadata_fields = _resolve_reference_metadata(req)
    try:
        tenant_ids = list(set([kb.tenant_id for kb in kbs]))
        e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
@ -508,6 +522,15 @@ async def retrieval_test(tenant_id):
        for c in ranks["chunks"]:
            c.pop("vector", None)

+        if include_metadata:
+            logging.info(
+                "sdk.retrieval reference_metadata enabled dataset_ids=%s fields=%s chunks=%s",
+                kb_ids,
+                sorted(metadata_fields) if metadata_fields else None,
+                len(ranks["chunks"]),
+            )
+            enrich_chunks_with_document_metadata(ranks["chunks"], metadata_fields)
+
        ##rename keys
        renamed_chunks = []
        for chunk in ranks["chunks"]:
--- a/api/apps/sdk/session.py
+++ b/api/apps/sdk/session.py
@ -44,6 +44,10 @@ from rag.prompts.template import load_prompt
 from rag.prompts.generator import cross_languages, keyword_extraction
 from common.constants import RetCode, LLMType
 from common import settings
+from api.utils.reference_metadata_utils import (
+    enrich_chunks_with_document_metadata,
+    resolve_reference_metadata_preferences,
+)


@token_required
@ -327,6 +331,7 @@ async def retrieval_test_embedded():
    tenant_id = objs[0].tenant_id
    if not tenant_id:
        return get_error_data_result(message="permission denined.")
+    search_config = {}

    async def _retrieval():
        nonlocal similarity_threshold, vector_similarity_weight, top, rerank_id
@ -337,8 +342,11 @@ async def retrieval_test_embedded():
        meta_data_filter = {}
        chat_mdl = None
        if req.get("search_id", ""):
-            search_config = SearchService.get_detail(req.get("search_id", "")).get("search_config", {})
-            meta_data_filter = search_config.get("meta_data_filter", {})
+            nonlocal search_config
+            detail = SearchService.get_detail(req.get("search_id", ""))
+            if detail:
+                search_config = detail.get("search_config", {})
+                meta_data_filter = search_config.get("meta_data_filter", {})
            if meta_data_filter.get("method") in ["auto", "semi_auto"]:
                chat_id = search_config.get("chat_id", "")
                if chat_id:
@ -414,6 +422,11 @@ async def retrieval_test_embedded():

        for c in ranks["chunks"]:
            c.pop("vector", None)
+
+        include_metadata, metadata_fields = _resolve_reference_metadata(req, search_config)
+        if include_metadata:
+            enrich_chunks_with_document_metadata(ranks["chunks"], metadata_fields)
+
        ranks["labels"] = labels

        return get_json_result(data=ranks)
@ -529,3 +542,6 @@ async def mindmap():
        return server_error_response(Exception(mind_map["error"]))
    return get_json_result(data=mind_map)

+
+def _resolve_reference_metadata(req, search_config=None):
+    return resolve_reference_metadata_preferences(req, search_config)