Fix retrieval function when metadata_condtion is specified in retrieval API (#13473)

### What problem does this PR solve? Fix https://github.com/infiniflow/ragflow/issues/13388 The following command returns empty when there is doc with the meta data ``` curl --request POST \ --url http://localhost:9222/api/v1/retrieval \ --header 'Content-Type: application/json' \ --header 'Authorization: Bearer ragflow-fO3mPFePfLgUYg8-9gjBVVXbvHqrvMPLGaW0P86PvAk' \ --data '{ "question": "any question", "dataset_ids": ["9bb4f0591b8811f18a4a84ba59049aa3"], "metadata_condition": { "logic": "and", "conditions": [ { "name": "character", "comparison_operator": "is", "value": "刘备" } ] } }' ``` When metadata_condtion is specified in the retrieval API, it is converted to doc_ids and doc_ids is passed to retrieval function. In retrieval funciton, when doc_ids is explicitly provided , we should bypass threshold. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-05-03 00:37:48 +08:00 · 2026-03-10 11:57:32 +08:00
parent 292a1a8566
commit 7c92f51133
3 changed files with 13 additions and 2 deletions
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@ -1682,7 +1682,7 @@ async def retrieval_test(tenant_id):
    if not doc_ids:
        metadata_condition = req.get("metadata_condition")
        if metadata_condition:
-            metas = DocMetadataService.get_meta_by_kbs(kb_ids)
+            metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
            doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
            # If metadata_condition has conditions but no docs match, return empty result
            if not doc_ids and metadata_condition.get("conditions"):
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -438,6 +438,12 @@ class Dealer:

        # When vector_similarity_weight is 0, similarity_threshold is not meaningful for term-only scores.
        post_threshold = 0.0 if vector_similarity_weight <= 0 else similarity_threshold
+
+        # When doc_ids is explicitly provided (metadata or document filtering), bypass threshold
+        # User wants those specific documents regardless of their relevance score
+        if doc_ids:
+            post_threshold = 0.0
+
        valid_idx = [int(i) for i in sorted_idx if sim_np[i] >= post_threshold]
        filtered_count = len(valid_idx)
        ranks["total"] = int(filtered_count)
--- a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py
+++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py
@ -220,6 +220,11 @@ def _load_doc_module(monkeypatch):
    
    tenant_llm_service_mod.TenantService = _StubTenantService
    tenant_llm_service_mod.TenantLLMService = _StubTenantLLMService
+
+    class _StubLLMFactoriesService:
+        pass
+
+    tenant_llm_service_mod.LLMFactoriesService = _StubLLMFactoriesService
    monkeypatch.setitem(sys.modules, "api.db.services.tenant_llm_service", tenant_llm_service_mod)

    # Mock LLMService
@ -993,7 +998,7 @@ class TestDocRoutesUnit:
            "get_request_json",
            lambda: _AwaitableValue({"dataset_ids": ["ds-1"], "question": "q", "metadata_condition": {"logic": "and"}}),
        )
-        monkeypatch.setattr(module.DocMetadataService, "get_meta_by_kbs", lambda _ids: [])
+        monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: [])
        monkeypatch.setattr(module, "meta_filter", lambda *_args, **_kwargs: [])
        res = _run(module.retrieval_test.__wrapped__("tenant-1"))
        assert "code" in res