From 1815f5950bae3e6be2b4ecb7e435a1f58219c8b5 Mon Sep 17 00:00:00 2001 From: qinling0210 <88864212+qinling0210@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:42:24 +0800 Subject: [PATCH] Call get_flatted_meta_by_kbs in dify retrieval (#13509) ### What problem does this PR solve? Fix https://github.com/infiniflow/ragflow/issues/13388 Call get_flatted_meta_by_kbs in dify retrieval. Remove get_meta_by_kbs. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/sdk/dify_retrieval.py | 2 +- api/db/services/doc_metadata_service.py | 76 ------------------- .../test_dify_retrieval_routes_unit.py | 13 +++- 3 files changed, 10 insertions(+), 81 deletions(-) diff --git a/api/apps/sdk/dify_retrieval.py b/api/apps/sdk/dify_retrieval.py index 07ad5d200..e6dd61d03 100644 --- a/api/apps/sdk/dify_retrieval.py +++ b/api/apps/sdk/dify_retrieval.py @@ -123,7 +123,7 @@ async def retrieval(tenant_id): similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0)) top = int(retrieval_setting.get("top_k", 1024)) metadata_condition = req.get("metadata_condition", {}) or {} - metas = DocMetadataService.get_meta_by_kbs([kb_id]) + metas = DocMetadataService.get_flatted_meta_by_kbs([kb_id]) doc_ids = [] try: diff --git a/api/db/services/doc_metadata_service.py b/api/db/services/doc_metadata_service.py index 02a6b8927..f2ee29e6d 100644 --- a/api/db/services/doc_metadata_service.py +++ b/api/db/services/doc_metadata_service.py @@ -694,82 +694,6 @@ class DocMetadataService: logging.error(f"Error getting metadata for document {doc_id}: {e}") return {} - @classmethod - @DB.connection_context() - def get_meta_by_kbs(cls, kb_ids: List[str]) -> Dict: - """ - Get metadata for documents in knowledge bases (Legacy). - - Legacy metadata aggregator (backward-compatible). - - Does NOT expand list values and a list is kept as one string key. - Example: {"tags": ["foo","bar"]} -> meta["tags"]["['foo', 'bar']"] = [doc_id] - - Expects meta_fields is a dict. - Use when existing callers rely on the old list-as-string semantics. - - Args: - kb_ids: List of knowledge base IDs - - Returns: - Metadata dictionary in format: {field_name: {value: [doc_ids]}} - """ - try: - # Get tenant_id from first KB - kb = Knowledgebase.get_by_id(kb_ids[0]) - if not kb: - return {} - - tenant_id = kb.tenant_id - index_name = cls._get_doc_meta_index_name(tenant_id) - - condition = {"kb_id": kb_ids} - order_by = OrderByExpr() - - # Query with large limit - results = settings.docStoreConn.search( - select_fields=["*"], - highlight_fields=[], - condition=condition, - match_expressions=[], - order_by=order_by, - offset=0, - limit=10000, - index_names=index_name, - knowledgebase_ids=kb_ids - ) - - logging.debug(f"[get_meta_by_kbs] index_name: {index_name}, kb_ids: {kb_ids}") - - # Aggregate metadata (legacy: keeps lists as string keys) - meta = {} - - # Use helper to iterate over results in any format - for doc_id, doc in cls._iter_search_results(results): - # Extract metadata fields (exclude system fields) - doc_meta = cls._extract_metadata(doc) - - # Legacy: Keep lists as string keys (do NOT expand) - for k, v in doc_meta.items(): - if k not in meta: - meta[k] = {} - # If not list, make it a list - if not isinstance(v, list): - v = [v] - # Legacy: Use the entire list as a string key - # Skip nested lists/dicts - if isinstance(v, list) and any(isinstance(x, (list, dict)) for x in v): - continue - list_key = str(v) - if list_key not in meta[k]: - meta[k][list_key] = [] - meta[k][list_key].append(doc_id) - - logging.debug(f"[get_meta_by_kbs] KBs: {kb_ids}, Returning metadata: {meta}") - return meta - - except Exception as e: - logging.error(f"Error getting metadata for KBs {kb_ids}: {e}") - return {} - @classmethod @DB.connection_context() def get_flatted_meta_by_kbs(cls, kb_ids: List[str]) -> Dict: diff --git a/test/testcases/test_http_api/test_dataset_management/test_dify_retrieval_routes_unit.py b/test/testcases/test_http_api/test_dataset_management/test_dify_retrieval_routes_unit.py index b7c625cbe..ac98d9e1d 100644 --- a/test/testcases/test_http_api/test_dataset_management/test_dify_retrieval_routes_unit.py +++ b/test/testcases/test_http_api/test_dataset_management/test_dify_retrieval_routes_unit.py @@ -161,6 +161,11 @@ def _load_dify_retrieval_module(monkeypatch): tenant_llm_service_mod.TenantService = _StubTenantService tenant_llm_service_mod.TenantLLMService = _StubTenantLLMService + + class _StubLLMFactoriesService: + pass + + tenant_llm_service_mod.LLMFactoriesService = _StubLLMFactoriesService monkeypatch.setitem(sys.modules, "api.db.services.tenant_llm_service", tenant_llm_service_mod) # Mock llm_service for LLMService @@ -265,7 +270,7 @@ def test_retrieval_success_with_metadata_and_kg(monkeypatch): ) monkeypatch.setattr(module, "jsonify", lambda payload: payload) - monkeypatch.setattr(module.DocMetadataService, "get_meta_by_kbs", lambda _kb_ids: [{"doc_id": "doc-1"}]) + monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: [{"doc_id": "doc-1"}]) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, _DummyKB())) monkeypatch.setattr(module, "convert_conditions", lambda cond: cond.get("conditions", [])) monkeypatch.setattr(module, "meta_filter", lambda *_args, **_kwargs: []) @@ -303,7 +308,7 @@ def test_retrieval_success_with_metadata_and_kg(monkeypatch): def test_retrieval_kb_not_found(monkeypatch): module = _load_dify_retrieval_module(monkeypatch) _set_request_json(monkeypatch, module, {"knowledge_id": "kb-missing", "query": "hello"}) - monkeypatch.setattr(module.DocMetadataService, "get_meta_by_kbs", lambda _kb_ids: []) + monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: []) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (False, None)) res = _run(inspect.unwrap(module.retrieval)("tenant-1")) @@ -315,7 +320,7 @@ def test_retrieval_kb_not_found(monkeypatch): def test_retrieval_not_found_exception_mapping(monkeypatch): module = _load_dify_retrieval_module(monkeypatch) _set_request_json(monkeypatch, module, {"knowledge_id": "kb-1", "query": "hello"}) - monkeypatch.setattr(module.DocMetadataService, "get_meta_by_kbs", lambda _kb_ids: []) + monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: []) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, _DummyKB())) monkeypatch.setattr(module, "label_question", lambda *_args, **_kwargs: []) @@ -334,7 +339,7 @@ def test_retrieval_not_found_exception_mapping(monkeypatch): def test_retrieval_generic_exception_mapping(monkeypatch): module = _load_dify_retrieval_module(monkeypatch) _set_request_json(monkeypatch, module, {"knowledge_id": "kb-1", "query": "hello"}) - monkeypatch.setattr(module.DocMetadataService, "get_meta_by_kbs", lambda _kb_ids: []) + monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: []) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, _DummyKB())) monkeypatch.setattr(module, "label_question", lambda *_args, **_kwargs: [])