Call get_flatted_meta_by_kbs in dify retrieval (#13509)

### What problem does this PR solve?

Fix https://github.com/infiniflow/ragflow/issues/13388

Call get_flatted_meta_by_kbs in dify retrieval. Remove get_meta_by_kbs.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
qinling0210
2026-03-11 13:42:24 +08:00
committed by GitHub
parent 2d2d3cdbcf
commit 1815f5950b
3 changed files with 10 additions and 81 deletions

View File

@ -123,7 +123,7 @@ async def retrieval(tenant_id):
similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0))
top = int(retrieval_setting.get("top_k", 1024))
metadata_condition = req.get("metadata_condition", {}) or {}
metas = DocMetadataService.get_meta_by_kbs([kb_id])
metas = DocMetadataService.get_flatted_meta_by_kbs([kb_id])
doc_ids = []
try:

View File

@ -694,82 +694,6 @@ class DocMetadataService:
logging.error(f"Error getting metadata for document {doc_id}: {e}")
return {}
@classmethod
@DB.connection_context()
def get_meta_by_kbs(cls, kb_ids: List[str]) -> Dict:
"""
Get metadata for documents in knowledge bases (Legacy).
Legacy metadata aggregator (backward-compatible).
- Does NOT expand list values and a list is kept as one string key.
Example: {"tags": ["foo","bar"]} -> meta["tags"]["['foo', 'bar']"] = [doc_id]
- Expects meta_fields is a dict.
Use when existing callers rely on the old list-as-string semantics.
Args:
kb_ids: List of knowledge base IDs
Returns:
Metadata dictionary in format: {field_name: {value: [doc_ids]}}
"""
try:
# Get tenant_id from first KB
kb = Knowledgebase.get_by_id(kb_ids[0])
if not kb:
return {}
tenant_id = kb.tenant_id
index_name = cls._get_doc_meta_index_name(tenant_id)
condition = {"kb_id": kb_ids}
order_by = OrderByExpr()
# Query with large limit
results = settings.docStoreConn.search(
select_fields=["*"],
highlight_fields=[],
condition=condition,
match_expressions=[],
order_by=order_by,
offset=0,
limit=10000,
index_names=index_name,
knowledgebase_ids=kb_ids
)
logging.debug(f"[get_meta_by_kbs] index_name: {index_name}, kb_ids: {kb_ids}")
# Aggregate metadata (legacy: keeps lists as string keys)
meta = {}
# Use helper to iterate over results in any format
for doc_id, doc in cls._iter_search_results(results):
# Extract metadata fields (exclude system fields)
doc_meta = cls._extract_metadata(doc)
# Legacy: Keep lists as string keys (do NOT expand)
for k, v in doc_meta.items():
if k not in meta:
meta[k] = {}
# If not list, make it a list
if not isinstance(v, list):
v = [v]
# Legacy: Use the entire list as a string key
# Skip nested lists/dicts
if isinstance(v, list) and any(isinstance(x, (list, dict)) for x in v):
continue
list_key = str(v)
if list_key not in meta[k]:
meta[k][list_key] = []
meta[k][list_key].append(doc_id)
logging.debug(f"[get_meta_by_kbs] KBs: {kb_ids}, Returning metadata: {meta}")
return meta
except Exception as e:
logging.error(f"Error getting metadata for KBs {kb_ids}: {e}")
return {}
@classmethod
@DB.connection_context()
def get_flatted_meta_by_kbs(cls, kb_ids: List[str]) -> Dict:

View File

@ -161,6 +161,11 @@ def _load_dify_retrieval_module(monkeypatch):
tenant_llm_service_mod.TenantService = _StubTenantService
tenant_llm_service_mod.TenantLLMService = _StubTenantLLMService
class _StubLLMFactoriesService:
pass
tenant_llm_service_mod.LLMFactoriesService = _StubLLMFactoriesService
monkeypatch.setitem(sys.modules, "api.db.services.tenant_llm_service", tenant_llm_service_mod)
# Mock llm_service for LLMService
@ -265,7 +270,7 @@ def test_retrieval_success_with_metadata_and_kg(monkeypatch):
)
monkeypatch.setattr(module, "jsonify", lambda payload: payload)
monkeypatch.setattr(module.DocMetadataService, "get_meta_by_kbs", lambda _kb_ids: [{"doc_id": "doc-1"}])
monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: [{"doc_id": "doc-1"}])
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, _DummyKB()))
monkeypatch.setattr(module, "convert_conditions", lambda cond: cond.get("conditions", []))
monkeypatch.setattr(module, "meta_filter", lambda *_args, **_kwargs: [])
@ -303,7 +308,7 @@ def test_retrieval_success_with_metadata_and_kg(monkeypatch):
def test_retrieval_kb_not_found(monkeypatch):
module = _load_dify_retrieval_module(monkeypatch)
_set_request_json(monkeypatch, module, {"knowledge_id": "kb-missing", "query": "hello"})
monkeypatch.setattr(module.DocMetadataService, "get_meta_by_kbs", lambda _kb_ids: [])
monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: [])
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (False, None))
res = _run(inspect.unwrap(module.retrieval)("tenant-1"))
@ -315,7 +320,7 @@ def test_retrieval_kb_not_found(monkeypatch):
def test_retrieval_not_found_exception_mapping(monkeypatch):
module = _load_dify_retrieval_module(monkeypatch)
_set_request_json(monkeypatch, module, {"knowledge_id": "kb-1", "query": "hello"})
monkeypatch.setattr(module.DocMetadataService, "get_meta_by_kbs", lambda _kb_ids: [])
monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: [])
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, _DummyKB()))
monkeypatch.setattr(module, "label_question", lambda *_args, **_kwargs: [])
@ -334,7 +339,7 @@ def test_retrieval_not_found_exception_mapping(monkeypatch):
def test_retrieval_generic_exception_mapping(monkeypatch):
module = _load_dify_retrieval_module(monkeypatch)
_set_request_json(monkeypatch, module, {"knowledge_id": "kb-1", "query": "hello"})
monkeypatch.setattr(module.DocMetadataService, "get_meta_by_kbs", lambda _kb_ids: [])
monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: [])
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, _DummyKB()))
monkeypatch.setattr(module, "label_question", lambda *_args, **_kwargs: [])