Fix document metadata loading for paged listings (#13515)

## Summary - scope normal document-list metadata lookups to the current page's document IDs - keep the `return_empty_metadata=True` path dataset-wide because it needs full knowledge of docs that already have metadata - add unit tests for both paged listing paths and the unchanged empty-metadata behavior ## Why `DocumentService.get_list()` and the normal `get_by_kb_id()` path were calling `DocMetadataService.get_metadata_for_documents(None, kb_id)`, which loads metadata for the entire dataset on every page request. That becomes especially problematic on large datasets. The metadata scan path paginates through the full metadata index without an explicit sort, while the ES helper only switches to `search_after` beyond `10000` results when a sort is present. In practice this can lead to unnecessary full-dataset metadata work, slower document-list loading, and unreliable `meta_fields` in list responses for large KBs. This change keeps the existing empty-metadata filter behavior intact, but scopes normal list responses to metadata for the current page only.
2026-04-26 05:25:41 +08:00 · 2026-03-11 06:42:16 +01:00
parent 507ba4ea20
commit 2d2d3cdbcf
2 changed files with 204 additions and 5 deletions
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -107,7 +107,8 @@ class DocumentService(CommonService):
        docs = docs.paginate(page_number, items_per_page)

        docs_list = list(docs.dicts())
-        metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
+        doc_ids_on_page = [doc["id"] for doc in docs_list]
+        metadata_map = DocMetadataService.get_metadata_for_documents(doc_ids_on_page, kb_id) if doc_ids_on_page else {}
        for doc in docs_list:
            doc["meta_fields"] = metadata_map.get(doc["id"], {})
        return docs_list, count
@ -156,10 +157,12 @@ class DocumentService(CommonService):
        if suffix:
            docs = docs.where(cls.model.suffix.in_(suffix))

-        metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
-        doc_ids_with_metadata = set(metadata_map.keys())
-        if return_empty_metadata and doc_ids_with_metadata:
-            docs = docs.where(cls.model.id.not_in(doc_ids_with_metadata))
+        metadata_map = {}
+        if return_empty_metadata:
+            metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
+            doc_ids_with_metadata = set(metadata_map.keys())
+            if doc_ids_with_metadata:
+                docs = docs.where(cls.model.id.not_in(doc_ids_with_metadata))

        count = docs.count()
        if desc:
@ -175,6 +178,8 @@ class DocumentService(CommonService):
            for doc in docs_list:
                doc["meta_fields"] = {}
        else:
+            doc_ids_on_page = [doc["id"] for doc in docs_list]
+            metadata_map = DocMetadataService.get_metadata_for_documents(doc_ids_on_page, kb_id) if doc_ids_on_page else {}
            for doc in docs_list:
                doc["meta_fields"] = metadata_map.get(doc["id"], {})
        return docs_list, count