Fix document metadata loading for paged listings (#13515)

## Summary
- scope normal document-list metadata lookups to the current page's
document IDs
- keep the `return_empty_metadata=True` path dataset-wide because it
needs full knowledge of docs that already have metadata
- add unit tests for both paged listing paths and the unchanged
empty-metadata behavior

## Why
`DocumentService.get_list()` and the normal `get_by_kb_id()` path were
calling `DocMetadataService.get_metadata_for_documents(None, kb_id)`,
which loads metadata for the entire dataset on every page request.

That becomes especially problematic on large datasets. The metadata scan
path paginates through the full metadata index without an explicit sort,
while the ES helper only switches to `search_after` beyond `10000`
results when a sort is present. In practice this can lead to unnecessary
full-dataset metadata work, slower document-list loading, and unreliable
`meta_fields` in list responses for large KBs.

This change keeps the existing empty-metadata filter behavior intact,
but scopes normal list responses to metadata for the current page only.
This commit is contained in:
Josh
2026-03-11 06:42:16 +01:00
committed by GitHub
parent 507ba4ea20
commit 2d2d3cdbcf
2 changed files with 204 additions and 5 deletions

View File

@ -107,7 +107,8 @@ class DocumentService(CommonService):
docs = docs.paginate(page_number, items_per_page)
docs_list = list(docs.dicts())
metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
doc_ids_on_page = [doc["id"] for doc in docs_list]
metadata_map = DocMetadataService.get_metadata_for_documents(doc_ids_on_page, kb_id) if doc_ids_on_page else {}
for doc in docs_list:
doc["meta_fields"] = metadata_map.get(doc["id"], {})
return docs_list, count
@ -156,10 +157,12 @@ class DocumentService(CommonService):
if suffix:
docs = docs.where(cls.model.suffix.in_(suffix))
metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
doc_ids_with_metadata = set(metadata_map.keys())
if return_empty_metadata and doc_ids_with_metadata:
docs = docs.where(cls.model.id.not_in(doc_ids_with_metadata))
metadata_map = {}
if return_empty_metadata:
metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
doc_ids_with_metadata = set(metadata_map.keys())
if doc_ids_with_metadata:
docs = docs.where(cls.model.id.not_in(doc_ids_with_metadata))
count = docs.count()
if desc:
@ -175,6 +178,8 @@ class DocumentService(CommonService):
for doc in docs_list:
doc["meta_fields"] = {}
else:
doc_ids_on_page = [doc["id"] for doc in docs_list]
metadata_map = DocMetadataService.get_metadata_for_documents(doc_ids_on_page, kb_id) if doc_ids_on_page else {}
for doc in docs_list:
doc["meta_fields"] = metadata_map.get(doc["id"], {})
return docs_list, count