Consolidateion metadata summary API (#14031)

### What problem does this PR solve?

Consolidation WEB API & HTTP API for document metadata summary

Before consolidation
Web API: POST /api/v1/document/metadata/summary
Http API - GET /v1/datasets/<dataset_id>/metadata/summary

After consolidation, Restful API -- GET
/v1/datasets/<dataset_id>/metadata/summary

### Type of change

- [x] Refactoring
This commit is contained in:
Jack
2026-04-10 18:41:30 +08:00
committed by GitHub
parent 11c89d87da
commit 4046a4cfb6
8 changed files with 108 additions and 126 deletions

View File

@ -423,29 +423,6 @@ async def doc_infos():
return get_json_result(data=docs_list)
@manager.route("/metadata/summary", methods=["POST"]) # noqa: F821
@login_required
async def metadata_summary():
req = await get_request_json()
kb_id = req.get("kb_id")
doc_ids = req.get("doc_ids")
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
tenants = UserTenantService.query(user_id=current_user.id)
for tenant in tenants:
if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
break
else:
return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR)
try:
summary = DocMetadataService.get_metadata_summary(kb_id, doc_ids)
return get_json_result(data={"summary": summary})
except Exception as e:
return server_error_response(e)
@manager.route("/metadata/update", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_ids")

View File

@ -15,6 +15,7 @@
#
import logging
from quart import request
from peewee import OperationalError
from pydantic import ValidationError
@ -25,7 +26,8 @@ from api.db.services.document_service import DocumentService
from api.db.services.knowledgebase_service import KnowledgebaseService
from common.constants import RetCode
from api.apps import login_required
from api.utils.api_utils import get_error_data_result, get_result, add_tenant_id_to_kwargs, get_request_json
from api.utils.api_utils import get_error_data_result, get_result, add_tenant_id_to_kwargs, get_request_json, \
server_error_response
from api.utils.validation_utils import (
UpdateDocumentReq, format_validation_error_message,
)
@ -144,3 +146,40 @@ async def update_document(tenant_id, dataset_id, document_id):
renamed_doc = rename_doc_key(doc)
return get_result(data=renamed_doc)
@manager.route("/datasets/<dataset_id>/metadata/summary", methods=["GET"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def metadata_summary(dataset_id, tenant_id):
"""
Get metadata summary for a dataset.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: query
name: doc_ids
type: string
required: false
description: Comma-separated document IDs to filter metadata.
responses:
200:
description: Metadata summary retrieved successfully.
"""
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
# Get doc_ids from query parameters (comma-separated string)
doc_ids_param = request.args.get("doc_ids", "")
doc_ids = doc_ids_param.split(",") if doc_ids_param else None
try:
summary = DocMetadataService.get_metadata_summary(dataset_id, doc_ids)
return get_result(data={"summary": summary})
except Exception as e:
return server_error_response(e)

View File

@ -451,19 +451,6 @@ def list_docs(dataset_id, tenant_id):
return get_result(data={"total": total, "docs": output_docs})
@manager.route("/datasets/<dataset_id>/metadata/summary", methods=["GET"]) # noqa: F821
@token_required
async def metadata_summary(dataset_id, tenant_id):
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
req = await get_request_json()
try:
summary = DocMetadataService.get_metadata_summary(dataset_id, req.get("doc_ids"))
return get_result(data={"summary": summary})
except Exception as e:
return server_error_response(e)
@manager.route("/datasets/<dataset_id>/metadata/update", methods=["POST"]) # noqa: F821
@token_required
async def metadata_batch_update(dataset_id, tenant_id):

View File

@ -472,7 +472,7 @@ class TestDocRoutesUnit:
assert res["code"] == 0
assert res["data"]["docs"] == []
def test_metadata_summary_and_batch_update(self, monkeypatch):
def test_metadata_batch_update(self, monkeypatch):
module = _load_doc_module(monkeypatch)
monkeypatch.setattr(module, "convert_conditions", lambda cond: cond)
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False)
@ -480,22 +480,7 @@ class TestDocRoutesUnit:
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert "don't own the dataset" in res["message"]
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False)
res = _run(module.metadata_summary.__wrapped__("ds-1", "tenant-1"))
assert "don't own the dataset" in res["message"]
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True)
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"doc_ids": ["d1"]}))
monkeypatch.setattr(module.DocMetadataService, "get_metadata_summary", lambda *_args, **_kwargs: {"k": 1})
res = _run(module.metadata_summary.__wrapped__("ds-1", "tenant-1"))
assert res["code"] == 0
assert res["data"]["summary"] == {"k": 1}
monkeypatch.setattr(module.DocMetadataService, "get_metadata_summary", lambda *_args, **_kwargs: (_ for _ in ()).throw(RuntimeError("x")))
monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)})
res = _run(module.metadata_summary.__wrapped__("ds-1", "tenant-1"))
assert res["code"] == 500
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"selector": [1]}))
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert res["message"] == "selector must be an object."
@ -576,6 +561,7 @@ class TestDocRoutesUnit:
assert res["data"]["updated"] == 1
assert res["data"]["matched_docs"] == 1
def test_delete_branches(self, monkeypatch):
module = _load_doc_module(monkeypatch)
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False)

View File

@ -13,11 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Although the docs group this under "chunk management," the backend aggregates
# Document.meta_fields via document_service#get_metadata_summary and the test
# uses update_document, so it belongs with file/document management tests.
# import pytest
#from common import metadata_summary, update_document
import pytest
from common import metadata_summary, update_document
def _summary_to_counts(summary):
@ -30,29 +27,63 @@ def _summary_to_counts(summary):
class TestMetadataSummary:
pass
@pytest.mark.p2
def test_metadata_summary_missing_kb_id(self, HttpApiAuth, add_document_func):
"""
Call with non-existent dataset
:param HttpApiAuth:
:param add_document_func:
:return:
"""
res = metadata_summary(HttpApiAuth, "")
assert res["code"] == 404, res
assert res["message"] == "Not Found: /api/v1/datasets//metadata/summary", res
# Alteration of API
# TODO
#@pytest.mark.p2
#def test_metadata_summary_counts(self, HttpApiAuth, add_documents_func):
# dataset_id, document_ids = add_documents_func
# payloads = [
# {"tags": ["foo", "bar"], "author": "alice"},
# {"tags": ["foo"], "author": "bob"},
# {"tags": ["bar", "baz"], "author": None},
# ]
# for doc_id, meta_fields in zip(document_ids, payloads):
# res = update_document(HttpApiAuth, dataset_id, doc_id, {"meta_fields": meta_fields})
# assert res["code"] == 0, res
@pytest.mark.p2
def test_metadata_summary_invalid_kb_id(self, HttpApiAuth, add_document_func):
"""Test metadata summary when user doesn't have access to the dataset."""
kb_id, doc_id = add_document_func
invalid_kb_id = "invalid_" + kb_id
# Call with a dataset that the user doesn't have access to
res = metadata_summary(HttpApiAuth, invalid_kb_id)
assert res["code"] == 102, res
assert res["message"] == f"You don't own the dataset {invalid_kb_id}. "
# res = metadata_summary(HttpApiAuth, dataset_id)
# assert res["code"] == 0, res
# summary = res["data"]["summary"]
# counts = _summary_to_counts(summary)
# assert counts["tags"]["foo"] == 2, counts
# assert counts["tags"]["bar"] == 2, counts
# assert counts["tags"]["baz"] == 1, counts
# assert counts["author"]["alice"] == 1, counts
# assert counts["author"]["bob"] == 1, counts
# assert "None" not in counts["author"], counts
@pytest.mark.p2
def test_metadata_summary_success(self, HttpApiAuth, add_document_func):
"""Test metadata summary success case"""
kb_id, doc_id = add_document_func
# Test successful case
res = metadata_summary(HttpApiAuth, kb_id)
assert res["code"] == 0, res
assert "summary" in res["data"], res
@pytest.mark.p2
def test_metadata_summary_counts(self, HttpApiAuth, add_documents_func):
"""
test normal cases
:param HttpApiAuth:
:param add_documents_func:
:return:
"""
dataset_id, document_ids = add_documents_func
payloads = [
{"tags": ["foo", "bar"], "author": "alice"},
{"tags": ["foo"], "author": "bob"},
{"tags": ["bar", "baz"], "author": ""},
]
for doc_id, meta_fields in zip(document_ids, payloads):
res = update_document(HttpApiAuth, dataset_id, doc_id, {"meta_fields": meta_fields})
assert res["code"] == 0, res
res = metadata_summary(HttpApiAuth, dataset_id)
assert res["code"] == 0, res
summary = res["data"]["summary"]
counts = _summary_to_counts(summary)
assert counts["tags"]["foo"] == 2, counts
assert counts["tags"]["bar"] == 2, counts
assert counts["tags"]["baz"] == 1, counts
assert counts["author"]["alice"] == 1, counts
assert counts["author"]["bob"] == 1, counts
assert "None" not in counts["author"], counts

View File

@ -319,48 +319,6 @@ class TestDocumentMetadataUnit:
assert res["code"] == 0
assert res["data"][0]["meta_fields"]["author"] == "alice"
def test_metadata_summary_missing_kb_id(self, document_app_module, monkeypatch):
module = document_app_module
async def fake_request_json():
return {"doc_ids": ["doc1"]}
monkeypatch.setattr(module, "get_request_json", fake_request_json)
res = _run(module.metadata_summary())
assert res["code"] == 101
def test_metadata_summary_unauthorized(self, document_app_module, monkeypatch):
module = document_app_module
monkeypatch.setattr(module.UserTenantService, "query", lambda **_kwargs: [SimpleNamespace(tenant_id="tenant1")])
monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: False)
async def fake_request_json():
return {"kb_id": "kb1", "doc_ids": ["doc1"]}
monkeypatch.setattr(module, "get_request_json", fake_request_json)
res = _run(module.metadata_summary())
assert res["code"] == 103
def test_metadata_summary_success_and_exception(self, document_app_module, monkeypatch):
module = document_app_module
self._allow_kb(module, monkeypatch)
monkeypatch.setattr(module.DocMetadataService, "get_metadata_summary", lambda *_args, **_kwargs: {"author": {"alice": 1}})
async def fake_request_json():
return {"kb_id": "kb1", "doc_ids": ["doc1"]}
monkeypatch.setattr(module, "get_request_json", fake_request_json)
res = _run(module.metadata_summary())
assert res["code"] == 0
assert "summary" in res["data"]
def raise_error(*_args, **_kwargs):
raise RuntimeError("boom")
monkeypatch.setattr(module.DocMetadataService, "get_metadata_summary", raise_error)
res = _run(module.metadata_summary())
assert res["code"] == 100
def test_metadata_update_missing_kb_id(self, document_app_module, monkeypatch):
module = document_app_module

View File

@ -263,7 +263,10 @@ export const getMetaDataService = ({
}: {
kb_id: string;
doc_ids?: string[];
}) => request.post(api.getMetaData, { data: { kb_id, doc_ids } });
}) =>
request.get(api.getMetaData(kb_id), {
params: doc_ids?.length ? { doc_ids: doc_ids.join(',') } : undefined,
});
export const updateMetaData = ({
kb_id,
doc_ids,

View File

@ -83,7 +83,8 @@ export default {
unbindPipelineTask: ({ kb_id, type }: { kb_id: string; type: string }) =>
`${webAPI}/kb/unbind_task?kb_id=${kb_id}&pipeline_task_type=${type}`,
pipelineRerun: `${webAPI}/canvas/rerun`,
getMetaData: `${webAPI}/document/metadata/summary`,
getMetaData: (datasetId: string) =>
`${restAPIv1}/datasets/${datasetId}/metadata/summary`,
updateMetaData: `${webAPI}/document/metadata/update`,
kbUpdateMetaData: `${webAPI}/kb/update_metadata_setting`,
documentUpdateMetaData: `${webAPI}/document/update_metadata_setting`,