mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-04-20 18:57:28 +08:00
Consolidateion metadata summary API (#14031)
### What problem does this PR solve? Consolidation WEB API & HTTP API for document metadata summary Before consolidation Web API: POST /api/v1/document/metadata/summary Http API - GET /v1/datasets/<dataset_id>/metadata/summary After consolidation, Restful API -- GET /v1/datasets/<dataset_id>/metadata/summary ### Type of change - [x] Refactoring
This commit is contained in:
@ -423,29 +423,6 @@ async def doc_infos():
|
||||
return get_json_result(data=docs_list)
|
||||
|
||||
|
||||
@manager.route("/metadata/summary", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
async def metadata_summary():
|
||||
req = await get_request_json()
|
||||
kb_id = req.get("kb_id")
|
||||
doc_ids = req.get("doc_ids")
|
||||
if not kb_id:
|
||||
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
tenants = UserTenantService.query(user_id=current_user.id)
|
||||
for tenant in tenants:
|
||||
if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
|
||||
break
|
||||
else:
|
||||
return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR)
|
||||
|
||||
try:
|
||||
summary = DocMetadataService.get_metadata_summary(kb_id, doc_ids)
|
||||
return get_json_result(data={"summary": summary})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route("/metadata/update", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("doc_ids")
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
#
|
||||
import logging
|
||||
|
||||
from quart import request
|
||||
from peewee import OperationalError
|
||||
from pydantic import ValidationError
|
||||
|
||||
@ -25,7 +26,8 @@ from api.db.services.document_service import DocumentService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from common.constants import RetCode
|
||||
from api.apps import login_required
|
||||
from api.utils.api_utils import get_error_data_result, get_result, add_tenant_id_to_kwargs, get_request_json
|
||||
from api.utils.api_utils import get_error_data_result, get_result, add_tenant_id_to_kwargs, get_request_json, \
|
||||
server_error_response
|
||||
from api.utils.validation_utils import (
|
||||
UpdateDocumentReq, format_validation_error_message,
|
||||
)
|
||||
@ -144,3 +146,40 @@ async def update_document(tenant_id, dataset_id, document_id):
|
||||
renamed_doc = rename_doc_key(doc)
|
||||
return get_result(data=renamed_doc)
|
||||
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/metadata/summary", methods=["GET"]) # noqa: F821
|
||||
@login_required
|
||||
@add_tenant_id_to_kwargs
|
||||
async def metadata_summary(dataset_id, tenant_id):
|
||||
"""
|
||||
Get metadata summary for a dataset.
|
||||
---
|
||||
tags:
|
||||
- Documents
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
parameters:
|
||||
- in: path
|
||||
name: dataset_id
|
||||
type: string
|
||||
required: true
|
||||
description: ID of the dataset.
|
||||
- in: query
|
||||
name: doc_ids
|
||||
type: string
|
||||
required: false
|
||||
description: Comma-separated document IDs to filter metadata.
|
||||
responses:
|
||||
200:
|
||||
description: Metadata summary retrieved successfully.
|
||||
"""
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
|
||||
# Get doc_ids from query parameters (comma-separated string)
|
||||
doc_ids_param = request.args.get("doc_ids", "")
|
||||
doc_ids = doc_ids_param.split(",") if doc_ids_param else None
|
||||
try:
|
||||
summary = DocMetadataService.get_metadata_summary(dataset_id, doc_ids)
|
||||
return get_result(data={"summary": summary})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
@ -451,19 +451,6 @@ def list_docs(dataset_id, tenant_id):
|
||||
return get_result(data={"total": total, "docs": output_docs})
|
||||
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/metadata/summary", methods=["GET"]) # noqa: F821
|
||||
@token_required
|
||||
async def metadata_summary(dataset_id, tenant_id):
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
|
||||
req = await get_request_json()
|
||||
try:
|
||||
summary = DocMetadataService.get_metadata_summary(dataset_id, req.get("doc_ids"))
|
||||
return get_result(data={"summary": summary})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/metadata/update", methods=["POST"]) # noqa: F821
|
||||
@token_required
|
||||
async def metadata_batch_update(dataset_id, tenant_id):
|
||||
|
||||
@ -472,7 +472,7 @@ class TestDocRoutesUnit:
|
||||
assert res["code"] == 0
|
||||
assert res["data"]["docs"] == []
|
||||
|
||||
def test_metadata_summary_and_batch_update(self, monkeypatch):
|
||||
def test_metadata_batch_update(self, monkeypatch):
|
||||
module = _load_doc_module(monkeypatch)
|
||||
monkeypatch.setattr(module, "convert_conditions", lambda cond: cond)
|
||||
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False)
|
||||
@ -480,22 +480,7 @@ class TestDocRoutesUnit:
|
||||
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
|
||||
assert "don't own the dataset" in res["message"]
|
||||
|
||||
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False)
|
||||
res = _run(module.metadata_summary.__wrapped__("ds-1", "tenant-1"))
|
||||
assert "don't own the dataset" in res["message"]
|
||||
|
||||
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True)
|
||||
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"doc_ids": ["d1"]}))
|
||||
monkeypatch.setattr(module.DocMetadataService, "get_metadata_summary", lambda *_args, **_kwargs: {"k": 1})
|
||||
res = _run(module.metadata_summary.__wrapped__("ds-1", "tenant-1"))
|
||||
assert res["code"] == 0
|
||||
assert res["data"]["summary"] == {"k": 1}
|
||||
|
||||
monkeypatch.setattr(module.DocMetadataService, "get_metadata_summary", lambda *_args, **_kwargs: (_ for _ in ()).throw(RuntimeError("x")))
|
||||
monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)})
|
||||
res = _run(module.metadata_summary.__wrapped__("ds-1", "tenant-1"))
|
||||
assert res["code"] == 500
|
||||
|
||||
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"selector": [1]}))
|
||||
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
|
||||
assert res["message"] == "selector must be an object."
|
||||
@ -576,6 +561,7 @@ class TestDocRoutesUnit:
|
||||
assert res["data"]["updated"] == 1
|
||||
assert res["data"]["matched_docs"] == 1
|
||||
|
||||
|
||||
def test_delete_branches(self, monkeypatch):
|
||||
module = _load_doc_module(monkeypatch)
|
||||
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False)
|
||||
|
||||
@ -13,11 +13,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Although the docs group this under "chunk management," the backend aggregates
|
||||
# Document.meta_fields via document_service#get_metadata_summary and the test
|
||||
# uses update_document, so it belongs with file/document management tests.
|
||||
# import pytest
|
||||
#from common import metadata_summary, update_document
|
||||
import pytest
|
||||
from common import metadata_summary, update_document
|
||||
|
||||
|
||||
def _summary_to_counts(summary):
|
||||
@ -30,29 +27,63 @@ def _summary_to_counts(summary):
|
||||
|
||||
|
||||
class TestMetadataSummary:
|
||||
pass
|
||||
@pytest.mark.p2
|
||||
def test_metadata_summary_missing_kb_id(self, HttpApiAuth, add_document_func):
|
||||
"""
|
||||
Call with non-existent dataset
|
||||
:param HttpApiAuth:
|
||||
:param add_document_func:
|
||||
:return:
|
||||
"""
|
||||
res = metadata_summary(HttpApiAuth, "")
|
||||
assert res["code"] == 404, res
|
||||
assert res["message"] == "Not Found: /api/v1/datasets//metadata/summary", res
|
||||
|
||||
# Alteration of API
|
||||
# TODO
|
||||
#@pytest.mark.p2
|
||||
#def test_metadata_summary_counts(self, HttpApiAuth, add_documents_func):
|
||||
# dataset_id, document_ids = add_documents_func
|
||||
# payloads = [
|
||||
# {"tags": ["foo", "bar"], "author": "alice"},
|
||||
# {"tags": ["foo"], "author": "bob"},
|
||||
# {"tags": ["bar", "baz"], "author": None},
|
||||
# ]
|
||||
# for doc_id, meta_fields in zip(document_ids, payloads):
|
||||
# res = update_document(HttpApiAuth, dataset_id, doc_id, {"meta_fields": meta_fields})
|
||||
# assert res["code"] == 0, res
|
||||
@pytest.mark.p2
|
||||
def test_metadata_summary_invalid_kb_id(self, HttpApiAuth, add_document_func):
|
||||
"""Test metadata summary when user doesn't have access to the dataset."""
|
||||
kb_id, doc_id = add_document_func
|
||||
invalid_kb_id = "invalid_" + kb_id
|
||||
# Call with a dataset that the user doesn't have access to
|
||||
res = metadata_summary(HttpApiAuth, invalid_kb_id)
|
||||
assert res["code"] == 102, res
|
||||
assert res["message"] == f"You don't own the dataset {invalid_kb_id}. "
|
||||
|
||||
# res = metadata_summary(HttpApiAuth, dataset_id)
|
||||
# assert res["code"] == 0, res
|
||||
# summary = res["data"]["summary"]
|
||||
# counts = _summary_to_counts(summary)
|
||||
# assert counts["tags"]["foo"] == 2, counts
|
||||
# assert counts["tags"]["bar"] == 2, counts
|
||||
# assert counts["tags"]["baz"] == 1, counts
|
||||
# assert counts["author"]["alice"] == 1, counts
|
||||
# assert counts["author"]["bob"] == 1, counts
|
||||
# assert "None" not in counts["author"], counts
|
||||
@pytest.mark.p2
|
||||
def test_metadata_summary_success(self, HttpApiAuth, add_document_func):
|
||||
"""Test metadata summary success case"""
|
||||
kb_id, doc_id = add_document_func
|
||||
# Test successful case
|
||||
res = metadata_summary(HttpApiAuth, kb_id)
|
||||
assert res["code"] == 0, res
|
||||
assert "summary" in res["data"], res
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_metadata_summary_counts(self, HttpApiAuth, add_documents_func):
|
||||
"""
|
||||
test normal cases
|
||||
:param HttpApiAuth:
|
||||
:param add_documents_func:
|
||||
:return:
|
||||
"""
|
||||
dataset_id, document_ids = add_documents_func
|
||||
payloads = [
|
||||
{"tags": ["foo", "bar"], "author": "alice"},
|
||||
{"tags": ["foo"], "author": "bob"},
|
||||
{"tags": ["bar", "baz"], "author": ""},
|
||||
]
|
||||
for doc_id, meta_fields in zip(document_ids, payloads):
|
||||
res = update_document(HttpApiAuth, dataset_id, doc_id, {"meta_fields": meta_fields})
|
||||
assert res["code"] == 0, res
|
||||
|
||||
res = metadata_summary(HttpApiAuth, dataset_id)
|
||||
assert res["code"] == 0, res
|
||||
|
||||
summary = res["data"]["summary"]
|
||||
counts = _summary_to_counts(summary)
|
||||
assert counts["tags"]["foo"] == 2, counts
|
||||
assert counts["tags"]["bar"] == 2, counts
|
||||
assert counts["tags"]["baz"] == 1, counts
|
||||
assert counts["author"]["alice"] == 1, counts
|
||||
assert counts["author"]["bob"] == 1, counts
|
||||
assert "None" not in counts["author"], counts
|
||||
|
||||
@ -319,48 +319,6 @@ class TestDocumentMetadataUnit:
|
||||
assert res["code"] == 0
|
||||
assert res["data"][0]["meta_fields"]["author"] == "alice"
|
||||
|
||||
def test_metadata_summary_missing_kb_id(self, document_app_module, monkeypatch):
|
||||
module = document_app_module
|
||||
|
||||
async def fake_request_json():
|
||||
return {"doc_ids": ["doc1"]}
|
||||
|
||||
monkeypatch.setattr(module, "get_request_json", fake_request_json)
|
||||
res = _run(module.metadata_summary())
|
||||
assert res["code"] == 101
|
||||
|
||||
def test_metadata_summary_unauthorized(self, document_app_module, monkeypatch):
|
||||
module = document_app_module
|
||||
monkeypatch.setattr(module.UserTenantService, "query", lambda **_kwargs: [SimpleNamespace(tenant_id="tenant1")])
|
||||
monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: False)
|
||||
|
||||
async def fake_request_json():
|
||||
return {"kb_id": "kb1", "doc_ids": ["doc1"]}
|
||||
|
||||
monkeypatch.setattr(module, "get_request_json", fake_request_json)
|
||||
res = _run(module.metadata_summary())
|
||||
assert res["code"] == 103
|
||||
|
||||
def test_metadata_summary_success_and_exception(self, document_app_module, monkeypatch):
|
||||
module = document_app_module
|
||||
self._allow_kb(module, monkeypatch)
|
||||
monkeypatch.setattr(module.DocMetadataService, "get_metadata_summary", lambda *_args, **_kwargs: {"author": {"alice": 1}})
|
||||
|
||||
async def fake_request_json():
|
||||
return {"kb_id": "kb1", "doc_ids": ["doc1"]}
|
||||
|
||||
monkeypatch.setattr(module, "get_request_json", fake_request_json)
|
||||
res = _run(module.metadata_summary())
|
||||
assert res["code"] == 0
|
||||
assert "summary" in res["data"]
|
||||
|
||||
def raise_error(*_args, **_kwargs):
|
||||
raise RuntimeError("boom")
|
||||
|
||||
monkeypatch.setattr(module.DocMetadataService, "get_metadata_summary", raise_error)
|
||||
res = _run(module.metadata_summary())
|
||||
assert res["code"] == 100
|
||||
|
||||
def test_metadata_update_missing_kb_id(self, document_app_module, monkeypatch):
|
||||
module = document_app_module
|
||||
|
||||
|
||||
@ -263,7 +263,10 @@ export const getMetaDataService = ({
|
||||
}: {
|
||||
kb_id: string;
|
||||
doc_ids?: string[];
|
||||
}) => request.post(api.getMetaData, { data: { kb_id, doc_ids } });
|
||||
}) =>
|
||||
request.get(api.getMetaData(kb_id), {
|
||||
params: doc_ids?.length ? { doc_ids: doc_ids.join(',') } : undefined,
|
||||
});
|
||||
export const updateMetaData = ({
|
||||
kb_id,
|
||||
doc_ids,
|
||||
|
||||
@ -83,7 +83,8 @@ export default {
|
||||
unbindPipelineTask: ({ kb_id, type }: { kb_id: string; type: string }) =>
|
||||
`${webAPI}/kb/unbind_task?kb_id=${kb_id}&pipeline_task_type=${type}`,
|
||||
pipelineRerun: `${webAPI}/canvas/rerun`,
|
||||
getMetaData: `${webAPI}/document/metadata/summary`,
|
||||
getMetaData: (datasetId: string) =>
|
||||
`${restAPIv1}/datasets/${datasetId}/metadata/summary`,
|
||||
updateMetaData: `${webAPI}/document/metadata/update`,
|
||||
kbUpdateMetaData: `${webAPI}/kb/update_metadata_setting`,
|
||||
documentUpdateMetaData: `${webAPI}/document/update_metadata_setting`,
|
||||
|
||||
Reference in New Issue
Block a user