From 059f375d858ab0ed93d25703737dd4464571f003 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Thu, 25 Dec 2025 14:06:50 +0800 Subject: [PATCH] Feat: supports filter documents by empty metadata (#12180) ### What problem does this PR solve? Supports filter documents by empty metadata ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/apps/document_app.py | 33 ++++++++++++++++--- api/db/services/document_service.py | 49 +++++++++++++++-------------- 2 files changed, 54 insertions(+), 28 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index f724a26b5..067d10da2 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -234,6 +234,10 @@ async def list_docs(): req = await get_request_json() + return_empty_metadata = req.get("return_empty_metadata", False) + if isinstance(return_empty_metadata, str): + return_empty_metadata = return_empty_metadata.lower() == "true" + run_status = req.get("run_status", []) if run_status: invalid_status = {s for s in run_status if s not in VALID_TASK_STATUS} @@ -248,11 +252,18 @@ async def list_docs(): suffix = req.get("suffix", []) metadata_condition = req.get("metadata_condition", {}) or {} - if metadata_condition and not isinstance(metadata_condition, dict): - return get_data_error_result(message="metadata_condition must be an object.") metadata = req.get("metadata", {}) or {} - if metadata and not isinstance(metadata, dict): - return get_data_error_result(message="metadata must be an object.") + if isinstance(metadata, dict) and metadata.get("empty_metadata"): + return_empty_metadata = True + metadata = {k: v for k, v in metadata.items() if k != "empty_metadata"} + if return_empty_metadata: + metadata_condition = {} + metadata = {} + else: + if metadata_condition and not isinstance(metadata_condition, dict): + return get_data_error_result(message="metadata_condition must be an object.") + if metadata and not isinstance(metadata, dict): + return get_data_error_result(message="metadata must be an object.") doc_ids_filter = None metas = None @@ -295,7 +306,19 @@ async def list_docs(): doc_ids_filter = list(doc_ids_filter) try: - docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, doc_ids_filter) + docs, tol = DocumentService.get_by_kb_id( + kb_id, + page_number, + items_per_page, + orderby, + desc, + keywords, + run_status, + types, + suffix, + doc_ids_filter, + return_empty_metadata=return_empty_metadata, + ) if create_time_from or create_time_to: filtered_docs = [] diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 8a9d36361..c0c9181fd 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -125,26 +125,26 @@ class DocumentService(CommonService): @classmethod @DB.connection_context() - def get_by_kb_id(cls, kb_id, page_number, items_per_page, - orderby, desc, keywords, run_status, types, suffix, doc_ids=None): + def get_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, doc_ids=None, return_empty_metadata=False): fields = cls.get_cls_model_fields() if keywords: - docs = cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])\ - .join(File2Document, on=(File2Document.document_id == cls.model.id))\ - .join(File, on=(File.id == File2Document.file_id))\ - .join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)\ - .join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)\ - .where( - (cls.model.kb_id == kb_id), - (fn.LOWER(cls.model.name).contains(keywords.lower())) - ) + docs = ( + cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname]) + .join(File2Document, on=(File2Document.document_id == cls.model.id)) + .join(File, on=(File.id == File2Document.file_id)) + .join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER) + .join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER) + .where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.name).contains(keywords.lower()))) + ) else: - docs = cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])\ - .join(File2Document, on=(File2Document.document_id == cls.model.id))\ - .join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)\ - .join(File, on=(File.id == File2Document.file_id))\ - .join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)\ + docs = ( + cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname]) + .join(File2Document, on=(File2Document.document_id == cls.model.id)) + .join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER) + .join(File, on=(File.id == File2Document.file_id)) + .join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER) .where(cls.model.kb_id == kb_id) + ) if doc_ids: docs = docs.where(cls.model.id.in_(doc_ids)) @@ -154,6 +154,8 @@ class DocumentService(CommonService): docs = docs.where(cls.model.type.in_(types)) if suffix: docs = docs.where(cls.model.suffix.in_(suffix)) + if return_empty_metadata: + docs = docs.where(fn.COALESCE(fn.JSON_LENGTH(cls.model.meta_fields), 0) == 0) count = docs.count() if desc: @@ -161,7 +163,6 @@ class DocumentService(CommonService): else: docs = docs.order_by(cls.model.getter_by(orderby).asc()) - if page_number and items_per_page: docs = docs.paginate(page_number, items_per_page) @@ -217,18 +218,16 @@ class DocumentService(CommonService): suffix_counter = {} run_status_counter = {} metadata_counter = {} + empty_metadata_count = 0 for row in rows: suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1 run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1 meta_fields = row.meta_fields or {} - if isinstance(meta_fields, str): - try: - meta_fields = json.loads(meta_fields) - except Exception: - meta_fields = {} - if not isinstance(meta_fields, dict): + if not meta_fields: + empty_metadata_count += 1 continue + has_valid_meta = False for key, value in meta_fields.items(): values = value if isinstance(value, list) else [value] for vv in values: @@ -240,7 +239,11 @@ class DocumentService(CommonService): if key not in metadata_counter: metadata_counter[key] = {} metadata_counter[key][sv] = metadata_counter[key].get(sv, 0) + 1 + has_valid_meta = True + if not has_valid_meta: + empty_metadata_count += 1 + metadata_counter["empty_metadata"] = {"true": empty_metadata_count} return { "suffix": suffix_counter, "run_status": run_status_counter,