# # Copyright 2026 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import json import os import re from pathlib import Path from quart import request, make_response from peewee import OperationalError from pydantic import ValidationError from api.apps import current_user, login_required from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX from api.apps.services.document_api_service import validate_document_update_fields, map_doc_keys, \ map_doc_keys_with_run_status, update_document_name_only, update_chunk_method, update_document_status_only, \ reset_document_for_reparse from api.db import VALID_FILE_TYPES, FileType from api.db.services import duplicate_name from api.db.services.doc_metadata_service import DocMetadataService from api.db.db_models import Task from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService from api.common.check_team_permission import check_kb_team_permission from api.db.services.task_service import TaskService, cancel_all_task_of from api.utils.api_utils import get_data_error_result, get_error_data_result, get_result, get_json_result, \ server_error_response, add_tenant_id_to_kwargs, get_request_json, get_error_argument_result, check_duplicate_ids from api.utils.validation_utils import ( UpdateDocumentReq, format_validation_error_message, validate_and_parse_json_request, DeleteDocumentReq, ) from common import settings from common.constants import ParserType, RetCode, TaskStatus, SANDBOX_ARTIFACT_BUCKET from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema from common.misc_utils import get_uuid, thread_pool_exec from api.utils.file_utils import filename_type, thumbnail from api.utils.web_utils import CONTENT_TYPE_MAP, html2pdf, is_valid_url, apply_safe_file_response_headers from common.ssrf_guard import assert_url_is_safe from rag.nlp import search @manager.route("/documents/upload", methods=["POST"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def upload_info(tenant_id: str): """ Upload a document and get its parsed info. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: header name: Authorization type: string required: true description: Bearer token for authentication. - in: formData name: file type: file required: false description: File to upload. - in: query name: url type: string required: false description: URL to fetch file from. responses: 200: description: Successful operation. """ files = await request.files file_objs = files.getlist("file") if files and files.get("file") else [] url = request.args.get("url") if file_objs and url: return get_error_argument_result("Provide either multipart file(s) or ?url=..., not both.") if not file_objs and not url: return get_error_argument_result("Missing input: provide multipart file(s) or url") try: if url and not file_objs: try: assert_url_is_safe(url) except ValueError as ve: logging.warning("upload_info: rejected unsafe url: %s", ve) return get_error_argument_result(str(ve)) data = await thread_pool_exec(FileService.upload_info, tenant_id, None, url) return get_result(data=data) if len(file_objs) == 1: data = await thread_pool_exec(FileService.upload_info, tenant_id, file_objs[0], None) return get_result(data=data) results = [await thread_pool_exec(FileService.upload_info, tenant_id, f, None) for f in file_objs] return get_result(data=results) except Exception as e: logging.exception("upload_info failed") return server_error_response(e) @manager.route("/datasets//documents/", methods=["PATCH"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def update_document(tenant_id, dataset_id, document_id): """ Update a document within a dataset. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset. - in: path name: document_id type: string required: true description: ID of the document to update. - in: header name: Authorization type: string required: true description: Bearer token for authentication. - in: body name: body description: Document update parameters. required: true schema: type: object properties: name: type: string description: New name of the document. parser_config: type: object description: Parser configuration. chunk_method: type: string description: Chunking method. enabled: type: boolean description: Document status. responses: 200: description: Document updated successfully. schema: type: object """ req = await get_request_json() # Verify ownership and existence of dataset and document if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): return get_error_data_result(message="You don't own the dataset.") e, kb = KnowledgebaseService.get_by_id(dataset_id) if not e: return get_error_data_result(message="Can't find this dataset!") # Prepare data for validation docs = DocumentService.query(kb_id=dataset_id, id=document_id) if not docs: return get_error_data_result(message="The dataset doesn't own the document.") # Validate document update request parameters try: update_doc_req = UpdateDocumentReq(**req) except ValidationError as e: return get_error_data_result(message=format_validation_error_message(e), code=RetCode.DATA_ERROR) doc = docs[0] # further check with inner status (from DB) error_msg, error_code = validate_document_update_fields(update_doc_req, doc, req) if error_msg: return get_error_data_result(message=error_msg, code=error_code) # All validations passed, now perform all updates # meta_fields provided, then update it if "meta_fields" in req: if not DocMetadataService.update_document_metadata(document_id, update_doc_req.meta_fields): return get_error_data_result(message="Failed to update metadata") # doc name provided from request and diff with existing value, update if "name" in req and req["name"] != doc.name: if error := update_document_name_only(document_id, req["name"]): return error # "parser_id" provided but does not match with existing doc's file type if "parser_id" in req and ((doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")): return get_data_error_result(message="Not supported yet!") # parser config provided (already validated in UpdateDocumentReq), update it if update_doc_req.parser_config: req["parser_config"].update(update_doc_req.parser_config.ext) DocumentService.update_parser_config(doc.id, req["parser_config"]) # pipeline_id provided - reset document for reparse if update_doc_req.pipeline_id: if error := reset_document_for_reparse(doc, tenant_id, pipeline_id=update_doc_req.pipeline_id): return error # chunk method provided - the update method will check if it's different with existing one elif update_doc_req.chunk_method: if error := update_chunk_method(req, doc, tenant_id): return error if "enabled" in req: # already checked in UpdateDocumentReq - it's int if present # "enabled" flag provided, the update method will check if it's changed and then update if so if error := update_document_status_only(int(req["enabled"]), doc, kb): return error try: original_doc_id = doc.id ok, doc = DocumentService.get_by_id(doc.id) if not ok: return get_error_data_result(message=f"Can not get document by id:{original_doc_id}") except OperationalError as e: logging.exception(e) return get_error_data_result(message="Database operation failed") renamed_doc = map_doc_keys(doc) return get_result(data=renamed_doc) @manager.route("/datasets//metadata/summary", methods=["GET"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def metadata_summary(dataset_id, tenant_id): """ Get metadata summary for a dataset. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset. - in: query name: doc_ids type: string required: false description: Comma-separated document IDs to filter metadata. responses: 200: description: Metadata summary retrieved successfully. """ if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") # Get doc_ids from query parameters (comma-separated string) doc_ids_param = request.args.get("doc_ids", "") doc_ids = doc_ids_param.split(",") if doc_ids_param else None try: summary = DocMetadataService.get_metadata_summary(dataset_id, doc_ids) return get_result(data={"summary": summary}) except Exception as e: return server_error_response(e) @manager.route("/datasets//metadata/update", methods=["POST"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def metadata_batch_update(dataset_id, tenant_id): """ Batch update metadata for documents in a dataset. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset. requestBody: required: true content: application/json: schema: type: object properties: selector: type: object updates: type: array deletes: type: array responses: 200: description: Metadata updated successfully. """ if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") req = await get_request_json() selector = req.get("selector", {}) or {} updates = req.get("updates", []) or [] deletes = req.get("deletes", []) or [] if not isinstance(selector, dict): return get_error_data_result(message="selector must be an object.") if not isinstance(updates, list) or not isinstance(deletes, list): return get_error_data_result(message="updates and deletes must be lists.") metadata_condition = selector.get("metadata_condition", {}) or {} if metadata_condition and not isinstance(metadata_condition, dict): return get_error_data_result(message="metadata_condition must be an object.") document_ids = selector.get("document_ids", []) or [] if document_ids and not isinstance(document_ids, list): return get_error_data_result(message="document_ids must be a list.") for upd in updates: if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd: return get_error_data_result(message="Each update requires key and value.") for d in deletes: if not isinstance(d, dict) or not d.get("key"): return get_error_data_result(message="Each delete requires key.") target_doc_ids = set() if document_ids: kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id]) invalid_ids = set(document_ids) - set(kb_doc_ids) if invalid_ids: return get_error_data_result(message=f"These documents do not belong to dataset {dataset_id}: {', '.join(invalid_ids)}") target_doc_ids = set(document_ids) if metadata_condition: metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id]) filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) target_doc_ids = target_doc_ids & filtered_ids if metadata_condition.get("conditions") and not target_doc_ids: return get_result(data={"updated": 0, "matched_docs": 0}) target_doc_ids = list(target_doc_ids) updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes) return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)}) @manager.route("/datasets//documents", methods=["POST"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def upload_document(dataset_id, tenant_id): """ Upload documents to a dataset. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset. - in: header name: Authorization type: string required: true description: Bearer token for authentication. - in: formData name: file type: file required: true description: Document files to upload. - in: formData name: parent_path type: string description: Optional nested path under the parent folder. Uses '/' separators. - in: query name: return_raw_files type: boolean required: false default: false description: Whether to skip document key mapping and return raw document data responses: 200: description: Successfully uploaded documents. schema: type: object properties: data: type: array items: type: object properties: id: type: string description: Document ID. name: type: string description: Document name. chunk_count: type: integer description: Number of chunks. token_count: type: integer description: Number of tokens. dataset_id: type: string description: ID of the dataset. chunk_method: type: string description: Chunking method used. run: type: string description: Processing status. """ upload_type = (request.args.get("type") or "local").lower() e, kb = KnowledgebaseService.get_by_id(dataset_id) if not e: logging.error(f"Can't find the dataset with ID {dataset_id}!") return get_error_data_result(message=f"Can't find the dataset with ID {dataset_id}!", code=RetCode.DATA_ERROR) if not check_kb_team_permission(kb, tenant_id): logging.error("No authorization.") return get_error_data_result(message="No authorization.", code=RetCode.AUTHENTICATION_ERROR) if upload_type == "web": return await _upload_web_document(dataset_id, kb, tenant_id) if upload_type == "empty": return await _upload_empty_document(dataset_id, kb, tenant_id) if upload_type != "local": return get_error_data_result( message='`type` must be one of "local", "web", or "empty".', code=RetCode.ARGUMENT_ERROR, ) return await _upload_local_documents(kb, tenant_id) async def _upload_web_document(dataset_id, kb, tenant_id): form = await request.form name = (form.get("name") or "").strip() url = form.get("url") if not name: return get_error_data_result(message='Lack of "name"', code=RetCode.ARGUMENT_ERROR) if not url: return get_error_data_result(message='Lack of "url"', code=RetCode.ARGUMENT_ERROR) if len(name.encode("utf-8")) > FILE_NAME_LEN_LIMIT: return get_error_data_result( message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=RetCode.ARGUMENT_ERROR, ) if not is_valid_url(url): return get_error_data_result(message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR) blob = html2pdf(url) if not blob: return server_error_response(ValueError("Download failure.")) root_folder = FileService.get_root_folder(tenant_id) FileService.init_knowledgebase_docs(root_folder["id"], tenant_id) kb_root_folder = FileService.get_kb_folder(tenant_id) kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"]) try: filename = duplicate_name(DocumentService.query, name=f"{name}.pdf", kb_id=kb.id) filetype = filename_type(filename) if filetype == FileType.OTHER.value: raise RuntimeError("This type of file has not been supported yet!") location = filename while settings.STORAGE_IMPL.obj_exist(dataset_id, location): location += "_" settings.STORAGE_IMPL.put(dataset_id, location, blob) doc = { "id": get_uuid(), "kb_id": kb.id, "parser_id": kb.parser_id, "pipeline_id": kb.pipeline_id, "parser_config": kb.parser_config, "created_by": tenant_id, "type": filetype, "name": filename, "location": location, "size": len(blob), "thumbnail": thumbnail(filename, blob), "suffix": Path(filename).suffix.lstrip("."), } if doc["type"] == FileType.VISUAL: doc["parser_id"] = ParserType.PICTURE.value if doc["type"] == FileType.AURAL: doc["parser_id"] = ParserType.AUDIO.value if re.search(r"\.(ppt|pptx|pages)$", filename): doc["parser_id"] = ParserType.PRESENTATION.value if re.search(r"\.(eml)$", filename): doc["parser_id"] = ParserType.EMAIL.value DocumentService.insert(doc) FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id) return get_result(data=map_doc_keys_with_run_status(doc, run_status="0")) except Exception as e: return server_error_response(e) async def _upload_empty_document(dataset_id, kb, tenant_id): req = await get_request_json() name = (req.get("name") or "").strip() if not name: return get_error_data_result(message="File name can't be empty.", code=RetCode.ARGUMENT_ERROR) if len(name.encode("utf-8")) > FILE_NAME_LEN_LIMIT: return get_error_data_result( message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=RetCode.ARGUMENT_ERROR, ) if DocumentService.query(name=name, kb_id=dataset_id): return get_error_data_result(message="Duplicated document name in the same dataset.") try: kb_root_folder = FileService.get_kb_folder(kb.tenant_id) if not kb_root_folder: return get_error_data_result(message="Cannot find the root folder.") kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"]) if not kb_folder: return get_error_data_result(message="Cannot find the kb folder for this file.") doc = DocumentService.insert( { "id": get_uuid(), "kb_id": kb.id, "parser_id": kb.parser_id, "pipeline_id": kb.pipeline_id, "parser_config": kb.parser_config, "created_by": tenant_id, "type": FileType.VIRTUAL, "name": name, "suffix": Path(name).suffix.lstrip("."), "location": "", "size": 0, } ) FileService.add_file_from_kb(doc.to_dict(), kb_folder["id"], kb.tenant_id) return get_result(data=map_doc_keys(doc)) except Exception as e: return server_error_response(e) async def _upload_local_documents(kb, tenant_id): form = await request.form files = await request.files if "file" not in files: logging.error("No file part!") return get_error_data_result(message="No file part!", code=RetCode.ARGUMENT_ERROR) file_objs = files.getlist("file") for file_obj in file_objs: if file_obj is None or file_obj.filename is None or file_obj.filename == "": logging.error("No file selected!") return get_error_data_result(message="No file selected!", code=RetCode.ARGUMENT_ERROR) if len(file_obj.filename.encode("utf-8")) > FILE_NAME_LEN_LIMIT: msg = f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less." logging.error(msg) return get_error_data_result(message=msg, code=RetCode.ARGUMENT_ERROR) err, files = await thread_pool_exec( FileService.upload_document, kb, file_objs, tenant_id, parent_path=form.get("parent_path") ) if err: msg = "\n".join(err) logging.error(msg) return get_error_data_result(message=msg, code=RetCode.SERVER_ERROR) if not files: msg = "There seems to be an issue with your file format. please verify it is correct and not corrupted." logging.error(msg) return get_error_data_result(message=msg, code=RetCode.DATA_ERROR) files = [f[0] for f in files] # remove the blob return_raw_files = request.args.get("return_raw_files", "false").lower() == "true" if return_raw_files: return get_result(data=files) renamed_doc_list = [map_doc_keys_with_run_status(doc, run_status="0") for doc in files] return get_result(data=renamed_doc_list) @manager.route("/datasets//documents", methods=["GET"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs def list_docs(dataset_id, tenant_id): """ List documents in a dataset. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset. - in: query name: page type: integer required: false default: 1 description: Page number. - in: query name: page_size type: integer required: false default: 30 description: Number of items per page. - in: query name: orderby type: string required: false default: "create_time" description: Field to order by. - in: query name: desc type: boolean required: false default: true description: Order in descending. - in: query name: create_time_from type: integer required: false default: 0 description: Unix timestamp for filtering documents created after this time. 0 means no filter. - in: query name: create_time_to type: integer required: false default: 0 description: Unix timestamp for filtering documents created before this time. 0 means no filter. - in: query name: suffix type: array items: type: string required: false description: Filter by file suffix (e.g., ["pdf", "txt", "docx"]). - in: query name: run type: array items: type: string required: false description: Filter by document run status. Supports both numeric ("0", "1", "2", "3", "4") and text formats ("UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL"). - in: header name: Authorization type: string required: true description: Bearer token for authentication. responses: 200: description: List of documents. schema: type: object properties: total: type: integer description: Total number of documents. docs: type: array items: type: object properties: id: type: string description: Document ID. name: type: string description: Document name. chunk_count: type: integer description: Number of chunks. token_count: type: integer description: Number of tokens. dataset_id: type: string description: ID of the dataset. chunk_method: type: string description: Chunking method used. run: type: string description: Processing status. """ if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): logging.error(f"You don't own the dataset {dataset_id}. ") return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") if request.args.get("type") == "filter": err_code, err_msg, payload, total = _get_doc_filters_with_request(request, dataset_id) if err_code != RetCode.SUCCESS: return get_data_error_result(code=err_code, message=err_msg) return get_json_result(data={"total": total, "filter": payload}) err_code, err_msg, payload, total = _get_docs_with_request(request, dataset_id) if err_code != RetCode.SUCCESS: return get_data_error_result(code=err_code, message=err_msg) renamed_doc_list = [map_doc_keys(doc) for doc in payload] for doc_item in renamed_doc_list: if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX): doc_item["thumbnail"] = f"/api/v1/documents/images/{dataset_id}-{doc_item['thumbnail']}" if doc_item.get("source_type"): doc_item["source_type"] = doc_item["source_type"].split("/")[0] if doc_item["parser_config"].get("metadata"): doc_item["parser_config"]["metadata"] = turn2jsonschema(doc_item["parser_config"]["metadata"]) return get_json_result(data={"total": total, "docs": renamed_doc_list}) def _get_docs_with_request(req, dataset_id:str): """Get documents with request parameters from a dataset. This function extracts filtering parameters from the request and returns a list of documents matching the specified criteria. Args: req: The request object containing query parameters. - page (int): Page number for pagination (default: 1). - page_size (int): Number of documents per page (default: 30). - orderby (str): Field to order by (default: "create_time"). - desc (bool): Whether to order in descending order (default: True). - keywords (str): Keywords to search in document names. - suffix (list): File suffix filters. - types (list): Document type filters. - run (list): Processing status filters. - create_time_from (int): Start timestamp for time range filter. - create_time_to (int): End timestamp for time range filter. - return_empty_metadata (bool|str): Whether to return documents with empty metadata. - metadata_condition (str): JSON string for complex metadata conditions. - metadata (str): JSON string for simple metadata key-value matching. dataset_id: The dataset ID to retrieve documents from. Returns: A tuple of (err_code, err_message, docs, total): - err_code (int): Success code (RetCode.SUCCESS) if successful, or error code if validation fails. - err_message (str): Empty string if successful, or error message if validation fails. - docs (list): List of document dictionaries matching the criteria, or empty list on error. - total (int): Total number of documents matching the criteria. Note: - The function supports filtering by document types, processing status, keywords, and time range. - Metadata filtering supports both simple key-value matching and complex conditions with operators. """ q = req.args page = int(q.get("page", 1)) page_size = int(q.get("page_size", 30)) orderby = q.get("orderby", "create_time") desc = str(q.get("desc", "true")).strip().lower() != "false" keywords = q.get("keywords", "") # filters - align with OpenAPI parameter names suffix = q.getlist("suffix") types = q.getlist("types") if types: invalid_types = {t for t in types if t not in VALID_FILE_TYPES} if invalid_types: msg = f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}" return RetCode.DATA_ERROR, msg, [], 0 # map run status (text or numeric) - align with API parameter run_status = q.getlist("run") run_status_text_to_numeric = {"UNSTART": "0", "RUNNING": "1", "CANCEL": "2", "DONE": "3", "FAIL": "4"} run_status_converted = [run_status_text_to_numeric.get(v, v) for v in run_status] if run_status_converted: invalid_status = {s for s in run_status_converted if s not in run_status_text_to_numeric.values()} if invalid_status: msg = f"Invalid filter run status conditions: {', '.join(invalid_status)}" return RetCode.DATA_ERROR, msg, [], 0 err_code, err_message, doc_ids_filter, return_empty_metadata = _parse_doc_id_filter_with_metadata(q, dataset_id) if err_code != RetCode.SUCCESS: return err_code, err_message, [], 0 doc_name = q.get("name") doc_id = q.get("id") if doc_id: if not DocumentService.query(id=doc_id, kb_id=dataset_id): return RetCode.DATA_ERROR, f"You don't own the document {doc_id}.", [], 0 doc_ids_filter = [doc_id] # id provided, ignore other filters if doc_name and not DocumentService.query(name=doc_name, kb_id=dataset_id): return RetCode.DATA_ERROR, f"You don't own the document {doc_name}.", [], 0 doc_ids = q.getlist("ids") if doc_id and len(doc_ids) > 0: return RetCode.DATA_ERROR, f"Should not provide both 'id':{doc_id} and 'ids'{doc_ids}" if len(doc_ids) > 0: doc_ids_filter = doc_ids docs, total = DocumentService.get_by_kb_id(dataset_id, page, page_size, orderby, desc, keywords, run_status_converted, types, suffix, name=doc_name, doc_ids=doc_ids_filter, return_empty_metadata=return_empty_metadata) # time range filter (0 means no bound) create_time_from = int(q.get("create_time_from", 0)) create_time_to = int(q.get("create_time_to", 0)) if create_time_from or create_time_to: docs = [d for d in docs if (create_time_from == 0 or d.get("create_time", 0) >= create_time_from) and (create_time_to == 0 or d.get("create_time", 0) <= create_time_to)] return RetCode.SUCCESS, "", docs, total def _get_doc_filters_with_request(req, dataset_id: str): """Get aggregated document filters with request parameters from a dataset.""" q = req.args keywords = q.get("keywords", "") suffix = q.getlist("suffix") types = q.getlist("types") if types: invalid_types = {t for t in types if t not in VALID_FILE_TYPES} if invalid_types: msg = f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}" return RetCode.DATA_ERROR, msg, {}, 0 run_status = q.getlist("run") run_status_text_to_numeric = {"UNSTART": "0", "RUNNING": "1", "CANCEL": "2", "DONE": "3", "FAIL": "4"} run_status_converted = [run_status_text_to_numeric.get(v, v) for v in run_status] if run_status_converted: invalid_status = {s for s in run_status_converted if s not in run_status_text_to_numeric.values()} if invalid_status: msg = f"Invalid filter run status conditions: {', '.join(invalid_status)}" return RetCode.DATA_ERROR, msg, {}, 0 docs_filter, total = DocumentService.get_filter_by_kb_id( dataset_id, keywords, run_status_converted, types, suffix, ) return RetCode.SUCCESS, "", docs_filter, total def _parse_doc_id_filter_with_metadata(req, kb_id): """Parse document ID filter based on metadata conditions from the request. This function extracts and processes metadata filtering parameters from the request and returns a list of document IDs that match the specified criteria. It supports two filtering modes: simple metadata key-value matching and complex metadata conditions with operators. Args: req: The request object containing filtering parameters. - return_empty_metadata (bool|str): If True, returns all documents regardless of their metadata. Can be a boolean or string "true"/"false". - metadata_condition (str): JSON string containing complex metadata conditions with optional "logic" (and/or) and "conditions" list. Each condition should have "name" (key), "comparison_operator", and "value" fields. - metadata (str): JSON string containing key-value pairs for exact metadata matching. Values can be a single value or list of values (OR logic within same key). Can include special key "empty_metadata" to indicate documents with empty metadata. kb_id: The knowledge base ID to filter documents from. Returns: A tuple of (err_code, err_message, docs, return_empty_metadata): - err_code (int): Success code (RetCode.SUCCESS) if successful, or error code if validation fails. - err_message (str): Empty string if successful, or error message if validation fails. - docs (list): List of document IDs matching the metadata criteria, or empty list if no filter should be applied or on error. - return_empty_metadata (bool): The processed flag indicating whether to return documents with empty metadata. Note: - When both metadata and metadata_condition are provided, they are combined with AND logic. - The metadata_condition uses operators like: =, !=, >, <, >=, <=, contains, not contains, in, not in, start with, end with, empty, not empty. - The metadata parameter performs exact matching where values are OR'd within the same key & AND'd across different keys. Examples: Simple metadata filter (exact match): req = {"metadata": '{"author": ["John", "Jane"]}'} # Returns documents where author is John OR Jane Simple metadata filter with multiple keys: req = {"metadata": '{"author": "John", "status": "published"}'} # Returns documents where author is John AND status is published Complex metadata conditions: req = {"metadata_condition": '{"logic": "and", "conditions": [{"name": "status", "comparison_operator": "eq", "value": "published"}]}'} # Returns documents where status equals "published" Complex conditions with multiple operators: req = {"metadata_condition": '{"logic": "or", "conditions": [{"name": "priority", "comparison_operator": "=", "value": "high"}, {"name": "status", "comparison_operator": "contains", "value": "urgent"}]}'} # Returns documents where priority is high OR status contains "urgent" Return empty metadata: req = {"return_empty_metadata": True} # Returns all documents regardless of metadata Combined metadata and metadata_condition: req = {"metadata": '{"author": "John"}', "metadata_condition": '{"logic": "and", "conditions": [{"name": "status", "comparison_operator": "=", "value": "published"}]}'} # Returns documents where author is John AND status equals published """ return_empty_metadata = req.get("return_empty_metadata", False) if isinstance(return_empty_metadata, str): return_empty_metadata = return_empty_metadata.lower() == "true" try: metadata_condition = json.loads(req.get("metadata_condition", "{}")) except json.JSONDecodeError: msg = f'metadata_condition must be valid JSON: {req.get("metadata_condition")}.' return RetCode.DATA_ERROR, msg, [], return_empty_metadata try: metadata = json.loads(req.get("metadata", "{}")) except json.JSONDecodeError: logging.error(msg=f'metadata must be valid JSON: {req.get("metadata")}.') return RetCode.DATA_ERROR, "metadata must be valid JSON.", [], return_empty_metadata if isinstance(metadata, dict) and metadata.get("empty_metadata"): return_empty_metadata = True metadata = {k: v for k, v in metadata.items() if k != "empty_metadata"} if return_empty_metadata: metadata_condition = {} metadata = {} else: if metadata_condition and not isinstance(metadata_condition, dict): return RetCode.DATA_ERROR, "metadata_condition must be an object.", [], return_empty_metadata if metadata and not isinstance(metadata, dict): return RetCode.DATA_ERROR, "metadata must be an object.", [], return_empty_metadata metas = dict() if metadata_condition or metadata: metas = DocMetadataService.get_flatted_meta_by_kbs([kb_id]) doc_ids_filter = None if metadata_condition: doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) if metadata_condition.get("conditions") and not doc_ids_filter: return RetCode.SUCCESS, "", [], return_empty_metadata if metadata: metadata_doc_ids = None for key, values in metadata.items(): if not values: continue if not isinstance(values, list): values = [values] values = [str(v) for v in values if v is not None and str(v).strip()] if not values: continue key_doc_ids = set() for value in values: key_doc_ids.update(metas.get(key, {}).get(value, [])) if metadata_doc_ids is None: metadata_doc_ids = key_doc_ids else: metadata_doc_ids &= key_doc_ids if not metadata_doc_ids: return RetCode.SUCCESS, "", [], return_empty_metadata if metadata_doc_ids is not None: if doc_ids_filter is None: doc_ids_filter = metadata_doc_ids else: doc_ids_filter &= metadata_doc_ids if not doc_ids_filter: return RetCode.SUCCESS, "", [], return_empty_metadata return RetCode.SUCCESS, "", list(doc_ids_filter) if doc_ids_filter is not None else None, return_empty_metadata @manager.route("/datasets//documents", methods=["DELETE"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def delete_documents(tenant_id, dataset_id): """ Delete documents from a dataset. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset containing the documents. - in: header name: Authorization type: string required: true description: Bearer token for authentication. - in: body name: body description: Document deletion parameters. required: true schema: type: object properties: ids: type: array or null items: type: string description: | Specifies the documents to delete: - An array of IDs, only the specified documents will be deleted. delete_all: type: boolean default: false description: Whether to delete all documents in the dataset. responses: 200: description: Successful operation. schema: type: object """ req, err = await validate_and_parse_json_request(request, DeleteDocumentReq) if err is not None or req is None: return get_error_argument_result(err) try: # Validate dataset exists and user has permission if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") # Get documents to delete doc_ids = req.get("ids") or [] delete_all = req.get("delete_all", False) if not delete_all and len(doc_ids) == 0: return get_error_data_result(message=f"should either provide doc ids or set delete_all(true), dataset: {dataset_id}. ") if len(doc_ids) > 0 and delete_all: return get_error_data_result(message=f"should not provide both doc ids and delete_all(true), dataset: {dataset_id}. ") if delete_all: doc_ids = [doc.id for doc in DocumentService.query(kb_id=dataset_id)] dataset_doc_ids = {doc.id for doc in DocumentService.query(kb_id=dataset_id)} invalid_ids = [doc_id for doc_id in doc_ids if doc_id not in dataset_doc_ids] if invalid_ids: return get_error_data_result( message=f"These documents do not belong to dataset {dataset_id} or Document not found: {', '.join(invalid_ids)}" ) # make sure each id is unique unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_ids, "document") if duplicate_messages: logging.warning(f"duplicate_messages:{duplicate_messages}") else: doc_ids = unique_doc_ids # Delete documents using existing FileService.delete_docs errors = await thread_pool_exec(FileService.delete_docs, doc_ids, tenant_id) if errors: return get_error_data_result(message=str(errors)) return get_result(data={"deleted": len(doc_ids)}) except Exception as e: logging.exception(e) return get_error_data_result(message="Internal server error") @manager.route("/datasets//documents//metadata/config", methods=["PUT"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def update_metadata_config(tenant_id, dataset_id, document_id): """ Update document metadata configuration. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset. - in: path name: document_id type: string required: true description: ID of the document. - in: header name: Authorization type: string required: true description: Bearer token for authentication. - in: body name: body description: Metadata configuration. required: true schema: type: object properties: metadata: type: object description: Metadata configuration JSON. responses: 200: description: Document updated successfully. """ # Verify ownership and existence of dataset if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): return get_error_data_result(message="You don't own the dataset.") # Verify document exists in the dataset doc = DocumentService.query(id=document_id, kb_id=dataset_id) if not doc: msg = f"Document {document_id} not found in dataset {dataset_id}" return get_error_data_result(message=msg) doc = doc[0] # Get request body req = await get_request_json() if "metadata" not in req: return get_error_argument_result(message="metadata is required") # Update parser config with metadata try: DocumentService.update_parser_config(doc.id, {"metadata": req["metadata"]}) except Exception as e: logging.error("error when update_parser_config", exc_info=e) return get_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e)) # Get updated document try: e, doc = DocumentService.get_by_id(doc.id) if not e: return get_data_error_result(message="Document not found!") except Exception as e: return get_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e)) return get_result(data=doc.to_dict()) @manager.route("/thumbnails", methods=["GET"]) # noqa: F821 def list_thumbnails(): """ Get thumbnails for documents. --- tags: - Documents parameters: - in: query name: doc_ids type: array required: true description: List of document IDs to get thumbnails for. responses: 200: description: Successfully retrieved thumbnails 400: description: Missing document IDs """ from api.constants import IMG_BASE64_PREFIX from api.db.services.document_service import DocumentService doc_ids = request.args.getlist("doc_ids") if not doc_ids: return get_json_result(data=False, message='Lack of "Document ID"', code=RetCode.ARGUMENT_ERROR) try: docs = DocumentService.get_thumbnails(doc_ids) for doc_item in docs: if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX): doc_item["thumbnail"] = f"/api/v1/documents/images/{doc_item['kb_id']}-{doc_item['thumbnail']}" return get_json_result(data={d["id"]: d["thumbnail"] for d in docs}) except Exception as e: return server_error_response(e) @manager.route("/datasets//documents/metadatas", methods=["PATCH"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def update_metadata(tenant_id, dataset_id): """ Update document metadata in batch. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset. - in: header name: Authorization type: string required: true description: Bearer token for authentication. - in: body name: body description: Metadata update request. required: true schema: type: object properties: selector: type: object description: Document selector. properties: document_ids: type: array items: type: string description: List of document IDs to update. metadata_condition: type: object description: Filter documents by existing metadata. updates: type: array items: type: object properties: key: type: string value: type: any description: List of metadata key-value pairs to update. deletes: type: array items: type: object properties: key: type: string description: List of metadata keys to delete. responses: 200: description: Metadata updated successfully. """ # Verify ownership of dataset if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") # Get request body req = await get_request_json() selector = req.get("selector", {}) or {} updates = req.get("updates", []) or [] deletes = req.get("deletes", []) or [] # Validate selector if not isinstance(selector, dict): return get_error_data_result(message="selector must be an object.") if not isinstance(updates, list) or not isinstance(deletes, list): return get_error_data_result(message="updates and deletes must be lists.") # Validate metadata_condition metadata_condition = selector.get("metadata_condition", {}) or {} if metadata_condition and not isinstance(metadata_condition, dict): return get_error_data_result(message="metadata_condition must be an object.") # Validate document_ids document_ids = selector.get("document_ids", []) or [] if document_ids and not isinstance(document_ids, list): return get_error_data_result(message="document_ids must be a list.") # Validate updates for upd in updates: if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd: return get_error_data_result(message="Each update requires key and value.") # Validate deletes for d in deletes: if not isinstance(d, dict) or not d.get("key"): return get_error_data_result(message="Each delete requires key.") # Initialize target document IDs target_doc_ids = set() # If document_ids provided, validate they belong to the dataset if document_ids: kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id]) invalid_ids = set(document_ids) - set(kb_doc_ids) if invalid_ids: return get_error_data_result( message=f"These documents do not belong to dataset {dataset_id}: {', '.join(invalid_ids)}" ) target_doc_ids = set(document_ids) # Apply metadata_condition filtering if provided if metadata_condition: metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id]) filtered_ids = set( meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) ) target_doc_ids = target_doc_ids & filtered_ids if metadata_condition.get("conditions") and not target_doc_ids: return get_result(data={"updated": 0, "matched_docs": 0}) # Convert to list and perform update target_doc_ids = list(target_doc_ids) updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes) return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)}) @manager.route("/documents/ingest", methods=["POST"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def ingest(tenant_id): req = await get_request_json() try: user_id = tenant_id error_code, error_message = await thread_pool_exec(_run_sync, user_id, req) if error_code: logging.error(f"error when ingest documents:{req}, error message:{error_message}") return get_json_result(error_code, error_message) return get_json_result(data=True) except Exception as e: logging.exception("document ingest/run failed") return server_error_response(e) def _run_sync(user_id:str, req): for doc_id in req["doc_ids"]: if not DocumentService.accessible(doc_id, user_id): return RetCode.AUTHENTICATION_ERROR, "No authorization." kb_table_num_map = {} for doc_id in req["doc_ids"]: info = {"run": str(req["run"]), "progress": 0} rerun_with_delete = str(req["run"]) == TaskStatus.RUNNING.value and req.get("delete", False) if rerun_with_delete: info["progress_msg"] = "" info["chunk_num"] = 0 info["token_num"] = 0 doc_tenant_id = DocumentService.get_tenant_id(doc_id) if not doc_tenant_id: return RetCode.DATA_ERROR, "Tenant not found!" e, doc = DocumentService.get_by_id(doc_id) if not e: return RetCode.DATA_ERROR, "Document not found!" if str(req["run"]) == TaskStatus.CANCEL.value: tasks = list(TaskService.query(doc_id=doc_id)) has_unfinished_task = any((task.progress or 0) < 1 for task in tasks) if str(doc.run) in [TaskStatus.RUNNING.value, TaskStatus.CANCEL.value] or has_unfinished_task: cancel_all_task_of(doc_id) else: return RetCode.DATA_ERROR, "Cannot cancel a task that is not in RUNNING status" if all([rerun_with_delete, str(doc.run) == TaskStatus.DONE.value]): DocumentService.clear_chunk_num_when_rerun(doc_id) DocumentService.update_by_id(doc_id, info) if req.get("delete", False): TaskService.filter_delete([Task.doc_id == doc_id]) if settings.docStoreConn.index_exist(search.index_name(doc_tenant_id), doc.kb_id): settings.docStoreConn.delete({"doc_id": doc_id}, search.index_name(doc_tenant_id), doc.kb_id) if str(req["run"]) == TaskStatus.RUNNING.value: if req.get("apply_kb"): e, kb = KnowledgebaseService.get_by_id(doc.kb_id) if not e: raise LookupError("Can't find this dataset!") doc.parser_config["llm_id"] = kb.parser_config.get("llm_id") doc.parser_config["enable_metadata"] = kb.parser_config.get("enable_metadata", False) doc.parser_config["metadata"] = kb.parser_config.get("metadata", {}) DocumentService.update_parser_config(doc.id, doc.parser_config) doc_dict = doc.to_dict() DocumentService.run(doc_tenant_id, doc_dict, kb_table_num_map) return None, None @manager.route("/datasets//documents/parse", methods=["POST"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def parse_documents(tenant_id, dataset_id): """ Start parsing documents in a dataset. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset. - in: header name: Authorization type: string required: true description: Bearer token for authentication. - in: body name: body description: Document parse parameters. required: true schema: type: object properties: document_ids: type: array items: type: string description: List of document IDs to parse. responses: 200: description: Successful operation. """ if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") req = await get_request_json() if req is None: return get_error_data_result(message="Request body is required") document_ids = req.get("document_ids") if document_ids is None or not isinstance(document_ids, list): return get_error_data_result(message="`document_ids` is required") if len(document_ids) == 0: return get_error_data_result(message="`document_ids` is required") # Check for duplicate document IDs unique_doc_ids, duplicate_messages = check_duplicate_ids(document_ids, "document") errors = duplicate_messages if duplicate_messages else [] # Validate all document IDs belong to the dataset not_found_ids = [] valid_doc_ids = [] for doc_id in unique_doc_ids: docs = DocumentService.query(kb_id=dataset_id, id=doc_id) if not docs: not_found_ids.append(doc_id) else: valid_doc_ids.append(doc_id) if not_found_ids: errors.append(f"Documents not found: {not_found_ids}") # Still parse valid documents, but return error code if not valid_doc_ids: return get_error_data_result(message=f"Documents not found: {not_found_ids}") try: def _run_sync(): kb_table_num_map = {} success_count = 0 for doc_id in valid_doc_ids: e, doc = DocumentService.get_by_id(doc_id) if not e: errors.append(f"Document not found: {doc_id}") continue info = {"run": str(TaskStatus.RUNNING.value), "progress": 0} # If re-running a completed document, clear previous chunks if str(doc.run) == TaskStatus.DONE.value: DocumentService.clear_chunk_num_when_rerun(doc.id) info["progress_msg"] = "" info["chunk_num"] = 0 info["token_num"] = 0 DocumentService.update_by_id(doc_id, info) TaskService.filter_delete([Task.doc_id == doc_id]) if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id): settings.docStoreConn.delete({"doc_id": doc_id}, search.index_name(tenant_id), doc.kb_id) doc_dict = doc.to_dict() DocumentService.run(tenant_id, doc_dict, kb_table_num_map) success_count += 1 result = {"success_count": success_count} if errors: result["errors"] = errors return result result = await thread_pool_exec(_run_sync) if not_found_ids: return get_error_data_result(message=f"Documents not found: {not_found_ids}") return get_result(data=result) except Exception as e: logging.exception(e) return get_error_data_result(message="Internal server error") @manager.route("/datasets//documents/stop", methods=["POST"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def stop_parse_documents(tenant_id, dataset_id): """ Stop parsing documents in a dataset. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset. - in: header name: Authorization type: string required: true description: Bearer token for authentication. - in: body name: body description: Document stop parse parameters. required: true schema: type: object properties: document_ids: type: array items: type: string description: List of document IDs to stop parsing. responses: 200: description: Successful operation. """ if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") req = await get_request_json() if req is None: return get_error_data_result(message="Request body is required") document_ids = req.get("document_ids") if document_ids is None or not isinstance(document_ids, list): return get_error_data_result(message="`document_ids` is required") if len(document_ids) == 0: return get_error_data_result(message="`document_ids` is required") # Check for duplicate document IDs unique_doc_ids, duplicate_messages = check_duplicate_ids(document_ids, "document") errors = duplicate_messages if duplicate_messages else [] # Validate all document IDs belong to the dataset not_found_ids = [] valid_doc_ids = [] for doc_id in unique_doc_ids: docs = DocumentService.query(kb_id=dataset_id, id=doc_id) if not docs: not_found_ids.append(doc_id) else: valid_doc_ids.append(doc_id) if not_found_ids: return get_error_data_result(message=f"Documents not found: {not_found_ids}") try: def _run_sync(): success_count = 0 for doc_id in valid_doc_ids: e, doc = DocumentService.get_by_id(doc_id) if not e: errors.append(f"Document not found: {doc_id}") continue # Check if the document is currently running tasks = list(TaskService.query(doc_id=doc_id)) has_unfinished_task = any((task.progress or 0) < 1 for task in tasks) if str(doc.run) not in [TaskStatus.RUNNING.value, TaskStatus.CANCEL.value] and not has_unfinished_task: errors.append("Can't stop parsing document that has not started or already completed") continue cancel_all_task_of(doc_id) DocumentService.update_by_id(doc_id, {"run": str(TaskStatus.CANCEL.value)}) success_count += 1 result = {"success_count": success_count} if errors: result["errors"] = errors return result result = await thread_pool_exec(_run_sync) if not_found_ids: return get_error_data_result(message=f"Documents not found: {not_found_ids}") return get_result(data=result) except Exception as e: logging.exception(e) return get_error_data_result(message="Internal server error") @manager.route("/documents/images/", methods=["GET"]) # noqa: F821 async def get_document_image(image_id): """ Get a document image by ID. --- tags: - Documents parameters: - name: image_id in: path required: true schema: type: string description: The image ID (format: bucket-name-image-name) responses: 200: description: Image file content: image/jpeg: schema: type: string format: binary """ try: arr = image_id.split("-") if len(arr) != 2: return get_data_error_result(message="Image not found.") bkt, nm = image_id.split("-") data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm) response = await make_response(data) response.headers.set("Content-Type", "image/JPEG") return response except Exception as e: return server_error_response(e) ARTIFACT_CONTENT_TYPES = { ".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".svg": "image/svg+xml", ".pdf": "application/pdf", ".csv": "text/csv", ".json": "application/json", ".html": "text/html", } @manager.route("/documents/artifact/", methods=["GET"]) # noqa: F821 @login_required async def get_artifact(filename): """ Get an artifact file. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: filename type: string required: true description: Name of the artifact file. - in: header name: Authorization type: string required: true description: Bearer token for authentication. responses: 200: description: Artifact file returned successfully. """ from common import settings try: bucket = SANDBOX_ARTIFACT_BUCKET # Validate filename: must be uuid hex + allowed extension, nothing else basename = os.path.basename(filename) if basename != filename or "/" in filename or "\\" in filename: return get_data_error_result(message="Invalid filename.") ext = os.path.splitext(basename)[1].lower() if ext not in ARTIFACT_CONTENT_TYPES: return get_data_error_result(message="Invalid file type.") data = await thread_pool_exec(settings.STORAGE_IMPL.get, bucket, basename) if not data: return get_data_error_result(message="Artifact not found.") content_type = ARTIFACT_CONTENT_TYPES.get(ext, "application/octet-stream") response = await make_response(data) safe_filename = re.sub(r"[^\w.\-]", "_", basename) apply_safe_file_response_headers(response, content_type, ext) if not response.headers.get("Content-Disposition"): response.headers.set("Content-Disposition", f'inline; filename="{safe_filename}"') return response except Exception as e: return server_error_response(e) @manager.route("/datasets//documents/batch-update-status", methods=["POST"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def batch_update_document_status(tenant_id, dataset_id): """ Batch update status of documents within a dataset. --- tags: - Documents security: - ApiKeyAuth: [] parameters: - in: path name: dataset_id type: string required: true description: ID of the dataset. - in: header name: Authorization type: string required: true description: Bearer token for authentication. - in: body name: body description: Document status update parameters. required: true schema: type: object required: - doc_ids - status properties: doc_ids: type: array items: type: string description: List of document IDs to update. status: type: string enum: ["0", "1"] description: New status (0 = disabled, 1 = enabled). responses: 200: description: Document statuses updated successfully. """ req = await get_request_json() doc_ids = req.get("doc_ids", []) if not isinstance(doc_ids, list) or not doc_ids: return get_error_argument_result(message='"doc_ids" must be a non-empty list.') if any(not isinstance(doc_id, str) or not doc_id for doc_id in doc_ids): return get_error_argument_result(message='"doc_ids" must contain non-empty document IDs.') status = str(req.get("status", -1)) if status not in ["0", "1"]: return get_error_argument_result(message=f'"Status" must be either 0 or 1:{status}!') # Verify dataset ownership if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): return get_error_data_result(message="You don't own the dataset.") e, kb = KnowledgebaseService.get_by_id(dataset_id) if not e: return get_error_data_result(message="Can't find this dataset!") result = {} has_error = False for doc_id in doc_ids: try: e, doc = DocumentService.get_by_id(doc_id) if not e: result[doc_id] = {"error": "Document not found"} has_error = True continue if doc.kb_id != dataset_id: logging.warning(f"Document {doc.kb_id} not in dataset {dataset_id}") result[doc_id] = {"error": "Document not found in this dataset."} has_error = True continue current_status = str(doc.status) if current_status == status: result[doc_id] = {"status": status} continue if not DocumentService.update_by_id(doc_id, {"status": str(status)}): result[doc_id] = {"error": "Database error (Document update)!"} has_error = True continue status_int = int(status) if getattr(doc, "chunk_num", 0) > 0: try: ok = settings.docStoreConn.update( {"doc_id": doc_id}, {"available_int": status_int}, search.index_name(kb.tenant_id), doc.kb_id, ) except Exception as exc: msg = str(exc) if "3022" in msg: result[doc_id] = {"error": "Document store table missing."} else: result[doc_id] = {"error": f"Document store update failed: {msg}"} has_error = True continue if not ok: result[doc_id] = {"error": "Database error (docStore update)!"} has_error = True continue result[doc_id] = {"status": status} except Exception as e: result[doc_id] = {"error": f"Internal server error: {str(e)}"} has_error = True if has_error: return get_json_result(data=result, message="Partial failure", code=RetCode.SERVER_ERROR) return get_json_result(data=result) @manager.route("/documents//preview", methods=["GET"]) # noqa: F821 @login_required async def get(doc_id): """Return the raw file bytes for a document the requesting user is authorized to read. The user must belong to the tenant that owns the document's knowledge base; otherwise the response is indistinguishable from a missing document to avoid cross-tenant ID enumeration. """ try: if not DocumentService.accessible(doc_id, current_user.id): return get_data_error_result(message="Document not found!") e, doc = DocumentService.get_by_id(doc_id) if not e: return get_data_error_result(message="Document not found!") b, n = File2DocumentService.get_storage_address(doc_id=doc_id) data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n) response = await make_response(data) ext = re.search(r"\.([^.]+)$", doc.name.lower()) ext = ext.group(1) if ext else None content_type = None if ext: fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application" content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}") apply_safe_file_response_headers(response, content_type, ext) return response except Exception as e: return server_error_response(e) @manager.route("/documents//download", methods=["GET"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def download_attachment(tenant_id=None, doc_id=None, attachment_id=None): """Stream a document's underlying file to the requesting user. Mirrors the authorization model of the preview endpoint: the user must belong to the tenant that owns the document's knowledge base. A denial returns the same "Document not found!" response so the endpoint cannot be used to enumerate doc ids across tenants. """ try: # Keep backward compatibility with older callers and unit tests that still # pass `attachment_id` instead of the route parameter name. doc_id = doc_id or attachment_id ext = request.args.get("ext", "markdown") data = await thread_pool_exec(settings.STORAGE_IMPL.get, tenant_id, doc_id) response = await make_response(data) content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}") apply_safe_file_response_headers(response, content_type, ext) return response except Exception as e: return server_error_response(e)