# # Copyright 2026 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.utils import validation_utils from common import settings from common.constants import TaskStatus from api.utils.api_utils import get_error_data_result, server_error_response, get_parser_config from api.utils.validation_utils import UpdateDocumentReq from rag.nlp import rag_tokenizer, search def update_document_name_only(document_id, req_doc_name): """ Update document name only (without validation). :param document_id: id (string) of the document :param req_doc_name: new name (string) from request for the document :return: None if all are good; otherwise returns the error message in the JSON format """ if not DocumentService.update_by_id(document_id, {"name": req_doc_name}): return get_error_data_result(message="Database error (Document rename)!") informs = File2DocumentService.get_by_document_id(document_id) if informs: e, file = FileService.get_by_id(informs[0].file_id) FileService.update_by_id(file.id, {"name": req_doc_name}) # Add logic to update index - refer to rename method in document_app.py tenant_id = DocumentService.get_tenant_id(document_id) title_tks = rag_tokenizer.tokenize(req_doc_name) es_body = { "docnm_kwd": req_doc_name, "title_tks": title_tks, "title_sm_tks": rag_tokenizer.fine_grained_tokenize(title_tks), } ok, doc = DocumentService.get_by_id(document_id) if not ok: return get_error_data_result(message=f"Not able to find document by id:{document_id}") if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id): settings.docStoreConn.update( {"doc_id": document_id}, es_body, search.index_name(tenant_id), doc.kb_id, ) return None def update_chunk_method(req, doc, tenant_id): """ Update chunk method only (without validation). Updates the chunk method and parser configuration for a document, and resets the document's progress if the chunk method changes. Also clears existing chunks from the document store if the method changes. Args: req: The request dictionary containing chunk_method and parser_config. doc: The document model from the database. tenant_id: The tenant ID for the document store. Returns: None if successful, or an error result dictionary if failed. """ if doc.parser_id.lower() != req["chunk_method"].lower(): # if chunk method changed, reset document for reparse result = reset_document_for_reparse(doc, tenant_id, parser_id=req["chunk_method"]) if result: return result if not req.get("parser_config"): req["parser_config"] = get_parser_config(req["chunk_method"], req.get("parser_config")) DocumentService.update_parser_config(doc.id, req["parser_config"]) return None def reset_document_for_reparse(doc, tenant_id, parser_id=None, pipeline_id=None): """ Reset document for reparsing. Updates the parser_id and/or pipeline_id for a document, resets its progress, clears existing chunks from the document store, and removes chunk images. Args: doc: The document model from the database. tenant_id: The tenant ID for the document store. parser_id: Optional new parser_id (chunk method). If None, keeps existing. pipeline_id: Optional new pipeline_id. If None, keeps existing. Returns: None if successful, or an error result dictionary if failed. """ # Build update fields update_fields = { "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value, } if parser_id is not None: update_fields["parser_id"] = parser_id if pipeline_id is not None: update_fields["pipeline_id"] = pipeline_id # Update document e = DocumentService.update_by_id(doc.id, update_fields) if not e: return get_error_data_result(message="Document not found!") # Delete chunks from document store if doc.token_num > 0: e = DocumentService.increment_chunk_num( doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duration * -1, ) if not e: return get_error_data_result(message="Document not found!") settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id) # Delete chunk images try: DocumentService.delete_chunk_images(doc, tenant_id) except Exception as e: logging.error(f"error when delete chunk images:{e}") return None def update_document_status_only(status:int, doc, kb): """ Update document status only (without validation). Updates the enabled/disabled status of a document and updates the corresponding index in the document store. Args: status: The new status value (0 for disabled, 1 for enabled). doc: The document model from the database. kb: The knowledge base model. Returns: None if successful, or an error result dictionary if failed. """ if doc.status is None or (int(doc.status) != status): try: if not DocumentService.update_by_id(doc.id, {"status": str(status)}): return get_error_data_result(message="Database error (Document update)!") settings.docStoreConn.update({"doc_id": doc.id}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id) except Exception as e: return server_error_response(e) return None def validate_document_update_fields(update_doc_req:UpdateDocumentReq, doc, req): """ Validate document update fields in a single method. Performs comprehensive validation of all document update fields, including immutable fields, document name, and chunk method. Args: update_doc_req: The validated update document request. doc: The document model from the database. req: The original request dictionary. Returns: A tuple of (error_message, error_code) if validation fails, or (None, None) if validation passes. """ # Validate immutable fields error_msg, error_code = validation_utils.validate_immutable_fields(update_doc_req, doc) if error_msg: return error_msg, error_code # Validate document name if present if "name" in req and req["name"] != doc.name: docs_from_name = DocumentService.query(name=req["name"], kb_id=doc.kb_id) error_msg, error_code = validation_utils.validate_document_name(req["name"], doc, docs_from_name) if error_msg: return error_msg, error_code # Validate chunk method if present if "chunk_method" in req: error_msg, error_code = validation_utils.validate_chunk_method(doc, req["chunk_method"]) if error_msg: return error_msg, error_code return None, None def map_doc_keys(doc): """ Rename document keys to match API response format. Converts internal document model field names to the external API response field names (e.g., 'chunk_num' -> 'chunk_count'). Args: doc: The document model from the database. Returns: A dictionary with renamed keys for API response. """ renamed_doc = _process_key_mappings(doc) if "run" in renamed_doc.keys(): renamed_doc = _process_run_mapping(renamed_doc, renamed_doc["run"]) return renamed_doc def map_doc_keys_with_run_status(doc, run_status): """ Map document keys to match API response format. Converts internal document model field names to the external API response field names (e.g., 'chunk_num' -> 'chunk_count'). Args: doc: The document model from the database OR a dictionary. run_status: Optional explicit run status value. If not provided: - If doc has 'run' field, it will be mapped using run_mapping - Otherwise, 'run' will be set to 'UNSTART' (for new uploads) Returns: A dictionary with renamed keys for API response. """ renamed_doc = _process_key_mappings(doc) renamed_doc = _process_run_mapping(renamed_doc, run_status) return renamed_doc def _process_key_mappings(doc): """ Map document keys to match API response format. Converts internal document model field names to the external API response field names (e.g., 'chunk_num' -> 'chunk_count'). Args: doc: The document model from the database OR a dictionary. Returns: A dictionary with renamed keys for API response. """ key_mapping = { "chunk_num": "chunk_count", "kb_id": "dataset_id", "token_num": "token_count", "parser_id": "chunk_method", } # Handle both dict and model input items = doc.to_dict().items() if hasattr(doc, 'to_dict') else doc.items() renamed_doc = {} for key, value in items: new_key = key_mapping.get(key, key) renamed_doc[new_key] = value return renamed_doc def _process_run_mapping(doc, run_status): """ Map document keys to match API response format. Args: doc: The document model from the database OR a dictionary. run_status: Optional explicit run status value. If provided, 'run' field of doc will be set to run_status. If not provided, 'run' will be set to 'UNSTART' (for new uploads) Returns: A dictionary with renamed keys for API response. """ run_mapping = { "0": "UNSTART", "1": "RUNNING", "2": "CANCEL", "3": "DONE", "4": "FAIL", } # Handle run field if run_status is None or run_status not in run_mapping.keys(): run_status = "0" doc["run"] = run_mapping[run_status] return doc