mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-21 08:37:05 +08:00
### What problem does this PR solve? Fixes #14866. Previously, `DocumentService.increment_chunk_num` and `decrement_chunk_num` updated the `Document` row and its parent `Knowledgebase` row in two separate, non-transactional statements. If the second update failed (DB error, connection drop, etc.) after the first one succeeded, the document and knowledge base chunk/token counters would drift apart and stay inconsistent. There was also a behavioral asymmetry between the two methods: - `increment_chunk_num` only logged a warning when the document row was missing and returned a value that callers usually treated as success. - `decrement_chunk_num` raised `LookupError` in the same situation. This PR makes the counter updates atomic and aligns the missing-document behavior between the two methods: - Wrap the `Document` and `Knowledgebase` updates in `increment_chunk_num` / `decrement_chunk_num` inside a `DB.atomic()` block so both succeed or both roll back together. - Raise `LookupError` from `increment_chunk_num` when the target document no longer exists, matching `decrement_chunk_num`. - Update `reset_document_for_reparse` in `document_api_service.py` to catch the new `LookupError` and return a proper "Document not found!" API error instead of propagating the exception. No schema changes, no API contract changes for the success path; only the failure mode for a missing document during reparse is now a clean error response instead of an uncaught exception. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
307 lines
11 KiB
Python
307 lines
11 KiB
Python
#
|
|
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import logging
|
|
|
|
from api.db.services.document_service import DocumentService
|
|
from api.db.services.file2document_service import File2DocumentService
|
|
from api.db.services.file_service import FileService
|
|
from api.utils import validation_utils
|
|
from common import settings
|
|
from common.constants import TaskStatus
|
|
from api.utils.api_utils import get_error_data_result, server_error_response, get_parser_config
|
|
from api.utils.validation_utils import UpdateDocumentReq
|
|
from rag.nlp import rag_tokenizer, search
|
|
|
|
|
|
def update_document_name_only(document_id, req_doc_name):
|
|
"""
|
|
Update document name only (without validation).
|
|
:param document_id: id (string) of the document
|
|
:param req_doc_name: new name (string) from request for the document
|
|
:return: None if all are good; otherwise returns the error message in the JSON format
|
|
"""
|
|
if not DocumentService.update_by_id(document_id, {"name": req_doc_name}):
|
|
return get_error_data_result(message="Database error (Document rename)!")
|
|
|
|
informs = File2DocumentService.get_by_document_id(document_id)
|
|
if informs:
|
|
e, file = FileService.get_by_id(informs[0].file_id)
|
|
FileService.update_by_id(file.id, {"name": req_doc_name})
|
|
# Add logic to update index - refer to rename method in document_app.py
|
|
tenant_id = DocumentService.get_tenant_id(document_id)
|
|
title_tks = rag_tokenizer.tokenize(req_doc_name)
|
|
es_body = {
|
|
"docnm_kwd": req_doc_name,
|
|
"title_tks": title_tks,
|
|
"title_sm_tks": rag_tokenizer.fine_grained_tokenize(title_tks),
|
|
}
|
|
ok, doc = DocumentService.get_by_id(document_id)
|
|
if not ok:
|
|
return get_error_data_result(message=f"Not able to find document by id:{document_id}")
|
|
if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id):
|
|
settings.docStoreConn.update(
|
|
{"doc_id": document_id},
|
|
es_body,
|
|
search.index_name(tenant_id),
|
|
doc.kb_id,
|
|
)
|
|
return None
|
|
|
|
def update_chunk_method(req, doc, tenant_id):
|
|
"""
|
|
Update chunk method only (without validation).
|
|
|
|
Updates the chunk method and parser configuration for a document,
|
|
and resets the document's progress if the chunk method changes.
|
|
Also clears existing chunks from the document store if the method changes.
|
|
|
|
Args:
|
|
req: The request dictionary containing chunk_method and parser_config.
|
|
doc: The document model from the database.
|
|
tenant_id: The tenant ID for the document store.
|
|
|
|
Returns:
|
|
None if successful, or an error result dictionary if failed.
|
|
"""
|
|
if doc.parser_id.lower() != req["chunk_method"].lower():
|
|
# if chunk method changed, reset document for reparse
|
|
result = reset_document_for_reparse(doc, tenant_id, parser_id=req["chunk_method"])
|
|
if result:
|
|
return result
|
|
if not req.get("parser_config"):
|
|
req["parser_config"] = get_parser_config(req["chunk_method"], req.get("parser_config"))
|
|
DocumentService.update_parser_config(doc.id, req["parser_config"])
|
|
return None
|
|
|
|
|
|
def reset_document_for_reparse(doc, tenant_id, parser_id=None, pipeline_id=None):
|
|
"""
|
|
Reset document for reparsing.
|
|
|
|
Updates the parser_id and/or pipeline_id for a document, resets its progress,
|
|
clears existing chunks from the document store, and removes chunk images.
|
|
|
|
Args:
|
|
doc: The document model from the database.
|
|
tenant_id: The tenant ID for the document store.
|
|
parser_id: Optional new parser_id (chunk method). If None, keeps existing.
|
|
pipeline_id: Optional new pipeline_id. If None, keeps existing.
|
|
|
|
Returns:
|
|
None if successful, or an error result dictionary if failed.
|
|
"""
|
|
|
|
# Build update fields
|
|
update_fields = {
|
|
"progress": 0,
|
|
"progress_msg": "",
|
|
"run": TaskStatus.UNSTART.value,
|
|
}
|
|
if parser_id is not None:
|
|
update_fields["parser_id"] = parser_id
|
|
if pipeline_id is not None:
|
|
update_fields["pipeline_id"] = pipeline_id
|
|
|
|
# Update document
|
|
e = DocumentService.update_by_id(doc.id, update_fields)
|
|
if not e:
|
|
return get_error_data_result(message="Document not found!")
|
|
|
|
# Delete chunks from document store
|
|
if doc.token_num > 0:
|
|
try:
|
|
e = DocumentService.increment_chunk_num(
|
|
doc.id,
|
|
doc.kb_id,
|
|
doc.token_num * -1,
|
|
doc.chunk_num * -1,
|
|
doc.process_duration * -1,
|
|
)
|
|
except LookupError:
|
|
return get_error_data_result(message="Document not found!")
|
|
if not e:
|
|
return get_error_data_result(message="Document not found!")
|
|
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
|
|
|
|
# Delete chunk images
|
|
try:
|
|
DocumentService.delete_chunk_images(doc, tenant_id)
|
|
except Exception as e:
|
|
logging.error(f"error when delete chunk images:{e}")
|
|
|
|
return None
|
|
|
|
|
|
def update_document_status_only(status:int, doc, kb):
|
|
"""
|
|
Update document status only (without validation).
|
|
|
|
Updates the enabled/disabled status of a document and updates
|
|
the corresponding index in the document store.
|
|
|
|
Args:
|
|
status: The new status value (0 for disabled, 1 for enabled).
|
|
doc: The document model from the database.
|
|
kb: The knowledge base model.
|
|
|
|
Returns:
|
|
None if successful, or an error result dictionary if failed.
|
|
"""
|
|
if doc.status is None or (int(doc.status) != status):
|
|
try:
|
|
if not DocumentService.update_by_id(doc.id, {"status": str(status)}):
|
|
return get_error_data_result(message="Database error (Document update)!")
|
|
settings.docStoreConn.update({"doc_id": doc.id}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
return None
|
|
|
|
|
|
def validate_document_update_fields(update_doc_req:UpdateDocumentReq, doc, req):
|
|
"""
|
|
Validate document update fields in a single method.
|
|
|
|
Performs comprehensive validation of all document update fields,
|
|
including immutable fields, document name, and chunk method.
|
|
|
|
Args:
|
|
update_doc_req: The validated update document request.
|
|
doc: The document model from the database.
|
|
req: The original request dictionary.
|
|
|
|
Returns:
|
|
A tuple of (error_message, error_code) if validation fails,
|
|
or (None, None) if validation passes.
|
|
"""
|
|
# Validate immutable fields
|
|
error_msg, error_code = validation_utils.validate_immutable_fields(update_doc_req, doc)
|
|
if error_msg:
|
|
return error_msg, error_code
|
|
|
|
# Validate document name if present
|
|
if "name" in req and req["name"] != doc.name:
|
|
docs_from_name = DocumentService.query(name=req["name"], kb_id=doc.kb_id)
|
|
error_msg, error_code = validation_utils.validate_document_name(req["name"], doc, docs_from_name)
|
|
if error_msg:
|
|
return error_msg, error_code
|
|
|
|
# Validate chunk method if present
|
|
if "chunk_method" in req:
|
|
error_msg, error_code = validation_utils.validate_chunk_method(doc, req["chunk_method"])
|
|
if error_msg:
|
|
return error_msg, error_code
|
|
|
|
return None, None
|
|
|
|
|
|
def map_doc_keys(doc):
|
|
"""
|
|
Rename document keys to match API response format.
|
|
|
|
Converts internal document model field names to the external API
|
|
response field names (e.g., 'chunk_num' -> 'chunk_count').
|
|
|
|
Args:
|
|
doc: The document model from the database.
|
|
|
|
Returns:
|
|
A dictionary with renamed keys for API response.
|
|
"""
|
|
renamed_doc = _process_key_mappings(doc)
|
|
if "run" in renamed_doc.keys():
|
|
renamed_doc = _process_run_mapping(renamed_doc, renamed_doc["run"])
|
|
return renamed_doc
|
|
|
|
|
|
def map_doc_keys_with_run_status(doc, run_status):
|
|
"""
|
|
Map document keys to match API response format.
|
|
|
|
Converts internal document model field names to the external API
|
|
response field names (e.g., 'chunk_num' -> 'chunk_count').
|
|
|
|
Args:
|
|
doc: The document model from the database OR a dictionary.
|
|
run_status: Optional explicit run status value. If not provided:
|
|
- If doc has 'run' field, it will be mapped using run_mapping
|
|
- Otherwise, 'run' will be set to 'UNSTART' (for new uploads)
|
|
|
|
Returns:
|
|
A dictionary with renamed keys for API response.
|
|
"""
|
|
renamed_doc = _process_key_mappings(doc)
|
|
renamed_doc = _process_run_mapping(renamed_doc, run_status)
|
|
return renamed_doc
|
|
|
|
|
|
def _process_key_mappings(doc):
|
|
"""
|
|
Map document keys to match API response format.
|
|
|
|
Converts internal document model field names to the external API
|
|
response field names (e.g., 'chunk_num' -> 'chunk_count').
|
|
|
|
Args:
|
|
doc: The document model from the database OR a dictionary.
|
|
|
|
Returns:
|
|
A dictionary with renamed keys for API response.
|
|
"""
|
|
key_mapping = {
|
|
"chunk_num": "chunk_count",
|
|
"kb_id": "dataset_id",
|
|
"token_num": "token_count",
|
|
"parser_id": "chunk_method",
|
|
}
|
|
|
|
# Handle both dict and model input
|
|
items = doc.to_dict().items() if hasattr(doc, 'to_dict') else doc.items()
|
|
|
|
renamed_doc = {}
|
|
for key, value in items:
|
|
new_key = key_mapping.get(key, key)
|
|
renamed_doc[new_key] = value
|
|
return renamed_doc
|
|
|
|
|
|
def _process_run_mapping(doc, run_status):
|
|
"""
|
|
Map document keys to match API response format.
|
|
|
|
Args:
|
|
doc: The document model from the database OR a dictionary.
|
|
run_status: Optional explicit run status value.
|
|
If provided, 'run' field of doc will be set to run_status.
|
|
If not provided, 'run' will be set to 'UNSTART' (for new uploads)
|
|
|
|
Returns:
|
|
A dictionary with renamed keys for API response.
|
|
"""
|
|
run_mapping = {
|
|
"0": "UNSTART",
|
|
"1": "RUNNING",
|
|
"2": "CANCEL",
|
|
"3": "DONE",
|
|
"4": "FAIL",
|
|
}
|
|
|
|
# Handle run field
|
|
if run_status is None or run_status not in run_mapping.keys():
|
|
run_status = "0"
|
|
|
|
doc["run"] = run_mapping[run_status]
|
|
return doc
|