ragflow/api/apps/file2document_app.py

#
#  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License
#

import asyncio
import logging
from pathlib import Path

from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService

from api.apps import login_required, current_user
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.utils.api_utils import get_data_error_result, get_json_result, get_request_json, server_error_response, validate_request
from common.misc_utils import get_uuid
from common.constants import RetCode
from api.db import FileType
from api.db.services.document_service import DocumentService


def _convert_files(file_ids, kb_ids, user_id):
    """Synchronous worker: delete old docs and insert new ones for the given file/kb pairs."""
    for id in file_ids:
        informs = File2DocumentService.get_by_file_id(id)
        for inform in informs:
            doc_id = inform.document_id
            e, doc = DocumentService.get_by_id(doc_id)
            if not e:
                continue
            tenant_id = DocumentService.get_tenant_id(doc_id)
            if not tenant_id:
                logging.warning("tenant_id not found for doc_id=%s, skipping remove_document", doc_id)
                continue
            DocumentService.remove_document(doc, tenant_id)
        File2DocumentService.delete_by_file_id(id)

        e, file = FileService.get_by_id(id)
        if not e:
            continue

        for kb_id in kb_ids:
            e, kb = KnowledgebaseService.get_by_id(kb_id)
            if not e:
                continue
            doc = DocumentService.insert({
                "id": get_uuid(),
                "kb_id": kb.id,
                "parser_id": FileService.get_parser(file.type, file.name, kb.parser_id),
                "pipeline_id": kb.pipeline_id,
                "parser_config": kb.parser_config,
                "created_by": user_id,
                "type": file.type,
                "name": file.name,
                "suffix": Path(file.name).suffix.lstrip("."),
                "location": file.location,
                "size": file.size
            })
            File2DocumentService.insert({
                "id": get_uuid(),
                "file_id": id,
                "document_id": doc.id,
            })


@manager.route('/convert', methods=['POST'])  # noqa: F821
@login_required
@validate_request("file_ids", "kb_ids")
async def convert():
    req = await get_request_json()
    kb_ids = req["kb_ids"]
    file_ids = req["file_ids"]

    try:
        files = FileService.get_by_ids(file_ids)
        files_set = {file.id: file for file in files}

        # Validate all files exist before starting any work
        for file_id in file_ids:
            if not files_set.get(file_id):
                return get_data_error_result(message="File not found!")

        # Validate all kb_ids exist before scheduling background work
        for kb_id in kb_ids:
            e, _ = KnowledgebaseService.get_by_id(kb_id)
            if not e:
                return get_data_error_result(message="Can't find this dataset!")

        # Expand folders to their innermost file IDs
        all_file_ids = []
        for file_id in file_ids:
            file = files_set[file_id]
            if file.type == FileType.FOLDER.value:
                all_file_ids.extend(FileService.get_all_innermost_file_ids(file_id, []))
            else:
                all_file_ids.append(file_id)

        user_id = current_user.id
        # Run the blocking DB work in a thread so the event loop is not blocked.
        # For large folders this prevents 504 Gateway Timeout by returning as
        # soon as the background task is scheduled.
        loop = asyncio.get_running_loop()
        future = loop.run_in_executor(None, _convert_files, all_file_ids, kb_ids, user_id)
        future.add_done_callback(
            lambda f: logging.error("_convert_files failed: %s", f.exception()) if f.exception() else None
        )
        return get_json_result(data=True)
    except Exception as e:
        return server_error_response(e)


@manager.route('/rm', methods=['POST'])  # noqa: F821
@login_required
@validate_request("file_ids")
async def rm():
    req = await get_request_json()
    file_ids = req["file_ids"]
    if not file_ids:
        return get_json_result(
            data=False, message='Lack of "Files ID"', code=RetCode.ARGUMENT_ERROR)
    try:
        for file_id in file_ids:
            informs = File2DocumentService.get_by_file_id(file_id)
            if not informs:
                return get_data_error_result(message="Inform not found!")
            for inform in informs:
                if not inform:
                    return get_data_error_result(message="Inform not found!")
                File2DocumentService.delete_by_file_id(file_id)
                doc_id = inform.document_id
                e, doc = DocumentService.get_by_id(doc_id)
                if not e:
                    return get_data_error_result(message="Document not found!")
                tenant_id = DocumentService.get_tenant_id(doc_id)
                if not tenant_id:
                    return get_data_error_result(message="Tenant not found!")
                if not DocumentService.remove_document(doc, tenant_id):
                    return get_data_error_result(
                        message="Database error (Document removal)!")
        return get_json_result(data=True)
    except Exception as e:
        return server_error_response(e)