Fix: /file2document/convert blocks event loop on large folders causing 504 timeout (#13784)

Problem

The /file2document/convert endpoint ran all file lookups, document
deletions, and insertions synchronously inside the
request cycle. Linking a large folder (~1.7GB with many files) caused
504 Gateway Timeout because the blocking DB loop
  held the HTTP connection open for too long.

  Fix

- Extracted the heavy DB work into a plain sync function _convert_files
- Inputs are validated and folder file IDs expanded upfront (fast path)
- The blocking work is dispatched to a thread pool via
get_running_loop().run_in_executor() and the endpoint returns 200
  immediately
- Frontend only checks data.code === 0 so the response change
(file2documents list → True) has no impact

  Fixes #13781

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Syed Shahmeer Ali
2026-03-26 13:45:10 +05:00
committed by GitHub
parent e705ac6643
commit ff92b5575b
6 changed files with 98 additions and 102 deletions

View File

@ -244,69 +244,33 @@ def test_convert_branch_matrix_unit(monkeypatch):
req_state = {"kb_ids": ["kb-1"], "file_ids": ["f1"]}
_set_request_json(monkeypatch, module, req_state)
events = {"deleted": []}
# Falsy file → "File not found!" (synchronous validation)
monkeypatch.setattr(module.FileService, "get_by_ids", lambda _ids: [_FalsyFile("f1", module.FileType.DOC.value)])
res = _run(module.convert())
assert res["message"] == "File not found!"
# Valid file but invalid kb → "Can't find this dataset!" (synchronous validation)
# KnowledgebaseService stub returns (False, None) by default
monkeypatch.setattr(module.FileService, "get_by_ids", lambda _ids: [_DummyFile("f1", module.FileType.DOC.value)])
monkeypatch.setattr(module.File2DocumentService, "get_by_file_id", lambda _file_id: [SimpleNamespace(document_id="doc-1")])
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None))
res = _run(module.convert())
assert res["message"] == "Document not found!"
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, SimpleNamespace(id=_doc_id)))
monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: None)
res = _run(module.convert())
assert res["message"] == "Tenant not found!"
monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "tenant-1")
monkeypatch.setattr(module.DocumentService, "remove_document", lambda *_args, **_kwargs: False)
res = _run(module.convert())
assert "Document removal" in res["message"]
monkeypatch.setattr(module.DocumentService, "remove_document", lambda *_args, **_kwargs: True)
monkeypatch.setattr(module.File2DocumentService, "get_by_file_id", lambda _file_id: [])
monkeypatch.setattr(module.File2DocumentService, "delete_by_file_id", lambda file_id: events["deleted"].append(file_id))
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (False, None))
res = _run(module.convert())
assert res["message"] == "Can't find this dataset!"
assert events["deleted"] == ["f1"]
# Valid file and kb → schedules background work, returns data=True immediately
kb = SimpleNamespace(id="kb-1", parser_id="naive", pipeline_id="p1", parser_config={})
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, kb))
monkeypatch.setattr(module.FileService, "get_by_id", lambda _file_id: (False, None))
res = _run(module.convert())
assert res["message"] == "Can't find this file!"
assert res["code"] == 0
assert res["data"] is True
# Folder expansion → schedules background work, returns data=True immediately
req_state["file_ids"] = ["folder-1"]
monkeypatch.setattr(module.FileService, "get_by_ids", lambda _ids: [_DummyFile("folder-1", module.FileType.FOLDER.value, name="folder")])
monkeypatch.setattr(module.FileService, "get_all_innermost_file_ids", lambda _file_id, _acc: ["inner-1"])
monkeypatch.setattr(
module.FileService,
"get_by_id",
lambda _file_id: (True, _DummyFile("inner-1", module.FileType.DOC.value, name="inner.txt", location="inner.loc", size=2)),
)
inserted = {}
def _insert(payload):
inserted.update(payload)
return SimpleNamespace(id="doc-new")
monkeypatch.setattr(module.DocumentService, "insert", _insert)
monkeypatch.setattr(module.FileService, "get_parser", lambda _ft, _name, _parser_id: "picked-parser")
monkeypatch.setattr(
module.File2DocumentService,
"insert",
lambda _payload: SimpleNamespace(to_json=lambda: {"file_id": "inner-1", "document_id": "doc-new"}),
)
res = _run(module.convert())
assert res["code"] == 0
assert res["data"] == [{"file_id": "inner-1", "document_id": "doc-new"}]
assert inserted["parser_id"] == "picked-parser"
assert inserted["pipeline_id"] == "p1"
assert res["data"] is True
# Exception in file lookup → 500
req_state["file_ids"] = ["f1"]
monkeypatch.setattr(
module.FileService,