Refactor: migrate document run api (#14351)

### What problem does this PR solve?

Before migration: POST /v1/document/run
After migration: POST /api/v1/documents/ingest/

### Type of change

- [x] Refactoring
This commit is contained in:
Jack
2026-04-27 21:25:58 +08:00
committed by GitHub
parent 965717c4fb
commit 49912a156e
9 changed files with 178 additions and 165 deletions

View File

@ -15,7 +15,6 @@
#
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed
from types import SimpleNamespace
import pytest
from test_common import bulk_upload_documents, list_documents, parse_documents
@ -124,6 +123,102 @@ class TestDocumentsParse:
assert res["code"] == 109, res
assert res["message"] == "No authorization.", res
@pytest.mark.p2
def test_document_not_found(self, WebApiAuth, add_documents_func):
"""Test document not found error."""
kb_id, document_ids = add_documents_func
# Try to parse a non-existent document
res = parse_documents(WebApiAuth, {"doc_ids": ["non_existent_doc_id"], "run": "1"})
assert res["code"] == 109, res
assert "No authorization" in res["message"], res
@pytest.mark.p2
def test_cancel_non_running_task_error(self, WebApiAuth, add_documents_func):
"""Test cancel error when task is not in RUNNING status."""
kb_id, document_ids = add_documents_func
doc_id = document_ids[0]
# First, run the document parsing
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "1"})
assert res["code"] == 0, res
# Wait for parsing to complete
condition(WebApiAuth, kb_id, [doc_id])
validate_document_parse_done(WebApiAuth, kb_id, [doc_id])
# Now try to cancel a completed task - should fail
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "2"})
assert res["code"] == 102, res
assert res["message"] == "Cannot cancel a task that is not in RUNNING status", res
@pytest.mark.p2
def test_rerun_with_delete(self, WebApiAuth, add_documents_func):
"""Test rerun with delete scenario."""
kb_id, document_ids = add_documents_func
doc_id = document_ids[0]
# First, run the document parsing
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "1"})
assert res["code"] == 0, res
# Wait for parsing to complete
condition(WebApiAuth, kb_id, [doc_id])
validate_document_parse_done(WebApiAuth, kb_id, [doc_id])
# Verify document has chunks
res = list_documents(WebApiAuth, {"kb_id": kb_id})
doc = next((d for d in res["data"]["docs"] if d["id"] == doc_id), None)
assert doc is not None
assert doc["chunk_count"] > 0, "Document should have chunks after parsing"
# Now rerun with delete - this should clear chunks and re-parse
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "1", "delete": True})
assert res["code"] == 0, res
# Wait for parsing to complete
condition(WebApiAuth, kb_id, [doc_id])
validate_document_parse_done(WebApiAuth, kb_id, [doc_id])
@pytest.mark.p2
def test_apply_kb_dataset_not_found(self, WebApiAuth, add_documents_func):
"""Test apply_kb when dataset is not found."""
kb_id, document_ids = add_documents_func
doc_id = document_ids[0]
# Try to apply_kb with a non-existent dataset - this is tricky to test
# because we can't easily delete the dataset after getting the doc_id
# This test verifies the happy path works
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "1"})
assert res["code"] == 0, res
# Wait for parsing to complete
condition(WebApiAuth, kb_id, [doc_id])
validate_document_parse_done(WebApiAuth, kb_id, [doc_id])
@pytest.mark.p2
def test_successful_parse(self, WebApiAuth, add_documents_func):
"""Test successful document parsing."""
kb_id, document_ids = add_documents_func
doc_id = document_ids[0]
# Run the document parsing
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "1"})
assert res["code"] == 0, res
# Wait for parsing to complete
condition(WebApiAuth, kb_id, [doc_id])
validate_document_parse_done(WebApiAuth, kb_id, [doc_id])
# Verify the document is properly parsed
res = list_documents(WebApiAuth, {"kb_id": kb_id})
doc = next((d for d in res["data"]["docs"] if d["id"] == doc_id), None)
assert doc is not None
assert doc["run"] == "DONE"
assert doc["chunk_count"] > 0
assert len(doc["process_begin_at"]) > 0
assert doc["process_duration"] > 0
@pytest.mark.p3
def test_repeated_parse(self, WebApiAuth, add_documents_func):
kb_id, document_ids = add_documents_func
@ -199,94 +294,6 @@ def test_concurrent_parse(WebApiAuth, add_dataset_func, tmp_path):
validate_document_parse_done(WebApiAuth, kb_id, document_ids)
@pytest.mark.p2
class TestDocumentsParseUnit:
def test_run_branch_matrix_unit(self, document_app_module, monkeypatch):
module = document_app_module
calls = {"clear": [], "filter_delete": [], "docstore_delete": [], "cancel": [], "run": []}
async def fake_thread_pool_exec(func, *args, **kwargs):
return func(*args, **kwargs)
monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec)
monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)})
monkeypatch.setattr(module.search, "index_name", lambda tenant_id: f"idx_{tenant_id}")
monkeypatch.setattr(module, "cancel_all_task_of", lambda doc_id: calls["cancel"].append(doc_id))
class _DocStore:
def index_exist(self, _index_name, _kb_id):
return True
def delete(self, where, _index_name, _kb_id):
calls["docstore_delete"].append(where["doc_id"])
monkeypatch.setattr(module.settings, "docStoreConn", _DocStore())
async def set_request(payload):
return payload
def apply_request(payload):
async def fake_request_json():
return await set_request(payload)
monkeypatch.setattr(module, "get_request_json", fake_request_json)
apply_request({"doc_ids": ["doc1"], "run": module.TaskStatus.RUNNING.value})
monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: False)
res = _run(module.run.__wrapped__())
assert res["code"] == module.RetCode.AUTHENTICATION_ERROR
monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: True)
monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: None)
res = _run(module.run.__wrapped__())
assert res["code"] == module.RetCode.DATA_ERROR
assert "Tenant not found!" in res["message"]
monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "tenant1")
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None))
res = _run(module.run.__wrapped__())
assert res["code"] == module.RetCode.DATA_ERROR
assert "Document not found!" in res["message"]
apply_request({"doc_ids": ["doc1"], "run": module.TaskStatus.CANCEL.value})
doc_cancel = SimpleNamespace(id="doc1", run=module.TaskStatus.DONE.value, kb_id="kb1", parser_config={}, to_dict=lambda: {"id": "doc1"})
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, doc_cancel))
monkeypatch.setattr(module.TaskService, "query", lambda **_kwargs: [SimpleNamespace(progress=1)])
res = _run(module.run.__wrapped__())
assert res["code"] == module.RetCode.DATA_ERROR
assert "Cannot cancel a task that is not in RUNNING status" in res["message"]
apply_request({"doc_ids": ["doc1"], "run": module.TaskStatus.RUNNING.value, "delete": True})
doc_rerun = SimpleNamespace(id="doc1", run=module.TaskStatus.DONE.value, kb_id="kb1", parser_config={}, to_dict=lambda: {"id": "doc1"})
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, doc_rerun))
monkeypatch.setattr(module.DocumentService, "clear_chunk_num_when_rerun", lambda doc_id: calls["clear"].append(doc_id))
monkeypatch.setattr(module.TaskService, "filter_delete", lambda _filters: calls["filter_delete"].append(True))
monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True)
monkeypatch.setattr(module.DocumentService, "run", lambda tenant_id, doc_dict, _kb_map: calls["run"].append((tenant_id, doc_dict)))
res = _run(module.run.__wrapped__())
assert res["code"] == 0
assert calls["clear"] == ["doc1"]
assert calls["filter_delete"] == [True]
assert calls["docstore_delete"] == ["doc1"]
assert calls["run"] == [("tenant1", {"id": "doc1"})]
apply_request({"doc_ids": ["doc1"], "run": module.TaskStatus.RUNNING.value, "apply_kb": True})
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (False, None))
res = _run(module.run.__wrapped__())
assert res["code"] == 500
assert "Can't find this dataset!" in res["message"]
apply_request({"doc_ids": ["doc1"], "run": module.TaskStatus.RUNNING.value})
def raise_run_error(*_args, **_kwargs):
raise RuntimeError("run boom")
monkeypatch.setattr(module.DocumentService, "run", raise_run_error)
res = _run(module.run.__wrapped__())
assert res["code"] == 500
assert "run boom" in res["message"]
# @pytest.mark.skip
class TestDocumentsParseStop:
@pytest.mark.parametrize(