diff --git a/api/apps/document_app.py b/api/apps/document_app.py index aa23edb0b..cdbe728fb 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -14,9 +14,7 @@ # limitations under the License # import logging -import os.path import re -from pathlib import PurePosixPath, PureWindowsPath from quart import make_response, request @@ -26,7 +24,6 @@ from api.db import FileType from api.db.db_models import Task from api.db.services.document_service import DocumentService, doc_upload_and_parse from api.db.services.file2document_service import File2DocumentService -from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.task_service import TaskService, cancel_all_task_of from api.utils.api_utils import ( @@ -36,28 +33,13 @@ from api.utils.api_utils import ( server_error_response, validate_request, ) -from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers, is_valid_url +from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers from common import settings from common.constants import RetCode, TaskStatus -from common.file_utils import get_project_base_directory from common.misc_utils import thread_pool_exec -from deepdoc.parser.html_parser import RAGFlowHtmlParser from rag.nlp import search -def _is_safe_download_filename(name: str) -> bool: - if not name or name in {".", ".."}: - return False - if "\x00" in name or len(name) > 255: - return False - if name != PurePosixPath(name).name: - return False - if name != PureWindowsPath(name).name: - return False - return True - - - @manager.route("/thumbnails", methods=["GET"]) # noqa: F821 # @login_required def thumbnails(): @@ -339,62 +321,3 @@ async def upload_and_parse(): form = await request.form doc_ids = doc_upload_and_parse(form.get("conversation_id"), file_objs, current_user.id) return get_json_result(data=doc_ids) - - -@manager.route("/parse", methods=["POST"]) # noqa: F821 -@login_required -async def parse(): - req = await get_request_json() - url = req.get("url", "") - if url: - if not is_valid_url(url): - return get_json_result(data=False, message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR) - download_path = os.path.join(get_project_base_directory(), "logs/downloads") - os.makedirs(download_path, exist_ok=True) - from seleniumwire.webdriver import Chrome, ChromeOptions - - options = ChromeOptions() - options.add_argument("--headless") - options.add_argument("--disable-gpu") - options.add_argument("--no-sandbox") - options.add_argument("--disable-dev-shm-usage") - options.add_experimental_option("prefs", {"download.default_directory": download_path, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True}) - driver = Chrome(options=options) - driver.get(url) - res_headers = [r.response.headers for r in driver.requests if r and r.response] - if len(res_headers) > 1: - sections = RAGFlowHtmlParser().parser_txt(driver.page_source) - driver.quit() - return get_json_result(data="\n".join(sections)) - - class File: - filename: str - filepath: str - - def __init__(self, filename, filepath): - self.filename = filename - self.filepath = filepath - - def read(self): - with open(self.filepath, "rb") as f: - return f.read() - - r = re.search(r"filename=\"([^\"]+)\"", str(res_headers)) - if not r or not r.group(1): - return get_json_result(data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR) - filename = r.group(1).strip() - if not _is_safe_download_filename(filename): - return get_json_result(data=False, message="Invalid downloaded filename", code=RetCode.ARGUMENT_ERROR) - filepath = os.path.join(download_path, filename) - f = File(filename, filepath) - txt = FileService.parse_docs([f], current_user.id) - return get_json_result(data=txt) - - files = await request.files - if "file" not in files: - return get_json_result(data=False, message="No file part!", code=RetCode.ARGUMENT_ERROR) - - file_objs = files.getlist("file") - txt = FileService.parse_docs(file_objs, current_user.id) - - return get_json_result(data=txt) diff --git a/test/testcases/test_web_api/test_document_app/test_upload_documents.py b/test/testcases/test_web_api/test_document_app/test_upload_documents.py index bb8d80577..2c74b1b8e 100644 --- a/test/testcases/test_web_api/test_document_app/test_upload_documents.py +++ b/test/testcases/test_web_api/test_document_app/test_upload_documents.py @@ -196,8 +196,7 @@ class TestDocumentsUpload: import asyncio -import sys -from types import ModuleType, SimpleNamespace +from types import SimpleNamespace class _AwaitableValue: @@ -329,122 +328,6 @@ class TestDocumentsUploadUnit: assert res["code"] == 0 assert res["data"] == ["doc-1"] - def test_parse_url_and_multipart_matrix_unit(self, document_app_module, monkeypatch, tmp_path): - module = document_app_module - - async def req_invalid_url(): - return {"url": "not-a-url"} - - monkeypatch.setattr(module, "get_request_json", req_invalid_url) - monkeypatch.setattr(module, "is_valid_url", lambda _url: False) - res = _run(module.parse()) - assert res["code"] == module.RetCode.ARGUMENT_ERROR - assert res["message"] == "The URL format is invalid" - - webdriver_mod = ModuleType("seleniumwire.webdriver") - - class _FakeChromeOptions: - def __init__(self): - self.args = [] - self.experimental = {} - - def add_argument(self, arg): - self.args.append(arg) - - def add_experimental_option(self, key, value): - self.experimental[key] = value - - class _Req: - def __init__(self, headers): - self.response = SimpleNamespace(headers=headers) - - class _FakeDriver: - def __init__(self, requests, page_source): - self.requests = requests - self.page_source = page_source - self.quit_called = False - self.visited = [] - self.options = None - - def get(self, url): - self.visited.append(url) - - def quit(self): - self.quit_called = True - - queue = [] - created = [] - - def _fake_chrome(options=None): - driver = queue.pop(0) - driver.options = options - created.append(driver) - return driver - - webdriver_mod.Chrome = _fake_chrome - webdriver_mod.ChromeOptions = _FakeChromeOptions - - seleniumwire_mod = ModuleType("seleniumwire") - seleniumwire_mod.webdriver = webdriver_mod - monkeypatch.setitem(sys.modules, "seleniumwire", seleniumwire_mod) - monkeypatch.setitem(sys.modules, "seleniumwire.webdriver", webdriver_mod) - monkeypatch.setattr(module, "get_project_base_directory", lambda: str(tmp_path)) - monkeypatch.setattr(module, "is_valid_url", lambda _url: True) - - class _Parser: - def parser_txt(self, page_source): - assert "page" in page_source - return ["section1", "section2"] - - monkeypatch.setattr(module, "RAGFlowHtmlParser", lambda: _Parser()) - queue.append(_FakeDriver([_Req({"x": "1"}), _Req({"y": "2"})], "page")) - - async def req_url_html(): - return {"url": "http://example.com/html"} - - monkeypatch.setattr(module, "get_request_json", req_url_html) - res = _run(module.parse()) - assert res["code"] == 0 - assert res["data"] == "section1\nsection2" - assert created[-1].quit_called is True - - (tmp_path / "logs" / "downloads").mkdir(parents=True, exist_ok=True) - (tmp_path / "logs" / "downloads" / "doc.txt").write_bytes(b"downloaded-bytes") - queue.append(_FakeDriver([_Req({"content-disposition": 'attachment; filename="doc.txt"'})], "file")) - captured = {} - - def parse_docs_read(files, _uid): - captured["filename"] = files[0].filename - captured["content"] = files[0].read() - return "parsed-download" - - monkeypatch.setattr(module.FileService, "parse_docs", parse_docs_read) - - async def req_url_file(): - return {"url": "http://example.com/file"} - - monkeypatch.setattr(module, "get_request_json", req_url_file) - res = _run(module.parse()) - assert res["code"] == 0 - assert res["data"] == "parsed-download" - assert captured["filename"] == "doc.txt" - assert captured["content"] == b"downloaded-bytes" - - async def req_no_url(): - return {} - - monkeypatch.setattr(module, "get_request_json", req_no_url) - monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles())) - res = _run(module.parse()) - assert res["code"] == module.RetCode.ARGUMENT_ERROR - assert res["message"] == "No file part!" - - monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles({"file": [_DummyFile("f1.txt")]}))) - monkeypatch.setattr(module.FileService, "parse_docs", lambda _files, _uid: "parsed-upload") - res = _run(module.parse()) - assert res["code"] == 0 - assert res["data"] == "parsed-upload" - @pytest.mark.p2 class TestWebCrawlUnit: