mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-25 18:36:59 +08:00
Refactor: deco doc-parse API that is not used any more (#14367)
### What problem does this PR solve? Delete un-used API "POST /v1/document/parse" ### Type of change - [x] Refactoring
This commit is contained in:
@ -14,9 +14,7 @@
|
||||
# limitations under the License
|
||||
#
|
||||
import logging
|
||||
import os.path
|
||||
import re
|
||||
from pathlib import PurePosixPath, PureWindowsPath
|
||||
|
||||
from quart import make_response, request
|
||||
|
||||
@ -26,7 +24,6 @@ from api.db import FileType
|
||||
from api.db.db_models import Task
|
||||
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.task_service import TaskService, cancel_all_task_of
|
||||
from api.utils.api_utils import (
|
||||
@ -36,28 +33,13 @@ from api.utils.api_utils import (
|
||||
server_error_response,
|
||||
validate_request,
|
||||
)
|
||||
from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers, is_valid_url
|
||||
from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers
|
||||
from common import settings
|
||||
from common.constants import RetCode, TaskStatus
|
||||
from common.file_utils import get_project_base_directory
|
||||
from common.misc_utils import thread_pool_exec
|
||||
from deepdoc.parser.html_parser import RAGFlowHtmlParser
|
||||
from rag.nlp import search
|
||||
|
||||
|
||||
def _is_safe_download_filename(name: str) -> bool:
|
||||
if not name or name in {".", ".."}:
|
||||
return False
|
||||
if "\x00" in name or len(name) > 255:
|
||||
return False
|
||||
if name != PurePosixPath(name).name:
|
||||
return False
|
||||
if name != PureWindowsPath(name).name:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
|
||||
@manager.route("/thumbnails", methods=["GET"]) # noqa: F821
|
||||
# @login_required
|
||||
def thumbnails():
|
||||
@ -339,62 +321,3 @@ async def upload_and_parse():
|
||||
form = await request.form
|
||||
doc_ids = doc_upload_and_parse(form.get("conversation_id"), file_objs, current_user.id)
|
||||
return get_json_result(data=doc_ids)
|
||||
|
||||
|
||||
@manager.route("/parse", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
async def parse():
|
||||
req = await get_request_json()
|
||||
url = req.get("url", "")
|
||||
if url:
|
||||
if not is_valid_url(url):
|
||||
return get_json_result(data=False, message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR)
|
||||
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
from seleniumwire.webdriver import Chrome, ChromeOptions
|
||||
|
||||
options = ChromeOptions()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--disable-gpu")
|
||||
options.add_argument("--no-sandbox")
|
||||
options.add_argument("--disable-dev-shm-usage")
|
||||
options.add_experimental_option("prefs", {"download.default_directory": download_path, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True})
|
||||
driver = Chrome(options=options)
|
||||
driver.get(url)
|
||||
res_headers = [r.response.headers for r in driver.requests if r and r.response]
|
||||
if len(res_headers) > 1:
|
||||
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
||||
driver.quit()
|
||||
return get_json_result(data="\n".join(sections))
|
||||
|
||||
class File:
|
||||
filename: str
|
||||
filepath: str
|
||||
|
||||
def __init__(self, filename, filepath):
|
||||
self.filename = filename
|
||||
self.filepath = filepath
|
||||
|
||||
def read(self):
|
||||
with open(self.filepath, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
r = re.search(r"filename=\"([^\"]+)\"", str(res_headers))
|
||||
if not r or not r.group(1):
|
||||
return get_json_result(data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR)
|
||||
filename = r.group(1).strip()
|
||||
if not _is_safe_download_filename(filename):
|
||||
return get_json_result(data=False, message="Invalid downloaded filename", code=RetCode.ARGUMENT_ERROR)
|
||||
filepath = os.path.join(download_path, filename)
|
||||
f = File(filename, filepath)
|
||||
txt = FileService.parse_docs([f], current_user.id)
|
||||
return get_json_result(data=txt)
|
||||
|
||||
files = await request.files
|
||||
if "file" not in files:
|
||||
return get_json_result(data=False, message="No file part!", code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
file_objs = files.getlist("file")
|
||||
txt = FileService.parse_docs(file_objs, current_user.id)
|
||||
|
||||
return get_json_result(data=txt)
|
||||
|
||||
@ -196,8 +196,7 @@ class TestDocumentsUpload:
|
||||
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from types import ModuleType, SimpleNamespace
|
||||
from types import SimpleNamespace
|
||||
|
||||
|
||||
class _AwaitableValue:
|
||||
@ -329,122 +328,6 @@ class TestDocumentsUploadUnit:
|
||||
assert res["code"] == 0
|
||||
assert res["data"] == ["doc-1"]
|
||||
|
||||
def test_parse_url_and_multipart_matrix_unit(self, document_app_module, monkeypatch, tmp_path):
|
||||
module = document_app_module
|
||||
|
||||
async def req_invalid_url():
|
||||
return {"url": "not-a-url"}
|
||||
|
||||
monkeypatch.setattr(module, "get_request_json", req_invalid_url)
|
||||
monkeypatch.setattr(module, "is_valid_url", lambda _url: False)
|
||||
res = _run(module.parse())
|
||||
assert res["code"] == module.RetCode.ARGUMENT_ERROR
|
||||
assert res["message"] == "The URL format is invalid"
|
||||
|
||||
webdriver_mod = ModuleType("seleniumwire.webdriver")
|
||||
|
||||
class _FakeChromeOptions:
|
||||
def __init__(self):
|
||||
self.args = []
|
||||
self.experimental = {}
|
||||
|
||||
def add_argument(self, arg):
|
||||
self.args.append(arg)
|
||||
|
||||
def add_experimental_option(self, key, value):
|
||||
self.experimental[key] = value
|
||||
|
||||
class _Req:
|
||||
def __init__(self, headers):
|
||||
self.response = SimpleNamespace(headers=headers)
|
||||
|
||||
class _FakeDriver:
|
||||
def __init__(self, requests, page_source):
|
||||
self.requests = requests
|
||||
self.page_source = page_source
|
||||
self.quit_called = False
|
||||
self.visited = []
|
||||
self.options = None
|
||||
|
||||
def get(self, url):
|
||||
self.visited.append(url)
|
||||
|
||||
def quit(self):
|
||||
self.quit_called = True
|
||||
|
||||
queue = []
|
||||
created = []
|
||||
|
||||
def _fake_chrome(options=None):
|
||||
driver = queue.pop(0)
|
||||
driver.options = options
|
||||
created.append(driver)
|
||||
return driver
|
||||
|
||||
webdriver_mod.Chrome = _fake_chrome
|
||||
webdriver_mod.ChromeOptions = _FakeChromeOptions
|
||||
|
||||
seleniumwire_mod = ModuleType("seleniumwire")
|
||||
seleniumwire_mod.webdriver = webdriver_mod
|
||||
monkeypatch.setitem(sys.modules, "seleniumwire", seleniumwire_mod)
|
||||
monkeypatch.setitem(sys.modules, "seleniumwire.webdriver", webdriver_mod)
|
||||
monkeypatch.setattr(module, "get_project_base_directory", lambda: str(tmp_path))
|
||||
monkeypatch.setattr(module, "is_valid_url", lambda _url: True)
|
||||
|
||||
class _Parser:
|
||||
def parser_txt(self, page_source):
|
||||
assert "page" in page_source
|
||||
return ["section1", "section2"]
|
||||
|
||||
monkeypatch.setattr(module, "RAGFlowHtmlParser", lambda: _Parser())
|
||||
queue.append(_FakeDriver([_Req({"x": "1"}), _Req({"y": "2"})], "<html>page</html>"))
|
||||
|
||||
async def req_url_html():
|
||||
return {"url": "http://example.com/html"}
|
||||
|
||||
monkeypatch.setattr(module, "get_request_json", req_url_html)
|
||||
res = _run(module.parse())
|
||||
assert res["code"] == 0
|
||||
assert res["data"] == "section1\nsection2"
|
||||
assert created[-1].quit_called is True
|
||||
|
||||
(tmp_path / "logs" / "downloads").mkdir(parents=True, exist_ok=True)
|
||||
(tmp_path / "logs" / "downloads" / "doc.txt").write_bytes(b"downloaded-bytes")
|
||||
queue.append(_FakeDriver([_Req({"content-disposition": 'attachment; filename="doc.txt"'})], "<html>file</html>"))
|
||||
captured = {}
|
||||
|
||||
def parse_docs_read(files, _uid):
|
||||
captured["filename"] = files[0].filename
|
||||
captured["content"] = files[0].read()
|
||||
return "parsed-download"
|
||||
|
||||
monkeypatch.setattr(module.FileService, "parse_docs", parse_docs_read)
|
||||
|
||||
async def req_url_file():
|
||||
return {"url": "http://example.com/file"}
|
||||
|
||||
monkeypatch.setattr(module, "get_request_json", req_url_file)
|
||||
res = _run(module.parse())
|
||||
assert res["code"] == 0
|
||||
assert res["data"] == "parsed-download"
|
||||
assert captured["filename"] == "doc.txt"
|
||||
assert captured["content"] == b"downloaded-bytes"
|
||||
|
||||
async def req_no_url():
|
||||
return {}
|
||||
|
||||
monkeypatch.setattr(module, "get_request_json", req_no_url)
|
||||
monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles()))
|
||||
res = _run(module.parse())
|
||||
assert res["code"] == module.RetCode.ARGUMENT_ERROR
|
||||
assert res["message"] == "No file part!"
|
||||
|
||||
monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles({"file": [_DummyFile("f1.txt")]})))
|
||||
monkeypatch.setattr(module.FileService, "parse_docs", lambda _files, _uid: "parsed-upload")
|
||||
res = _run(module.parse())
|
||||
assert res["code"] == 0
|
||||
assert res["data"] == "parsed-upload"
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
class TestWebCrawlUnit:
|
||||
|
||||
Reference in New Issue
Block a user