mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-20 16:26:42 +08:00
### What problem does this PR solve? The POST /upload_info?url=<url> endpoint accepted a user-supplied URL and passed it directly to AsyncWebCrawler without any validation. There were no restrictions on URL scheme, destination hostname, or resolved IP address. This allowed any authenticated user to instruct the server to make outbound HTTP requests to internal infrastructure — including RFC 1918 private networks, loopback addresses, and cloud metadata services such as http://169.254.169.254 — effectively using the server as a proxy for internal network reconnaissance or credential theft. This PR adds an SSRF guard (_validate_url_for_crawl) that runs before any crawl is initiated. It enforces an allowlist of safe schemes (http/https), resolves the hostname at validation time, and rejects any URL whose resolved IP falls within a private or reserved network range. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
596 lines
24 KiB
Python
596 lines
24 KiB
Python
#
|
|
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License
|
|
#
|
|
import os.path
|
|
import re
|
|
from pathlib import Path, PurePosixPath, PureWindowsPath
|
|
|
|
from quart import make_response, request
|
|
|
|
from api.apps import current_user, login_required
|
|
from api.common.check_team_permission import check_kb_team_permission
|
|
from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX
|
|
from api.db import FileType
|
|
from api.db.db_models import Task
|
|
from api.db.services import duplicate_name
|
|
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
|
from api.db.services.file2document_service import File2DocumentService
|
|
from api.db.services.file_service import FileService
|
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
from api.db.services.task_service import TaskService, cancel_all_task_of
|
|
from api.utils.api_utils import (
|
|
get_data_error_result,
|
|
get_json_result,
|
|
get_request_json,
|
|
server_error_response,
|
|
validate_request,
|
|
)
|
|
from api.utils.file_utils import filename_type, thumbnail
|
|
from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers, html2pdf, is_valid_url
|
|
from common import settings
|
|
from common.constants import SANDBOX_ARTIFACT_BUCKET, ParserType, RetCode, TaskStatus
|
|
from common.file_utils import get_project_base_directory
|
|
from common.misc_utils import get_uuid, thread_pool_exec
|
|
from common.ssrf_guard import assert_url_is_safe
|
|
from deepdoc.parser.html_parser import RAGFlowHtmlParser
|
|
from rag.nlp import search
|
|
|
|
|
|
def _is_safe_download_filename(name: str) -> bool:
|
|
if not name or name in {".", ".."}:
|
|
return False
|
|
if "\x00" in name or len(name) > 255:
|
|
return False
|
|
if name != PurePosixPath(name).name:
|
|
return False
|
|
if name != PureWindowsPath(name).name:
|
|
return False
|
|
return True
|
|
|
|
|
|
@manager.route("/web_crawl", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
@validate_request("kb_id", "name", "url")
|
|
async def web_crawl():
|
|
form = await request.form
|
|
kb_id = form.get("kb_id")
|
|
if not kb_id:
|
|
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
|
|
name = form.get("name")
|
|
url = form.get("url")
|
|
if not is_valid_url(url):
|
|
return get_json_result(data=False, message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR)
|
|
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
|
if not e:
|
|
raise LookupError("Can't find this dataset!")
|
|
if not check_kb_team_permission(kb, current_user.id):
|
|
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
|
|
|
|
blob = html2pdf(url)
|
|
if not blob:
|
|
return server_error_response(ValueError("Download failure."))
|
|
|
|
root_folder = FileService.get_root_folder(current_user.id)
|
|
pf_id = root_folder["id"]
|
|
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
|
kb_root_folder = FileService.get_kb_folder(current_user.id)
|
|
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
|
|
|
try:
|
|
filename = duplicate_name(DocumentService.query, name=name + ".pdf", kb_id=kb.id)
|
|
filetype = filename_type(filename)
|
|
if filetype == FileType.OTHER.value:
|
|
raise RuntimeError("This type of file has not been supported yet!")
|
|
|
|
location = filename
|
|
while settings.STORAGE_IMPL.obj_exist(kb_id, location):
|
|
location += "_"
|
|
settings.STORAGE_IMPL.put(kb_id, location, blob)
|
|
doc = {
|
|
"id": get_uuid(),
|
|
"kb_id": kb.id,
|
|
"parser_id": kb.parser_id,
|
|
"parser_config": kb.parser_config,
|
|
"created_by": current_user.id,
|
|
"type": filetype,
|
|
"name": filename,
|
|
"location": location,
|
|
"size": len(blob),
|
|
"thumbnail": thumbnail(filename, blob),
|
|
"suffix": Path(filename).suffix.lstrip("."),
|
|
}
|
|
if doc["type"] == FileType.VISUAL:
|
|
doc["parser_id"] = ParserType.PICTURE.value
|
|
if doc["type"] == FileType.AURAL:
|
|
doc["parser_id"] = ParserType.AUDIO.value
|
|
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
|
doc["parser_id"] = ParserType.PRESENTATION.value
|
|
if re.search(r"\.(eml)$", filename):
|
|
doc["parser_id"] = ParserType.EMAIL.value
|
|
DocumentService.insert(doc)
|
|
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
return get_json_result(data=True)
|
|
|
|
|
|
@manager.route("/create", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
@validate_request("name", "kb_id")
|
|
async def create():
|
|
req = await get_request_json()
|
|
kb_id = req["kb_id"]
|
|
if not kb_id:
|
|
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
|
|
if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT:
|
|
return get_json_result(data=False, message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=RetCode.ARGUMENT_ERROR)
|
|
|
|
if req["name"].strip() == "":
|
|
return get_json_result(data=False, message="File name can't be empty.", code=RetCode.ARGUMENT_ERROR)
|
|
req["name"] = req["name"].strip()
|
|
|
|
try:
|
|
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
|
if not e:
|
|
return get_data_error_result(message="Can't find this dataset!")
|
|
|
|
if DocumentService.query(name=req["name"], kb_id=kb_id):
|
|
return get_data_error_result(message="Duplicated document name in the same dataset.")
|
|
|
|
kb_root_folder = FileService.get_kb_folder(kb.tenant_id)
|
|
if not kb_root_folder:
|
|
return get_data_error_result(message="Cannot find the root folder.")
|
|
kb_folder = FileService.new_a_file_from_kb(
|
|
kb.tenant_id,
|
|
kb.name,
|
|
kb_root_folder["id"],
|
|
)
|
|
if not kb_folder:
|
|
return get_data_error_result(message="Cannot find the kb folder for this file.")
|
|
|
|
doc = DocumentService.insert(
|
|
{
|
|
"id": get_uuid(),
|
|
"kb_id": kb.id,
|
|
"parser_id": kb.parser_id,
|
|
"pipeline_id": kb.pipeline_id,
|
|
"parser_config": kb.parser_config,
|
|
"created_by": current_user.id,
|
|
"type": FileType.VIRTUAL,
|
|
"name": req["name"],
|
|
"suffix": Path(req["name"]).suffix.lstrip("."),
|
|
"location": "",
|
|
"size": 0,
|
|
}
|
|
)
|
|
|
|
FileService.add_file_from_kb(doc.to_dict(), kb_folder["id"], kb.tenant_id)
|
|
|
|
return get_json_result(data=doc.to_json())
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/thumbnails", methods=["GET"]) # noqa: F821
|
|
# @login_required
|
|
def thumbnails():
|
|
doc_ids = request.args.getlist("doc_ids")
|
|
if not doc_ids:
|
|
return get_json_result(data=False, message='Lack of "Document ID"', code=RetCode.ARGUMENT_ERROR)
|
|
|
|
try:
|
|
docs = DocumentService.get_thumbnails(doc_ids)
|
|
|
|
for doc_item in docs:
|
|
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
|
|
doc_item["thumbnail"] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"
|
|
|
|
return get_json_result(data={d["id"]: d["thumbnail"] for d in docs})
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/change_status", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
@validate_request("doc_ids", "status")
|
|
async def change_status():
|
|
req = await get_request_json()
|
|
doc_ids = req.get("doc_ids", [])
|
|
status = str(req.get("status", ""))
|
|
|
|
if status not in ["0", "1"]:
|
|
return get_json_result(data=False, message='"Status" must be either 0 or 1!', code=RetCode.ARGUMENT_ERROR)
|
|
|
|
result = {}
|
|
has_error = False
|
|
for doc_id in doc_ids:
|
|
if not DocumentService.accessible(doc_id, current_user.id):
|
|
result[doc_id] = {"error": "No authorization."}
|
|
has_error = True
|
|
continue
|
|
|
|
try:
|
|
e, doc = DocumentService.get_by_id(doc_id)
|
|
if not e:
|
|
result[doc_id] = {"error": "No authorization."}
|
|
has_error = True
|
|
continue
|
|
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
|
|
if not e:
|
|
result[doc_id] = {"error": "Can't find this dataset!"}
|
|
has_error = True
|
|
continue
|
|
current_status = str(doc.status)
|
|
if current_status == status:
|
|
result[doc_id] = {"status": status}
|
|
continue
|
|
if not DocumentService.update_by_id(doc_id, {"status": str(status)}):
|
|
result[doc_id] = {"error": "Database error (Document update)!"}
|
|
has_error = True
|
|
continue
|
|
|
|
status_int = int(status)
|
|
if getattr(doc, "chunk_num", 0) > 0:
|
|
try:
|
|
ok = settings.docStoreConn.update(
|
|
{"doc_id": doc_id},
|
|
{"available_int": status_int},
|
|
search.index_name(kb.tenant_id),
|
|
doc.kb_id,
|
|
)
|
|
except Exception as exc:
|
|
msg = str(exc)
|
|
if "3022" in msg:
|
|
result[doc_id] = {"error": "Document store table missing."}
|
|
else:
|
|
result[doc_id] = {"error": f"Document store update failed: {msg}"}
|
|
has_error = True
|
|
continue
|
|
if not ok:
|
|
result[doc_id] = {"error": "Database error (docStore update)!"}
|
|
has_error = True
|
|
continue
|
|
result[doc_id] = {"status": status}
|
|
except Exception as e:
|
|
result[doc_id] = {"error": f"Internal server error: {str(e)}"}
|
|
has_error = True
|
|
|
|
if has_error:
|
|
return get_json_result(data=result, message="Partial failure", code=RetCode.SERVER_ERROR)
|
|
return get_json_result(data=result)
|
|
|
|
|
|
@manager.route("/run", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
@validate_request("doc_ids", "run")
|
|
async def run():
|
|
req = await get_request_json()
|
|
uid = current_user.id
|
|
try:
|
|
|
|
def _run_sync():
|
|
for doc_id in req["doc_ids"]:
|
|
if not DocumentService.accessible(doc_id, uid):
|
|
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
|
|
|
|
kb_table_num_map = {}
|
|
for id in req["doc_ids"]:
|
|
info = {"run": str(req["run"]), "progress": 0}
|
|
if str(req["run"]) == TaskStatus.RUNNING.value and req.get("delete", False):
|
|
info["progress_msg"] = ""
|
|
info["chunk_num"] = 0
|
|
info["token_num"] = 0
|
|
|
|
tenant_id = DocumentService.get_tenant_id(id)
|
|
if not tenant_id:
|
|
return get_data_error_result(message="Tenant not found!")
|
|
e, doc = DocumentService.get_by_id(id)
|
|
if not e:
|
|
return get_data_error_result(message="Document not found!")
|
|
|
|
if str(req["run"]) == TaskStatus.CANCEL.value:
|
|
tasks = list(TaskService.query(doc_id=id))
|
|
has_unfinished_task = any((task.progress or 0) < 1 for task in tasks)
|
|
if str(doc.run) in [TaskStatus.RUNNING.value, TaskStatus.CANCEL.value] or has_unfinished_task:
|
|
cancel_all_task_of(id)
|
|
else:
|
|
return get_data_error_result(message="Cannot cancel a task that is not in RUNNING status")
|
|
if all([("delete" not in req or req["delete"]), str(req["run"]) == TaskStatus.RUNNING.value, str(doc.run) == TaskStatus.DONE.value]):
|
|
DocumentService.clear_chunk_num_when_rerun(doc.id)
|
|
|
|
DocumentService.update_by_id(id, info)
|
|
if req.get("delete", False):
|
|
TaskService.filter_delete([Task.doc_id == id])
|
|
if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id):
|
|
settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
|
|
|
|
if str(req["run"]) == TaskStatus.RUNNING.value:
|
|
if req.get("apply_kb"):
|
|
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
|
|
if not e:
|
|
raise LookupError("Can't find this dataset!")
|
|
doc.parser_config["llm_id"] = kb.parser_config.get("llm_id")
|
|
doc.parser_config["enable_metadata"] = kb.parser_config.get("enable_metadata", False)
|
|
doc.parser_config["metadata"] = kb.parser_config.get("metadata", {})
|
|
DocumentService.update_parser_config(doc.id, doc.parser_config)
|
|
doc_dict = doc.to_dict()
|
|
DocumentService.run(tenant_id, doc_dict, kb_table_num_map)
|
|
|
|
return get_json_result(data=True)
|
|
|
|
return await thread_pool_exec(_run_sync)
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/get/<doc_id>", methods=["GET"]) # noqa: F821
|
|
@login_required
|
|
async def get(doc_id):
|
|
try:
|
|
e, doc = DocumentService.get_by_id(doc_id)
|
|
if not e:
|
|
return get_data_error_result(message="Document not found!")
|
|
|
|
b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
|
|
data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n)
|
|
response = await make_response(data)
|
|
|
|
ext = re.search(r"\.([^.]+)$", doc.name.lower())
|
|
ext = ext.group(1) if ext else None
|
|
content_type = None
|
|
if ext:
|
|
fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application"
|
|
content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}")
|
|
apply_safe_file_response_headers(response, content_type, ext)
|
|
return response
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/download/<attachment_id>", methods=["GET"]) # noqa: F821
|
|
@login_required
|
|
async def download_attachment(attachment_id):
|
|
try:
|
|
ext = request.args.get("ext", "markdown")
|
|
data = await thread_pool_exec(settings.STORAGE_IMPL.get, current_user.id, attachment_id)
|
|
response = await make_response(data)
|
|
content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
|
|
apply_safe_file_response_headers(response, content_type, ext)
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/change_parser", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
@validate_request("doc_id")
|
|
async def change_parser():
|
|
req = await get_request_json()
|
|
if not DocumentService.accessible(req["doc_id"], current_user.id):
|
|
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
|
|
|
|
e, doc = DocumentService.get_by_id(req["doc_id"])
|
|
if not e:
|
|
return get_data_error_result(message="Document not found!")
|
|
|
|
def reset_doc():
|
|
nonlocal doc
|
|
e = DocumentService.update_by_id(doc.id, {"pipeline_id": req["pipeline_id"], "parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value})
|
|
if not e:
|
|
return get_data_error_result(message="Document not found!")
|
|
if doc.token_num > 0:
|
|
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duration * -1)
|
|
if not e:
|
|
return get_data_error_result(message="Document not found!")
|
|
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
|
if not tenant_id:
|
|
return get_data_error_result(message="Tenant not found!")
|
|
DocumentService.delete_chunk_images(doc, tenant_id)
|
|
if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id):
|
|
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
|
|
return None
|
|
|
|
try:
|
|
if "pipeline_id" in req and req["pipeline_id"] != "":
|
|
if doc.pipeline_id == req["pipeline_id"]:
|
|
return get_json_result(data=True)
|
|
DocumentService.update_by_id(doc.id, {"pipeline_id": req["pipeline_id"]})
|
|
reset_doc()
|
|
return get_json_result(data=True)
|
|
|
|
if doc.parser_id.lower() == req["parser_id"].lower():
|
|
if "parser_config" in req:
|
|
if req["parser_config"] == doc.parser_config:
|
|
return get_json_result(data=True)
|
|
else:
|
|
return get_json_result(data=True)
|
|
|
|
if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"):
|
|
return get_data_error_result(message="Not supported yet!")
|
|
if "parser_config" in req:
|
|
DocumentService.update_parser_config(doc.id, req["parser_config"])
|
|
reset_doc()
|
|
return get_json_result(data=True)
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/image/<image_id>", methods=["GET"]) # noqa: F821
|
|
# @login_required
|
|
async def get_image(image_id):
|
|
try:
|
|
arr = image_id.split("-")
|
|
if len(arr) != 2:
|
|
return get_data_error_result(message="Image not found.")
|
|
bkt, nm = image_id.split("-")
|
|
data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm)
|
|
response = await make_response(data)
|
|
response.headers.set("Content-Type", "image/JPEG")
|
|
return response
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
ARTIFACT_CONTENT_TYPES = {
|
|
".png": "image/png",
|
|
".jpg": "image/jpeg",
|
|
".jpeg": "image/jpeg",
|
|
".svg": "image/svg+xml",
|
|
".pdf": "application/pdf",
|
|
".csv": "text/csv",
|
|
".json": "application/json",
|
|
".html": "text/html",
|
|
}
|
|
|
|
|
|
@manager.route("/artifact/<filename>", methods=["GET"]) # noqa: F821
|
|
@login_required
|
|
async def get_artifact(filename):
|
|
try:
|
|
bucket = SANDBOX_ARTIFACT_BUCKET
|
|
# Validate filename: must be uuid hex + allowed extension, nothing else
|
|
basename = os.path.basename(filename)
|
|
if basename != filename or "/" in filename or "\\" in filename:
|
|
return get_data_error_result(message="Invalid filename.")
|
|
ext = os.path.splitext(basename)[1].lower()
|
|
if ext not in ARTIFACT_CONTENT_TYPES:
|
|
return get_data_error_result(message="Invalid file type.")
|
|
data = await thread_pool_exec(settings.STORAGE_IMPL.get, bucket, basename)
|
|
if not data:
|
|
return get_data_error_result(message="Artifact not found.")
|
|
content_type = ARTIFACT_CONTENT_TYPES.get(ext, "application/octet-stream")
|
|
response = await make_response(data)
|
|
safe_filename = re.sub(r"[^\w.\-]", "_", basename)
|
|
apply_safe_file_response_headers(response, content_type, ext)
|
|
if not response.headers.get("Content-Disposition"):
|
|
response.headers.set("Content-Disposition", f'inline; filename="{safe_filename}"')
|
|
return response
|
|
except Exception as e:
|
|
return server_error_response(e)
|
|
|
|
|
|
@manager.route("/upload_and_parse", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
@validate_request("conversation_id")
|
|
async def upload_and_parse():
|
|
files = await request.files
|
|
if "file" not in files:
|
|
return get_json_result(data=False, message="No file part!", code=RetCode.ARGUMENT_ERROR)
|
|
|
|
file_objs = files.getlist("file")
|
|
for file_obj in file_objs:
|
|
if file_obj.filename == "":
|
|
return get_json_result(data=False, message="No file selected!", code=RetCode.ARGUMENT_ERROR)
|
|
|
|
form = await request.form
|
|
doc_ids = doc_upload_and_parse(form.get("conversation_id"), file_objs, current_user.id)
|
|
return get_json_result(data=doc_ids)
|
|
|
|
|
|
@manager.route("/parse", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
async def parse():
|
|
req = await get_request_json()
|
|
url = req.get("url", "")
|
|
if url:
|
|
if not is_valid_url(url):
|
|
return get_json_result(data=False, message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR)
|
|
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
|
os.makedirs(download_path, exist_ok=True)
|
|
from seleniumwire.webdriver import Chrome, ChromeOptions
|
|
|
|
options = ChromeOptions()
|
|
options.add_argument("--headless")
|
|
options.add_argument("--disable-gpu")
|
|
options.add_argument("--no-sandbox")
|
|
options.add_argument("--disable-dev-shm-usage")
|
|
options.add_experimental_option("prefs", {"download.default_directory": download_path, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True})
|
|
driver = Chrome(options=options)
|
|
driver.get(url)
|
|
res_headers = [r.response.headers for r in driver.requests if r and r.response]
|
|
if len(res_headers) > 1:
|
|
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
|
driver.quit()
|
|
return get_json_result(data="\n".join(sections))
|
|
|
|
class File:
|
|
filename: str
|
|
filepath: str
|
|
|
|
def __init__(self, filename, filepath):
|
|
self.filename = filename
|
|
self.filepath = filepath
|
|
|
|
def read(self):
|
|
with open(self.filepath, "rb") as f:
|
|
return f.read()
|
|
|
|
r = re.search(r"filename=\"([^\"]+)\"", str(res_headers))
|
|
if not r or not r.group(1):
|
|
return get_json_result(data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR)
|
|
filename = r.group(1).strip()
|
|
if not _is_safe_download_filename(filename):
|
|
return get_json_result(data=False, message="Invalid downloaded filename", code=RetCode.ARGUMENT_ERROR)
|
|
filepath = os.path.join(download_path, filename)
|
|
f = File(filename, filepath)
|
|
txt = FileService.parse_docs([f], current_user.id)
|
|
return get_json_result(data=txt)
|
|
|
|
files = await request.files
|
|
if "file" not in files:
|
|
return get_json_result(data=False, message="No file part!", code=RetCode.ARGUMENT_ERROR)
|
|
|
|
file_objs = files.getlist("file")
|
|
txt = FileService.parse_docs(file_objs, current_user.id)
|
|
|
|
return get_json_result(data=txt)
|
|
|
|
|
|
@manager.route("/upload_info", methods=["POST"]) # noqa: F821
|
|
@login_required
|
|
async def upload_info():
|
|
files = await request.files
|
|
file_objs = files.getlist("file") if files and files.get("file") else []
|
|
url = request.args.get("url")
|
|
|
|
if file_objs and url:
|
|
return get_json_result(
|
|
data=False,
|
|
message="Provide either multipart file(s) or ?url=..., not both.",
|
|
code=RetCode.BAD_REQUEST,
|
|
)
|
|
|
|
if not file_objs and not url:
|
|
return get_json_result(
|
|
data=False,
|
|
message="Missing input: provide multipart file(s) or url",
|
|
code=RetCode.BAD_REQUEST,
|
|
)
|
|
|
|
try:
|
|
if url and not file_objs:
|
|
assert_url_is_safe(url)
|
|
return get_json_result(data=FileService.upload_info(current_user.id, None, url))
|
|
|
|
if len(file_objs) == 1:
|
|
return get_json_result(data=FileService.upload_info(current_user.id, file_objs[0], None))
|
|
|
|
results = [FileService.upload_info(current_user.id, f, None) for f in file_objs]
|
|
return get_json_result(data=results)
|
|
except Exception as e:
|
|
return server_error_response(e)
|