mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-22 00:50:10 +08:00
### What problem does this PR solve? The POST /upload_info?url=<url> endpoint accepted a user-supplied URL and passed it directly to AsyncWebCrawler without any validation. There were no restrictions on URL scheme, destination hostname, or resolved IP address. This allowed any authenticated user to instruct the server to make outbound HTTP requests to internal infrastructure — including RFC 1918 private networks, loopback addresses, and cloud metadata services such as http://169.254.169.254 — effectively using the server as a proxy for internal network reconnaissance or credential theft. This PR adds an SSRF guard (_validate_url_for_crawl) that runs before any crawl is initiated. It enforces an allowlist of safe schemes (http/https), resolves the hostname at validation time, and rejects any URL whose resolved IP falls within a private or reserved network range. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
77 lines
2.7 KiB
Python
77 lines
2.7 KiB
Python
#
|
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
from abc import ABC
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler
|
|
from agent.tools.base import ToolParamBase, ToolBase
|
|
|
|
|
|
class CrawlerParam(ToolParamBase):
|
|
"""
|
|
Define the Crawler component parameters.
|
|
"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.proxy = None
|
|
self.extract_type = "markdown"
|
|
|
|
def check(self):
|
|
self.check_valid_value(self.extract_type, "Type of content from the crawler", ["html", "markdown", "content"])
|
|
|
|
|
|
class Crawler(ToolBase, ABC):
|
|
component_name = "Crawler"
|
|
|
|
def _run(self, history, **kwargs):
|
|
from common.ssrf_guard import assert_url_is_safe, pin_dns_global
|
|
|
|
ans = self.get_input()
|
|
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
|
try:
|
|
_ssrf_hostname, _ssrf_ip = assert_url_is_safe(ans)
|
|
except ValueError:
|
|
return Crawler.be_output("URL not valid")
|
|
try:
|
|
# pin_dns_global is used (not thread-local) because crawl4ai resolves
|
|
# DNS in asyncio executor threads that don't share thread-local state.
|
|
with pin_dns_global(_ssrf_hostname, _ssrf_ip):
|
|
result = asyncio.run(self.get_web(ans))
|
|
|
|
return Crawler.be_output(result)
|
|
|
|
except Exception as e:
|
|
return Crawler.be_output(f"An unexpected error occurred: {str(e)}")
|
|
|
|
async def get_web(self, url):
|
|
if self.check_if_canceled("Crawler async operation"):
|
|
return
|
|
|
|
proxy = self._param.proxy if self._param.proxy else None
|
|
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
|
|
result = await crawler.arun(url=url, bypass_cache=True)
|
|
|
|
if self.check_if_canceled("Crawler async operation"):
|
|
return
|
|
|
|
if self._param.extract_type == "html":
|
|
return result.cleaned_html
|
|
elif self._param.extract_type == "markdown":
|
|
return result.markdown
|
|
elif self._param.extract_type == "content":
|
|
return result.extracted_content
|
|
return result.markdown
|