fix: SSRF in WordExtractor URL download (credit to @EaEa0001 ) (#31678)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
盐粒 Yanli
2026-01-29 14:01:21 +08:00
committed by GitHub
parent c2473d85dc
commit dbfc47e8b0
4 changed files with 68 additions and 13 deletions

View File

@ -1,4 +1,7 @@
"""Abstract interface for document loader implementations."""
"""Word (.docx) document extractor used for RAG ingestion.
Supports local file paths and remote URLs (downloaded via `core.helper.ssrf_proxy`).
"""
import logging
import mimetypes
@ -8,7 +11,6 @@ import tempfile
import uuid
from urllib.parse import urlparse
import httpx
from docx import Document as DocxDocument
from docx.oxml.ns import qn
from docx.text.run import Run
@ -44,7 +46,7 @@ class WordExtractor(BaseExtractor):
# If the file is a web path, download it to a temporary file, and use that
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
response = httpx.get(self.file_path, timeout=None)
response = ssrf_proxy.get(self.file_path)
if response.status_code != 200:
response.close()
@ -55,6 +57,7 @@ class WordExtractor(BaseExtractor):
self.temp_file = tempfile.NamedTemporaryFile() # noqa SIM115
try:
self.temp_file.write(response.content)
self.temp_file.flush()
finally:
response.close()
self.file_path = self.temp_file.name