Fix docx segment image URLs

This commit is contained in:
Yanli 盐粒
2026-02-09 20:14:52 +08:00
parent e4ab6e0919
commit fc91a7a38b
6 changed files with 38 additions and 43 deletions

View File

@ -114,7 +114,6 @@ class PdfExtractor(BaseExtractor):
"""
image_content = []
upload_files = []
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
try:
image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))
@ -164,7 +163,7 @@ class PdfExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
upload_files.append(upload_file)
image_content.append(f"![image]({base_url}/files/{upload_file.id}/file-preview)")
image_content.append(f"![image](/files/{upload_file.id}/file-preview)")
except Exception as e:
logger.warning("Failed to extract image from PDF: %s", e)
continue

View File

@ -87,7 +87,6 @@ class WordExtractor(BaseExtractor):
def _extract_images_from_docx(self, doc):
image_count = 0
image_map = {}
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
for r_id, rel in doc.part.rels.items():
if "image" in rel.target_ref:
@ -126,7 +125,7 @@ class WordExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
db.session.add(upload_file)
image_map[r_id] = f"![image]({base_url}/files/{upload_file.id}/file-preview)"
image_map[r_id] = f"![image](/files/{upload_file.id}/file-preview)"
else:
image_ext = rel.target_ref.split(".")[-1]
if image_ext is None:
@ -154,7 +153,7 @@ class WordExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
db.session.add(upload_file)
image_map[rel.target_part] = f"![image]({base_url}/files/{upload_file.id}/file-preview)"
image_map[rel.target_part] = f"![image](/files/{upload_file.id}/file-preview)"
db.session.commit()
return image_map

View File

@ -809,7 +809,7 @@ class DocumentSegment(Base):
text = self.content
# For data before v0.10.0
pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?"
pattern = r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/image-preview(?:\?[^\s\)\"\']*)?"
matches = re.finditer(pattern, text)
for match in matches:
upload_file_id = match.group(1)
@ -826,7 +826,7 @@ class DocumentSegment(Base):
signed_urls.append((match.start(), match.end(), signed_url))
# For data after v0.10.0
pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?"
pattern = r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/file-preview(?:\?[^\s\)\"\']*)?"
matches = re.finditer(pattern, text)
for match in matches:
upload_file_id = match.group(1)

View File

@ -87,7 +87,7 @@ def test_extract_images_formats(mock_dependencies, monkeypatch, image_bytes, exp
mock_raw.FPDF_PAGEOBJ_IMAGE = 1
result = extractor._extract_images(mock_page)
assert f"![image](http://files.local/files/{file_id}/file-preview)" in result
assert f"![image](/files/{file_id}/file-preview)" in result
assert len(saves) == 1
assert saves[0][1] == image_bytes
assert len(db_stub.session.added) == 1
@ -180,7 +180,7 @@ def test_extract_images_failures(mock_dependencies):
result = extractor._extract_images(mock_page)
# Should have one success
assert "![image](http://files.local/files/test_file_id/file-preview)" in result
assert "![image](/files/test_file_id/file-preview)" in result
assert len(saves) == 1
assert saves[0][1] == jpeg_bytes
assert db_stub.session.committed is True

View File

@ -123,6 +123,7 @@ def test_extract_images_from_docx(monkeypatch):
# Patch config values used for URL composition and storage type
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
monkeypatch.setattr(we.dify_config, "INTERNAL_FILES_URL", "http://internal.docker:5001", raising=False)
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
# Patch UploadFile to avoid real DB models
@ -164,7 +165,7 @@ def test_extract_images_from_docx(monkeypatch):
# Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part)
assert set(image_map.keys()) == {"rId1", internal_part}
assert all(v.startswith("![image](") and v.endswith("/file-preview)") for v in image_map.values())
assert all(v.startswith("![image](/files/") and v.endswith("/file-preview)") for v in image_map.values())
# Storage should receive both payloads
payloads = {data for _, data in saves}
@ -176,39 +177,6 @@ def test_extract_images_from_docx(monkeypatch):
assert db_stub.session.committed is True
def test_extract_images_from_docx_uses_internal_files_url():
"""Test that INTERNAL_FILES_URL takes precedence over FILES_URL for plugin access."""
# Test the URL generation logic directly
from configs import dify_config
# Mock the configuration values
original_files_url = getattr(dify_config, "FILES_URL", None)
original_internal_files_url = getattr(dify_config, "INTERNAL_FILES_URL", None)
try:
# Set both URLs - INTERNAL should take precedence
dify_config.FILES_URL = "http://external.example.com"
dify_config.INTERNAL_FILES_URL = "http://internal.docker:5001"
# Test the URL generation logic (same as in word_extractor.py)
upload_file_id = "test_file_id"
# This is the pattern we fixed in the word extractor
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
generated_url = f"{base_url}/files/{upload_file_id}/file-preview"
# Verify that INTERNAL_FILES_URL is used instead of FILES_URL
assert "http://internal.docker:5001" in generated_url, f"Expected internal URL, got: {generated_url}"
assert "http://external.example.com" not in generated_url, f"Should not use external URL, got: {generated_url}"
finally:
# Restore original values
if original_files_url is not None:
dify_config.FILES_URL = original_files_url
if original_internal_files_url is not None:
dify_config.INTERNAL_FILES_URL = original_internal_files_url
def test_extract_hyperlinks(monkeypatch):
# Mock db and storage to avoid issues during image extraction (even if no images are present)
monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None))

View File

@ -547,6 +547,35 @@ class TestDocumentSegmentIndexing:
assert segment.index_node_hash == index_node_hash
assert segment.keywords == keywords
def test_document_segment_sign_content_strips_absolute_files_host(self):
"""Test that sign_content strips scheme/host from absolute /files URLs and returns a signed relative URL."""
# Arrange
upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=f"![image](http://internal.docker:5001/files/{upload_file_id}/file-preview)",
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
import models.dataset as dataset_module
# Act
with patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True), patch(
"models.dataset.time.time", return_value=1700000000
), patch("models.dataset.os.urandom", return_value=b"\x00" * 16):
signed = segment.get_sign_content()
# Assert
assert "internal.docker:5001" not in signed
assert f"/files/{upload_file_id}/file-preview?timestamp=" in signed
assert "&nonce=" in signed
assert "&sign=" in signed
def test_document_segment_with_answer_field(self):
"""Test creating a document segment with answer field for QA model."""
# Arrange