mirror of
https://github.com/langgenius/dify.git
synced 2026-05-04 01:18:05 +08:00
Fix docx segment image URLs
This commit is contained in:
@ -114,7 +114,6 @@ class PdfExtractor(BaseExtractor):
|
||||
"""
|
||||
image_content = []
|
||||
upload_files = []
|
||||
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
|
||||
|
||||
try:
|
||||
image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))
|
||||
@ -164,7 +163,7 @@ class PdfExtractor(BaseExtractor):
|
||||
used_at=naive_utc_now(),
|
||||
)
|
||||
upload_files.append(upload_file)
|
||||
image_content.append(f"")
|
||||
image_content.append(f"")
|
||||
except Exception as e:
|
||||
logger.warning("Failed to extract image from PDF: %s", e)
|
||||
continue
|
||||
|
||||
@ -87,7 +87,6 @@ class WordExtractor(BaseExtractor):
|
||||
def _extract_images_from_docx(self, doc):
|
||||
image_count = 0
|
||||
image_map = {}
|
||||
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
|
||||
|
||||
for r_id, rel in doc.part.rels.items():
|
||||
if "image" in rel.target_ref:
|
||||
@ -126,7 +125,7 @@ class WordExtractor(BaseExtractor):
|
||||
used_at=naive_utc_now(),
|
||||
)
|
||||
db.session.add(upload_file)
|
||||
image_map[r_id] = f""
|
||||
image_map[r_id] = f""
|
||||
else:
|
||||
image_ext = rel.target_ref.split(".")[-1]
|
||||
if image_ext is None:
|
||||
@ -154,7 +153,7 @@ class WordExtractor(BaseExtractor):
|
||||
used_at=naive_utc_now(),
|
||||
)
|
||||
db.session.add(upload_file)
|
||||
image_map[rel.target_part] = f""
|
||||
image_map[rel.target_part] = f""
|
||||
db.session.commit()
|
||||
return image_map
|
||||
|
||||
|
||||
@ -809,7 +809,7 @@ class DocumentSegment(Base):
|
||||
text = self.content
|
||||
|
||||
# For data before v0.10.0
|
||||
pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?"
|
||||
pattern = r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/image-preview(?:\?[^\s\)\"\']*)?"
|
||||
matches = re.finditer(pattern, text)
|
||||
for match in matches:
|
||||
upload_file_id = match.group(1)
|
||||
@ -826,7 +826,7 @@ class DocumentSegment(Base):
|
||||
signed_urls.append((match.start(), match.end(), signed_url))
|
||||
|
||||
# For data after v0.10.0
|
||||
pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?"
|
||||
pattern = r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/file-preview(?:\?[^\s\)\"\']*)?"
|
||||
matches = re.finditer(pattern, text)
|
||||
for match in matches:
|
||||
upload_file_id = match.group(1)
|
||||
|
||||
@ -87,7 +87,7 @@ def test_extract_images_formats(mock_dependencies, monkeypatch, image_bytes, exp
|
||||
mock_raw.FPDF_PAGEOBJ_IMAGE = 1
|
||||
result = extractor._extract_images(mock_page)
|
||||
|
||||
assert f"" in result
|
||||
assert f"" in result
|
||||
assert len(saves) == 1
|
||||
assert saves[0][1] == image_bytes
|
||||
assert len(db_stub.session.added) == 1
|
||||
@ -180,7 +180,7 @@ def test_extract_images_failures(mock_dependencies):
|
||||
result = extractor._extract_images(mock_page)
|
||||
|
||||
# Should have one success
|
||||
assert "" in result
|
||||
assert "" in result
|
||||
assert len(saves) == 1
|
||||
assert saves[0][1] == jpeg_bytes
|
||||
assert db_stub.session.committed is True
|
||||
|
||||
@ -123,6 +123,7 @@ def test_extract_images_from_docx(monkeypatch):
|
||||
|
||||
# Patch config values used for URL composition and storage type
|
||||
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
|
||||
monkeypatch.setattr(we.dify_config, "INTERNAL_FILES_URL", "http://internal.docker:5001", raising=False)
|
||||
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
|
||||
|
||||
# Patch UploadFile to avoid real DB models
|
||||
@ -164,7 +165,7 @@ def test_extract_images_from_docx(monkeypatch):
|
||||
|
||||
# Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part)
|
||||
assert set(image_map.keys()) == {"rId1", internal_part}
|
||||
assert all(v.startswith(" and v.endswith("/file-preview)") for v in image_map.values())
|
||||
assert all(v.startswith(" and v.endswith("/file-preview)") for v in image_map.values())
|
||||
|
||||
# Storage should receive both payloads
|
||||
payloads = {data for _, data in saves}
|
||||
@ -176,39 +177,6 @@ def test_extract_images_from_docx(monkeypatch):
|
||||
assert db_stub.session.committed is True
|
||||
|
||||
|
||||
def test_extract_images_from_docx_uses_internal_files_url():
|
||||
"""Test that INTERNAL_FILES_URL takes precedence over FILES_URL for plugin access."""
|
||||
# Test the URL generation logic directly
|
||||
from configs import dify_config
|
||||
|
||||
# Mock the configuration values
|
||||
original_files_url = getattr(dify_config, "FILES_URL", None)
|
||||
original_internal_files_url = getattr(dify_config, "INTERNAL_FILES_URL", None)
|
||||
|
||||
try:
|
||||
# Set both URLs - INTERNAL should take precedence
|
||||
dify_config.FILES_URL = "http://external.example.com"
|
||||
dify_config.INTERNAL_FILES_URL = "http://internal.docker:5001"
|
||||
|
||||
# Test the URL generation logic (same as in word_extractor.py)
|
||||
upload_file_id = "test_file_id"
|
||||
|
||||
# This is the pattern we fixed in the word extractor
|
||||
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
|
||||
generated_url = f"{base_url}/files/{upload_file_id}/file-preview"
|
||||
|
||||
# Verify that INTERNAL_FILES_URL is used instead of FILES_URL
|
||||
assert "http://internal.docker:5001" in generated_url, f"Expected internal URL, got: {generated_url}"
|
||||
assert "http://external.example.com" not in generated_url, f"Should not use external URL, got: {generated_url}"
|
||||
|
||||
finally:
|
||||
# Restore original values
|
||||
if original_files_url is not None:
|
||||
dify_config.FILES_URL = original_files_url
|
||||
if original_internal_files_url is not None:
|
||||
dify_config.INTERNAL_FILES_URL = original_internal_files_url
|
||||
|
||||
|
||||
def test_extract_hyperlinks(monkeypatch):
|
||||
# Mock db and storage to avoid issues during image extraction (even if no images are present)
|
||||
monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None))
|
||||
|
||||
@ -547,6 +547,35 @@ class TestDocumentSegmentIndexing:
|
||||
assert segment.index_node_hash == index_node_hash
|
||||
assert segment.keywords == keywords
|
||||
|
||||
def test_document_segment_sign_content_strips_absolute_files_host(self):
|
||||
"""Test that sign_content strips scheme/host from absolute /files URLs and returns a signed relative URL."""
|
||||
# Arrange
|
||||
upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
|
||||
segment = DocumentSegment(
|
||||
tenant_id=str(uuid4()),
|
||||
dataset_id=str(uuid4()),
|
||||
document_id=str(uuid4()),
|
||||
position=1,
|
||||
content=f"",
|
||||
word_count=1,
|
||||
tokens=1,
|
||||
created_by=str(uuid4()),
|
||||
)
|
||||
|
||||
import models.dataset as dataset_module
|
||||
|
||||
# Act
|
||||
with patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True), patch(
|
||||
"models.dataset.time.time", return_value=1700000000
|
||||
), patch("models.dataset.os.urandom", return_value=b"\x00" * 16):
|
||||
signed = segment.get_sign_content()
|
||||
|
||||
# Assert
|
||||
assert "internal.docker:5001" not in signed
|
||||
assert f"/files/{upload_file_id}/file-preview?timestamp=" in signed
|
||||
assert "&nonce=" in signed
|
||||
assert "&sign=" in signed
|
||||
|
||||
def test_document_segment_with_answer_field(self):
|
||||
"""Test creating a document segment with answer field for QA model."""
|
||||
# Arrange
|
||||
|
||||
Reference in New Issue
Block a user