fix: handle backslash path separators in DOCX ZIP entries exported on…(#33129) (#33131)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2026-03-30 02:20:16 +08:00 · 2026-03-09 10:49:42 +08:00
parent 2cc0de9c1b
commit 322cd37de1
2 changed files with 92 additions and 1 deletions
--- a/api/dify_graph/nodes/document_extractor/node.py
+++ b/api/dify_graph/nodes/document_extractor/node.py
@ -4,6 +4,7 @@ import json
 import logging
 import os
 import tempfile
+import zipfile
 from collections.abc import Mapping, Sequence
 from typing import TYPE_CHECKING, Any

@ -385,6 +386,32 @@ def parser_docx_part(block, doc: Document, content_items, i):
        content_items.append((i, "table", Table(block, doc)))


+def _normalize_docx_zip(file_content: bytes) -> bytes:
+    """
+    Some DOCX files (e.g. exported by Evernote on Windows) are malformed:
+    ZIP entry names use backslash (\\) as path separator instead of the forward
+    slash (/) required by both the ZIP spec and OOXML.  On Linux/Mac the entry
+    "word\\document.xml" is never found when python-docx looks for
+    "word/document.xml", which triggers a KeyError about a missing relationship.
+
+    This function rewrites the ZIP in-memory, normalizing all entry names to
+    use forward slashes without touching any actual document content.
+    """
+    try:
+        with zipfile.ZipFile(io.BytesIO(file_content), "r") as zin:
+            out_buf = io.BytesIO()
+            with zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED) as zout:
+                for item in zin.infolist():
+                    data = zin.read(item.filename)
+                    # Normalize backslash path separators to forward slash
+                    item.filename = item.filename.replace("\\", "/")
+                    zout.writestr(item, data)
+            return out_buf.getvalue()
+    except zipfile.BadZipFile:
+        # Not a valid zip — return as-is and let python-docx report the real error
+        return file_content
+
+
 def _extract_text_from_docx(file_content: bytes) -> str:
    """
    Extract text from a DOCX file.
@ -392,7 +419,15 @@ def _extract_text_from_docx(file_content: bytes) -> str:
    """
    try:
        doc_file = io.BytesIO(file_content)
-        doc = docx.Document(doc_file)
+        try:
+            doc = docx.Document(doc_file)
+        except Exception as e:
+            logger.warning("Failed to parse DOCX, attempting to normalize ZIP entry paths: %s", e)
+            # Some DOCX files exported by tools like Evernote on Windows use
+            # backslash path separators in ZIP entries and/or single-quoted XML
+            # attributes, both of which break python-docx on Linux. Normalize and retry.
+            file_content = _normalize_docx_zip(file_content)
+            doc = docx.Document(io.BytesIO(file_content))
        text = []

        # Keep track of paragraph and table positions