mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-19 11:45:10 +08:00
### What problem does this PR solve? Fixes #12604 - DOCX files containing hyperlinks to internal bookmarks (e.g., `#_文档目录`) cause a `KeyError` during parsing: ``` KeyError: "There is no item named 'word/#_文档目录' in the archive" ``` This happens because python-docx incorrectly tries to read internal bookmark references as files from the ZIP archive. Internal bookmarks are relationship targets starting with `#` and are not actual files. This PR extends the existing `load_from_xml_v2` workaround (which already handles `NULL` targets) to also skip relationship targets starting with `#`. Related upstream issue: https://github.com/python-openxml/python-docx/issues/902 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- Contribution by Gittensor, see my contribution statistics at https://gittensor.io/miners/details?githubId=94194147
This commit is contained in:
@ -727,7 +727,7 @@ def load_from_xml_v2(baseURI, rels_item_xml):
|
|||||||
if rels_item_xml is not None:
|
if rels_item_xml is not None:
|
||||||
rels_elm = parse_xml(rels_item_xml)
|
rels_elm = parse_xml(rels_item_xml)
|
||||||
for rel_elm in rels_elm.Relationship_lst:
|
for rel_elm in rels_elm.Relationship_lst:
|
||||||
if rel_elm.target_ref in ("../NULL", "NULL"):
|
if rel_elm.target_ref in ("../NULL", "NULL") or rel_elm.target_ref.startswith("#"):
|
||||||
continue
|
continue
|
||||||
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
|
||||||
return srels
|
return srels
|
||||||
|
|||||||
Reference in New Issue
Block a user