mirror of
https://github.com/langgenius/dify.git
synced 2026-05-03 00:48:04 +08:00
Merge branch 'main' into deploy/dev
This commit is contained in:
@ -59,8 +59,6 @@ class DatasourcePluginProviderController(ABC):
|
||||
:param credentials: the credentials of the tool
|
||||
"""
|
||||
credentials_schema = dict[str, ProviderConfig]()
|
||||
if credentials_schema is None:
|
||||
return
|
||||
|
||||
for credential in self.entity.credentials_schema:
|
||||
credentials_schema[credential.name] = credential
|
||||
|
||||
@ -74,7 +74,8 @@ class ExtractProcessor:
|
||||
else:
|
||||
suffix = ""
|
||||
# https://stackoverflow.com/questions/26541416/generate-temporary-file-names-without-creating-actual-file-in-python#comment90414256_26541521
|
||||
file_path = f"{temp_dir}/{tempfile.gettempdir()}{suffix}"
|
||||
# Generate a temporary filename under the created temp_dir and ensure the directory exists
|
||||
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore
|
||||
Path(file_path).write_bytes(response.content)
|
||||
extract_setting = ExtractSetting(datasource_type=DatasourceType.FILE, document_model="text_model")
|
||||
if return_text:
|
||||
|
||||
@ -204,26 +204,61 @@ class WordExtractor(BaseExtractor):
|
||||
return " ".join(unique_content)
|
||||
|
||||
def _parse_cell_paragraph(self, paragraph, image_map):
|
||||
paragraph_content = []
|
||||
for run in paragraph.runs:
|
||||
if run.element.xpath(".//a:blip"):
|
||||
for blip in run.element.xpath(".//a:blip"):
|
||||
image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
|
||||
if not image_id:
|
||||
continue
|
||||
rel = paragraph.part.rels.get(image_id)
|
||||
if rel is None:
|
||||
continue
|
||||
# For external images, use image_id as key; for internal, use target_part
|
||||
if rel.is_external:
|
||||
if image_id in image_map:
|
||||
paragraph_content.append(image_map[image_id])
|
||||
else:
|
||||
image_part = rel.target_part
|
||||
if image_part in image_map:
|
||||
paragraph_content.append(image_map[image_part])
|
||||
else:
|
||||
paragraph_content.append(run.text)
|
||||
paragraph_content: list[str] = []
|
||||
|
||||
for child in paragraph._element:
|
||||
tag = child.tag
|
||||
if tag == qn("w:hyperlink"):
|
||||
# Note: w:hyperlink elements may also use w:anchor for internal bookmarks.
|
||||
# This extractor intentionally only converts external links (HTTP/mailto, etc.)
|
||||
# that are backed by a relationship id (r:id) with rel.is_external == True.
|
||||
# Hyperlinks without such an external rel (including anchor-only bookmarks)
|
||||
# are left as plain text link_text.
|
||||
r_id = child.get(qn("r:id"))
|
||||
link_text_parts: list[str] = []
|
||||
for run_elem in child.findall(qn("w:r")):
|
||||
run = Run(run_elem, paragraph)
|
||||
if run.text:
|
||||
link_text_parts.append(run.text)
|
||||
link_text = "".join(link_text_parts).strip()
|
||||
if r_id:
|
||||
try:
|
||||
rel = paragraph.part.rels.get(r_id)
|
||||
if rel:
|
||||
target_ref = getattr(rel, "target_ref", None)
|
||||
if target_ref:
|
||||
parsed_target = urlparse(str(target_ref))
|
||||
if rel.is_external or parsed_target.scheme in ("http", "https", "mailto"):
|
||||
display_text = link_text or str(target_ref)
|
||||
link_text = f"[{display_text}]({target_ref})"
|
||||
except Exception:
|
||||
logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id)
|
||||
if link_text:
|
||||
paragraph_content.append(link_text)
|
||||
|
||||
elif tag == qn("w:r"):
|
||||
run = Run(child, paragraph)
|
||||
if run.element.xpath(".//a:blip"):
|
||||
for blip in run.element.xpath(".//a:blip"):
|
||||
image_id = blip.get(
|
||||
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
||||
)
|
||||
if not image_id:
|
||||
continue
|
||||
rel = paragraph.part.rels.get(image_id)
|
||||
if rel is None:
|
||||
continue
|
||||
if rel.is_external:
|
||||
if image_id in image_map:
|
||||
paragraph_content.append(image_map[image_id])
|
||||
else:
|
||||
image_part = rel.target_part
|
||||
if image_part in image_map:
|
||||
paragraph_content.append(image_map[image_part])
|
||||
else:
|
||||
if run.text:
|
||||
paragraph_content.append(run.text)
|
||||
|
||||
return "".join(paragraph_content).strip()
|
||||
|
||||
def parse_docx(self, docx_path):
|
||||
|
||||
Reference in New Issue
Block a user