Merge branch 'main' into deploy/dev

This commit is contained in:
Stephen Zhou
2026-03-11 10:20:51 +08:00
29 changed files with 3560 additions and 721 deletions

View File

@ -59,8 +59,6 @@ class DatasourcePluginProviderController(ABC):
:param credentials: the credentials of the tool
"""
credentials_schema = dict[str, ProviderConfig]()
if credentials_schema is None:
return
for credential in self.entity.credentials_schema:
credentials_schema[credential.name] = credential

View File

@ -74,7 +74,8 @@ class ExtractProcessor:
else:
suffix = ""
# https://stackoverflow.com/questions/26541416/generate-temporary-file-names-without-creating-actual-file-in-python#comment90414256_26541521
file_path = f"{temp_dir}/{tempfile.gettempdir()}{suffix}"
# Generate a temporary filename under the created temp_dir and ensure the directory exists
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore
Path(file_path).write_bytes(response.content)
extract_setting = ExtractSetting(datasource_type=DatasourceType.FILE, document_model="text_model")
if return_text:

View File

@ -204,26 +204,61 @@ class WordExtractor(BaseExtractor):
return " ".join(unique_content)
def _parse_cell_paragraph(self, paragraph, image_map):
paragraph_content = []
for run in paragraph.runs:
if run.element.xpath(".//a:blip"):
for blip in run.element.xpath(".//a:blip"):
image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
if not image_id:
continue
rel = paragraph.part.rels.get(image_id)
if rel is None:
continue
# For external images, use image_id as key; for internal, use target_part
if rel.is_external:
if image_id in image_map:
paragraph_content.append(image_map[image_id])
else:
image_part = rel.target_part
if image_part in image_map:
paragraph_content.append(image_map[image_part])
else:
paragraph_content.append(run.text)
paragraph_content: list[str] = []
for child in paragraph._element:
tag = child.tag
if tag == qn("w:hyperlink"):
# Note: w:hyperlink elements may also use w:anchor for internal bookmarks.
# This extractor intentionally only converts external links (HTTP/mailto, etc.)
# that are backed by a relationship id (r:id) with rel.is_external == True.
# Hyperlinks without such an external rel (including anchor-only bookmarks)
# are left as plain text link_text.
r_id = child.get(qn("r:id"))
link_text_parts: list[str] = []
for run_elem in child.findall(qn("w:r")):
run = Run(run_elem, paragraph)
if run.text:
link_text_parts.append(run.text)
link_text = "".join(link_text_parts).strip()
if r_id:
try:
rel = paragraph.part.rels.get(r_id)
if rel:
target_ref = getattr(rel, "target_ref", None)
if target_ref:
parsed_target = urlparse(str(target_ref))
if rel.is_external or parsed_target.scheme in ("http", "https", "mailto"):
display_text = link_text or str(target_ref)
link_text = f"[{display_text}]({target_ref})"
except Exception:
logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id)
if link_text:
paragraph_content.append(link_text)
elif tag == qn("w:r"):
run = Run(child, paragraph)
if run.element.xpath(".//a:blip"):
for blip in run.element.xpath(".//a:blip"):
image_id = blip.get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
)
if not image_id:
continue
rel = paragraph.part.rels.get(image_id)
if rel is None:
continue
if rel.is_external:
if image_id in image_map:
paragraph_content.append(image_map[image_id])
else:
image_part = rel.target_part
if image_part in image_map:
paragraph_content.append(image_map[image_part])
else:
if run.text:
paragraph_content.append(run.text)
return "".join(paragraph_content).strip()
def parse_docx(self, docx_path):