Merge branch 'main' into deploy/dev

2026-05-03 00:48:04 +08:00 · 2026-03-11 10:20:51 +08:00
parent 0128c609eb 54637144c5
commit cc35e92258
29 changed files with 3560 additions and 721 deletions
--- a/api/core/datasource/__base/datasource_provider.py
+++ b/api/core/datasource/__base/datasource_provider.py
@ -59,8 +59,6 @@ class DatasourcePluginProviderController(ABC):
        :param credentials: the credentials of the tool
        """
        credentials_schema = dict[str, ProviderConfig]()
-        if credentials_schema is None:
-            return

        for credential in self.entity.credentials_schema:
            credentials_schema[credential.name] = credential
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@ -74,7 +74,8 @@ class ExtractProcessor:
                        else:
                            suffix = ""
            # https://stackoverflow.com/questions/26541416/generate-temporary-file-names-without-creating-actual-file-in-python#comment90414256_26541521
-            file_path = f"{temp_dir}/{tempfile.gettempdir()}{suffix}"
+            # Generate a temporary filename under the created temp_dir and ensure the directory exists
+            file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"  # type: ignore
            Path(file_path).write_bytes(response.content)
            extract_setting = ExtractSetting(datasource_type=DatasourceType.FILE, document_model="text_model")
            if return_text:
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@ -204,26 +204,61 @@ class WordExtractor(BaseExtractor):
        return " ".join(unique_content)

    def _parse_cell_paragraph(self, paragraph, image_map):
-        paragraph_content = []
-        for run in paragraph.runs:
-            if run.element.xpath(".//a:blip"):
-                for blip in run.element.xpath(".//a:blip"):
-                    image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
-                    if not image_id:
-                        continue
-                    rel = paragraph.part.rels.get(image_id)
-                    if rel is None:
-                        continue
-                    # For external images, use image_id as key; for internal, use target_part
-                    if rel.is_external:
-                        if image_id in image_map:
-                            paragraph_content.append(image_map[image_id])
-                    else:
-                        image_part = rel.target_part
-                        if image_part in image_map:
-                            paragraph_content.append(image_map[image_part])
-            else:
-                paragraph_content.append(run.text)
+        paragraph_content: list[str] = []
+
+        for child in paragraph._element:
+            tag = child.tag
+            if tag == qn("w:hyperlink"):
+                # Note: w:hyperlink elements may also use w:anchor for internal bookmarks.
+                # This extractor intentionally only converts external links (HTTP/mailto, etc.)
+                # that are backed by a relationship id (r:id) with rel.is_external == True.
+                # Hyperlinks without such an external rel (including anchor-only bookmarks)
+                # are left as plain text link_text.
+                r_id = child.get(qn("r:id"))
+                link_text_parts: list[str] = []
+                for run_elem in child.findall(qn("w:r")):
+                    run = Run(run_elem, paragraph)
+                    if run.text:
+                        link_text_parts.append(run.text)
+                link_text = "".join(link_text_parts).strip()
+                if r_id:
+                    try:
+                        rel = paragraph.part.rels.get(r_id)
+                        if rel:
+                            target_ref = getattr(rel, "target_ref", None)
+                            if target_ref:
+                                parsed_target = urlparse(str(target_ref))
+                                if rel.is_external or parsed_target.scheme in ("http", "https", "mailto"):
+                                    display_text = link_text or str(target_ref)
+                                    link_text = f"[{display_text}]({target_ref})"
+                    except Exception:
+                        logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id)
+                if link_text:
+                    paragraph_content.append(link_text)
+
+            elif tag == qn("w:r"):
+                run = Run(child, paragraph)
+                if run.element.xpath(".//a:blip"):
+                    for blip in run.element.xpath(".//a:blip"):
+                        image_id = blip.get(
+                            "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
+                        )
+                        if not image_id:
+                            continue
+                        rel = paragraph.part.rels.get(image_id)
+                        if rel is None:
+                            continue
+                        if rel.is_external:
+                            if image_id in image_map:
+                                paragraph_content.append(image_map[image_id])
+                        else:
+                            image_part = rel.target_part
+                            if image_part in image_map:
+                                paragraph_content.append(image_map[image_part])
+                else:
+                    if run.text:
+                        paragraph_content.append(run.text)
+
        return "".join(paragraph_content).strip()

    def parse_docx(self, docx_path):