Merge remote-tracking branch 'origin/main' into feat/model-plugins-implementing

2026-03-16 04:17:43 +08:00 · 2026-03-11 11:57:11 +08:00
parent 08da390678 0fad13370c
commit e51162af0c
10 changed files with 198 additions and 274 deletions
--- a/api/core/plugin/entities/plugin_daemon.py
+++ b/api/core/plugin/entities/plugin_daemon.py
@ -157,6 +157,7 @@ class PluginInstallTaskPluginStatus(BaseModel):
    message: str = Field(description="The message of the install task.")
    icon: str = Field(description="The icon of the plugin.")
    labels: I18nObject = Field(description="The labels of the plugin.")
+    source: str | None = Field(default=None, description="The installation source of the plugin")


 class PluginInstallTask(BasePluginEntity):
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@ -204,26 +204,61 @@ class WordExtractor(BaseExtractor):
        return " ".join(unique_content)

    def _parse_cell_paragraph(self, paragraph, image_map):
-        paragraph_content = []
-        for run in paragraph.runs:
-            if run.element.xpath(".//a:blip"):
-                for blip in run.element.xpath(".//a:blip"):
-                    image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
-                    if not image_id:
-                        continue
-                    rel = paragraph.part.rels.get(image_id)
-                    if rel is None:
-                        continue
-                    # For external images, use image_id as key; for internal, use target_part
-                    if rel.is_external:
-                        if image_id in image_map:
-                            paragraph_content.append(image_map[image_id])
-                    else:
-                        image_part = rel.target_part
-                        if image_part in image_map:
-                            paragraph_content.append(image_map[image_part])
-            else:
-                paragraph_content.append(run.text)
+        paragraph_content: list[str] = []
+
+        for child in paragraph._element:
+            tag = child.tag
+            if tag == qn("w:hyperlink"):
+                # Note: w:hyperlink elements may also use w:anchor for internal bookmarks.
+                # This extractor intentionally only converts external links (HTTP/mailto, etc.)
+                # that are backed by a relationship id (r:id) with rel.is_external == True.
+                # Hyperlinks without such an external rel (including anchor-only bookmarks)
+                # are left as plain text link_text.
+                r_id = child.get(qn("r:id"))
+                link_text_parts: list[str] = []
+                for run_elem in child.findall(qn("w:r")):
+                    run = Run(run_elem, paragraph)
+                    if run.text:
+                        link_text_parts.append(run.text)
+                link_text = "".join(link_text_parts).strip()
+                if r_id:
+                    try:
+                        rel = paragraph.part.rels.get(r_id)
+                        if rel:
+                            target_ref = getattr(rel, "target_ref", None)
+                            if target_ref:
+                                parsed_target = urlparse(str(target_ref))
+                                if rel.is_external or parsed_target.scheme in ("http", "https", "mailto"):
+                                    display_text = link_text or str(target_ref)
+                                    link_text = f"[{display_text}]({target_ref})"
+                    except Exception:
+                        logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id)
+                if link_text:
+                    paragraph_content.append(link_text)
+
+            elif tag == qn("w:r"):
+                run = Run(child, paragraph)
+                if run.element.xpath(".//a:blip"):
+                    for blip in run.element.xpath(".//a:blip"):
+                        image_id = blip.get(
+                            "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
+                        )
+                        if not image_id:
+                            continue
+                        rel = paragraph.part.rels.get(image_id)
+                        if rel is None:
+                            continue
+                        if rel.is_external:
+                            if image_id in image_map:
+                                paragraph_content.append(image_map[image_id])
+                        else:
+                            image_part = rel.target_part
+                            if image_part in image_map:
+                                paragraph_content.append(image_map[image_part])
+                else:
+                    if run.text:
+                        paragraph_content.append(run.text)
+
        return "".join(paragraph_content).strip()

    def parse_docx(self, docx_path):
--- a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py
+++ b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py
@ -423,15 +423,6 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch):
    markdown = extractor._table_to_markdown(table, {})
    assert markdown == "| H1 | H2 |\n| --- | --- |\n| A | B |"

-    class FakeRunElement:
-        def __init__(self, blips):
-            self._blips = blips
-
-        def xpath(self, pattern):
-            if pattern == ".//a:blip":
-                return self._blips
-            return []
-
    class FakeBlip:
        def __init__(self, image_id):
            self.image_id = image_id
@ -439,11 +430,31 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch):
        def get(self, key):
            return self.image_id

+    class FakeRunChild:
+        def __init__(self, blips, text=""):
+            self._blips = blips
+            self.text = text
+            self.tag = qn("w:r")
+
+        def xpath(self, pattern):
+            if pattern == ".//a:blip":
+                return self._blips
+            return []
+
+    class FakeRun:
+        def __init__(self, element, paragraph):
+            # Mirror the subset used by _parse_cell_paragraph
+            self.element = element
+            self.text = getattr(element, "text", "")
+
+    # Patch we.Run so our lightweight child objects work with the extractor
+    monkeypatch.setattr(we, "Run", FakeRun)
+
    image_part = object()
    paragraph = SimpleNamespace(
-        runs=[
-            SimpleNamespace(element=FakeRunElement([FakeBlip(None), FakeBlip("ext"), FakeBlip("int")]), text=""),
-            SimpleNamespace(element=FakeRunElement([]), text="plain"),
+        _element=[
+            FakeRunChild([FakeBlip(None), FakeBlip("ext"), FakeBlip("int")], text=""),
+            FakeRunChild([], text="plain"),
        ],
        part=SimpleNamespace(
            rels={
@ -452,6 +463,7 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch):
            }
        ),
    )
+
    image_map = {"ext": "EXT-IMG", image_part: "INT-IMG"}
    assert extractor._parse_cell_paragraph(paragraph, image_map) == "EXT-IMGINT-IMGplain"

@ -625,3 +637,83 @@ def test_parse_docx_covers_drawing_shapes_hyperlink_error_and_table_branch(monke
    assert "BrokenLink" in content
    assert "TABLE-MARKDOWN" in content
    logger_exception.assert_called_once()
+
+
+def test_parse_cell_paragraph_hyperlink_in_table_cell_http():
+    doc = Document()
+    table = doc.add_table(rows=1, cols=1)
+    cell = table.cell(0, 0)
+    p = cell.paragraphs[0]
+
+    # Build modern hyperlink inside table cell
+    r_id = "rIdHttp1"
+    hyperlink = OxmlElement("w:hyperlink")
+    hyperlink.set(qn("r:id"), r_id)
+
+    run_elem = OxmlElement("w:r")
+    t = OxmlElement("w:t")
+    t.text = "Dify"
+    run_elem.append(t)
+    hyperlink.append(run_elem)
+    p._p.append(hyperlink)
+
+    # Relationship for external http link
+    doc.part.rels.add_relationship(
+        "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
+        "https://dify.ai",
+        r_id,
+        is_external=True,
+    )
+
+    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
+        doc.save(tmp.name)
+        tmp_path = tmp.name
+
+    try:
+        reopened = Document(tmp_path)
+        para = reopened.tables[0].cell(0, 0).paragraphs[0]
+        extractor = object.__new__(WordExtractor)
+        out = extractor._parse_cell_paragraph(para, {})
+        assert out == "[Dify](https://dify.ai)"
+    finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+
+
+def test_parse_cell_paragraph_hyperlink_in_table_cell_mailto():
+    doc = Document()
+    table = doc.add_table(rows=1, cols=1)
+    cell = table.cell(0, 0)
+    p = cell.paragraphs[0]
+
+    r_id = "rIdMail1"
+    hyperlink = OxmlElement("w:hyperlink")
+    hyperlink.set(qn("r:id"), r_id)
+
+    run_elem = OxmlElement("w:r")
+    t = OxmlElement("w:t")
+    t.text = "john@test.com"
+    run_elem.append(t)
+    hyperlink.append(run_elem)
+    p._p.append(hyperlink)
+
+    doc.part.rels.add_relationship(
+        "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
+        "mailto:john@test.com",
+        r_id,
+        is_external=True,
+    )
+
+    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
+        doc.save(tmp.name)
+        tmp_path = tmp.name
+
+    try:
+        reopened = Document(tmp_path)
+        para = reopened.tables[0].cell(0, 0).paragraphs[0]
+        extractor = object.__new__(WordExtractor)
+        out = extractor._parse_cell_paragraph(para, {})
+        assert out == "[john@test.com](mailto:john@test.com)"
+    finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
--- a/web/Dockerfile
+++ b/web/Dockerfile
@ -35,7 +35,7 @@ COPY --from=packages /app/web/ .
 COPY . .

 ENV NODE_OPTIONS="--max-old-space-size=4096"
-RUN pnpm build:docker
+RUN pnpm build


 # production stage
--- a/web/app/components/base/prompt-editor/plugins/on-blur-or-focus-block.tsx
+++ b/web/app/components/base/prompt-editor/plugins/on-blur-or-focus-block.tsx
@ -20,17 +20,21 @@ const OnBlurBlock: FC<OnBlurBlockProps> = ({
 }) => {
  const [editor] = useLexicalComposerContext()

-  const ref = useRef<any>(null)
+  const ref = useRef<ReturnType<typeof setTimeout> | null>(null)

  useEffect(() => {
-    return mergeRegister(
+    const clearHideMenuTimeout = () => {
+      if (ref.current) {
+        clearTimeout(ref.current)
+        ref.current = null
+      }
+    }
+
+    const unregister = mergeRegister(
      editor.registerCommand(
        CLEAR_HIDE_MENU_TIMEOUT,
        () => {
-          if (ref.current) {
-            clearTimeout(ref.current)
-            ref.current = null
-          }
+          clearHideMenuTimeout()
          return true
        },
        COMMAND_PRIORITY_EDITOR,
@ -41,6 +45,7 @@ const OnBlurBlock: FC<OnBlurBlockProps> = ({
          // Check if the clicked target element is var-search-input
          const target = event?.relatedTarget as HTMLElement
          if (!target?.classList?.contains('var-search-input')) {
+            clearHideMenuTimeout()
            ref.current = setTimeout(() => {
              editor.dispatchCommand(KEY_ESCAPE_COMMAND, new KeyboardEvent('keydown', { key: 'Escape' }))
            }, 200)
@ -61,6 +66,11 @@ const OnBlurBlock: FC<OnBlurBlockProps> = ({
        COMMAND_PRIORITY_EDITOR,
      ),
    )
+
+    return () => {
+      clearHideMenuTimeout()
+      unregister()
+    }
  }, [editor, onBlur, onFocus])

  return null
--- a/web/eslint-suppressions.json
+++ b/web/eslint-suppressions.json
@ -3565,11 +3565,6 @@
      "count": 2
    }
  },
-  "app/components/base/prompt-editor/plugins/on-blur-or-focus-block.tsx": {
-    "ts/no-explicit-any": {
-      "count": 1
-    }
-  },
  "app/components/base/prompt-editor/plugins/query-block/index.tsx": {
    "react-refresh/only-export-components": {
      "count": 2
@ -12614,14 +12609,6 @@
      "count": 2
    }
  },
-  "scripts/optimize-standalone.js": {
-    "e18e/prefer-static-regex": {
-      "count": 1
-    },
-    "unused-imports/no-unused-vars": {
-      "count": 2
-    }
-  },
  "scripts/refactor-component.js": {
    "e18e/prefer-static-regex": {
      "count": 14
--- a/web/eslint.config.mjs
+++ b/web/eslint.config.mjs
@ -218,3 +218,6 @@ export default antfu(
    },
  },
 )
+  .disableRulesFix([
+    'e18e/prefer-array-at',
+  ])
--- a/web/package.json
+++ b/web/package.json
@ -26,39 +26,36 @@
    "node": "^22.22.1"
  },
  "scripts": {
+    "analyze": "next experimental-analyze",
+    "analyze-component": "node ./scripts/analyze-component.js",
+    "build": "next build",
+    "build:vinext": "vinext build",
    "dev": "next dev",
    "dev:inspect": "next dev --inspect",
    "dev:vinext": "vinext dev",
-    "build": "next build",
-    "build:docker": "next build && node scripts/optimize-standalone.js",
-    "build:vinext": "vinext build",
-    "start": "node ./scripts/copy-and-start.mjs",
-    "start:vinext": "vinext start",
+    "gen-doc-paths": "tsx ./scripts/gen-doc-paths.ts",
+    "gen-icons": "node ./scripts/gen-icons.mjs && eslint --fix app/components/base/icons/src/",
+    "i18n:check": "tsx ./scripts/check-i18n.js",
+    "knip": "knip",
    "lint": "eslint --cache --concurrency=auto",
    "lint:ci": "eslint --cache --concurrency 2",
    "lint:fix": "pnpm lint --fix",
    "lint:quiet": "pnpm lint --quiet",
-    "lint:complexity": "pnpm lint --rule 'complexity: [error, {max: 15}]' --quiet",
-    "lint:report": "pnpm lint --output-file eslint_report.json --format json",
    "lint:tss": "tsslint --project tsconfig.json",
-    "type-check": "tsc --noEmit",
-    "type-check:tsgo": "tsgo --noEmit",
+    "preinstall": "npx only-allow pnpm",
    "prepare": "cd ../ && node -e \"if (process.env.NODE_ENV !== 'production'){process.exit(1)} \" || husky ./web/.husky",
-    "gen-icons": "node ./scripts/gen-icons.mjs && eslint --fix app/components/base/icons/src/",
-    "gen-doc-paths": "tsx ./scripts/gen-doc-paths.ts",
-    "uglify-embed": "node ./bin/uglify-embed",
-    "i18n:check": "tsx ./scripts/check-i18n.js",
-    "test": "vitest run",
-    "test:coverage": "vitest run --coverage",
-    "test:ci": "vitest run --coverage --silent=passed-only",
-    "test:watch": "vitest --watch",
-    "analyze-component": "node ./scripts/analyze-component.js",
    "refactor-component": "node ./scripts/refactor-component.js",
+    "start": "node ./scripts/copy-and-start.mjs",
+    "start:vinext": "vinext start",
    "storybook": "storybook dev -p 6006",
    "storybook:build": "storybook build",
-    "preinstall": "npx only-allow pnpm",
-    "analyze": "next experimental-analyze",
-    "knip": "knip"
+    "test": "vitest run",
+    "test:ci": "vitest run --coverage --silent=passed-only",
+    "test:coverage": "vitest run --coverage",
+    "test:watch": "vitest --watch",
+    "type-check": "tsc --noEmit",
+    "type-check:tsgo": "tsgo --noEmit",
+    "uglify-embed": "node ./bin/uglify-embed"
  },
  "dependencies": {
    "@amplitude/analytics-browser": "2.36.3",
--- a/web/scripts/README.md
+++ b/web/scripts/README.md
@ -1,38 +0,0 @@
-# Production Build Optimization Scripts
-
-## optimize-standalone.js
-
-This script removes unnecessary development dependencies from the Next.js standalone build output to reduce the production Docker image size.
-
-### What it does
-
-The script specifically targets and removes `jest-worker` packages that are bundled with Next.js but not needed in production. These packages are included because:
-
-1. Next.js includes jest-worker in its compiled dependencies
-1. terser-webpack-plugin (used by Next.js for minification) depends on jest-worker
-1. pnpm's dependency resolution creates symlinks to jest-worker in various locations
-
-### Usage
-
-The script is automatically run during Docker builds via the `build:docker` npm script:
-
-```bash
-# Docker build (removes jest-worker after build)
-pnpm build:docker
-```
-
-To run the optimization manually:
-
-```bash
-node scripts/optimize-standalone.js
-```
-
-### What gets removed
-
- `node_modules/.pnpm/next@*/node_modules/next/dist/compiled/jest-worker`
- `node_modules/.pnpm/terser-webpack-plugin@*/node_modules/jest-worker` (symlinks)
- `node_modules/.pnpm/jest-worker@*` (actual packages)
-
-### Impact
-
-Removing jest-worker saves approximately 36KB per instance from the production image. While this may seem small, it helps ensure production images only contain necessary runtime dependencies.
--- a/web/scripts/optimize-standalone.js
+++ b/web/scripts/optimize-standalone.js
@ -1,163 +0,0 @@
-/**
- * Script to optimize Next.js standalone output for production
- * Removes unnecessary files like jest-worker that are bundled with Next.js
- */
-
-import fs from 'node:fs'
-import path from 'node:path'
-import { fileURLToPath } from 'node:url'
-
-const __filename = fileURLToPath(import.meta.url)
-const __dirname = path.dirname(__filename)
-
-console.log('🔧 Optimizing standalone output...')
-
-const standaloneDir = path.join(__dirname, '..', '.next', 'standalone')
-
-// Check if standalone directory exists
-if (!fs.existsSync(standaloneDir)) {
-  console.error('❌ Standalone directory not found. Please run "next build" first.')
-  process.exit(1)
-}
-
-// List of paths to remove (relative to standalone directory)
-const pathsToRemove = [
-  // Remove jest-worker from Next.js compiled dependencies
-  'node_modules/.pnpm/next@*/node_modules/next/dist/compiled/jest-worker',
-  // Remove jest-worker symlinks from terser-webpack-plugin
-  'node_modules/.pnpm/terser-webpack-plugin@*/node_modules/jest-worker',
-  // Remove actual jest-worker packages (directories only, not symlinks)
-  'node_modules/.pnpm/jest-worker@*',
-]
-
-// Function to safely remove a path
-function removePath(basePath, relativePath) {
-  const fullPath = path.join(basePath, relativePath)
-
-  // Handle wildcard patterns
-  if (relativePath.includes('*')) {
-    const parts = relativePath.split('/')
-    let currentPath = basePath
-
-    for (let i = 0; i < parts.length; i++) {
-      const part = parts[i]
-      if (part.includes('*')) {
-        // Find matching directories
-        if (fs.existsSync(currentPath)) {
-          const entries = fs.readdirSync(currentPath)
-
-          // replace '*' with '.*'
-          const regexPattern = part.replace(/\*/g, '.*')
-
-          const regex = new RegExp(`^${regexPattern}$`)
-
-          for (const entry of entries) {
-            if (regex.test(entry)) {
-              const remainingPath = parts.slice(i + 1).join('/')
-              const matchedPath = path.join(currentPath, entry, remainingPath)
-
-              try {
-                // Use lstatSync to check if path exists (works for both files and symlinks)
-                const stats = fs.lstatSync(matchedPath)
-
-                if (stats.isSymbolicLink()) {
-                  // Remove symlink
-                  fs.unlinkSync(matchedPath)
-                  console.log(`✅ Removed symlink: ${path.relative(basePath, matchedPath)}`)
-                }
-                else {
-                  // Remove directory/file
-                  fs.rmSync(matchedPath, { recursive: true, force: true })
-                  console.log(`✅ Removed: ${path.relative(basePath, matchedPath)}`)
-                }
-              }
-              catch (error) {
-                // Silently ignore ENOENT (path not found) errors
-                if (error.code !== 'ENOENT') {
-                  console.error(`❌ Failed to remove ${matchedPath}: ${error.message}`)
-                }
-              }
-            }
-          }
-        }
-        return
-      }
-      else {
-        currentPath = path.join(currentPath, part)
-      }
-    }
-  }
-  else {
-    // Direct path removal
-    if (fs.existsSync(fullPath)) {
-      try {
-        fs.rmSync(fullPath, { recursive: true, force: true })
-        console.log(`✅ Removed: ${relativePath}`)
-      }
-      catch (error) {
-        console.error(`❌ Failed to remove ${fullPath}: ${error.message}`)
-      }
-    }
-  }
-}
-
-// Remove unnecessary paths
-console.log('🗑️  Removing unnecessary files...')
-for (const pathToRemove of pathsToRemove) {
-  removePath(standaloneDir, pathToRemove)
-}
-
-// Calculate size reduction
-console.log('\n📊 Optimization complete!')
-
-// Optional: Display the size of remaining jest-related files (if any)
-const checkForJest = (dir) => {
-  const jestFiles = []
-
-  function walk(currentPath) {
-    if (!fs.existsSync(currentPath))
-      return
-
-    try {
-      const entries = fs.readdirSync(currentPath)
-      for (const entry of entries) {
-        const fullPath = path.join(currentPath, entry)
-
-        try {
-          const stat = fs.lstatSync(fullPath) // Use lstatSync to handle symlinks
-
-          if (stat.isDirectory() && !stat.isSymbolicLink()) {
-            // Skip node_modules subdirectories to avoid deep traversal
-            if (entry === 'node_modules' && currentPath !== standaloneDir) {
-              continue
-            }
-            walk(fullPath)
-          }
-          else if (stat.isFile() && entry.includes('jest')) {
-            jestFiles.push(path.relative(standaloneDir, fullPath))
-          }
-        }
-        catch (err) {
-          // Skip files that can't be accessed
-          continue
-        }
-      }
-    }
-    catch (err) {
-      // Skip directories that can't be read
-
-    }
-  }
-
-  walk(dir)
-  return jestFiles
-}
-
-const remainingJestFiles = checkForJest(standaloneDir)
-if (remainingJestFiles.length > 0) {
-  console.log('\n⚠️  Warning: Some jest-related files still remain:')
-  remainingJestFiles.forEach(file => console.log(`  - ${file}`))
-}
-else {
-  console.log('\n✨ No jest-related files found in standalone output!')
-}