diff --git a/api/core/plugin/entities/plugin_daemon.py b/api/core/plugin/entities/plugin_daemon.py index 2dc540e6a8..416e0f6b4d 100644 --- a/api/core/plugin/entities/plugin_daemon.py +++ b/api/core/plugin/entities/plugin_daemon.py @@ -157,6 +157,7 @@ class PluginInstallTaskPluginStatus(BaseModel): message: str = Field(description="The message of the install task.") icon: str = Field(description="The icon of the plugin.") labels: I18nObject = Field(description="The labels of the plugin.") + source: str | None = Field(default=None, description="The installation source of the plugin") class PluginInstallTask(BasePluginEntity): diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index 1ddbfc5864..d6b6ca35be 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -204,26 +204,61 @@ class WordExtractor(BaseExtractor): return " ".join(unique_content) def _parse_cell_paragraph(self, paragraph, image_map): - paragraph_content = [] - for run in paragraph.runs: - if run.element.xpath(".//a:blip"): - for blip in run.element.xpath(".//a:blip"): - image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed") - if not image_id: - continue - rel = paragraph.part.rels.get(image_id) - if rel is None: - continue - # For external images, use image_id as key; for internal, use target_part - if rel.is_external: - if image_id in image_map: - paragraph_content.append(image_map[image_id]) - else: - image_part = rel.target_part - if image_part in image_map: - paragraph_content.append(image_map[image_part]) - else: - paragraph_content.append(run.text) + paragraph_content: list[str] = [] + + for child in paragraph._element: + tag = child.tag + if tag == qn("w:hyperlink"): + # Note: w:hyperlink elements may also use w:anchor for internal bookmarks. + # This extractor intentionally only converts external links (HTTP/mailto, etc.) + # that are backed by a relationship id (r:id) with rel.is_external == True. + # Hyperlinks without such an external rel (including anchor-only bookmarks) + # are left as plain text link_text. + r_id = child.get(qn("r:id")) + link_text_parts: list[str] = [] + for run_elem in child.findall(qn("w:r")): + run = Run(run_elem, paragraph) + if run.text: + link_text_parts.append(run.text) + link_text = "".join(link_text_parts).strip() + if r_id: + try: + rel = paragraph.part.rels.get(r_id) + if rel: + target_ref = getattr(rel, "target_ref", None) + if target_ref: + parsed_target = urlparse(str(target_ref)) + if rel.is_external or parsed_target.scheme in ("http", "https", "mailto"): + display_text = link_text or str(target_ref) + link_text = f"[{display_text}]({target_ref})" + except Exception: + logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id) + if link_text: + paragraph_content.append(link_text) + + elif tag == qn("w:r"): + run = Run(child, paragraph) + if run.element.xpath(".//a:blip"): + for blip in run.element.xpath(".//a:blip"): + image_id = blip.get( + "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" + ) + if not image_id: + continue + rel = paragraph.part.rels.get(image_id) + if rel is None: + continue + if rel.is_external: + if image_id in image_map: + paragraph_content.append(image_map[image_id]) + else: + image_part = rel.target_part + if image_part in image_map: + paragraph_content.append(image_map[image_part]) + else: + if run.text: + paragraph_content.append(run.text) + return "".join(paragraph_content).strip() def parse_docx(self, docx_path): diff --git a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py index 12a26ef75a..64eb89590a 100644 --- a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py +++ b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py @@ -423,15 +423,6 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch): markdown = extractor._table_to_markdown(table, {}) assert markdown == "| H1 | H2 |\n| --- | --- |\n| A | B |" - class FakeRunElement: - def __init__(self, blips): - self._blips = blips - - def xpath(self, pattern): - if pattern == ".//a:blip": - return self._blips - return [] - class FakeBlip: def __init__(self, image_id): self.image_id = image_id @@ -439,11 +430,31 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch): def get(self, key): return self.image_id + class FakeRunChild: + def __init__(self, blips, text=""): + self._blips = blips + self.text = text + self.tag = qn("w:r") + + def xpath(self, pattern): + if pattern == ".//a:blip": + return self._blips + return [] + + class FakeRun: + def __init__(self, element, paragraph): + # Mirror the subset used by _parse_cell_paragraph + self.element = element + self.text = getattr(element, "text", "") + + # Patch we.Run so our lightweight child objects work with the extractor + monkeypatch.setattr(we, "Run", FakeRun) + image_part = object() paragraph = SimpleNamespace( - runs=[ - SimpleNamespace(element=FakeRunElement([FakeBlip(None), FakeBlip("ext"), FakeBlip("int")]), text=""), - SimpleNamespace(element=FakeRunElement([]), text="plain"), + _element=[ + FakeRunChild([FakeBlip(None), FakeBlip("ext"), FakeBlip("int")], text=""), + FakeRunChild([], text="plain"), ], part=SimpleNamespace( rels={ @@ -452,6 +463,7 @@ def test_table_to_markdown_and_parse_helpers(monkeypatch): } ), ) + image_map = {"ext": "EXT-IMG", image_part: "INT-IMG"} assert extractor._parse_cell_paragraph(paragraph, image_map) == "EXT-IMGINT-IMGplain" @@ -625,3 +637,83 @@ def test_parse_docx_covers_drawing_shapes_hyperlink_error_and_table_branch(monke assert "BrokenLink" in content assert "TABLE-MARKDOWN" in content logger_exception.assert_called_once() + + +def test_parse_cell_paragraph_hyperlink_in_table_cell_http(): + doc = Document() + table = doc.add_table(rows=1, cols=1) + cell = table.cell(0, 0) + p = cell.paragraphs[0] + + # Build modern hyperlink inside table cell + r_id = "rIdHttp1" + hyperlink = OxmlElement("w:hyperlink") + hyperlink.set(qn("r:id"), r_id) + + run_elem = OxmlElement("w:r") + t = OxmlElement("w:t") + t.text = "Dify" + run_elem.append(t) + hyperlink.append(run_elem) + p._p.append(hyperlink) + + # Relationship for external http link + doc.part.rels.add_relationship( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", + "https://dify.ai", + r_id, + is_external=True, + ) + + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp: + doc.save(tmp.name) + tmp_path = tmp.name + + try: + reopened = Document(tmp_path) + para = reopened.tables[0].cell(0, 0).paragraphs[0] + extractor = object.__new__(WordExtractor) + out = extractor._parse_cell_paragraph(para, {}) + assert out == "[Dify](https://dify.ai)" + finally: + if os.path.exists(tmp_path): + os.remove(tmp_path) + + +def test_parse_cell_paragraph_hyperlink_in_table_cell_mailto(): + doc = Document() + table = doc.add_table(rows=1, cols=1) + cell = table.cell(0, 0) + p = cell.paragraphs[0] + + r_id = "rIdMail1" + hyperlink = OxmlElement("w:hyperlink") + hyperlink.set(qn("r:id"), r_id) + + run_elem = OxmlElement("w:r") + t = OxmlElement("w:t") + t.text = "john@test.com" + run_elem.append(t) + hyperlink.append(run_elem) + p._p.append(hyperlink) + + doc.part.rels.add_relationship( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", + "mailto:john@test.com", + r_id, + is_external=True, + ) + + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp: + doc.save(tmp.name) + tmp_path = tmp.name + + try: + reopened = Document(tmp_path) + para = reopened.tables[0].cell(0, 0).paragraphs[0] + extractor = object.__new__(WordExtractor) + out = extractor._parse_cell_paragraph(para, {}) + assert out == "[john@test.com](mailto:john@test.com)" + finally: + if os.path.exists(tmp_path): + os.remove(tmp_path) diff --git a/web/Dockerfile b/web/Dockerfile index 9b24f9ea0a..b54bae706c 100644 --- a/web/Dockerfile +++ b/web/Dockerfile @@ -35,7 +35,7 @@ COPY --from=packages /app/web/ . COPY . . ENV NODE_OPTIONS="--max-old-space-size=4096" -RUN pnpm build:docker +RUN pnpm build # production stage diff --git a/web/app/components/base/prompt-editor/plugins/on-blur-or-focus-block.tsx b/web/app/components/base/prompt-editor/plugins/on-blur-or-focus-block.tsx index 84eacb01ed..80c3ddba21 100644 --- a/web/app/components/base/prompt-editor/plugins/on-blur-or-focus-block.tsx +++ b/web/app/components/base/prompt-editor/plugins/on-blur-or-focus-block.tsx @@ -20,17 +20,21 @@ const OnBlurBlock: FC = ({ }) => { const [editor] = useLexicalComposerContext() - const ref = useRef(null) + const ref = useRef | null>(null) useEffect(() => { - return mergeRegister( + const clearHideMenuTimeout = () => { + if (ref.current) { + clearTimeout(ref.current) + ref.current = null + } + } + + const unregister = mergeRegister( editor.registerCommand( CLEAR_HIDE_MENU_TIMEOUT, () => { - if (ref.current) { - clearTimeout(ref.current) - ref.current = null - } + clearHideMenuTimeout() return true }, COMMAND_PRIORITY_EDITOR, @@ -41,6 +45,7 @@ const OnBlurBlock: FC = ({ // Check if the clicked target element is var-search-input const target = event?.relatedTarget as HTMLElement if (!target?.classList?.contains('var-search-input')) { + clearHideMenuTimeout() ref.current = setTimeout(() => { editor.dispatchCommand(KEY_ESCAPE_COMMAND, new KeyboardEvent('keydown', { key: 'Escape' })) }, 200) @@ -61,6 +66,11 @@ const OnBlurBlock: FC = ({ COMMAND_PRIORITY_EDITOR, ), ) + + return () => { + clearHideMenuTimeout() + unregister() + } }, [editor, onBlur, onFocus]) return null diff --git a/web/eslint-suppressions.json b/web/eslint-suppressions.json index d5b563420b..66d757ab50 100644 --- a/web/eslint-suppressions.json +++ b/web/eslint-suppressions.json @@ -3565,11 +3565,6 @@ "count": 2 } }, - "app/components/base/prompt-editor/plugins/on-blur-or-focus-block.tsx": { - "ts/no-explicit-any": { - "count": 1 - } - }, "app/components/base/prompt-editor/plugins/query-block/index.tsx": { "react-refresh/only-export-components": { "count": 2 @@ -12614,14 +12609,6 @@ "count": 2 } }, - "scripts/optimize-standalone.js": { - "e18e/prefer-static-regex": { - "count": 1 - }, - "unused-imports/no-unused-vars": { - "count": 2 - } - }, "scripts/refactor-component.js": { "e18e/prefer-static-regex": { "count": 14 diff --git a/web/eslint.config.mjs b/web/eslint.config.mjs index 145df1484e..de78e90548 100644 --- a/web/eslint.config.mjs +++ b/web/eslint.config.mjs @@ -218,3 +218,6 @@ export default antfu( }, }, ) + .disableRulesFix([ + 'e18e/prefer-array-at', + ]) diff --git a/web/package.json b/web/package.json index dcb060e6bd..30434e8707 100644 --- a/web/package.json +++ b/web/package.json @@ -26,39 +26,36 @@ "node": "^22.22.1" }, "scripts": { + "analyze": "next experimental-analyze", + "analyze-component": "node ./scripts/analyze-component.js", + "build": "next build", + "build:vinext": "vinext build", "dev": "next dev", "dev:inspect": "next dev --inspect", "dev:vinext": "vinext dev", - "build": "next build", - "build:docker": "next build && node scripts/optimize-standalone.js", - "build:vinext": "vinext build", - "start": "node ./scripts/copy-and-start.mjs", - "start:vinext": "vinext start", + "gen-doc-paths": "tsx ./scripts/gen-doc-paths.ts", + "gen-icons": "node ./scripts/gen-icons.mjs && eslint --fix app/components/base/icons/src/", + "i18n:check": "tsx ./scripts/check-i18n.js", + "knip": "knip", "lint": "eslint --cache --concurrency=auto", "lint:ci": "eslint --cache --concurrency 2", "lint:fix": "pnpm lint --fix", "lint:quiet": "pnpm lint --quiet", - "lint:complexity": "pnpm lint --rule 'complexity: [error, {max: 15}]' --quiet", - "lint:report": "pnpm lint --output-file eslint_report.json --format json", "lint:tss": "tsslint --project tsconfig.json", - "type-check": "tsc --noEmit", - "type-check:tsgo": "tsgo --noEmit", + "preinstall": "npx only-allow pnpm", "prepare": "cd ../ && node -e \"if (process.env.NODE_ENV !== 'production'){process.exit(1)} \" || husky ./web/.husky", - "gen-icons": "node ./scripts/gen-icons.mjs && eslint --fix app/components/base/icons/src/", - "gen-doc-paths": "tsx ./scripts/gen-doc-paths.ts", - "uglify-embed": "node ./bin/uglify-embed", - "i18n:check": "tsx ./scripts/check-i18n.js", - "test": "vitest run", - "test:coverage": "vitest run --coverage", - "test:ci": "vitest run --coverage --silent=passed-only", - "test:watch": "vitest --watch", - "analyze-component": "node ./scripts/analyze-component.js", "refactor-component": "node ./scripts/refactor-component.js", + "start": "node ./scripts/copy-and-start.mjs", + "start:vinext": "vinext start", "storybook": "storybook dev -p 6006", "storybook:build": "storybook build", - "preinstall": "npx only-allow pnpm", - "analyze": "next experimental-analyze", - "knip": "knip" + "test": "vitest run", + "test:ci": "vitest run --coverage --silent=passed-only", + "test:coverage": "vitest run --coverage", + "test:watch": "vitest --watch", + "type-check": "tsc --noEmit", + "type-check:tsgo": "tsgo --noEmit", + "uglify-embed": "node ./bin/uglify-embed" }, "dependencies": { "@amplitude/analytics-browser": "2.36.3", diff --git a/web/scripts/README.md b/web/scripts/README.md deleted file mode 100644 index 2c575a244c..0000000000 --- a/web/scripts/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Production Build Optimization Scripts - -## optimize-standalone.js - -This script removes unnecessary development dependencies from the Next.js standalone build output to reduce the production Docker image size. - -### What it does - -The script specifically targets and removes `jest-worker` packages that are bundled with Next.js but not needed in production. These packages are included because: - -1. Next.js includes jest-worker in its compiled dependencies -1. terser-webpack-plugin (used by Next.js for minification) depends on jest-worker -1. pnpm's dependency resolution creates symlinks to jest-worker in various locations - -### Usage - -The script is automatically run during Docker builds via the `build:docker` npm script: - -```bash -# Docker build (removes jest-worker after build) -pnpm build:docker -``` - -To run the optimization manually: - -```bash -node scripts/optimize-standalone.js -``` - -### What gets removed - -- `node_modules/.pnpm/next@*/node_modules/next/dist/compiled/jest-worker` -- `node_modules/.pnpm/terser-webpack-plugin@*/node_modules/jest-worker` (symlinks) -- `node_modules/.pnpm/jest-worker@*` (actual packages) - -### Impact - -Removing jest-worker saves approximately 36KB per instance from the production image. While this may seem small, it helps ensure production images only contain necessary runtime dependencies. diff --git a/web/scripts/optimize-standalone.js b/web/scripts/optimize-standalone.js deleted file mode 100644 index b73667eac6..0000000000 --- a/web/scripts/optimize-standalone.js +++ /dev/null @@ -1,163 +0,0 @@ -/** - * Script to optimize Next.js standalone output for production - * Removes unnecessary files like jest-worker that are bundled with Next.js - */ - -import fs from 'node:fs' -import path from 'node:path' -import { fileURLToPath } from 'node:url' - -const __filename = fileURLToPath(import.meta.url) -const __dirname = path.dirname(__filename) - -console.log('šŸ”§ Optimizing standalone output...') - -const standaloneDir = path.join(__dirname, '..', '.next', 'standalone') - -// Check if standalone directory exists -if (!fs.existsSync(standaloneDir)) { - console.error('āŒ Standalone directory not found. Please run "next build" first.') - process.exit(1) -} - -// List of paths to remove (relative to standalone directory) -const pathsToRemove = [ - // Remove jest-worker from Next.js compiled dependencies - 'node_modules/.pnpm/next@*/node_modules/next/dist/compiled/jest-worker', - // Remove jest-worker symlinks from terser-webpack-plugin - 'node_modules/.pnpm/terser-webpack-plugin@*/node_modules/jest-worker', - // Remove actual jest-worker packages (directories only, not symlinks) - 'node_modules/.pnpm/jest-worker@*', -] - -// Function to safely remove a path -function removePath(basePath, relativePath) { - const fullPath = path.join(basePath, relativePath) - - // Handle wildcard patterns - if (relativePath.includes('*')) { - const parts = relativePath.split('/') - let currentPath = basePath - - for (let i = 0; i < parts.length; i++) { - const part = parts[i] - if (part.includes('*')) { - // Find matching directories - if (fs.existsSync(currentPath)) { - const entries = fs.readdirSync(currentPath) - - // replace '*' with '.*' - const regexPattern = part.replace(/\*/g, '.*') - - const regex = new RegExp(`^${regexPattern}$`) - - for (const entry of entries) { - if (regex.test(entry)) { - const remainingPath = parts.slice(i + 1).join('/') - const matchedPath = path.join(currentPath, entry, remainingPath) - - try { - // Use lstatSync to check if path exists (works for both files and symlinks) - const stats = fs.lstatSync(matchedPath) - - if (stats.isSymbolicLink()) { - // Remove symlink - fs.unlinkSync(matchedPath) - console.log(`āœ… Removed symlink: ${path.relative(basePath, matchedPath)}`) - } - else { - // Remove directory/file - fs.rmSync(matchedPath, { recursive: true, force: true }) - console.log(`āœ… Removed: ${path.relative(basePath, matchedPath)}`) - } - } - catch (error) { - // Silently ignore ENOENT (path not found) errors - if (error.code !== 'ENOENT') { - console.error(`āŒ Failed to remove ${matchedPath}: ${error.message}`) - } - } - } - } - } - return - } - else { - currentPath = path.join(currentPath, part) - } - } - } - else { - // Direct path removal - if (fs.existsSync(fullPath)) { - try { - fs.rmSync(fullPath, { recursive: true, force: true }) - console.log(`āœ… Removed: ${relativePath}`) - } - catch (error) { - console.error(`āŒ Failed to remove ${fullPath}: ${error.message}`) - } - } - } -} - -// Remove unnecessary paths -console.log('šŸ—‘ļø Removing unnecessary files...') -for (const pathToRemove of pathsToRemove) { - removePath(standaloneDir, pathToRemove) -} - -// Calculate size reduction -console.log('\nšŸ“Š Optimization complete!') - -// Optional: Display the size of remaining jest-related files (if any) -const checkForJest = (dir) => { - const jestFiles = [] - - function walk(currentPath) { - if (!fs.existsSync(currentPath)) - return - - try { - const entries = fs.readdirSync(currentPath) - for (const entry of entries) { - const fullPath = path.join(currentPath, entry) - - try { - const stat = fs.lstatSync(fullPath) // Use lstatSync to handle symlinks - - if (stat.isDirectory() && !stat.isSymbolicLink()) { - // Skip node_modules subdirectories to avoid deep traversal - if (entry === 'node_modules' && currentPath !== standaloneDir) { - continue - } - walk(fullPath) - } - else if (stat.isFile() && entry.includes('jest')) { - jestFiles.push(path.relative(standaloneDir, fullPath)) - } - } - catch (err) { - // Skip files that can't be accessed - continue - } - } - } - catch (err) { - // Skip directories that can't be read - - } - } - - walk(dir) - return jestFiles -} - -const remainingJestFiles = checkForJest(standaloneDir) -if (remainingJestFiles.length > 0) { - console.log('\nāš ļø Warning: Some jest-related files still remain:') - remainingJestFiles.forEach(file => console.log(` - ${file}`)) -} -else { - console.log('\n✨ No jest-related files found in standalone output!') -}