From 9007109a6bf97ddd1766e13313a36d16b6d11182 Mon Sep 17 00:00:00 2001 From: wangxiaolei Date: Wed, 31 Dec 2025 10:30:15 +0800 Subject: [PATCH] fix: [xxx](xxx) render as xxx](xxx) (#30392) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- api/core/rag/cleaner/clean_processor.py | 44 ++-- api/core/tools/utils/text_processing_utils.py | 6 + .../unit_tests/core/rag/cleaner/__init__.py | 0 .../core/rag/cleaner/test_clean_processor.py | 213 ++++++++++++++++++ .../unit_tests/utils/test_text_processing.py | 5 + 5 files changed, 255 insertions(+), 13 deletions(-) create mode 100644 api/tests/unit_tests/core/rag/cleaner/__init__.py create mode 100644 api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py diff --git a/api/core/rag/cleaner/clean_processor.py b/api/core/rag/cleaner/clean_processor.py index 9cb009035b..e182c35b99 100644 --- a/api/core/rag/cleaner/clean_processor.py +++ b/api/core/rag/cleaner/clean_processor.py @@ -27,26 +27,44 @@ class CleanProcessor: pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)" text = re.sub(pattern, "", text) - # Remove URL but keep Markdown image URLs - # First, temporarily replace Markdown image URLs with a placeholder - markdown_image_pattern = r"!\[.*?\]\((https?://[^\s)]+)\)" - placeholders: list[str] = [] + # Remove URL but keep Markdown image URLs and link URLs + # Replace the ENTIRE markdown link/image with a single placeholder to protect + # the link text (which might also be a URL) from being removed + markdown_link_pattern = r"\[([^\]]*)\]\((https?://[^)]+)\)" + markdown_image_pattern = r"!\[.*?\]\((https?://[^)]+)\)" + placeholders: list[tuple[str, str, str]] = [] # (type, text, url) - def replace_with_placeholder(match, placeholders=placeholders): + def replace_markdown_with_placeholder(match, placeholders=placeholders): + link_type = "link" + link_text = match.group(1) + url = match.group(2) + placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__" + placeholders.append((link_type, link_text, url)) + return placeholder + + def replace_image_with_placeholder(match, placeholders=placeholders): + link_type = "image" url = match.group(1) - placeholder = f"__MARKDOWN_IMAGE_URL_{len(placeholders)}__" - placeholders.append(url) - return f"![image]({placeholder})" + placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__" + placeholders.append((link_type, "image", url)) + return placeholder - text = re.sub(markdown_image_pattern, replace_with_placeholder, text) + # Protect markdown links first + text = re.sub(markdown_link_pattern, replace_markdown_with_placeholder, text) + # Then protect markdown images + text = re.sub(markdown_image_pattern, replace_image_with_placeholder, text) # Now remove all remaining URLs - url_pattern = r"https?://[^\s)]+" + url_pattern = r"https?://\S+" text = re.sub(url_pattern, "", text) - # Finally, restore the Markdown image URLs - for i, url in enumerate(placeholders): - text = text.replace(f"__MARKDOWN_IMAGE_URL_{i}__", url) + # Restore the Markdown links and images + for i, (link_type, text_or_alt, url) in enumerate(placeholders): + placeholder = f"__MARKDOWN_PLACEHOLDER_{i}__" + if link_type == "link": + text = text.replace(placeholder, f"[{text_or_alt}]({url})") + else: # image + text = text.replace(placeholder, f"![{text_or_alt}]({url})") return text def filter_string(self, text): diff --git a/api/core/tools/utils/text_processing_utils.py b/api/core/tools/utils/text_processing_utils.py index 0f9a91a111..4bfaa5e49b 100644 --- a/api/core/tools/utils/text_processing_utils.py +++ b/api/core/tools/utils/text_processing_utils.py @@ -4,6 +4,7 @@ import re def remove_leading_symbols(text: str) -> str: """ Remove leading punctuation or symbols from the given text. + Preserves markdown links like [text](url) at the start. Args: text (str): The input text to process. @@ -11,6 +12,11 @@ def remove_leading_symbols(text: str) -> str: Returns: str: The text with leading punctuation or symbols removed. """ + # Check if text starts with a markdown link - preserve it + markdown_link_pattern = r"^\[([^\]]+)\]\((https?://[^)]+)\)" + if re.match(markdown_link_pattern, text): + return text + # Match Unicode ranges for punctuation and symbols # FIXME this pattern is confused quick fix for #11868 maybe refactor it later pattern = r'^[\[\]\u2000-\u2025\u2027-\u206F\u2E00-\u2E7F\u3000-\u300F\u3011-\u303F"#$%&\'()*+,./:;<=>?@^_`~]+' diff --git a/api/tests/unit_tests/core/rag/cleaner/__init__.py b/api/tests/unit_tests/core/rag/cleaner/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py b/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py new file mode 100644 index 0000000000..65ee62b8dd --- /dev/null +++ b/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py @@ -0,0 +1,213 @@ +from core.rag.cleaner.clean_processor import CleanProcessor + + +class TestCleanProcessor: + """Test cases for CleanProcessor.clean method.""" + + def test_clean_default_removal_of_invalid_symbols(self): + """Test default cleaning removes invalid symbols.""" + # Test <| replacement + assert CleanProcessor.clean("text<|with<|invalid", None) == "text replacement + assert CleanProcessor.clean("text|>with|>invalid", None) == "text>with>invalid" + + # Test removal of control characters + text_with_control = "normal\x00text\x1fwith\x07control\x7fchars" + expected = "normaltextwithcontrolchars" + assert CleanProcessor.clean(text_with_control, None) == expected + + # Test U+FFFE removal + text_with_ufffe = "normal\ufffepadding" + expected = "normalpadding" + assert CleanProcessor.clean(text_with_ufffe, None) == expected + + def test_clean_with_none_process_rule(self): + """Test cleaning with None process_rule - only default cleaning applied.""" + text = "Hello<|World\x00" + expected = "Hello becomes >, control chars and U+FFFE are removed + assert CleanProcessor.clean(text, None) == "<<>>" + + def test_clean_multiple_markdown_links_preserved(self): + """Test multiple markdown links are all preserved.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[One](https://one.com) [Two](http://two.org) [Three](https://three.net)" + expected = "[One](https://one.com) [Two](http://two.org) [Three](https://three.net)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_text_as_url(self): + """Test markdown link where the link text itself is a URL.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + # Link text that looks like URL should be preserved + text = "[https://text-url.com](https://actual-url.com)" + expected = "[https://text-url.com](https://actual-url.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + # Text URL without markdown should be removed + text = "https://text-url.com https://actual-url.com" + expected = " " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_complex_markdown_link_content(self): + """Test markdown links with complex content - known limitation with brackets in link text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + # Note: The regex pattern [^\]]* cannot handle ] within link text + # This is a known limitation - the pattern stops at the first ] + text = "[Text with [brackets] and (parens)](https://example.com)" + # Actual behavior: only matches up to first ], URL gets removed + expected = "[Text with [brackets] and (parens)](" + assert CleanProcessor.clean(text, process_rule) == expected + + # Test that properly formatted markdown links work + text = "[Text with (parens) and symbols](https://example.com)" + expected = "[Text with (parens) and symbols](https://example.com)" + assert CleanProcessor.clean(text, process_rule) == expected diff --git a/api/tests/unit_tests/utils/test_text_processing.py b/api/tests/unit_tests/utils/test_text_processing.py index 11e017464a..bf61162a66 100644 --- a/api/tests/unit_tests/utils/test_text_processing.py +++ b/api/tests/unit_tests/utils/test_text_processing.py @@ -15,6 +15,11 @@ from core.tools.utils.text_processing_utils import remove_leading_symbols ("", ""), (" ", " "), ("【测试】", "【测试】"), + # Markdown link preservation - should be preserved if text starts with a markdown link + ("[Google](https://google.com) is a search engine", "[Google](https://google.com) is a search engine"), + ("[Example](http://example.com) some text", "[Example](http://example.com) some text"), + # Leading symbols before markdown link are removed, including the opening bracket [ + ("@[Test](https://example.com)", "Test](https://example.com)"), ], ) def test_remove_leading_symbols(input_text, expected_output):