mirror of
https://github.com/langgenius/dify.git
synced 2026-01-19 11:45:05 +08:00
fix: [xxx](xxx) render as xxx](xxx) (#30392)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
@ -27,26 +27,44 @@ class CleanProcessor:
|
||||
pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
|
||||
text = re.sub(pattern, "", text)
|
||||
|
||||
# Remove URL but keep Markdown image URLs
|
||||
# First, temporarily replace Markdown image URLs with a placeholder
|
||||
markdown_image_pattern = r"!\[.*?\]\((https?://[^\s)]+)\)"
|
||||
placeholders: list[str] = []
|
||||
# Remove URL but keep Markdown image URLs and link URLs
|
||||
# Replace the ENTIRE markdown link/image with a single placeholder to protect
|
||||
# the link text (which might also be a URL) from being removed
|
||||
markdown_link_pattern = r"\[([^\]]*)\]\((https?://[^)]+)\)"
|
||||
markdown_image_pattern = r"!\[.*?\]\((https?://[^)]+)\)"
|
||||
placeholders: list[tuple[str, str, str]] = [] # (type, text, url)
|
||||
|
||||
def replace_with_placeholder(match, placeholders=placeholders):
|
||||
def replace_markdown_with_placeholder(match, placeholders=placeholders):
|
||||
link_type = "link"
|
||||
link_text = match.group(1)
|
||||
url = match.group(2)
|
||||
placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__"
|
||||
placeholders.append((link_type, link_text, url))
|
||||
return placeholder
|
||||
|
||||
def replace_image_with_placeholder(match, placeholders=placeholders):
|
||||
link_type = "image"
|
||||
url = match.group(1)
|
||||
placeholder = f"__MARKDOWN_IMAGE_URL_{len(placeholders)}__"
|
||||
placeholders.append(url)
|
||||
return f""
|
||||
placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__"
|
||||
placeholders.append((link_type, "image", url))
|
||||
return placeholder
|
||||
|
||||
text = re.sub(markdown_image_pattern, replace_with_placeholder, text)
|
||||
# Protect markdown links first
|
||||
text = re.sub(markdown_link_pattern, replace_markdown_with_placeholder, text)
|
||||
# Then protect markdown images
|
||||
text = re.sub(markdown_image_pattern, replace_image_with_placeholder, text)
|
||||
|
||||
# Now remove all remaining URLs
|
||||
url_pattern = r"https?://[^\s)]+"
|
||||
url_pattern = r"https?://\S+"
|
||||
text = re.sub(url_pattern, "", text)
|
||||
|
||||
# Finally, restore the Markdown image URLs
|
||||
for i, url in enumerate(placeholders):
|
||||
text = text.replace(f"__MARKDOWN_IMAGE_URL_{i}__", url)
|
||||
# Restore the Markdown links and images
|
||||
for i, (link_type, text_or_alt, url) in enumerate(placeholders):
|
||||
placeholder = f"__MARKDOWN_PLACEHOLDER_{i}__"
|
||||
if link_type == "link":
|
||||
text = text.replace(placeholder, f"[{text_or_alt}]({url})")
|
||||
else: # image
|
||||
text = text.replace(placeholder, f"")
|
||||
return text
|
||||
|
||||
def filter_string(self, text):
|
||||
|
||||
@ -4,6 +4,7 @@ import re
|
||||
def remove_leading_symbols(text: str) -> str:
|
||||
"""
|
||||
Remove leading punctuation or symbols from the given text.
|
||||
Preserves markdown links like [text](url) at the start.
|
||||
|
||||
Args:
|
||||
text (str): The input text to process.
|
||||
@ -11,6 +12,11 @@ def remove_leading_symbols(text: str) -> str:
|
||||
Returns:
|
||||
str: The text with leading punctuation or symbols removed.
|
||||
"""
|
||||
# Check if text starts with a markdown link - preserve it
|
||||
markdown_link_pattern = r"^\[([^\]]+)\]\((https?://[^)]+)\)"
|
||||
if re.match(markdown_link_pattern, text):
|
||||
return text
|
||||
|
||||
# Match Unicode ranges for punctuation and symbols
|
||||
# FIXME this pattern is confused quick fix for #11868 maybe refactor it later
|
||||
pattern = r'^[\[\]\u2000-\u2025\u2027-\u206F\u2E00-\u2E7F\u3000-\u300F\u3011-\u303F"#$%&\'()*+,./:;<=>?@^_`~]+'
|
||||
|
||||
Reference in New Issue
Block a user