Feature rtl support (#13118)

### What problem does this PR solve? This PR adds comprehensive **Right-to-Left (RTL) language support**, primarily targeting Arabic and other RTL scripts (Hebrew, Persian, Urdu, etc.). Previously, RTL content had multiple rendering issues: - Incorrect sentence splitting for Arabic punctuation in citation logic - Misaligned text in chat messages and markdown components - Improper positioning of blockquotes and “think” sections - Incorrect table alignment - Citation placement ambiguity in RTL prompts - UI layout inconsistencies when mixing LTR and RTL text This PR introduces backend and frontend improvements to properly detect, render, and style RTL content while preserving existing LTR behavior. #### Backend - Updated sentence boundary regex in `rag/nlp/search.py` to include Arabic punctuation: - `،` (comma) - `؛` (semicolon) - `؟` (question mark) - `۔` (Arabic full stop) - Ensures citation insertion works correctly in RTL sentences. - Updated citation prompt instructions to clarify citation placement rules for RTL languages. #### Frontend - Introduced a new utility: `text-direction.ts` - Detects text direction based on Unicode ranges. - Supports Arabic, Hebrew, Syriac, Thaana, and related scripts. - Provides `getDirAttribute()` for automatic `dir` assignment. - Applied dynamic `dir` attributes across: - Markdown rendering - Chat messages - Search results - Tables - Hover cards and reference popovers - Added proper RTL styling in LESS: - Text alignment adjustments - Blockquote border flipping - Section indentation correction - Table direction switching - Use of `<bdi>` for figure labels to prevent bidirectional conflicts #### DevOps / Environment - Added Windows backend launch script with retry handling. - Updated dependency metadata. - Adjusted development-only React debugging behavior. --- ### Type of change - [x] Bug Fix (non-breaking change which fixes RTL rendering and citation issues) - [x] New Feature (non-breaking change which adds RTL detection and dynamic direction handling) --------- Co-authored-by: 6ba3i <isbaaoui09@gmail.com> Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local> Co-authored-by: Ahmad Intisar <168020872+ahmadintisar@users.noreply.github.com> Co-authored-by: Liu An <asiro@qq.com>
2026-05-05 01:37:46 +08:00 · 2026-03-02 08:03:44 +03:00
parent a897aedea9
commit 21bc1ab7ec
54 changed files with 828 additions and 303 deletions
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@ -35,6 +35,7 @@ from api.db.services.llm_service import LLMBundle
 from common.metadata_utils import apply_meta_data_filter
 from api.db.services.tenant_llm_service import TenantLLMService
 from common.time_utils import current_timestamp, datetime_format
+from common.text_utils import normalize_arabic_digits
 from rag.graphrag.general.mind_map_extractor import MindMapExtractor
 from rag.advanced_rag import DeepResearcher
 from rag.app.tag import label_question
@ -377,10 +378,12 @@ BAD_CITATION_PATTERNS = [
    re.compile(r"【\s*ID\s*[: ]*\s*(\d+)\s*】"),  # 【ID: 12】
    re.compile(r"ref\s*(\d+)", flags=re.IGNORECASE),  # ref12、REF 12
 ]
+CITATION_MARKER_PATTERN = re.compile(r"\[(?:ID:)?([0-9\u0660-\u0669\u06F0-\u06F9]+)\]")


 def repair_bad_citation_formats(answer: str, kbinfos: dict, idx: set):
    max_index = len(kbinfos["chunks"])
+    normalized_answer = normalize_arabic_digits(answer) or ""

    def safe_add(i):
        if 0 <= i < max_index:
@ -388,19 +391,36 @@ def repair_bad_citation_formats(answer: str, kbinfos: dict, idx: set):
            return True
        return False

-    def find_and_replace(pattern, group_index=1, repl=lambda i: f"ID:{i}", flags=0):
+    def find_and_replace(pattern, group_index=1, repl=lambda digits: f"ID:{digits}"):
        nonlocal answer
+        nonlocal normalized_answer

-        def replacement(match):
+        matches = list(pattern.finditer(normalized_answer))
+        if not matches:
+            return
+
+        parts = []
+        last_idx = 0
+        for match in matches:
+            parts.append(answer[last_idx:match.start()])
            try:
                i = int(match.group(group_index))
-                if safe_add(i):
-                    return f"[{repl(i)}]"
            except Exception:
-                pass
-            return match.group(0)
+                parts.append(answer[match.start():match.end()])
+                last_idx = match.end()
+                continue

-        answer = re.sub(pattern, replacement, answer, flags=flags)
+            if safe_add(i):
+                digit_start, digit_end = match.span(group_index)
+                digits_original = answer[digit_start:digit_end]
+                parts.append(f"[{repl(digits_original)}]")
+            else:
+                parts.append(answer[match.start():match.end()])
+            last_idx = match.end()
+
+        parts.append(answer[last_idx:])
+        answer = "".join(parts)
+        normalized_answer = normalize_arabic_digits(answer) or ""

    for pattern in BAD_CITATION_PATTERNS:
        find_and_replace(pattern)
@ -627,7 +647,8 @@ async def async_chat(dialog, messages, stream=True, **kwargs):

        if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
            idx = set([])
-            if embd_mdl and not re.search(r"\[ID:([0-9]+)\]", answer):
+            normalized_answer = normalize_arabic_digits(answer) or ""
+            if embd_mdl and not CITATION_MARKER_PATTERN.search(normalized_answer):
                answer, idx = retriever.insert_citations(
                    answer,
                    [ck["content_ltks"] for ck in kbinfos["chunks"]],
@ -637,7 +658,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
                    vtweight=dialog.vector_similarity_weight,
                )
            else:
-                for match in re.finditer(r"\[ID:([0-9]+)\]", answer):
+                for match in CITATION_MARKER_PATTERN.finditer(normalized_answer):
                    i = int(match.group(1))
                    if i < len(kbinfos["chunks"]):
                        idx.add(i)