mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-05 01:37:46 +08:00
Feature rtl support (#13118)
### What problem does this PR solve? This PR adds comprehensive **Right-to-Left (RTL) language support**, primarily targeting Arabic and other RTL scripts (Hebrew, Persian, Urdu, etc.). Previously, RTL content had multiple rendering issues: - Incorrect sentence splitting for Arabic punctuation in citation logic - Misaligned text in chat messages and markdown components - Improper positioning of blockquotes and “think” sections - Incorrect table alignment - Citation placement ambiguity in RTL prompts - UI layout inconsistencies when mixing LTR and RTL text This PR introduces backend and frontend improvements to properly detect, render, and style RTL content while preserving existing LTR behavior. #### Backend - Updated sentence boundary regex in `rag/nlp/search.py` to include Arabic punctuation: - `،` (comma) - `؛` (semicolon) - `؟` (question mark) - `۔` (Arabic full stop) - Ensures citation insertion works correctly in RTL sentences. - Updated citation prompt instructions to clarify citation placement rules for RTL languages. #### Frontend - Introduced a new utility: `text-direction.ts` - Detects text direction based on Unicode ranges. - Supports Arabic, Hebrew, Syriac, Thaana, and related scripts. - Provides `getDirAttribute()` for automatic `dir` assignment. - Applied dynamic `dir` attributes across: - Markdown rendering - Chat messages - Search results - Tables - Hover cards and reference popovers - Added proper RTL styling in LESS: - Text alignment adjustments - Blockquote border flipping - Section indentation correction - Table direction switching - Use of `<bdi>` for figure labels to prevent bidirectional conflicts #### DevOps / Environment - Added Windows backend launch script with retry handling. - Updated dependency metadata. - Adjusted development-only React debugging behavior. --- ### Type of change - [x] Bug Fix (non-breaking change which fixes RTL rendering and citation issues) - [x] New Feature (non-breaking change which adds RTL detection and dynamic direction handling) --------- Co-authored-by: 6ba3i <isbaaoui09@gmail.com> Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local> Co-authored-by: Ahmad Intisar <168020872+ahmadintisar@users.noreply.github.com> Co-authored-by: Liu An <asiro@qq.com>
This commit is contained in:
@ -35,6 +35,7 @@ from api.db.services.llm_service import LLMBundle
|
||||
from common.metadata_utils import apply_meta_data_filter
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
from common.time_utils import current_timestamp, datetime_format
|
||||
from common.text_utils import normalize_arabic_digits
|
||||
from rag.graphrag.general.mind_map_extractor import MindMapExtractor
|
||||
from rag.advanced_rag import DeepResearcher
|
||||
from rag.app.tag import label_question
|
||||
@ -377,10 +378,12 @@ BAD_CITATION_PATTERNS = [
|
||||
re.compile(r"【\s*ID\s*[: ]*\s*(\d+)\s*】"), # 【ID: 12】
|
||||
re.compile(r"ref\s*(\d+)", flags=re.IGNORECASE), # ref12、REF 12
|
||||
]
|
||||
CITATION_MARKER_PATTERN = re.compile(r"\[(?:ID:)?([0-9\u0660-\u0669\u06F0-\u06F9]+)\]")
|
||||
|
||||
|
||||
def repair_bad_citation_formats(answer: str, kbinfos: dict, idx: set):
|
||||
max_index = len(kbinfos["chunks"])
|
||||
normalized_answer = normalize_arabic_digits(answer) or ""
|
||||
|
||||
def safe_add(i):
|
||||
if 0 <= i < max_index:
|
||||
@ -388,19 +391,36 @@ def repair_bad_citation_formats(answer: str, kbinfos: dict, idx: set):
|
||||
return True
|
||||
return False
|
||||
|
||||
def find_and_replace(pattern, group_index=1, repl=lambda i: f"ID:{i}", flags=0):
|
||||
def find_and_replace(pattern, group_index=1, repl=lambda digits: f"ID:{digits}"):
|
||||
nonlocal answer
|
||||
nonlocal normalized_answer
|
||||
|
||||
def replacement(match):
|
||||
matches = list(pattern.finditer(normalized_answer))
|
||||
if not matches:
|
||||
return
|
||||
|
||||
parts = []
|
||||
last_idx = 0
|
||||
for match in matches:
|
||||
parts.append(answer[last_idx:match.start()])
|
||||
try:
|
||||
i = int(match.group(group_index))
|
||||
if safe_add(i):
|
||||
return f"[{repl(i)}]"
|
||||
except Exception:
|
||||
pass
|
||||
return match.group(0)
|
||||
parts.append(answer[match.start():match.end()])
|
||||
last_idx = match.end()
|
||||
continue
|
||||
|
||||
answer = re.sub(pattern, replacement, answer, flags=flags)
|
||||
if safe_add(i):
|
||||
digit_start, digit_end = match.span(group_index)
|
||||
digits_original = answer[digit_start:digit_end]
|
||||
parts.append(f"[{repl(digits_original)}]")
|
||||
else:
|
||||
parts.append(answer[match.start():match.end()])
|
||||
last_idx = match.end()
|
||||
|
||||
parts.append(answer[last_idx:])
|
||||
answer = "".join(parts)
|
||||
normalized_answer = normalize_arabic_digits(answer) or ""
|
||||
|
||||
for pattern in BAD_CITATION_PATTERNS:
|
||||
find_and_replace(pattern)
|
||||
@ -627,7 +647,8 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
|
||||
|
||||
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
|
||||
idx = set([])
|
||||
if embd_mdl and not re.search(r"\[ID:([0-9]+)\]", answer):
|
||||
normalized_answer = normalize_arabic_digits(answer) or ""
|
||||
if embd_mdl and not CITATION_MARKER_PATTERN.search(normalized_answer):
|
||||
answer, idx = retriever.insert_citations(
|
||||
answer,
|
||||
[ck["content_ltks"] for ck in kbinfos["chunks"]],
|
||||
@ -637,7 +658,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
|
||||
vtweight=dialog.vector_similarity_weight,
|
||||
)
|
||||
else:
|
||||
for match in re.finditer(r"\[ID:([0-9]+)\]", answer):
|
||||
for match in CITATION_MARKER_PATTERN.finditer(normalized_answer):
|
||||
i = int(match.group(1))
|
||||
if i < len(kbinfos["chunks"]):
|
||||
idx.add(i)
|
||||
|
||||
Reference in New Issue
Block a user