Files
ragflow/agent/dsl_migration.py
Magicbook1108 1376c004a9 Fix: update docs generator (#14070)
### What problem does this PR solve?

Refactor: update docs generator

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

1. Support multiple document generator components and correctly display
messages in the message component. The document generator will not
overwrite other messages.

<img width="700" alt="Screenshot from 2026-04-13 13-56-17"
src="https://github.com/user-attachments/assets/3f3e06e8-33ce-4df1-8b05-510c86af70a4"
/>

2. Support Chinese content and ensure correct Markdown rendering in PDF
and DOCX
<img width="700" alt="image"
src="https://github.com/user-attachments/assets/69bf1f7b-261d-48e5-a9f3-8e94462b90ed"
/>

3. Simplify configuration page and support more output format
 
<img height="700" alt="image"
src="https://github.com/user-attachments/assets/8647374c-c055-4daa-ad71-cd9052eb138e"
/>

4. Hide download from other components except for message 
<img width="700" alt="image"
src="https://github.com/user-attachments/assets/a723dfcb-b60d-4eb5-b2f6-d41ca5955eb4"
/>

<img width="700" alt="image"
src="https://github.com/user-attachments/assets/a8762ac4-807b-4f0b-9287-65f82f7c9c98"
/>

5. Sanitize filename
 
<img width="700" alt="image"
src="https://github.com/user-attachments/assets/df49509f-37c0-40f9-b03d-bd6ce7fdefa8"
/>


6. And more changes on usability
2026-04-14 15:24:43 +08:00

179 lines
6.5 KiB
Python

#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import re
# Keep all legacy chunker renames in one place so the migration rule stays readable.
COMPONENT_RENAMES = {
"Splitter": "TokenChunker",
"HierarchicalMerger": "TitleChunker",
"PDFGenerator": "DocGenerator",
}
NODE_TYPE_RENAMES = {
"splitterNode": "chunkerNode",
}
VARIABLE_REF_PATTERN = re.compile(r"(\{+\s*)([A-Za-z0-9:_-]+)(@[A-Za-z0-9_.-]+)(\s*\}+)")
def normalize_chunker_dsl(dsl: dict) -> dict:
"""
Rewrite legacy chunker component names and ids into the current DSL schema.
This is intentionally a pure migration step:
- it does not change business params
- it only rewrites structural identifiers used by the canvas/runtime
- custom human-authored names are preserved unless they are still the exact
built-in legacy operator name
"""
if not isinstance(dsl, dict):
return dsl
normalized = copy.deepcopy(dsl)
components = normalized.get("components")
if not isinstance(components, dict):
return normalized
component_id_map: dict[str, str] = {}
for component_id in components.keys():
new_component_id = component_id
for old_name, new_name in COMPONENT_RENAMES.items():
prefix = f"{old_name}:"
if component_id.startswith(prefix):
new_component_id = f"{new_name}:{component_id[len(prefix):]}"
break
component_id_map[component_id] = new_component_id
def rewrite_variable_refs(text: str) -> str:
if text in component_id_map:
return component_id_map[text]
def repl(match: re.Match[str]) -> str:
component_id = match.group(2)
return (
match.group(1)
+ component_id_map.get(component_id, component_id)
+ match.group(3)
+ match.group(4)
)
return VARIABLE_REF_PATTERN.sub(repl, text)
def rewrite_value(value):
if isinstance(value, str):
return rewrite_variable_refs(value)
if isinstance(value, list):
return [rewrite_value(item) for item in value]
if isinstance(value, dict):
return {key: rewrite_value(item) for key, item in value.items()}
return value
rewritten_components = {}
for old_component_id, component in components.items():
new_component_id = component_id_map[old_component_id]
new_component = rewrite_value(component)
if isinstance(new_component, dict):
obj = new_component.get("obj")
if isinstance(obj, dict):
component_name = obj.get("component_name")
obj["component_name"] = COMPONENT_RENAMES.get(component_name, component_name)
if isinstance(new_component.get("downstream"), list):
new_component["downstream"] = [
component_id_map.get(component_id, component_id)
for component_id in new_component["downstream"]
]
if isinstance(new_component.get("upstream"), list):
new_component["upstream"] = [
component_id_map.get(component_id, component_id)
for component_id in new_component["upstream"]
]
parent_id = new_component.get("parent_id")
if isinstance(parent_id, str):
new_component["parent_id"] = component_id_map.get(parent_id, parent_id)
rewritten_components[new_component_id] = new_component
normalized["components"] = rewritten_components
if isinstance(normalized.get("path"), list):
normalized["path"] = [
component_id_map.get(component_id, component_id)
for component_id in normalized["path"]
]
graph = normalized.get("graph")
if isinstance(graph, dict):
nodes = graph.get("nodes")
if isinstance(nodes, list):
for node in nodes:
if not isinstance(node, dict):
continue
node_id = node.get("id")
if isinstance(node_id, str):
node["id"] = component_id_map.get(node_id, node_id)
parent_id = node.get("parentId")
if isinstance(parent_id, str):
node["parentId"] = component_id_map.get(parent_id, parent_id)
node_type = node.get("type")
if isinstance(node_type, str):
node["type"] = NODE_TYPE_RENAMES.get(node_type, node_type)
data = node.get("data")
if not isinstance(data, dict):
continue
label = data.get("label")
if isinstance(label, str):
data["label"] = COMPONENT_RENAMES.get(label, label)
name = data.get("name")
if isinstance(name, str) and name in COMPONENT_RENAMES:
data["name"] = COMPONENT_RENAMES[name]
if "form" in data:
data["form"] = rewrite_value(data["form"])
edges = graph.get("edges")
if isinstance(edges, list):
replacements = sorted(component_id_map.items(), key=lambda item: len(item[0]), reverse=True)
for edge in edges:
if not isinstance(edge, dict):
continue
for key in ("source", "target"):
value = edge.get(key)
if isinstance(value, str):
edge[key] = component_id_map.get(value, value)
edge_id = edge.get("id")
if isinstance(edge_id, str):
for old_component_id, new_component_id in replacements:
edge_id = edge_id.replace(old_component_id, new_component_id)
edge["id"] = edge_id
for key in ("history", "messages", "reference"):
if key in normalized:
normalized[key] = rewrite_value(normalized[key])
return normalized