mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-23 01:18:22 +08:00
### What problem does this PR solve? Refactor: update docs generator ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 1. Support multiple document generator components and correctly display messages in the message component. The document generator will not overwrite other messages. <img width="700" alt="Screenshot from 2026-04-13 13-56-17" src="https://github.com/user-attachments/assets/3f3e06e8-33ce-4df1-8b05-510c86af70a4" /> 2. Support Chinese content and ensure correct Markdown rendering in PDF and DOCX <img width="700" alt="image" src="https://github.com/user-attachments/assets/69bf1f7b-261d-48e5-a9f3-8e94462b90ed" /> 3. Simplify configuration page and support more output format <img height="700" alt="image" src="https://github.com/user-attachments/assets/8647374c-c055-4daa-ad71-cd9052eb138e" /> 4. Hide download from other components except for message <img width="700" alt="image" src="https://github.com/user-attachments/assets/a723dfcb-b60d-4eb5-b2f6-d41ca5955eb4" /> <img width="700" alt="image" src="https://github.com/user-attachments/assets/a8762ac4-807b-4f0b-9287-65f82f7c9c98" /> 5. Sanitize filename <img width="700" alt="image" src="https://github.com/user-attachments/assets/df49509f-37c0-40f9-b03d-bd6ce7fdefa8" /> 6. And more changes on usability
179 lines
6.5 KiB
Python
179 lines
6.5 KiB
Python
#
|
|
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import copy
|
|
import re
|
|
|
|
|
|
# Keep all legacy chunker renames in one place so the migration rule stays readable.
|
|
COMPONENT_RENAMES = {
|
|
"Splitter": "TokenChunker",
|
|
"HierarchicalMerger": "TitleChunker",
|
|
"PDFGenerator": "DocGenerator",
|
|
}
|
|
|
|
NODE_TYPE_RENAMES = {
|
|
"splitterNode": "chunkerNode",
|
|
}
|
|
|
|
VARIABLE_REF_PATTERN = re.compile(r"(\{+\s*)([A-Za-z0-9:_-]+)(@[A-Za-z0-9_.-]+)(\s*\}+)")
|
|
|
|
|
|
def normalize_chunker_dsl(dsl: dict) -> dict:
|
|
"""
|
|
Rewrite legacy chunker component names and ids into the current DSL schema.
|
|
|
|
This is intentionally a pure migration step:
|
|
- it does not change business params
|
|
- it only rewrites structural identifiers used by the canvas/runtime
|
|
- custom human-authored names are preserved unless they are still the exact
|
|
built-in legacy operator name
|
|
"""
|
|
if not isinstance(dsl, dict):
|
|
return dsl
|
|
|
|
normalized = copy.deepcopy(dsl)
|
|
components = normalized.get("components")
|
|
if not isinstance(components, dict):
|
|
return normalized
|
|
|
|
component_id_map: dict[str, str] = {}
|
|
for component_id in components.keys():
|
|
new_component_id = component_id
|
|
for old_name, new_name in COMPONENT_RENAMES.items():
|
|
prefix = f"{old_name}:"
|
|
if component_id.startswith(prefix):
|
|
new_component_id = f"{new_name}:{component_id[len(prefix):]}"
|
|
break
|
|
component_id_map[component_id] = new_component_id
|
|
|
|
def rewrite_variable_refs(text: str) -> str:
|
|
if text in component_id_map:
|
|
return component_id_map[text]
|
|
|
|
def repl(match: re.Match[str]) -> str:
|
|
component_id = match.group(2)
|
|
return (
|
|
match.group(1)
|
|
+ component_id_map.get(component_id, component_id)
|
|
+ match.group(3)
|
|
+ match.group(4)
|
|
)
|
|
|
|
return VARIABLE_REF_PATTERN.sub(repl, text)
|
|
|
|
def rewrite_value(value):
|
|
if isinstance(value, str):
|
|
return rewrite_variable_refs(value)
|
|
if isinstance(value, list):
|
|
return [rewrite_value(item) for item in value]
|
|
if isinstance(value, dict):
|
|
return {key: rewrite_value(item) for key, item in value.items()}
|
|
return value
|
|
|
|
rewritten_components = {}
|
|
for old_component_id, component in components.items():
|
|
new_component_id = component_id_map[old_component_id]
|
|
new_component = rewrite_value(component)
|
|
|
|
if isinstance(new_component, dict):
|
|
obj = new_component.get("obj")
|
|
if isinstance(obj, dict):
|
|
component_name = obj.get("component_name")
|
|
obj["component_name"] = COMPONENT_RENAMES.get(component_name, component_name)
|
|
|
|
if isinstance(new_component.get("downstream"), list):
|
|
new_component["downstream"] = [
|
|
component_id_map.get(component_id, component_id)
|
|
for component_id in new_component["downstream"]
|
|
]
|
|
if isinstance(new_component.get("upstream"), list):
|
|
new_component["upstream"] = [
|
|
component_id_map.get(component_id, component_id)
|
|
for component_id in new_component["upstream"]
|
|
]
|
|
|
|
parent_id = new_component.get("parent_id")
|
|
if isinstance(parent_id, str):
|
|
new_component["parent_id"] = component_id_map.get(parent_id, parent_id)
|
|
|
|
rewritten_components[new_component_id] = new_component
|
|
|
|
normalized["components"] = rewritten_components
|
|
|
|
if isinstance(normalized.get("path"), list):
|
|
normalized["path"] = [
|
|
component_id_map.get(component_id, component_id)
|
|
for component_id in normalized["path"]
|
|
]
|
|
|
|
graph = normalized.get("graph")
|
|
if isinstance(graph, dict):
|
|
nodes = graph.get("nodes")
|
|
if isinstance(nodes, list):
|
|
for node in nodes:
|
|
if not isinstance(node, dict):
|
|
continue
|
|
node_id = node.get("id")
|
|
if isinstance(node_id, str):
|
|
node["id"] = component_id_map.get(node_id, node_id)
|
|
|
|
parent_id = node.get("parentId")
|
|
if isinstance(parent_id, str):
|
|
node["parentId"] = component_id_map.get(parent_id, parent_id)
|
|
|
|
node_type = node.get("type")
|
|
if isinstance(node_type, str):
|
|
node["type"] = NODE_TYPE_RENAMES.get(node_type, node_type)
|
|
|
|
data = node.get("data")
|
|
if not isinstance(data, dict):
|
|
continue
|
|
|
|
label = data.get("label")
|
|
if isinstance(label, str):
|
|
data["label"] = COMPONENT_RENAMES.get(label, label)
|
|
|
|
name = data.get("name")
|
|
if isinstance(name, str) and name in COMPONENT_RENAMES:
|
|
data["name"] = COMPONENT_RENAMES[name]
|
|
|
|
if "form" in data:
|
|
data["form"] = rewrite_value(data["form"])
|
|
|
|
edges = graph.get("edges")
|
|
if isinstance(edges, list):
|
|
replacements = sorted(component_id_map.items(), key=lambda item: len(item[0]), reverse=True)
|
|
for edge in edges:
|
|
if not isinstance(edge, dict):
|
|
continue
|
|
for key in ("source", "target"):
|
|
value = edge.get(key)
|
|
if isinstance(value, str):
|
|
edge[key] = component_id_map.get(value, value)
|
|
|
|
edge_id = edge.get("id")
|
|
if isinstance(edge_id, str):
|
|
for old_component_id, new_component_id in replacements:
|
|
edge_id = edge_id.replace(old_component_id, new_component_id)
|
|
edge["id"] = edge_id
|
|
|
|
for key in ("history", "messages", "reference"):
|
|
if key in normalized:
|
|
normalized[key] = rewrite_value(normalized[key])
|
|
|
|
return normalized
|