mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-20 16:26:42 +08:00
## Summary RAPTOR's recursive clustering builds a `layers` list tracking `(start_idx, end_idx)` boundaries per level, but currently discards this information — only the flat `chunks` list is returned. This makes it impossible to distinguish leaf-level summaries from top-level ones. This PR: - Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__` - Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first summary level, 2 = summary-of-summaries, etc.) - Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch handles it via existing `*_int` dynamic template) ### Why this matters Downstream features need to know which RAPTOR layer a summary belongs to: - **Retrieving the top-level document summary** for entity extraction, search snippets, or document comparison - **Filtering by abstraction level** — users may want only high-level summaries or only leaf-level cluster summaries - **RAPTOR recall quality** — #10951 reports summaries not being recalled for definition queries; layer metadata enables targeted retrieval ### Changes | File | Change | LOC | |------|--------|-----| | `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 | | `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set `raptor_layer_int` | ~12 | | `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field | ~1 | ### Backward compatibility - **Additive only** — no existing fields or behavior changed - Existing RAPTOR chunks continue to work (they'll have `raptor_layer_int = 0` by default) - New RAPTOR chunks get layer metadata automatically ## Test plan - [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is set on indexed chunks - [ ] Verify `raptor_layer_int` values increase with abstraction level (layer 1 < layer 2 < ...) - [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still works - [ ] Verify Infinity backend accepts the new field Fixes #7488 Related: #4104, #11191, #10951 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: yuch85 <yuch85.1@gmail.com> Co-authored-by: Wang Qi <wangq8@outlook.com>
44 lines
3.0 KiB
JSON
44 lines
3.0 KiB
JSON
{
|
|
"id": {"type": "varchar", "default": ""},
|
|
"doc_id": {"type": "varchar", "default": ""},
|
|
"kb_id": {"type": "varchar", "default": "", "index_type": {"type": "secondary", "cardinality": "low"}},
|
|
"mom_id": {"type": "varchar", "default": ""},
|
|
"mom": {"type": "varchar", "default": ""},
|
|
"create_time": {"type": "varchar", "default": ""},
|
|
"create_timestamp_flt": {"type": "float", "default": 0.0},
|
|
"img_id": {"type": "varchar", "default": ""},
|
|
"docnm": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "docnm_kwd, title_tks, title_sm_tks"},
|
|
"name_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"tag_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"important_kwd_empty_count": {"type": "integer", "default": 0},
|
|
"important_keywords": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "important_kwd, important_tks"},
|
|
"questions": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "question_kwd, question_tks"},
|
|
"content": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "content_with_weight, content_ltks, content_sm_ltks"},
|
|
"authors": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "authors_tks, authors_sm_tks"},
|
|
"page_num_int": {"type": "varchar", "default": ""},
|
|
"top_int": {"type": "varchar", "default": ""},
|
|
"position_int": {"type": "varchar", "default": ""},
|
|
"weight_int": {"type": "integer", "default": 0},
|
|
"weight_flt": {"type": "float", "default": 0.0},
|
|
"chunk_order_int": {"type": "integer", "default": 0},
|
|
"rank_int": {"type": "integer", "default": 0},
|
|
"rank_flt": {"type": "float", "default": 0},
|
|
"available_int": {"type": "integer", "default": 1, "index_type": {"type": "secondary", "cardinality": "low"}},
|
|
"knowledge_graph_kwd": {"type": "varchar", "default": ""},
|
|
"entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"pagerank_fea": {"type": "integer", "default": 0},
|
|
"tag_feas": {"type": "varchar", "default": "", "analyzer": "rankfeatures"},
|
|
"from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"n_hop_with_weight": {"type": "varchar", "default": ""},
|
|
"mom_with_weight": {"type": "varchar", "default": ""},
|
|
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"toc_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
"raptor_layer_int": {"type": "integer", "default": 0}
|
|
}
|