mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-03-12 02:29:01 +08:00
### What problem does this PR solve? Improve image table context. Current strategy in attach_media_context: - Order by position when possible: if any chunk has page/position info, sort by (page, top, left), otherwise keep original order. - Apply only to media chunks: images use image_context_size, tables use table_context_size. - Primary matching: on the same page, choose a text chunk whose vertical span overlaps the media, then pick the one with the closest vertical midpoint. - Fallback matching: if no overlap on that page, choose the nearest text chunk on the same page (page-head uses the next text; page-tail uses the previous text). - Context extraction: inside the chosen text chunk, find a mid-sentence boundary near the text midpoint, then take context_size tokens split before/after (total budget). - No multi-chunk stitching: context comes from a single text chunk to avoid mixing unrelated segments. ### Type of change - [x] Refactoring --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
80 lines
1.7 KiB
TypeScript
80 lines
1.7 KiB
TypeScript
import { RunningStatus } from '@/constants/knowledge';
|
|
|
|
export interface IDocumentInfo {
|
|
chunk_num: number;
|
|
create_date: string;
|
|
create_time: number;
|
|
created_by: string;
|
|
nickname: string;
|
|
id: string;
|
|
kb_id: string;
|
|
location: string;
|
|
name: string;
|
|
parser_config: IParserConfig;
|
|
parser_id: string;
|
|
pipeline_id: string;
|
|
pipeline_name: string;
|
|
process_begin_at?: string;
|
|
process_duration: number;
|
|
progress: number;
|
|
progress_msg: string;
|
|
run: RunningStatus;
|
|
size: number;
|
|
source_type: string;
|
|
status: string;
|
|
suffix: string;
|
|
thumbnail: string;
|
|
token_num: number;
|
|
type: string;
|
|
update_date: string;
|
|
update_time: number;
|
|
meta_fields?: Record<string, any>;
|
|
}
|
|
|
|
export interface IParserConfig {
|
|
delimiter?: string;
|
|
html4excel?: boolean;
|
|
layout_recognize?: string;
|
|
pages?: any[];
|
|
chunk_token_num?: number;
|
|
auto_keywords?: number;
|
|
auto_questions?: number;
|
|
toc_extraction?: boolean;
|
|
task_page_size?: number;
|
|
raptor?: Raptor;
|
|
graphrag?: GraphRag;
|
|
image_context_window?: number;
|
|
image_table_context_window?: number;
|
|
image_context_size?: number;
|
|
table_context_size?: number;
|
|
mineru_parse_method?: 'auto' | 'txt' | 'ocr';
|
|
mineru_formula_enable?: boolean;
|
|
mineru_table_enable?: boolean;
|
|
mineru_lang?: string;
|
|
entity_types?: string[];
|
|
metadata?: Array<{
|
|
key?: string;
|
|
description?: string;
|
|
enum?: string[];
|
|
}>;
|
|
enable_metadata?: boolean;
|
|
}
|
|
|
|
interface Raptor {
|
|
use_raptor: boolean;
|
|
}
|
|
|
|
interface GraphRag {
|
|
community?: boolean;
|
|
entity_types?: string[];
|
|
method?: string;
|
|
resolution?: boolean;
|
|
use_graphrag?: boolean;
|
|
}
|
|
|
|
export type IDocumentInfoFilter = {
|
|
run_status: Record<number, number>;
|
|
suffix: Record<string, number>;
|
|
metadata: Record<string, Record<string, number>>;
|
|
};
|