ragflow/rag/app/resume.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

"""
Resume parsing module (aligned with SmartResume Pipeline architecture optimization)

Key optimizations (ref: arXiv:2510.09722):
    1. PDF text fusion: metadata + OCR dual-path extraction and fusion
    2. Layout-aware reconstruction: YOLOv10 layout segmentation + hierarchical sorting + line indexing
    3. Parallel task decomposition: basic info / work experience / education - 3-way parallel LLM extraction
    4. Index pointer mechanism: LLM returns line number ranges instead of generating full text, reducing hallucination
    5. Four-stage post-processing: source text re-extraction, domain normalization, context deduplication, source text validation

Compatibility:
    - chunk(filename, binary, callback, **kwargs) signature remains unchanged
    - Compatible with FACTORY[ParserType.RESUME.value] in task_executor.py
"""

import json
import re
import random
import datetime
import unicodedata
import concurrent.futures
from io import BytesIO
from typing import Optional
import numpy as np

# tiktoken for long random string filtering (ref: SmartResume should_remove strategy)
try:
    import tiktoken
    _tiktoken_encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
except ImportError:
    _tiktoken_encoding = None

# Long random string pattern: 40+ char alphanumeric mixed strings (hash, token, tracking ID, etc.)
_LONG_RANDOM_PATTERN = re.compile(r'[a-zA-Z0-9\-~_]{40,}')

import logging as logger
from rag.nlp import rag_tokenizer
from deepdoc.parser.utils import get_text

# json_repair for fixing malformed JSON from LLM responses (ref: SmartResume fault-tolerance strategy)
try:
    import json_repair
except ImportError:
    json_repair = None

# YOLOv10 layout detector (lazy initialization to avoid loading model when unused)
_layout_recognizer = None


def _get_layout_recognizer():
    """
    Get YOLOv10 layout detector singleton (lazy loading)

    Uses the existing deepdoc LayoutRecognizer based on layout.onnx model.

    Returns:
        LayoutRecognizer instance, or None if loading fails
    """
    global _layout_recognizer
    if _layout_recognizer is None:
        try:
            from deepdoc.vision import LayoutRecognizer
            _layout_recognizer = LayoutRecognizer("layout")
            logger.info("YOLOv10 layout detector loaded successfully")
        except Exception as e:
            logger.warning(f"YOLOv10 layout detector loading failed, falling back to heuristic sorting: {e}")
            _layout_recognizer = False  # Mark as failed to avoid repeated attempts
    return _layout_recognizer if _layout_recognizer is not False else None

# ==================== Constants ====================

# Fields forbidden from being used as select fields in resume
FORBIDDEN_SELECT_FIELDS = [
    "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd",
    "sch_rank_kwd", "edu_fea_kwd"
]

# Field name to description mapping (bilingual versions for chunk construction)
FIELD_MAP_ZH = {
    "name_kwd": "姓名/名字",
    "name_pinyin_kwd": "姓名拼音/名字拼音",
    "gender_kwd": "性别（男，女）",
    "age_int": "年龄/岁/年纪",
    "phone_kwd": "电话/手机/微信",
    "email_tks": "email/e-mail/邮箱",
    "position_name_tks": "职位/职能/岗位/职责",
    "expect_city_names_tks": "期望城市",
    "work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
    "corporation_name_tks": "最近就职(上班)的公司/上一家公司",
    "first_school_name_tks": "第一学历毕业学校",
    "first_degree_kwd": "第一学历",
    "highest_degree_kwd": "最高学历",
    "first_major_tks": "第一学历专业",
    "edu_first_fea_kwd": "第一学历标签",
    "degree_kwd": "过往学历",
    "major_tks": "学过的专业/过往专业",
    "school_name_tks": "学校/毕业院校",
    "sch_rank_kwd": "学校标签",
    "edu_fea_kwd": "教育标签",
    "corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
    "edu_end_int": "毕业年份",
    "industry_name_tks": "所在行业",
    "birth_dt": "生日/出生年份",
    "expect_position_name_tks": "期望职位/期望职能/期望岗位",
    "skill_tks": "技能/技术栈/编程语言/框架/工具",
    "language_tks": "语言能力/外语水平",
    "certificate_tks": "证书/资质/认证",
    "project_tks": "项目经验/项目名称",
    "work_desc_tks": "工作职责/工作描述",
    "project_desc_tks": "项目描述/项目职责",
    "self_evaluation_tks": "自我评价/个人优势/个人总结",
}

FIELD_MAP_EN = {
    "name_kwd": "Name",
    "name_pinyin_kwd": "Name Pinyin",
    "gender_kwd": "Gender (Male, Female)",
    "age_int": "Age",
    "phone_kwd": "Phone/Mobile/WeChat",
    "email_tks": "Email",
    "position_name_tks": "Position/Title/Role",
    "expect_city_names_tks": "Preferred City",
    "work_exp_flt": "Years of Experience",
    "corporation_name_tks": "Most Recent Company",
    "first_school_name_tks": "First Degree School",
    "first_degree_kwd": "First Degree",
    "highest_degree_kwd": "Highest Degree",
    "first_major_tks": "First Degree Major",
    "edu_first_fea_kwd": "First Degree Tag",
    "degree_kwd": "Past Degrees",
    "major_tks": "Past Majors",
    "school_name_tks": "School/University",
    "sch_rank_kwd": "School Tag",
    "edu_fea_kwd": "Education Tag",
    "corp_nm_tks": "Past Companies",
    "edu_end_int": "Graduation Year",
    "industry_name_tks": "Industry",
    "birth_dt": "Date of Birth",
    "expect_position_name_tks": "Preferred Position/Role",
    "skill_tks": "Skills/Tech Stack/Languages/Frameworks/Tools",
    "language_tks": "Language Proficiency",
    "certificate_tks": "Certificates/Qualifications",
    "project_tks": "Project Experience/Project Name",
    "work_desc_tks": "Job Responsibilities/Description",
    "project_desc_tks": "Project Description/Responsibilities",
    "self_evaluation_tks": "Self-Evaluation/Personal Strengths/Summary",
}


def _is_english(lang: str) -> bool:
    """Determine if the language parameter indicates English"""
    return lang.lower() in ("english", "en")


def get_field_map(lang: str) -> dict:
    """Get the corresponding field mapping based on language parameter"""
    return FIELD_MAP_EN if _is_english(lang) else FIELD_MAP_ZH


# Backward compatible: default to Chinese version
FIELD_MAP = FIELD_MAP_ZH


# ==================== Parallel LLM Extraction Prompt Templates ====================
# Ref: SmartResume task decomposition strategy, splitting extraction into independent subtasks
# Each prompt ends with /no_think marker to suppress reasoning model's thinking output
# Prompts loaded from md files under rag/prompts/, supporting bilingual versions

from rag.prompts.template import load_prompt


def _load_resume_prompt(name: str, lang: str) -> str:
    """Load the corresponding version of resume prompt template based on language parameter

    Args:
        name: Prompt name (without language suffix), e.g. "resume_system"
        lang: Language parameter, e.g. "Chinese" or "English"
    Returns:
        Prompt template string
    """
    suffix = "_en" if _is_english(lang) else ""
    return load_prompt(f"{name}{suffix}")


def get_system_prompt(lang: str) -> str:
    """Get system prompt"""
    return _load_resume_prompt("resume_system", lang)


def get_basic_info_prompt(lang: str) -> str:
    """Get basic info extraction prompt"""
    return _load_resume_prompt("resume_basic_info", lang)


def get_work_exp_prompt(lang: str) -> str:
    """Get work experience extraction prompt"""
    return _load_resume_prompt("resume_work_exp", lang)


def get_education_prompt(lang: str) -> str:
    """Get education background extraction prompt"""
    return _load_resume_prompt("resume_education", lang)


def get_project_exp_prompt(lang: str) -> str:
    """Get project experience extraction prompt"""
    return _load_resume_prompt("resume_project_exp", lang)


# Backward compatible: default Chinese version constants (for possible external direct references)
SYSTEM_PROMPT = load_prompt("resume_system")
BASIC_INFO_PROMPT = load_prompt("resume_basic_info")
WORK_EXP_PROMPT = load_prompt("resume_work_exp")
EDUCATION_PROMPT = load_prompt("resume_education")
PROJECT_EXP_PROMPT = load_prompt("resume_project_exp")

# LLM call max retry count (ref: SmartResume retry strategy)
_LLM_MAX_RETRIES = 2


def _normalize_whitespace(text: str) -> str:
    """
    Unicode whitespace normalization (ref: SmartResume _clean_text_content)

    Replaces various Unicode spaces (\u00A0 non-breaking space, \u3000 fullwidth space,
    \u2000-\u200A various width spaces, etc.) with regular spaces,
    then applies NFKC normalization (fullwidth to halfwidth) and merges consecutive spaces.

    Args:
        text: Original text
    Returns:
        Normalized text
    """
    if not text:
        return ""
    # NFKC normalization (fullwidth to halfwidth, etc.)
    text = unicodedata.normalize('NFKC', text)
    # Unify various Unicode spaces to regular space
    text = re.sub(
        r'[\u0020\u00A0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\u00A7]',
        ' ', text
    )
    # Merge consecutive spaces
    text = re.sub(r' {2,}', ' ', text)
    return text.strip()


def _should_remove_random_str(match: re.Match) -> bool:
    """
    Determine if a matched long string is a meaningless random string (ref: SmartResume should_remove)

    Uses tiktoken encoding to judge: if token count exceeds 50% of original char count,
    it indicates a meaningless random string (hash, token, tracking ID, etc.) that should be removed.
    Normal English words have high token encoding efficiency, with token count far less than char count.

    Args:
        match: Regex match object
    Returns:
        True means it should be removed
    """
    if _tiktoken_encoding is None:
        # When tiktoken is unavailable, use simple heuristic: case/digit alternation frequency
        s = match.group(0)
        changes = sum(
            1 for i in range(1, len(s))
            if s[i].isdigit() != s[i-1].isdigit()
            or (s[i].isalpha() and s[i-1].isalpha() and s[i].isupper() != s[i-1].isupper())
        )
        return changes / len(s) > 0.3
    encoded = _tiktoken_encoding.encode(match.group(0))
    return len(encoded) > len(match.group(0)) * 0.5


def _clean_line_content(text: str) -> str:
    """
    Clean single line text content (Unicode normalization + long random string filtering)

    Args:
        text: Original line text
    Returns:
        Cleaned text
    """
    if not text:
        return ""
    # Unicode whitespace normalization
    text = _normalize_whitespace(text)
    # Filter long random strings (hash, token and other meaningless content)
    text = _LONG_RANDOM_PATTERN.sub(
        lambda m: '' if _should_remove_random_str(m) else m.group(0),
        text
    )
    # Clean up extra spaces after filtering
    text = re.sub(r' {2,}', ' ', text).strip()
    return text


# ==================== Phase 1: PDF Text Fusion and Layout Reconstruction ====================


def _is_noise_char(obj: dict) -> bool:
    """
    Determine if a PDF character object is a decorative layer noise character

    Uses a "body text whitelist" strategy instead of enumerating noise features,
    to handle noise patterns from different resume templates:

    Two reliable features of body text characters (either one means body text):
    1. Embedded font: Font name format is XXXXXX+FontName (contains '+'),
       indicating the font is embedded in the PDF, chosen by the document author
    2. Structure tag: Has PDF Tagged Structure tags (e.g., Span, P, NonStruct, etc.),
       indicating the character belongs to the document's semantic structure tree

    Common features of noise characters:
    - Uses system fonts (e.g., Helvetica, Arial), font name doesn't contain '+'
    - No structure tags (tag is None or non-semantic tags like 'OC')
    - Common in resume template background decorations, watermarks, tracking marks

    Args:
        obj: pdfplumber character/text object dictionary
    Returns:
        True means it's a noise character that should be filtered
    """
    # Whitelist condition 1: Embedded font (font name contains '+' prefix)
    fontname = obj.get("fontname", "")
    if "+" in fontname:
        return False  # Embedded font = body content

    # Whitelist condition 2: Has PDF structure tag
    tag = obj.get("tag")
    if tag in ("Span", "NonStruct", "P", "H1", "H2", "H3", "H4", "H5", "H6",
               "TD", "TH", "LI", "L", "Table", "TR", "Figure", "Caption"):
        return False  # Has semantic structure tag = body content

    # Doesn't meet any whitelist condition, treat as noise
    return True


def _extract_metadata_text(binary: bytes) -> list[dict]:
    """
    Extract text blocks from PDF metadata (with coordinate info)

    Strategy:
    1. Use whitelist strategy to filter decorative layer noise chars (embedded font or structure tag = body text)
    2. Safe fallback: if filtered chars are less than 30% of original, skip filtering to avoid false positives
    3. Use extract_words for word-level extraction (with real coordinates)
    4. Aggregate adjacent words into line-level text blocks by Y coordinate
    5. Additionally extract table content (many resumes use table layouts)

    Args:
        binary: PDF file binary content
    Returns:
        List of text blocks, each containing text, x0, top, x1, bottom, page fields
    """
    try:
        import pdfplumber
        blocks = []
        with pdfplumber.open(BytesIO(binary)) as pdf:
            for page_idx, page in enumerate(pdf.pages):
                page_width = page.width or 600

                # Filter decorative layer noise chars (whitelist strategy based on embedded font + structure tag)
                # Safe fallback: if filtered chars are less than 30% of original, the PDF's body text
                # may use non-embedded fonts without structure tags, skip filtering to avoid false positives
                try:
                    original_char_count = len(page.chars)
                    filtered_page = page.filter(
                        lambda obj: not _is_noise_char(obj)
                    )
                    filtered_char_count = len(filtered_page.chars)
                    if original_char_count > 0 and filtered_char_count < original_char_count * 0.3:
                        # Filtered out over 70% of chars, likely false positives, fall back to original page
                        filtered_page = page
                except Exception:
                    filtered_page = page

                # Use extract_words for extraction (with real coordinates)
                words = []
                try:
                    words = filtered_page.extract_words(
                        keep_blank_chars=False, use_text_flow=True
                    )
                except Exception:
                    pass

                if words:
                    # Aggregate adjacent words into line-level text blocks by Y coordinate
                    # Words on the same line: top coordinate difference within threshold
                    line_threshold = 5  # Y coordinate difference threshold (unit: PDF points)
                    current_line_words = [words[0]]

                    def _flush_line(line_words):
                        """Merge words in a line into a single text block"""
                        # Sort by x0 to ensure left-to-right order
                        line_words.sort(key=lambda w: float(w.get("x0", 0)))
                        texts = []
                        for w in line_words:
                            texts.append(w.get("text", ""))
                        merged_text = " ".join(texts)
                        if not merged_text.strip():
                            return None
                        return {
                            "text": merged_text.strip(),
                            "x0": float(min(w.get("x0", 0) for w in line_words)),
                            "top": float(min(w.get("top", 0) for w in line_words)),
                            "x1": float(max(w.get("x1", 0) for w in line_words)),
                            "bottom": float(max(w.get("bottom", 0) for w in line_words)),
                            "page": page_idx,
                        }

                    for w in words[1:]:
                        w_top = float(w.get("top", 0))
                        cur_top = float(current_line_words[0].get("top", 0))
                        if abs(w_top - cur_top) <= line_threshold:
                            current_line_words.append(w)
                        else:
                            block = _flush_line(current_line_words)
                            if block:
                                blocks.append(block)
                            current_line_words = [w]

                    # Process the last line
                    if current_line_words:
                        block = _flush_line(current_line_words)
                        if block:
                            blocks.append(block)
                else:
                    # Fall back to extract_text when extract_words fails
                    page_text = None
                    try:
                        page_text = page.extract_text()
                    except Exception:
                        pass
                    if page_text and page_text.strip():
                        raw_lines = page_text.split("\n")
                        line_height = 16
                        for i, line in enumerate(raw_lines):
                            cleaned = line.strip()
                            if not cleaned:
                                continue
                            blocks.append({
                                "text": cleaned,
                                "x0": 0,
                                "top": i * line_height,
                                "x1": page_width,
                                "bottom": i * line_height + line_height - 2,
                                "page": page_idx,
                            })

                # Extract table content from the page
                # Many resumes use table layouts (e.g., personal info section), extract_words may miss table structure
                try:
                    tables = page.extract_tables()
                    if tables:
                        page_blocks = [b for b in blocks if b["page"] == page_idx]
                        max_top = max((b["top"] for b in page_blocks), default=0) + 20
                        row_height = 16

                        for table in tables:
                            for row in table:
                                if not row:
                                    continue
                                cells = [str(c).strip() for c in row if c and str(c).strip()]
                                if not cells:
                                    continue
                                row_text = " | ".join(cells)
                                # Dedup: check if table content was already extracted by extract_words
                                is_dup = False
                                for pb in page_blocks:
                                    if all(c in pb["text"] for c in cells[:2]):
                                        is_dup = True
                                        break
                                if is_dup:
                                    continue
                                blocks.append({
                                    "text": row_text,
                                    "x0": 0,
                                    "top": max_top,
                                    "x1": page_width,
                                    "bottom": max_top + row_height - 2,
                                    "page": page_idx,
                                })
                                max_top += row_height
                except Exception as e:
                    logger.debug(f"PDF table extraction skipped (page {page_idx}): {e}")
        return blocks
    except Exception as e:
        logger.warning(f"PDF metadata extraction failed: {e}")
        return []

def _extract_ocr_text(binary: bytes, meta_blocks: list[dict] | None = None) -> list[dict]:
    """
    Extract OCR text blocks using blackout strategy (with coordinate info).

    Strategy (ref: SmartResume):
    1. Render PDF pages to images
    2. Black out regions already extracted by metadata
    3. Run OCR on the blacked-out image, only recognizing content metadata missed
    4. Eliminates duplication at source, no IoU dedup needed downstream

    Args:
        binary: PDF file binary content
        meta_blocks: Text blocks from metadata extraction, used to black out existing text regions
    Returns:
        List of text blocks, each containing text, x0, top, x1, bottom, page fields
    """
    if meta_blocks is None:
        meta_blocks = []
    try:
        import pdfplumber
        from deepdoc.vision.ocr import OCR
        import numpy as np

        ocr = OCR()
        blocks = []

        with pdfplumber.open(BytesIO(binary)) as pdf:
            for page_idx, page in enumerate(pdf.pages):
                # Render page to image (resolution=216 = 3x scale, since PDF default is 72 DPI)
                img = page.to_image(resolution=216)
                page_img = np.array(img.annotated)

                # Scale factor from PDF coordinates to image coordinates
                pdf_to_img_scale = 216.0 / 72.0  # = 3.0

                # Black out metadata-extracted text regions before OCR
                page_meta_blocks = [b for b in meta_blocks if b.get("page") == page_idx]
                if page_meta_blocks:
                    page_img = _blackout_text_regions(page_img, meta_blocks, page_idx, pdf_to_img_scale)

                ocr_result = ocr(page_img)
                if not ocr_result:
                    continue
                for box_info in ocr_result:
                    if isinstance(box_info, (list, tuple)) and len(box_info) >= 2:
                        coords = box_info[0]  # Coordinate points
                        text_info = box_info[1]
                        text = text_info[0] if isinstance(text_info, (list, tuple)) else str(text_info)
                        if text.strip() and isinstance(coords, (list, tuple)) and len(coords) >= 4:
                            # Extract bounding box from four corner points
                            xs = [p[0] for p in coords if isinstance(p, (list, tuple))]
                            ys = [p[1] for p in coords if isinstance(p, (list, tuple))]
                            if xs and ys:
                                blocks.append({
                                    "text": text.strip(),
                                    "x0": min(xs), "top": min(ys),
                                    "x1": max(xs), "bottom": max(ys),
                                    "page": page_idx,
                                })
        return blocks
    except Exception as e:
        logger.warning(f"OCR extraction failed: {e}")
        return []


def _fuse_text_blocks(meta_blocks: list[dict], ocr_blocks: list[dict]) -> list[dict]:
    """
    Fuse PDF metadata text and OCR text (blackout strategy version).

    Since the OCR phase already blacks out metadata-extracted regions, OCR only recognizes
    content that metadata missed. Therefore this function only needs to:
    1. Filter out garbled blocks from metadata
    2. Directly merge valid metadata blocks and OCR blocks (no IoU dedup needed)

    Args:
        meta_blocks: Text blocks from metadata extraction
        ocr_blocks: Text blocks from OCR extraction (already deduplicated via blackout strategy)
    Returns:
        Fused text block list
    """
    if not ocr_blocks:
        return meta_blocks
    if not meta_blocks:
        return ocr_blocks

    # Filter out garbled blocks from metadata
    valid_meta = []
    garbled_count = 0
    for b in meta_blocks:
        if _is_valid_line(b.get("text", "")):
            valid_meta.append(b)
        else:
            garbled_count += 1

    if garbled_count:
        logger.info(f"Detected {garbled_count} garbled blocks in metadata, filtered out")

    # Under blackout strategy, OCR won't re-recognize existing text, just merge directly
    fused = valid_meta + ocr_blocks
    return fused


def _layout_aware_reorder(blocks: list[dict]) -> list[dict]:
    """
    Layout-aware hierarchical sorting (ref: SmartResume Hierarchical Re-ordering)

    Two-level sorting strategy:
    1. Inter-segment sorting: first by page number, then by Y coordinate (top to bottom), same row by X coordinate (left to right)
    2. Intra-segment sorting: within each logical segment, sort by reading order

    For multi-column resumes, detect column positions by clustering X coordinates,
    then sort by column order.

    Args:
        blocks: Text block list (with coordinate info)
    Returns:
        Sorted text block list
    """
    if not blocks:
        return blocks

    # Group by page
    pages = {}
    for b in blocks:
        pg = b.get("page", 0)
        pages.setdefault(pg, []).append(b)

    sorted_blocks = []
    for pg in sorted(pages.keys()):
        page_blocks = pages[pg]

        # Detect multi-column layout: by X coordinate median
        if len(page_blocks) > 5:
            x_centers = [(b["x0"] + b["x1"]) / 2 for b in page_blocks]
            x_min, x_max = min(x_centers), max(x_centers)
            page_width = x_max - x_min if x_max > x_min else 1

            # Simple two-column detection: if text blocks are clearly distributed on left and right sides
            mid_x = (x_min + x_max) / 2
            left_count = sum(1 for x in x_centers if x < mid_x - page_width * 0.1)
            right_count = sum(1 for x in x_centers if x > mid_x + page_width * 0.1)

            if left_count > 3 and right_count > 3:
                # Multi-column layout: left column first then right column, each column top to bottom
                left_blocks = [b for b in page_blocks if (b["x0"] + b["x1"]) / 2 < mid_x]
                right_blocks = [b for b in page_blocks if (b["x0"] + b["x1"]) / 2 >= mid_x]
                left_blocks.sort(key=lambda b: (b["top"], b["x0"]))
                right_blocks.sort(key=lambda b: (b["top"], b["x0"]))
                sorted_blocks.extend(left_blocks)
                sorted_blocks.extend(right_blocks)
                continue

        # Single-column layout: top to bottom, same row left to right
        page_blocks.sort(key=lambda b: (b["top"], b["x0"]))
        sorted_blocks.extend(page_blocks)

    return sorted_blocks


def _build_indexed_text(blocks: list[dict]) -> tuple[str, list[str], list[dict]]:
    """

    Build indexed text with line numbers (ref: SmartResume Indexed Linearization)

    Merges sorted text blocks into lines and adds a unique index number to each line.
    Includes garbled line filtering logic and field label split repair.
    Also preserves coordinate info for each line, used for writing position_int etc. to chunks.

    Args:
        blocks: Sorted text block list
    Returns:
        (indexed_text, lines, line_positions) tuple:
        - indexed_text: Text string with line numbers
        - lines: Original line text list (without line numbers)
        - line_positions: Coordinate info for each line, format:
    """
    if not blocks:
        return "", [], []

    raw_lines = []
    raw_positions = []
    current_line_parts = []
    current_line_blocks = []
    current_top = blocks[0].get("top", 0)
    current_layoutno = blocks[0].get("layoutno", "")
    threshold = 10

    def _merge_line_position(line_blocks: list[dict]) -> dict:
        """Merge coordinates of all blocks in a line into outer bounding rectangle"""
        return {
            "page": line_blocks[0].get("page", 0),
            "x0": min(b.get("x0", 0) for b in line_blocks),
            "x1": max(b.get("x1", 0) for b in line_blocks),
            "top": min(b.get("top", 0) for b in line_blocks),
            "bottom": max(b.get("bottom", 0) for b in line_blocks),
        }

    for b in blocks:
        b_layoutno = b.get("layoutno", "")
        y_changed = abs(b.get("top", 0) - current_top) > threshold
        layout_changed = b_layoutno != current_layoutno and current_layoutno and b_layoutno
        if (y_changed or layout_changed) and current_line_parts:
            raw_lines.append(" ".join(current_line_parts))
            raw_positions.append(_merge_line_position(current_line_blocks))
            current_line_parts = []
            current_line_blocks = []
            current_top = b.get("top", 0)
            current_layoutno = b_layoutno
        current_line_parts.append(b["text"])
        current_line_blocks.append(b)

    if current_line_parts:
        raw_lines.append(" ".join(current_line_parts))
        raw_positions.append(_merge_line_position(current_line_blocks))

    # Filter empty and garbled lines (sync filter coordinates)
    lines = []
    line_positions = []
    for line, pos in zip(raw_lines, raw_positions):
        # Unicode normalization + long random string filtering (ref: SmartResume _clean_text_content)
        line = _clean_line_content(line)
        if not line:
            continue
        # Garbled detection: skip if valid chars (Chinese/ASCII letters/digits/common punctuation) ratio is too low
        if not _is_valid_line(line):
            continue
        lines.append(line)
        line_positions.append(pos)

    # Fix field label split issues
    # Coordinates are not affected, keep original positions
    lines = _fix_split_labels(lines)

    # Build indexed text with line numbers
    indexed_parts = [f"[{i}]: {line}" for i, line in enumerate(lines)]
    indexed_text = "\n".join(indexed_parts)

    return indexed_text, lines, line_positions

def _is_valid_line(line: str) -> bool:
    """
    Check if a text line is valid content (not garbled)

    Multi-dimensional detection:
    1. Valid character ratio (Chinese, ASCII alphanumeric, common punctuation)
    2. Single-character spacing anomaly detection (PDF custom font mapping causing "O U W Z_W V 2" pattern)
    3. Consecutive meaningless alphanumeric sequence detection

    Args:
        line: Text line to check
    Returns:
        True means valid line, False means garbled line
    """
    if len(line) <= 3:
        # Short lines may be valid content like names, keep them
        return True

    cid_count = len(re.findall(r'\(cid:\d+\)', line))
    if cid_count >= 3:
        return False
    # Valid characters: Chinese (incl. extension), ASCII alphanumeric, common punctuation and spaces, fullwidth chars, CJK punctuation
    valid_chars = re.findall(
        r'[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff'
        r'a-zA-Z0-9\s@.,:;!?()（）【】\-_/\\|·•'
        r'、，。：；！？\u201c\u201d\u2018\u2019《》'
        r'\uff01-\uff5e'
        r'\u3000-\u303f'
        r'#%&+=~`\u00b7\u2022\u2013\u2014'
        r']',
        line
    )
    ratio = len(valid_chars) / len(line) if len(line) > 0 else 0
    if ratio < 0.5:
        return False

    # Detect PDF custom font mapping causing single-character spacing anomaly pattern
    # Feature: lots of "single letter space single letter space" sequences, e.g. "O U W Z_W V 2 X 3"
    # Stats: ratio of space-separated single chars among non-space chars
    spaced_singles = re.findall(r'(?:^|\s)([a-zA-Z0-9])(?:\s|$)', line)
    non_space_len = len(line.replace(" ", ""))
    if non_space_len > 5 and len(spaced_singles) > 0:
        # If ratio of space-separated single chars to non-space chars is too high, classify as garbled
        single_ratio = len(spaced_singles) / non_space_len
        if single_ratio > 0.3:
            return False

    # Detect consecutive meaningless mixed-case alphanumeric sequences (e.g. "UJqZX9V2")
    # Normal English words don't have such frequent case alternation patterns
    garbled_seqs = re.findall(r'[a-zA-Z0-9]{4,}', line.replace(" ", ""))
    if garbled_seqs:
        garbled_count = 0
        for seq in garbled_seqs:
            # Count case alternations
            case_changes = sum(
                1 for i in range(1, len(seq))
                if (seq[i].isupper() != seq[i-1].isupper() and seq[i].isalpha() and seq[i-1].isalpha())
                or (seq[i].isdigit() != seq[i-1].isdigit())
            )
            # Too high alternation frequency = garbled sequence (normal words like "Spring" have only 1 alternation)
            if len(seq) >= 4 and case_changes / len(seq) > 0.5:
                garbled_count += 1
        # If garbled sequence ratio is too high
        if len(garbled_seqs) > 0 and garbled_count / len(garbled_seqs) > 0.4:
            return False

    return True


def _fix_split_labels(lines: list[str]) -> list[str]:
    """
    Fix field label split issues

    Some PDF layouts split field labels across line start/end, e.g.:
    - "名：陈晓俐 姓" -> should be fixed to "姓名：陈晓俐"
    - "别：男 性" -> should be fixed to "性别：男"

    Args:
        lines: Original line text list
    Returns:
        Fixed line text list
    """
    # Common split field label patterns: (line-end part, line-start part) -> full label
    split_patterns = {
        ("姓", "名"): "姓名",
        ("性", "别"): "性别",
        ("年", "龄"): "年龄",
        ("电", "话"): "电话",
        ("邮", "箱"): "邮箱",
        ("学", "历"): "学历",
        ("专", "业"): "专业",
        ("地", "址"): "地址",
        ("籍", "贯"): "籍贯",
        ("民", "族"): "民族",
    }

    fixed = []
    for line in lines:
        # Detect in-line split patterns: "X：content Y" where (Y, X) is a split pair
        for (suffix_char, prefix_char), full_label in split_patterns.items():
            # Pattern: "prefix_char：content suffix_char" (first half at line start, second half at line end)
            pattern = rf'^({re.escape(prefix_char)})\s*[:：]\s*(.+?)\s+{re.escape(suffix_char)}\s*$'
            m = re.match(pattern, line)
            if m:
                content = m.group(2).strip()
                line = f"{full_label}：{content}"
                break
            # Pattern: "suffix_char content prefix_char：" (second half at line start, first half at line end)
            pattern2 = rf'^{re.escape(suffix_char)}\s*[:：]?\s*(.+?)\s+{re.escape(prefix_char)}\s*$'
            m2 = re.match(pattern2, line)
            if m2:
                content = m2.group(1).strip()
                line = f"{full_label}：{content}"
                break
        fixed.append(line)
    return fixed


def extract_text(filename: str, binary: bytes) -> tuple[str, list[str], list[dict]]:
    """
    Extract text content based on file type (Pipeline Phase 1).

    PDF files use dual-path fusion + layout reconstruction + line indexing.
    Other formats fall back to simple text extraction.

    Args:
        filename: File name
        binary: File binary content
    Returns:
        (indexed_text, lines, line_positions) tuple:
        - indexed_text: Text with line number indices
        - lines: List of original line texts
        - line_positions: List of per-line coordinate info (empty list for non-PDF formats)
    """
    fname_lower = filename.lower()

    try:
        if fname_lower.endswith(".pdf"):
            # Dual-path extraction
            meta_blocks = _extract_metadata_text(binary)
            ocr_blocks = []

            # Determine whether OCR supplementation is needed:
            # 1. Metadata text too short (< 100 chars)
            # 2. High garbled text ratio in metadata (caused by custom font mapping)
            meta_text_len = sum(len(b["text"]) for b in meta_blocks)
            need_ocr = False

            if meta_text_len < 100:
                logger.info("PDF metadata text too short, enabling OCR supplementation")
                need_ocr = True
            else:
                # Check metadata text quality: calculate valid line ratio
                # If many lines are judged as garbled by _is_valid_line, the PDF font mapping has issues
                valid_line_count = 0
                total_line_count = 0
                for b in meta_blocks:
                    text = b.get("text", "").strip()
                    if not text:
                        continue
                    total_line_count += 1
                    if _is_valid_line(text):
                        valid_line_count += 1
                if total_line_count > 0:
                    valid_ratio = valid_line_count / total_line_count
                    if valid_ratio < 0.6:
                        logger.info(
                            f"PDF metadata text quality low (valid line ratio {valid_ratio:.1%}), enabling OCR supplementation"
                        )
                        need_ocr = True

            if need_ocr:
                # Blackout strategy: black out metadata-extracted regions before OCR
                ocr_blocks = _extract_ocr_text(binary, meta_blocks=meta_blocks)

            # Text fusion
            fused_blocks = _fuse_text_blocks(meta_blocks, ocr_blocks)

            # Layout-aware sorting (prefer YOLOv10 layout detection, fall back to heuristic on failure)
            sorted_blocks = _layout_detect_reorder(fused_blocks, binary)

            # Build line-indexed text (with coordinate info)
            return _build_indexed_text(sorted_blocks)

        elif fname_lower.endswith(".docx"):
            from docx import Document
            doc = Document(BytesIO(binary))
            lines = [p.text.strip() for p in doc.paragraphs if p.text.strip()]

            # Extract table content from DOCX
            # Reference: table handling in naive.py Docx class
            # Many resumes use table layouts for personal info; iterating only paragraphs would miss this content
            for table in doc.tables:
                for row in table.rows:
                    cells = []
                    for cell in row.cells:
                        cell_text = cell.text.strip()
                        if cell_text:
                            cells.append(cell_text)
                    if not cells:
                        continue
                    row_text = " | ".join(cells)
                    # Deduplicate: skip if this row text already exists in lines
                    if row_text not in lines:
                        lines.append(row_text)

            indexed = "\n".join(f"[{i}]: {line}" for i, line in enumerate(lines))
            # DOCX has no coordinate info, return empty list
            return indexed, lines, []

        else:
            text = get_text(filename, binary)
            lines = [line.strip() for line in text.split("\n") if line.strip()]
            indexed = "\n".join(f"[{i}]: {line}" for i, line in enumerate(lines))
            return indexed, lines, []

    except Exception:
        logger.exception(f"Text extraction failed: {filename}")
        return "", [], []


# ==================== Phase 2: Parallel LLM Structured Extraction ====================


def _clean_llm_json_response(response: str) -> str:
    """
    Clean LLM JSON response.

    Uses SmartResume's lightweight string extraction strategy:
    1. Remove markdown code block markers
    2. Remove <think>...</think> thinking tags (reasoning models may output these)
    3. text.find("{") and text.rfind("}") to locate valid JSON block

    Args:
        response: Raw LLM response text
    Returns:
        Cleaned JSON string
    """
    text = response.strip()
    # Remove markdown code block markers
    text = text.replace("```json", "").replace("```", "").strip()
    # Remove reasoning model thinking tags
    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
    # Clean escaped quotes (SmartResume's approach)
    text = text.replace('\\"', '"')
    # SmartResume strategy: locate first { and last }
    start = text.find("{")
    end = text.rfind("}")
    if start != -1 and end != -1 and end > start:
        return text[start:end + 1]
    return text


def _parse_json_with_repair(text: str) -> dict:
    """
    Parse JSON string, attempt repair on failure (ref SmartResume's json_repair strategy).

    Repair strategies:
    1. Standard json.loads
    2. Replace Python-style booleans/None
    3. Use json_repair library

    Args:
        text: JSON string
    Returns:
        Parsed dictionary
    Raises:
        json.JSONDecodeError: Raised when all repair strategies fail
    """
    # First attempt: standard parsing
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Second attempt: replace Python-style values (ref SmartResume)
    repaired = text.replace("'", '"')
    repaired = repaired.replace('True', 'true')
    repaired = repaired.replace('False', 'false')
    repaired = repaired.replace('None', 'null')
    try:
        return json.loads(repaired)
    except json.JSONDecodeError:
        pass

    # Third attempt: use json_repair library
    if json_repair is not None:
        try:
            return json_repair.loads(text)
        except Exception:
            pass

    # All strategies failed
    raise json.JSONDecodeError("All JSON repair strategies failed", text, 0)


def _call_llm(prompt: str, tenant_id , lang: str) -> Optional[dict]:
    """
    Call LLM and parse JSON response (ref SmartResume's retry + fault-tolerance strategy).

    Retry mechanism:
    - Retry up to _LLM_MAX_RETRIES times
    - On retry, increase temperature and randomize seed for output diversity
    - Use json_repair on JSON parse failure

    Args:
        prompt: User prompt
        lang: Language
    Returns:
        Parsed dictionary, or None on failure

    """
    try:
        from api.db.services.llm_service import LLMBundle
        from common.constants import LLMType

        llm =  LLMBundle(tenant_id, LLMType.CHAT, lang=lang)

        for attempt in range(_LLM_MAX_RETRIES + 1):
            try:
                # Increase temperature on retry for diversity (ref SmartResume)
                temperature = 0.1 if attempt == 0 else 1.0
                gen_conf = {"temperature": temperature, "max_tokens": 2048}
                if attempt > 0:
                    gen_conf["seed"] = random.randint(0, 1000000)

                response = llm._run_coroutine_sync(
                    llm.async_chat(
                        system=get_system_prompt(lang),
                        history=[{"role": "user", "content": prompt}],
                        gen_conf=gen_conf,
                    )
                )
                cleaned = _clean_llm_json_response(response)
                return _parse_json_with_repair(cleaned)

            except json.JSONDecodeError as e:
                if attempt < _LLM_MAX_RETRIES:
                    logger.info(f"LLM JSON parse failed (attempt {attempt + 1}), retrying: {e}")
                    continue
                else:
                    logger.warning(f"LLM JSON parse failed (retries exhausted): {e}")
                    return None

    except Exception as e:
        logger.warning(f"LLM call failed: {e}")
        return None


def _normalize_for_comparison(text: str) -> str:
    """
    Normalize text for comparison (ref SmartResume's _normalize_for_comparison).

    Unify fullwidth/halfwidth, remove whitespace, Unicode normalization,
    so that "阿里巴巴" and "阿 里 巴 巴" can match.

    Args:
        text: Original text
    Returns:
        Normalized text
    """
    if not text:
        return ""
    # Unicode NFKC normalization (fullwidth to halfwidth, etc.)
    text = unicodedata.normalize("NFKC", text)
    # Remove all whitespace characters
    text = re.sub(r'\s+', '', text)
    return text.lower()

def _calc_single_exp_years(start_str: str, end_str: str) -> float:
    """
    Calculate years for a single experience entry.

    Args:
        start_str: Start date string
        end_str: End date string ("至今" etc. means current)
    Returns:
        Years (float, 1 decimal place), returns 0 if unable to calculate
    """
    from datetime import datetime

    start_str = str(start_str).strip()
    end_str = str(end_str).strip()
    if not start_str:
        return 0

    start_date = _parse_date_str(start_str)
    if not start_date:
        return 0

    if end_str in ("至今", "现在", "present", "Present", "now", "Now", ""):
        end_date = datetime.now()
    else:
        end_date = _parse_date_str(end_str)
        if not end_date:
            end_date = datetime.now()

    months = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)
    if months <= 0:
        return 0
    return round(months / 12.0, 1)


def _calculate_work_years(experiences: list[dict]) -> float:
    """
    Calculate total work years based on start/end dates of each work experience.

    Args:
        experiences: List of work experiences, each containing start_date, end_date fields
    Returns:
        Total work years (float), returns 0 if unable to calculate
    """
    total = 0.0
    for exp in experiences:
        total += _calc_single_exp_years(
            exp.get("start_date", ""), exp.get("end_date", "")
        )
    return round(total, 1)


def _parse_date_str(date_str: str) -> Optional[datetime.datetime]:
    """
    Parse date string, supporting multiple common formats.

    Supported formats:
    - 2024.1 / 2024.01
    - 2024-1 / 2024-01
    - 2024/1 / 2024/01
    - 2024年1月
    - 2024 (year only, defaults to January)

    Args:
        date_str: Date string
    Returns:
        datetime object, or None on parse failure
    """
    from datetime import datetime

    date_str = date_str.strip()
    # Try matching year.month / year-month / year/month / year(nian)month(yue) formats
    patterns = [
        (r"((?:19|20)\d{2})[.\-/年](\d{1,2})", "%Y-%m"),
        (r"^((?:19|20)\d{2})$", "%Y"),
    ]
    for pattern, _ in patterns:
        m = re.search(pattern, date_str)
        if m:
            try:
                year = int(m.group(1))
                month = int(m.group(2)) if len(m.groups()) > 1 else 1
                # Month range validation
                if month < 1 or month > 12:
                    month = 1
                return datetime(year, month, 1)
            except (ValueError, IndexError):
                continue
    return None


def _extract_description_from_range(
        index_range: list, lines: list[str],
        company: str = "", position: str = ""
) -> str:
    """
    Extract description from original text by index range (ref SmartResume's _extract_description_from_range).

    Key improvement:
    - Filter out lines containing both company name and position title (avoid mixing header lines into description)
    - Boundary safety checks

    Args:
        index_range: [start_line_number, end_line_number]
        lines: List of original line texts
        company: Company name (used to filter header lines)
        position: Position title (used to filter header lines)
    Returns:
        Extracted description text
    """
    if not index_range or len(index_range) != 2:
        return ""

    start_idx, end_idx = int(index_range[0]), int(index_range[1])

    # Boundary safety check
    if start_idx < 0 or end_idx >= len(lines) or start_idx > end_idx:
        return ""

    extracted_lines = lines[start_idx:end_idx + 1]

    # Filter out lines containing both company name and position title (ref SmartResume)
    if company or position:
        norm_company = _normalize_for_comparison(company)
        norm_position = _normalize_for_comparison(position)
        filtered = []
        for line in extracted_lines:
            norm_line = _normalize_for_comparison(line)
            # If a line contains both company name and position title, it's likely a header line, skip
            if norm_company and norm_position and norm_company in norm_line and norm_position in norm_line:
                continue
            # If a line exactly equals company name or position title, also skip
            if norm_line == norm_company or norm_line == norm_position:
                continue
            filtered.append(line)
        extracted_lines = filtered

    if not extracted_lines:
        return ""

    return "\n".join(line.strip() for line in extracted_lines if line.strip())


def _extract_basic_info(indexed_text: str, tenant_id , lang: str) -> Optional[dict]:
    """Extract basic info (subtask 1).

    Basic info is usually at the beginning of the resume, first 8000 chars suffice.
    """
    prompt = get_basic_info_prompt(lang).format(indexed_text=indexed_text[:8000])
    return _call_llm(prompt,tenant_id, lang)


def _extract_work_experience(indexed_text: str, tenant_id , lang: str) -> Optional[dict]:
    """Extract work experience (subtask 2, using index pointers).

    Work experience may span the middle-to-end of the resume, use full text to avoid truncation.
    """
    prompt = get_work_exp_prompt(lang).format(indexed_text=indexed_text)
    return _call_llm(prompt, tenant_id , lang)


def _extract_education(indexed_text: str, tenant_id , lang: str) -> Optional[dict]:
    """Extract education background (subtask 3).

    Education is usually at the end of the resume, must use full text to avoid truncation.
    Resume text is generally under 30K chars, within LLM context window.
    """
    prompt = get_education_prompt(lang).format(indexed_text=indexed_text)
    return _call_llm(prompt,tenant_id, lang)


def _extract_project_experience(indexed_text: str, tenant_id , lang: str) -> Optional[dict]:
    """Extract project experience (subtask 4, using index pointers).

    Project experience may span the middle-to-end of the resume, use full text to avoid truncation.
    """
    prompt = get_project_exp_prompt(lang).format(indexed_text=indexed_text)
    return _call_llm(prompt, tenant_id , lang)


def parse_with_llm(indexed_text: str, lines: list[str], tenant_id , lang: str) -> Optional[dict]:
    """
    Extract resume info using parallel task decomposition strategy (ref SmartResume Section 3.2).

    Decomposes extraction into four independent subtasks executed in parallel:
    1. Basic info (name, phone, skills, self-evaluation, etc.)
    2. Work experience (company, position, description line ranges)
    3. Education background (school, major, degree)
    4. Project experience (project name, role, description line ranges)

    Args:
        indexed_text: Line-indexed resume text
        lines: List of original line texts (for index-based extraction)
        lang: Language
    Returns:
        Merged structured resume dictionary, or None on failure
    """
    try:
        # Execute four subtasks in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            future_basic = executor.submit(_extract_basic_info, indexed_text, tenant_id , lang)
            future_work = executor.submit(_extract_work_experience, indexed_text, tenant_id , lang)
            future_edu = executor.submit(_extract_education, indexed_text, tenant_id, lang)
            future_project = executor.submit(_extract_project_experience, indexed_text, tenant_id , lang)

            basic_info = future_basic.result(timeout=60)
            work_exp = future_work.result(timeout=60)
            education = future_edu.result(timeout=60)
            project_exp = future_project.result(timeout=60)

        # Merge results
        resume = {}

        # Merge basic info
        if basic_info:
            resume.update(basic_info)
            logger.info(f"Basic info extraction succeeded: {len(basic_info)} fields")

        # Process work experience (index pointer extraction)
        if work_exp and "workExperience" in work_exp:
            experiences = work_exp["workExperience"]
            companies = []
            positions = []
            work_descs = []
            # Save detailed info for each experience (dates, years) for chunk generation
            work_exp_details = []
            for exp in experiences:
                company = exp.get("company", "")
                position = exp.get("position", "")
                start_date = exp.get("start_date", "")
                end_date = exp.get("end_date", "")
                # Calculate years for this experience entry
                years = _calc_single_exp_years(start_date, end_date)
                if company:
                    companies.append(company)
                if position:
                    positions.append(position)
                # Save detailed info for each experience entry
                work_exp_details.append({
                    "company": company,
                    "position": position,
                    "start_date": start_date,
                    "end_date": end_date,
                    "years": years,
                })
                # Index pointer mechanism: extract description from original text by line range
                # Use _extract_description_from_range to filter header lines (ref SmartResume)
                desc_lines = exp.get("desc_lines", [])
                if isinstance(desc_lines, list) and len(desc_lines) == 2:
                    desc = _extract_description_from_range(
                        desc_lines, lines, company=company, position=position
                    )
                    if desc.strip():
                        work_descs.append(desc.strip())

            if companies:
                resume["corp_nm_tks"] = companies
                resume["corporation_name_tks"] = companies[0]
            if positions:
                resume["position_name_tks"] = positions
            if work_descs:
                resume["work_desc_tks"] = work_descs
            # Save experience details for _build_chunk_document
            if work_exp_details:
                resume["_work_exp_details"] = work_exp_details
            # Calculate total work years from each experience's dates (overrides LLM's guess in basic info)
            calculated_years = _calculate_work_years(experiences)
            if calculated_years > 0:
                resume["work_exp_flt"] = calculated_years
            logger.info(f"Work experience extraction succeeded: {len(experiences)} entries, calculated total years: {calculated_years}")

        # Process education background
        if education and "education" in education:
            edu_list = education["education"]
            schools = []
            majors = []
            degrees = []
            for edu in edu_list:
                if edu.get("school"):
                    schools.append(edu["school"])
                if edu.get("major"):
                    majors.append(edu["major"])
                if edu.get("degree"):
                    degrees.append(edu["degree"])
                # Extract graduation year
                end_date = edu.get("end_date", "")
                if end_date and not resume.get("edu_end_int"):
                    year_match = re.search(r"(19|20)\d{2}", str(end_date))
                    if year_match:
                        resume["edu_end_int"] = int(year_match.group(0))

            if schools:
                resume["school_name_tks"] = schools
                resume["first_school_name_tks"] = schools[-1]  # Earliest school is usually last
            if majors:
                resume["major_tks"] = majors
                resume["first_major_tks"] = majors[-1]
            if degrees:
                resume["degree_kwd"] = degrees
                # Infer highest degree (supports both Chinese and English degree names)
                degree_rank = {
                    "博士": 5, "PhD": 5, "Doctor": 5,
                    "硕士": 4, "Master": 4, "MBA": 4, "EMBA": 4, "MPA": 4,
                    "本科": 3, "Bachelor": 3,
                    "大专": 2, "专科": 2, "Associate": 2, "Diploma": 2,
                    "高中": 1, "High School": 1,
                }
                highest = max(degrees, key=lambda d: degree_rank.get(d, 0), default="")
                if highest:
                    resume["highest_degree_kwd"] = highest
                resume["first_degree_kwd"] = degrees[-1] if degrees else ""
            logger.info(f"Education extraction succeeded: {len(edu_list)} entries")

        # Process project experience (index pointer extraction, similar to work experience)
        if project_exp and "projectExperience" in project_exp:
            projects = project_exp["projectExperience"]
            project_names = []
            project_descs = []
            for proj in projects:
                name = proj.get("project_name", "")
                if name:
                    project_names.append(name)
                # Index pointer mechanism: extract project description from original text by line range
                desc_lines = proj.get("desc_lines", [])
                if isinstance(desc_lines, list) and len(desc_lines) == 2:
                    desc = _extract_description_from_range(
                        desc_lines, lines, company=name, position=proj.get("role", "")
                    )
                    if desc.strip():
                        project_descs.append(desc.strip())

            if project_names:
                resume["project_tks"] = project_names
            if project_descs:
                resume["project_desc_tks"] = project_descs
            logger.info(f"Project experience extraction succeeded: {len(projects)} entries")

        if not resume.get("name_kwd"):
            resume["name_kwd"] = "Unknown" if _is_english(lang) else "未知"

        return resume if len(resume) > 2 else None

    except concurrent.futures.TimeoutError:
        logger.warning("LLM parallel extraction timed out")
        return None
    except Exception as e:
        logger.warning(f"LLM parallel extraction failed: {e}")
        return None


# ==================== Phase 3: Regex Fallback Parsing ====================


def parse_with_regex(text: str, lang: str = "Chinese") -> dict:
    """
    Parse resume text using regex (fallback strategy)

    When LLM parsing fails, use regex to extract basic structured info from text.

    Args:
        text: Resume text content (without line number index)
        lang: Language parameter, default "Chinese"
    Returns:
        Structured resume info dictionary
    """
    resume: dict = {}
    lines = [line.strip() for line in text.split("\n") if line.strip()]

    # --- Extract Name ---
    if _is_english(lang):
        # English resume: extract from "Name: XXX" format
        for line in lines[:30]:
            name_match = re.search(r'(?:Name|Full\s*Name)\s*[:：]\s*([A-Za-z][A-Za-z\s\-\.]{1,40})', line, re.IGNORECASE)
            if name_match:
                resume["name_kwd"] = name_match.group(1).strip()
                break
        # English resume strategy 2: first line if short text without digits, may be a name
        if "name_kwd" not in resume and lines:
            first = lines[0].strip()
            if len(first) <= 40 and not re.search(r"\d", first) and re.match(r'^[A-Za-z][A-Za-z\s\-\.]+$', first):
                resume["name_kwd"] = first
    else:
        # Chinese resume: extract from "姓名：XXX" format
        for line in lines[:30]:
            name_match = re.search(r'姓\s*名\s*[:：]\s*([\u4e00-\u9fa5]{2,4})', line)
            if name_match:
                resume["name_kwd"] = name_match.group(1)
                break

        # Strategy 2: search first 20 lines for standalone Chinese names (2-4 chars), excluding common title words
        if "name_kwd" not in resume:
            title_words = {
                "个人", "简历", "求职", "应聘", "基本", "信息", "概述", "简介",
                "教育", "工作", "经历", "经验", "技能", "项目", "自我", "评价",
                "专业", "技术", "证书", "语言", "能力", "培训", "荣誉", "奖项",
            }
            for line in lines[:20]:
                if any(w in line for w in title_words):
                    continue
                if re.search(r'[:：]', line) and len(line) > 6:
                    continue
                cleaned = re.sub(r"^[A-Za-z_\-\d\s]+\s+", "", line)
                cleaned = re.sub(r"\s+[A-Za-z_\-\d\s]+$", "", cleaned).strip()
                if 2 <= len(cleaned) <= 4 and re.match(r"^[\u4e00-\u9fa5]{2,4}$", cleaned):
                    resume["name_kwd"] = cleaned
                    break

        # Strategy 3: first line if short without digits, may be a name
        if "name_kwd" not in resume and lines:
            first = lines[0].strip()
            if len(first) <= 10 and not re.search(r"\d", first):
                cn_part = re.findall(r'[\u4e00-\u9fa5]+', first)
                if cn_part and 2 <= len(cn_part[0]) <= 4:
                    resume["name_kwd"] = cn_part[0]

    # --- Extract Phone Number ---
    phones = re.findall(r"1[3-9]\d{9}", text)
    if phones:
        resume["phone_kwd"] = phones[0]

    # --- Extract Email ---
    emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    if emails:
        resume["email_tks"] = emails[0]

    # --- Extract Gender ---
    if _is_english(lang):
        # English resume: extract from "Gender: Male/Female" format
        gender_label = re.search(r'(?:Gender|Sex)\s*[:：]\s*(Male|Female|M|F)', text, re.IGNORECASE)
        if gender_label:
            raw = gender_label.group(1).strip().upper()
            resume["gender_kwd"] = "Male" if raw in ("M", "MALE") else "Female"
        else:
            gender_match = re.search(r'\b(Male|Female)\b', text[:500], re.IGNORECASE)
            if gender_match:
                resume["gender_kwd"] = gender_match.group(1).capitalize()
    else:
        # Chinese resume: extract from "性别：男/女" format
        gender_label = re.search(r'性\s*别\s*[:：]\s*(男|女)', text)
        if gender_label:
            resume["gender_kwd"] = gender_label.group(1)
        else:
            gender_match = re.search(r"(男|女)", text[:500])
            if gender_match:
                resume["gender_kwd"] = gender_match.group(1)

    # --- Extract Age ---
    if _is_english(lang):
        # English resume: match "25 years old" or "Age: 25"
        age_match = re.search(r'(?:Age)\s*[:：]\s*(\d{1,2})', text, re.IGNORECASE)
        if not age_match:
            age_match = re.search(r'(\d{1,2})\s*years?\s*old', text, re.IGNORECASE)
        if age_match:
            resume["age_int"] = int(age_match.group(1))
    else:
        # Chinese resume: match "25岁"
        age_match = re.search(r"(\d{1,2})\s*岁", text)
        if age_match:
            resume["age_int"] = int(age_match.group(1))

    # --- Extract Date of Birth ---
    if _is_english(lang):
        # English resume: match "1990-01-15" or "Jan 15, 1990" etc.
        birth_match = re.search(r'(?:Birth|DOB|Date\s*of\s*Birth)\s*[:：]\s*(.{6,20})', text, re.IGNORECASE)
        if birth_match:
            resume["birth_dt"] = birth_match.group(1).strip()
        else:
            birth_match = re.search(r"(19|20)\d{2}[-/]\d{1,2}[-/]\d{1,2}", text)
            if birth_match:
                resume["birth_dt"] = birth_match.group(0)
    else:
        # Chinese resume: match "1990年1月15日" or "1990-01-15"
        birth_match = re.search(r"(19|20)\d{2}[年/-]\d{1,2}[月/-]\d{1,2}", text)
        if birth_match:
            resume["birth_dt"] = birth_match.group(0)

    # --- Extract Education Level ---
    degree_keywords_zh = ["博士", "硕士", "本科", "大专", "专科", "高中", "MBA", "EMBA", "MPA"]
    degree_keywords_en = ["PhD", "Master", "Bachelor", "Associate", "Diploma", "High School",
                          "MBA", "EMBA", "MPA", "Doctor"]
    degree_keywords = degree_keywords_en if _is_english(lang) else degree_keywords_zh
    found_degrees = [d for d in degree_keywords if d in text]
    if found_degrees:
        resume["degree_kwd"] = found_degrees

    # --- Extract School ---
    if _is_english(lang):
        # English resume: match "University/College/Institute/School" keywords
        schools = re.findall(
            r'([A-Z][A-Za-z\s\-&]{2,40}(?:University|College|Institute|School|Academy))',
            text
        )
        # Remove extra whitespace
        schools = [re.sub(r'\s+', ' ', s).strip() for s in schools]
    else:
        # Chinese resume: match "XX大学/学院/职业技术学院"
        schools = re.findall(r"[\u4e00-\u9fa5]{2,15}(?:大学|学院|职业技术学院)", text)
    if schools:
        resume["school_name_tks"] = list(set(schools))
        resume["first_school_name_tks"] = schools[0]

    # --- Extract Major ---
    if _is_english(lang):
        # English resume: match "Major: XXX" / "Field of Study: XXX" / "Specialization: XXX"
        majors = re.findall(
            r'(?:Major|Field\s*of\s*Study|Specialization|Concentration)\s*[:：]\s*([A-Za-z\s\-&,]{2,40})',
            text, re.IGNORECASE
        )
        majors = [m.strip() for m in majors if m.strip()]
    else:
        # Chinese resume: match "专业：XXX"
        majors = re.findall(r"专业[:：]\s*([\u4e00-\u9fa5]{2,20})", text)
    if majors:
        resume["major_tks"] = majors
        resume["first_major_tks"] = majors[0]

    # --- Extract Company Names ---
    if _is_english(lang):
        # English resume: match common company suffixes
        en_company_patterns = [
            r'([A-Z][A-Za-z\s\-&,\.]{2,40}(?:Inc\.|Corp\.|Ltd\.|LLC|Co\.|Company|Group|Technologies|Technology|Solutions|Consulting|Services|Bank))',
        ]
        companies = []
        for pattern in en_company_patterns:
            companies.extend(re.findall(pattern, text))
        companies = [re.sub(r'\s+', ' ', c).strip() for c in companies]
    else:
        # Chinese resume: match "XX有限公司" format
        company_patterns = [
            r"[\u4e00-\u9fa5]{2,20}[（(][\u4e00-\u9fa5]{2,10}[)）](?:科技|信息技术|网络科技)?(?:股份)?有限公司",
            r"[\u4e00-\u9fa5]{4,20}(?:科技|信息技术|网络科技|银行)?(?:股份)?有限公司",
        ]
        companies = []
        for pattern in company_patterns:
            companies.extend(re.findall(pattern, text))

    unique_companies = []
    seen = set()
    # Filter verb list (bilingual)
    filter_verbs = (
        ["completed", "conducted", "implemented", "responsible", "participated", "developed"]
        if _is_english(lang)
        else ["完成", "进行", "实施", "负责", "参与", "开发"]
    )
    min_len = 3 if _is_english(lang) else 6
    for c in companies:
        if len(c) < min_len or any(v in c.lower() for v in filter_verbs) or c in seen:
            continue
        is_sub = False
        for existing in list(unique_companies):
            if c in existing:
                is_sub = True
                break
            if existing in c:
                unique_companies.remove(existing)
                seen.discard(existing)
        if not is_sub:
            unique_companies.append(c)
            seen.add(c)

    if unique_companies:
        resume["corp_nm_tks"] = unique_companies
        resume["corporation_name_tks"] = unique_companies[0]

    # --- Extract Position (improved: context constraints to reduce noise) ---
    if _is_english(lang):
        # English resume: Strategy 1 - extract from "Title: XXX" / "Position: XXX" / "Role: XXX" format
        position_label_matches = re.findall(
            r'(?:Title|Position|Role|Job\s*Title)\s*[:：]\s*([A-Za-z\s\-/&]{2,30})',
            text, re.IGNORECASE
        )
        positions = [p.strip() for p in position_label_matches if p.strip()]

        # English resume: Strategy 2 - match common position suffix keywords
        en_position_suffixes = [
            "Engineer", "Manager", "Director", "Supervisor", "Specialist",
            "Designer", "Consultant", "Assistant", "Architect", "Analyst",
            "Developer", "Lead", "Officer", "Coordinator", "Administrator",
            "Intern", "VP", "President",
        ]
        for line in lines:
            if len(line) > 60:
                continue  # Skip overly long lines (usually description text)
            for suffix in en_position_suffixes:
                match = re.search(rf'([A-Za-z\s\-]{{1,25}}{suffix})\b', line, re.IGNORECASE)
                if match:
                    pos = match.group(1).strip()
                    # Filter out matches that are clearly not positions (contain verbs)
                    filter_pos_verbs = ["responsible", "participated", "completed", "developed", "designed"]
                    if not any(v in pos.lower() for v in filter_pos_verbs) and len(pos) > 3:
                        positions.append(pos)
    else:
        # Chinese resume: Strategy 1 - extract from "职位/岗位：XXX" format
        position_label_matches = re.findall(
            r'(?:职位|岗位|职务|职称|担任)\s*[:：]\s*([\u4e00-\u9fa5a-zA-Z]{2,15})',
            text
        )
        positions = list(position_label_matches)

        # Chinese resume: Strategy 2 - extract from work experience paragraphs (company name followed by position)
        for line in lines:
            pos_match = re.search(
                r'(?:有限公司|集团|银行)\s+([\u4e00-\u9fa5]{2,8}(?:工程师|经理|总监|主管|专员|设计师|顾问|助理|架构师|分析师|运营|产品))',
                line
            )
            if pos_match:
                positions.append(pos_match.group(1))

        # Chinese resume: Strategy 3 - position keywords in standalone lines (length-limited to avoid matching description text)
        position_suffixes = ["工程师", "经理", "总监", "主管", "专员", "设计师", "顾问",
                             "助理", "架构师", "分析师", "开发者", "负责人"]
        for line in lines:
            if len(line) > 20:
                continue  # Skip overly long lines
            for suffix in position_suffixes:
                match = re.search(rf'([\u4e00-\u9fa5]{{1,6}}{suffix})', line)
                if match:
                    pos = match.group(1)
                    if not any(v in pos for v in ["负责", "参与", "完成", "开发了", "设计了"]):
                        positions.append(pos)

    if positions:
        # Deduplicate while preserving order
        seen_pos = set()
        unique_positions = []
        for p in positions:
            if p not in seen_pos:
                seen_pos.add(p)
                unique_positions.append(p)
        resume["position_name_tks"] = unique_positions

    # --- Extract Years of Experience ---
    if _is_english(lang):
        # English resume: match "5 years experience" / "5+ years of experience"
        work_exp_match = re.search(r'(\d+)\+?\s*years?\s*(?:of\s*)?(?:experience|work)', text, re.IGNORECASE)
        if work_exp_match:
            resume["work_exp_flt"] = float(work_exp_match.group(1))
    else:
        # Chinese resume: match "5年...经验"
        work_exp_match = re.search(r"(\d+)\s*年.*?经验", text)
        if work_exp_match:
            resume["work_exp_flt"] = float(work_exp_match.group(1))

    # --- Extract Graduation Year ---
    if _is_english(lang):
        # English resume: match "Graduated 2020" / "Graduation: 2020" / "Class of 2020"
        grad_match = re.search(r'(?:Graduat(?:ed|ion)|Class\s*of)\s*[:：]?\s*((?:19|20)\d{2})', text, re.IGNORECASE)
        if grad_match:
            resume["edu_end_int"] = int(grad_match.group(1))
    else:
        # Chinese resume: match "2020年...毕业"
        grad_match = re.search(r"((?:19|20)\d{2})\s*年.*?毕业", text)
        if grad_match:
            resume["edu_end_int"] = int(grad_match.group(1))

    if "name_kwd" not in resume:
        resume["name_kwd"] = "Unknown" if _is_english(lang) else "未知"

    return resume


# ==================== Phase 4: Post-processing Pipeline ====================


def _postprocess_resume(resume: dict, lines: list[str], lang: str = "Chinese") -> dict:
    """
    Four-phase post-processing pipeline (ref: SmartResume Section 3.2.3)

    1. Source text validation: check if key fields can be found in the original text
    2. Domain normalization: standardize date formats, clean company name suffix noise
    3. Contextual deduplication: remove duplicate company/school entries
    4. Field completion: ensure all required fields exist

    Args:
        resume: Raw resume dictionary extracted by LLM
        lines: Original line text list (for source text validation)
        lang: Language parameter, default "Chinese"
    Returns:
        Post-processed resume dictionary
    """
    _en = _is_english(lang)
    full_text = "\n".join(lines) if lines else ""
    # Normalize full text for comparison (ref: SmartResume _validate_fields_in_text)
    norm_full_text = _normalize_for_comparison(full_text)

    # --- Phase 1: Source text validation (prune hallucinations, ref: SmartResume _validate_fields_in_text) ---
    # Name validation: clear if not found in source text (SmartResume strategy: discard hallucinated fields)
    _unknown_names = ("未知", "Unknown")
    if resume.get("name_kwd") and resume["name_kwd"] not in _unknown_names:
        norm_name = _normalize_for_comparison(resume["name_kwd"])
        if norm_full_text and norm_name and norm_name not in norm_full_text:
            logger.warning(f"Name '{resume['name_kwd']}' not found in source text, classified as LLM hallucination, cleared")
            resume["name_kwd"] = ""

    # Validate company names (strict matching: full name must appear in source text, no longer using loose 4-char prefix matching)
    if resume.get("corp_nm_tks") and norm_full_text:
        verified_companies = []
        for company in resume["corp_nm_tks"]:
            norm_company = _normalize_for_comparison(company)
            if norm_company and norm_company in norm_full_text:
                verified_companies.append(company)
            else:
                logger.debug(f"Company '{company}' not found in source text, filtered out")
        # Update even if all filtered out (SmartResume strategy: prefer missing over wrong)
        resume["corp_nm_tks"] = verified_companies
        if verified_companies:
            resume["corporation_name_tks"] = verified_companies[0]
        else:
            resume["corporation_name_tks"] = ""

    # Validate school names (ref: SmartResume _validate_fields_in_text)
    if resume.get("school_name_tks") and norm_full_text:
        verified_schools = []
        for school in resume["school_name_tks"]:
            norm_school = _normalize_for_comparison(school)
            if norm_school and norm_school in norm_full_text:
                verified_schools.append(school)
            else:
                logger.debug(f"School '{school}' not found in source text, filtered out")
        resume["school_name_tks"] = verified_schools
        if verified_schools:
            if resume.get("first_school_name_tks"):
                # Ensure first_school is also in the verified list
                if resume["first_school_name_tks"] not in verified_schools:
                    resume["first_school_name_tks"] = verified_schools[-1]
        else:
            resume["first_school_name_tks"] = ""

    # Validate position names
    if resume.get("position_name_tks") and norm_full_text:
        verified_positions = []
        for pos in resume["position_name_tks"]:
            norm_pos = _normalize_for_comparison(pos)
            if norm_pos and norm_pos in norm_full_text:
                verified_positions.append(pos)
        if verified_positions:
            resume["position_name_tks"] = verified_positions

    # --- Phase 2: Domain normalization ---
    # Standardize date format
    if resume.get("birth_dt"):
        resume["birth_dt"] = re.sub(r"[年月]", "-", str(resume["birth_dt"])).rstrip("-")

    # Clean non-digit characters from phone number (keep + sign)
    if resume.get("phone_kwd"):
        phone = re.sub(r"[^\d+]", "", str(resume["phone_kwd"]))
        if phone:
            resume["phone_kwd"] = phone

    # Standardize gender (output format determined by language parameter)
    if resume.get("gender_kwd"):
        gender = str(resume["gender_kwd"]).strip()
        if gender in ("male", "Male", "M", "m", "男"):
            resume["gender_kwd"] = "Male" if _en else "男"
        elif gender in ("female", "Female", "F", "f", "女"):
            resume["gender_kwd"] = "Female" if _en else "女"

    # --- Phase 3: Contextual deduplication ---
    for list_field in ["corp_nm_tks", "school_name_tks", "major_tks",
                       "position_name_tks", "skill_tks"]:
        if isinstance(resume.get(list_field), list):
            # Order-preserving deduplication
            seen = set()
            deduped = []
            for item in resume[list_field]:
                item_str = str(item).strip()
                if item_str and item_str not in seen:
                    seen.add(item_str)
                    deduped.append(item_str)
            resume[list_field] = deduped
    # --- Phase 3.4: work_desc_tks dedup by company name + time period ---
    # LLM often extracts the same company's content twice: once from the "Work Experience"
    # section and once from the "Project Experience" section, producing entries like
    # These have different descriptions (daily work vs project details), so content-based
    # Jaccard dedup cannot catch them. Instead, we detect duplicate companies by checking
    # if one company name is a substring of another AND their time periods overlap.
    # This also fixes the inflated work_exp_flt (e.g. 25.5 years instead of ~14).
    work_descs = resume.get("work_desc_tks", [])
    if len(work_descs) > 1:
        corp_names = resume.get("corp_nm_tks", [])
        work_details = resume.get("_work_exp_details", [])
        positions = resume.get("position_name_tks", [])
        kept_indices = []
        for i in range(len(work_descs)):
            is_dup = False
            corp_i = _normalize_for_comparison(corp_names[i]) if i < len(corp_names) else ""
            detail_i = work_details[i] if i < len(work_details) else {}
            start_i = detail_i.get("start_date", "")
            end_i = detail_i.get("end_date", "")
            # Parse dates for entry i once (reused across inner loop)
            dt_start_i = _parse_date_str(start_i) if start_i else None
            dt_end_i = _parse_date_str(end_i) if end_i else None
            for j in kept_indices:
                # Strategy A: company name substring + time period overlap
                corp_j = _normalize_for_comparison(corp_names[j]) if j < len(corp_names) else ""
                if corp_i and corp_j:
                    shorter_c, longer_c = (corp_i, corp_j) if len(corp_i) <= len(corp_j) else (corp_j, corp_i)
                    if shorter_c in longer_c:
                        # Check time period overlap using parsed dates
                        # Two intervals [s1,e1] and [s2,e2] overlap iff s1 <= e2 and s2 <= e1
                        # Use <= because resume dates are month-granularity (e.g. "2018.03" means "sometime in March 2018")
                        detail_j = work_details[j] if j < len(work_details) else {}
                        start_j = detail_j.get("start_date", "")
                        end_j = detail_j.get("end_date", "")
                        dt_start_j = _parse_date_str(start_j) if start_j else None
                        dt_end_j = _parse_date_str(end_j) if end_j else None
                        # Need at least one valid date on each side to compare
                        if dt_start_i and dt_start_j:
                            # Use far-future as default end if missing
                            eff_end_i = dt_end_i or datetime.datetime(2099, 12, 1)
                            eff_end_j = dt_end_j or datetime.datetime(2099, 12, 1)
                            if dt_start_i <= eff_end_j and dt_start_j <= eff_end_i:
                                is_dup = True
                                break
                        elif (start_i and start_j and start_i == start_j) or \
                                (end_i and end_j and end_i == end_j):
                            # Fallback: exact string match if date parsing fails
                            is_dup = True
                            break
                # Strategy B: content-based Jaccard similarity (fallback)
                norm_i = _normalize_for_comparison(work_descs[i])
                norm_j = _normalize_for_comparison(work_descs[j])
                shorter, longer = (norm_i, norm_j) if len(norm_i) <= len(norm_j) else (norm_j, norm_i)
                if shorter and longer and shorter in longer:
                    is_dup = True
                    break
                jac = _shingling_jaccard(work_descs[i], work_descs[j], n=5)
                if jac > 0.5:
                    is_dup = True
                    break
            if is_dup:
                dup_corp = corp_names[i] if i < len(corp_names) else f"#{i+1}"
                logger.debug(f"Work desc internal duplicate removed: {dup_corp}")
            else:
                kept_indices.append(i)
        # Only update when entries were actually removed
        if len(kept_indices) < len(work_descs):
            resume["work_desc_tks"] = [work_descs[i] for i in kept_indices]
            if corp_names:
                resume["corp_nm_tks"] = [corp_names[i] for i in kept_indices if i < len(corp_names)]
            if work_details:
                resume["_work_exp_details"] = [work_details[i] for i in kept_indices if i < len(work_details)]
            if positions:
                resume["position_name_tks"] = [positions[i] for i in kept_indices if i < len(positions)]
            # Recalculate work years based on deduplicated entries
            new_details = resume.get("_work_exp_details", [])
            if new_details:
                recalc_years = sum(d.get("years", 0) for d in new_details)
                recalc_years = round(recalc_years, 1)
                if recalc_years > 0:
                    resume["work_exp_flt"] = recalc_years
                    logger.info(f"Work years recalculated: {recalc_years} yrs (before dedup: {_calculate_work_years([{'start_date': d.get('start_date',''), 'end_date': d.get('end_date','')} for d in work_details])} yrs)")
            new_corps = resume.get("corp_nm_tks", [])
            if new_corps:
                resume["corporation_name_tks"] = new_corps[0]

    # --- Phase 3.5: Merge project_desc_tks into work_desc_tks ---
    # Instead of complex cross-dedup, we simply merge unique project descriptions into
    # work_desc_tks and clear project_desc_tks. This avoids the problem where LLM extracts
    # the same content into both fields with slightly different wording.
    # After merge, project_desc_tks is emptied so _build_chunk_document won't generate
    # duplicate chunks. Project names are preserved in project_tks for reference.
    work_descs = resume.get("work_desc_tks", [])
    project_descs = resume.get("project_desc_tks", [])
    # Save pre-merge project descriptions for debugging
    resume["_raw_project_descs"] = list(project_descs) if project_descs else []
    if project_descs:
        project_names = resume.get("project_tks", [])
        merged_count = 0
        skipped_count = 0
        for i, proj_desc in enumerate(project_descs):
            norm_proj = _normalize_for_comparison(proj_desc)
            if not norm_proj:
                continue
            # Check if this project desc already exists in work_descs (exact or near-duplicate)
            already_exists = False
            for wd in work_descs:
                norm_wd = _normalize_for_comparison(wd)
                if not norm_wd:
                    continue
                # Substring containment check
                shorter, longer = (norm_proj, norm_wd) if len(norm_proj) <= len(norm_wd) else (norm_wd, norm_proj)
                if shorter in longer:
                    already_exists = True
                    break
                # Jaccard similarity check
                if _shingling_jaccard(proj_desc, wd, n=5) > 0.5:
                    already_exists = True
                    break
            if already_exists:
                skipped_count += 1
                proj_name = project_names[i] if i < len(project_names) else f"#{i+1}"
                logger.debug(f"Project desc already in work_desc, skipped: {proj_name}")
            else:
                # Append to work_desc_tks with project name prefix for context
                proj_name = project_names[i] if i < len(project_names) else ""
                if proj_name:
                    proj_desc_with_prefix = f"[{proj_name}] {proj_desc}"
                else:
                    proj_desc_with_prefix = proj_desc
                work_descs.append(proj_desc_with_prefix)
                merged_count += 1
        resume["work_desc_tks"] = work_descs
        # Clear project_desc_tks — all content is now in work_desc_tks
        resume["project_desc_tks"] = []
        logger.info(f"Merged project descs into work_desc_tks: {merged_count} merged, {skipped_count} skipped (duplicate)")
    # --- Phase 4: Field completion ---
    required_fields = [
        "name_kwd", "gender_kwd", "phone_kwd", "email_tks",
        "position_name_tks", "school_name_tks", "major_tks",
    ]
    for field in required_fields:
        if field not in resume:
            if field.endswith("_tks"):
                resume[field] = []
            elif field.endswith("_int") or field.endswith("_flt"):
                resume[field] = 0
            else:
                resume[field] = ""

    # Clean internal marker fields (already handled in Phase 1, this is a safety fallback)
    resume.pop("_name_confidence", None)

    return resume


# ==================== Pipeline Orchestration & Chunk Construction ====================


def parse_resume(filename: str, binary: bytes, tenant_id , lang: str = "Chinese") -> tuple[dict, list[str], list[dict]]:
    """
    Resume parsing pipeline orchestration function

    Execution flow:
        1. Text extraction (dual-path fusion + layout reconstruction + line-number index)
        2. Parallel LLM structured extraction (three sub-tasks)
        3. Regex fallback parsing (when LLM fails)
        4. Four-phase post-processing

    Args:
        filename: File name
        binary: File binary content
        lang: Language, default "Chinese"
    Returns:
        (resume, lines, line_positions) tuple:
        - resume: Structured resume information dictionary
        - lines: Original line text list (for chunk text matching and positioning)
        - line_positions: Per-line coordinate info list (for writing chunk position_int fields)
    """
    # Phase 1: Text extraction
    indexed_text, lines, line_positions = extract_text(filename, binary)
    if not indexed_text or not lines:
        logger.warning(f"Text extraction returned empty: {filename}")
        default_name = "Unknown" if _is_english(lang) else "未知"
        return {"name_kwd": default_name}, [], []

    # Phase 2: Parallel LLM structured extraction
    resume = parse_with_llm(indexed_text, lines, tenant_id , lang)

    # Phase 3: Fallback to regex parsing when LLM fails
    if not resume:
        logger.info(f"LLM parsing failed, falling back to regex parsing: {filename}")
        plain_text = "\n".join(lines)
        resume = parse_with_regex(plain_text, lang)

    # Phase 4: Post-processing pipeline
    resume = _postprocess_resume(resume, lines, lang)

    return resume, lines, line_positions


def _build_chunk_document(filename: str, resume: dict,
                          lang: str = "Chinese") -> list[dict]:
    """
    Build a list of document chunks from structured resume information

    Each field generates an independent chunk containing tokenization results and metadata.
    Compatible with the build_chunks flow in task_executor.py.

    Key design: Each chunk redundantly includes key identity fields (name, phone, email, etc.),
    so that when any chunk is retrieved, the candidate's identity can be immediately identified.
    The full resume can be fetched via doc_id to get all chunks for complete information.

    Args:
        filename: File name
        resume: Structured resume information dictionary
        lang: Language parameter, default "Chinese"
    Returns:
        Document chunk list, each chunk contains content_with_weight, content_ltks,
        position_int, page_num_int, top_int and other fields
    """
    chunks = []
    # Get the corresponding field map version based on language parameter
    field_map = get_field_map(lang)
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
    }
    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])

    # Extract key identity fields, redundantly written to each chunk
    # These fields are small in size but high in information density; once retrieved, the candidate can be immediately identified
    _IDENTITY_FIELDS = ("name_kwd", "phone_kwd", "email_tks", "gender_kwd",
                        "highest_degree_kwd", "work_exp_flt")
    identity_meta = {}
    for ik in _IDENTITY_FIELDS:
        iv = resume.get(ik)
        if not iv:
            continue
        if ik.endswith("_tks"):
            identity_meta[ik] = rag_tokenizer.tokenize(
                " ".join(iv) if isinstance(iv, list) else str(iv)
            )
        elif ik.endswith("_kwd"):
            identity_meta[ik] = iv if isinstance(iv, list) else str(iv)
        elif ik.endswith("_flt"):
            try:
                identity_meta[ik] = float(iv)
            except (ValueError, TypeError):
                pass
        else:
            identity_meta[ik] = str(iv)

    # Build resume summary text, appended to each chunk's content to improve semantic retrieval recall
    summary_parts = []
    _en = _is_english(lang)
    if resume.get("name_kwd"):
        summary_parts.append(f"{'Name' if _en else '姓名'}:{resume['name_kwd']}")
    if resume.get("phone_kwd"):
        summary_parts.append(f"{'Phone' if _en else '电话'}:{resume['phone_kwd']}")
    if resume.get("corporation_name_tks"):
        corp = resume["corporation_name_tks"]
        summary_parts.append(f"{'Company' if _en else '公司'}:{corp if isinstance(corp, str) else ' '.join(corp)}")
    if resume.get("highest_degree_kwd"):
        summary_parts.append(f"{'Degree' if _en else '学历'}:{resume['highest_degree_kwd']}")
    if resume.get("work_exp_flt"):
        if _en:
            summary_parts.append(f"Experience:{resume['work_exp_flt']}yrs")
        else:
            summary_parts.append(f"经验:{resume['work_exp_flt']}年")
    resume_summary = " | ".join(summary_parts) if summary_parts else ""

    # List fields that need per-element splitting (each experience/project generates a separate chunk to avoid oversized merged chunks)
    _SPLIT_LIST_FIELDS = {"work_desc_tks", "project_desc_tks"}

    # Basic info field set: these fields should be merged into one chunk to avoid splitting name, phone, email, etc.
    _BASIC_INFO_FIELDS = {
        "name_kwd", "name_pinyin_kwd", "gender_kwd", "age_int",
        "phone_kwd", "email_tks", "birth_dt", "work_exp_flt",
        "position_name_tks", "expect_city_names_tks",
        "expect_position_name_tks",
    }

    # Education field set: degree, school, major, tags, etc. should be merged into one chunk
    _EDUCATION_FIELDS = {
        "first_school_name_tks", "first_degree_kwd", "highest_degree_kwd",
        "first_major_tks", "edu_first_fea_kwd", "degree_kwd", "major_tks",
        "school_name_tks", "sch_rank_kwd", "edu_fea_kwd", "edu_end_int",
    }

    # Skills & certificates field set: skills, languages, certificates are small, merge into one chunk
    _SKILL_CERT_FIELDS = {
        "skill_tks", "language_tks", "certificate_tks",
    }

    # Work overview field set: company list, industry, most recent company merged into one chunk
    _WORK_OVERVIEW_FIELDS = {
        "corporation_name_tks", "corp_nm_tks", "industry_name_tks",
    }

    # All merge groups: (field_set, group_title) tuple list
    _MERGE_GROUPS = [
        (_BASIC_INFO_FIELDS, "Basic Info" if _en else "基本信息"),
        (_EDUCATION_FIELDS, "Education" if _en else "教育背景"),
        (_SKILL_CERT_FIELDS, "Skills & Certificates" if _en else "技能与证书"),
        (_WORK_OVERVIEW_FIELDS, "Work Overview" if _en else "工作概况"),
    ]

    # Collect all fields that need merge processing; skip them during individual iteration
    _ALL_MERGED_FIELDS = set()
    for fields_set, _ in _MERGE_GROUPS:
        _ALL_MERGED_FIELDS.update(fields_set)

    # Merge fields by group, generating one chunk per group
    for fields_set, group_title in _MERGE_GROUPS:
        group_parts = []
        group_field_values = {}  # Store structured values for each field, to be written into chunk
        for field_key in field_map:
            if field_key not in fields_set:
                continue
            value = resume.get(field_key)
            if not value:
                continue
            field_desc = field_map[field_key]
            if isinstance(value, list):
                text_value = " ".join(str(v) for v in value if v)
            else:
                text_value = str(value)
            if not text_value.strip():
                continue
            group_parts.append(f"{field_desc}: {text_value}")
            group_field_values[field_key] = value

        if not group_parts:
            continue

        content = f"{group_title}\n" + "\n".join(group_parts)
        if resume_summary:
            content += f"\n[{resume_summary}]"
        chunk = {
            "content_with_weight": content,
            "content_ltks": rag_tokenizer.tokenize(content),
            "content_sm_ltks": rag_tokenizer.fine_grained_tokenize(
                rag_tokenizer.tokenize(content)
            ),
        }
        chunk.update(doc)
        # Redundantly write identity fields
        for mk, mv in identity_meta.items():
            chunk[mk] = mv
        # Write each field's structured value into chunk (for structured retrieval)
        for fk, fv in group_field_values.items():
            if fk.endswith("_tks"):
                text_val = " ".join(str(v) for v in fv) if isinstance(fv, list) else str(fv)
                chunk[fk] = rag_tokenizer.tokenize(text_val)
            elif fk.endswith("_kwd"):
                chunk[fk] = fv if isinstance(fv, list) else str(fv)
            elif fk.endswith("_int"):
                try:
                    chunk[fk] = int(fv)
                except (ValueError, TypeError):
                    pass
            elif fk.endswith("_flt"):
                try:
                    chunk[fk] = float(fv)
                except (ValueError, TypeError):
                    pass
            else:
                chunk[fk] = str(fv)
        chunks.append(chunk)

    # Iterate over field map, generating a chunk for each non-merged field with a value
    for field_key, field_desc in field_map.items():
        # Skip fields already processed in merge groups
        if field_key in _ALL_MERGED_FIELDS:
            continue
        value = resume.get(field_key)
        if not value:
            continue

        # For work/project descriptions (long text lists), split into multiple chunks per element
        if field_key in _SPLIT_LIST_FIELDS and isinstance(value, list):
            # Get company name list to add context to each work description
            corp_list = resume.get("corp_nm_tks", []) if field_key == "work_desc_tks" else []
            project_list = resume.get("project_tks", []) if field_key == "project_desc_tks" else []
            # Get detailed info for each work experience entry (time period, years)
            work_details = resume.get("_work_exp_details", []) if field_key == "work_desc_tks" else []

            for idx, item in enumerate(value):
                item_text = str(item).strip()
                if not item_text:
                    continue

                # Add company/project name prefix to each description for context
                if field_key == "work_desc_tks" and idx < len(work_details):
                    # Use detailed info to build prefix, including company, time range, years
                    detail = work_details[idx]
                    company = detail.get("company", "")
                    start_d = detail.get("start_date", "")
                    end_d = detail.get("end_date", "")
                    years = detail.get("years", 0)
                    # Build time range text
                    time_parts = []
                    if start_d:
                        time_range = f"{start_d}-{end_d}" if end_d else str(start_d)
                        time_parts.append(time_range)
                    if years > 0:
                        time_parts.append(f"{years}{'yrs' if _en else '年'}")
                    time_text = " ".join(time_parts)
                    if company and time_text:
                        content_prefix = f"{field_desc}（{company} {time_text}）"
                    elif company:
                        content_prefix = f"{field_desc}（{company}）"
                    else:
                        content_prefix = f"{field_desc}（{'#' if _en else '第'}{idx + 1}{'' if _en else '段'}）"
                elif field_key == "work_desc_tks" and idx < len(corp_list):
                    content_prefix = f"{field_desc}（{corp_list[idx]}）"
                elif field_key == "project_desc_tks" and idx < len(project_list):
                    content_prefix = f"{field_desc}（{project_list[idx]}）"
                else:
                    content_prefix = f"{field_desc}（{'#' if _en else '第'}{idx + 1}{'' if _en else '段'}）"

                if resume_summary:
                    content = f"{content_prefix}: {item_text}\n[{resume_summary}]"
                else:
                    content = f"{content_prefix}: {item_text}"

                chunk = {
                    "content_with_weight": content,
                    "content_ltks": rag_tokenizer.tokenize(content),
                    "content_sm_ltks": rag_tokenizer.fine_grained_tokenize(
                        rag_tokenizer.tokenize(content)
                    ),
                }
                chunk.update(doc)

                # Redundantly write identity fields
                for mk, mv in identity_meta.items():
                    if mk != field_key:
                        chunk[mk] = mv

                # Tokenization result for current segment
                chunk[field_key] = rag_tokenizer.tokenize(item_text)
                chunks.append(chunk)
            continue

        # Merge list values into text
        if isinstance(value, list):
            text_value = " ".join(str(v) for v in value if v)
        else:
            text_value = str(value)

        if not text_value.strip():
            continue

        # Build chunk content: "field_desc: field_value", append summary for semantic association
        if resume_summary and field_key not in ("name_kwd", "phone_kwd"):
            content = f"{field_desc}: {text_value}\n[{resume_summary}]"
        else:
            content = f"{field_desc}: {text_value}"

        chunk = {
            "content_with_weight": content,
            "content_ltks": rag_tokenizer.tokenize(content),
            "content_sm_ltks": rag_tokenizer.fine_grained_tokenize(
                rag_tokenizer.tokenize(content)
            ),
        }
        chunk.update(doc)

        # Redundantly write identity fields (do not overwrite the current field's own value)
        for mk, mv in identity_meta.items():
            if mk != field_key:
                chunk[mk] = mv

        # Write resume field value into the chunk's corresponding field (for structured retrieval)
        if field_key.endswith("_tks"):
            chunk[field_key] = rag_tokenizer.tokenize(text_value)
        elif field_key.endswith("_kwd"):
            if isinstance(value, list):
                chunk[field_key] = value
            else:
                chunk[field_key] = text_value
        elif field_key.endswith("_int"):
            try:
                chunk[field_key] = int(value)
            except (ValueError, TypeError):
                pass
        elif field_key.endswith("_flt"):
            try:
                chunk[field_key] = float(value)
            except (ValueError, TypeError):
                pass
        else:
            chunk[field_key] = text_value

        chunks.append(chunk)

    # If no chunks were generated, create at least one chunk containing the name
    if not chunks:
        name = resume.get("name_kwd", "Unknown" if _en else "未知")
        content = f"{'Name' if _en else '姓名'}: {name}"
        chunk = {
            "content_with_weight": content,
            "content_ltks": rag_tokenizer.tokenize(content),
            "content_sm_ltks": rag_tokenizer.fine_grained_tokenize(
                rag_tokenizer.tokenize(content)
            ),
        }
        chunk.update(doc)
        chunks.append(chunk)

    # Write coordinate info to each chunk (position_int, page_num_int, top_int)
    #
    # Resume chunks are split by semantic fields (basic info, education, work description, etc.),
    # not by PDF physical regions. Field values may be scattered across multiple locations in the PDF,
    # and using text matching to reverse-lookup coordinates would cause disordered sorting.
    #
    # Therefore, assign incrementing coordinates based on chunk generation order (i.e., semantic logical order),
    # ensuring display order: basic info -> education -> skills/certs -> work overview -> work desc -> project desc...
    #
    # add_positions input format: [(page, left, right, top, bottom), ...]
    #   - page starts from 0, function internally stores +1
    #   - task_executor sorts by page_num_int and top_int (page first, then Y coordinate)
    from rag.nlp import add_positions

    for i, ck in enumerate(chunks):
        # All chunks placed on page=0, top increments by index to ensure logical ordering
        add_positions(ck, [[0, 0, 0, i, i]])

    return chunks

def _blackout_text_regions(image: "np.ndarray", meta_blocks: list[dict], page_idx: int,
                           pdf_to_img_scale: float) -> "np.ndarray":
    """
    Black out metadata-extracted text regions on the page image to prevent OCR duplication.

    Ref: SmartResume blackout strategy — extract metadata text first, black out those regions,
    then run OCR on the blacked-out image so it only recognizes content metadata missed.
    More reliable than IoU-based deduplication.

    Args:
        image: Page image (numpy array)
        meta_blocks: Text blocks from metadata extraction
        page_idx: Current page number
        pdf_to_img_scale: Scale factor from PDF coordinates to image coordinates
    Returns:
        Image with text regions blacked out
    """
    import cv2
    blacked = image.copy()
    page_blocks = [b for b in meta_blocks if b.get("page") == page_idx]
    # Draw filled black rectangles over each metadata text block
    padding = 2  # Extra pixels to ensure full coverage
    for b in page_blocks:
        x0 = int(b["x0"] * pdf_to_img_scale) - padding
        y0 = int(b["top"] * pdf_to_img_scale) - padding
        x1 = int(b["x1"] * pdf_to_img_scale) + padding
        y1 = int(b["bottom"] * pdf_to_img_scale) + padding
        # Clamp to image boundaries
        x0 = max(0, x0)
        y0 = max(0, y0)
        x1 = min(blacked.shape[1], x1)
        y1 = min(blacked.shape[0], y1)
        cv2.rectangle(blacked, (x0, y0), (x1, y1), (0, 0, 0), -1)
    return blacked


def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
          lang="Chinese", callback=None, **kwargs):
    """
    Resume parsing entry function (compatible with task_executor.py)

    This function is the entry point registered as FACTORY[ParserType.RESUME.value],
    with a signature consistent with other parsers (e.g., naive.chunk).

    Args:
        filename: File name
        binary: File binary content
        from_page: Start page number (not used in resume parsing)
        to_page: End page number (not used in resume parsing)
        lang: Language, default "Chinese"
        callback: Progress callback function, accepts (progress, message) parameters
        **kwargs: Other parameters (parser_config, kb_id, tenant_id, etc.)
    Returns:
        Document chunk list
    """
    if callback is None:
        def callback(prog, msg): return None

    try:
        callback(0.1, "Starting resume parsing...")

        # Parse resume
        resume, lines, line_positions = parse_resume(filename, binary, tenant_id , lang)
        callback(0.6, "Resume structured extraction complete")

        # Build document chunks (with coordinate info)
        chunks = _build_chunk_document(filename, resume, lang)
        callback(0.9, f"Document chunk construction complete, {len(chunks)} chunks total")

        callback(1.0, "Resume parsing complete")
        return chunks

    except Exception as e:
        logger.exception(f"Resume parsing exception: {filename}")
        callback(-1, f"Resume parsing failed: {str(e)}")
        return []


def _resort_page_with_layout(page_blocks: list[dict], layout_regions: list[dict]) -> list[dict]:
    if not page_blocks:
        return []

    if not layout_regions:
        return sorted(page_blocks, key=lambda b: (
            (b.get("top", 0) + b.get("bottom", 0)) / 2,
            (b.get("x0", 0) + b.get("x1", 0)) / 2,
        ))

    type_groups: dict[str, list] = {}
    for lt in layout_regions:
        tp = lt.get("type", "")
        type_groups.setdefault(tp, []).append(lt)
    entries = []
    for tp, group in type_groups.items():
        for idx, lt in enumerate(group):
            key = f"{tp}-{idx}"
            x0, x1 = lt.get("x0", 0), lt.get("x1", 0)
            top, bottom = lt.get("top", 0), lt.get("bottom", 0)
            entries.append({
                "key": key, "type": tp,
                "x0": x0, "top": top, "x1": x1, "bottom": bottom,
                "cy": (top + bottom) / 2, "cx": (x0 + x1) / 2,
            })

    for b in page_blocks:
        if b.get("layoutno"):
            continue
        b_cx = (b.get("x0", 0) + b.get("x1", 0)) / 2
        b_cy = (b.get("top", 0) + b.get("bottom", 0)) / 2
        for entry in entries:
            if (entry["x0"] <= b_cx <= entry["x1"]
                    and entry["top"] <= b_cy <= entry["bottom"]):
                b["layoutno"] = entry["key"]
                b["layout_type"] = entry["type"]
                break

    for entry in entries:
        layout_key = entry["key"]
        layout_area = (entry["x1"] - entry["x0"]) * (entry["bottom"] - entry["top"])
        if layout_area <= 0:
            continue
        layout_blocks = [b for b in page_blocks if b.get("layoutno") == layout_key]
        if not layout_blocks:
            continue
        text_total_area = sum(
            (b.get("x1", 0) - b.get("x0", 0)) * (b.get("bottom", 0) - b.get("top", 0))
            for b in layout_blocks
        )
        if text_total_area / layout_area < 0.075:
            for b in layout_blocks:
                b["layoutno"] = ""
                b["layout_type"] = ""

    entry_map = {e["key"]: e for e in entries}
    for b in page_blocks:
        b_cx = (b.get("x0", 0) + b.get("x1", 0)) / 2
        b_cy = (b.get("top", 0) + b.get("bottom", 0)) / 2
        b["_x_center"] = b_cx
        b["_y_center"] = b_cy
        layoutno = b.get("layoutno", "")
        if layoutno and layoutno in entry_map:
            b["_lx_center"] = entry_map[layoutno]["cx"]
            b["_ly_center"] = entry_map[layoutno]["cy"]
        else:
            b["_lx_center"] = b_cx
            b["_ly_center"] = b_cy

    active_keys = {b.get("layoutno") for b in page_blocks if b.get("layoutno")}
    active_entries = [e for e in entries if e["key"] in active_keys]

    for b in page_blocks:
        if b.get("layoutno"):
            continue
        if not active_entries:
            continue
        b_cx, b_cy = b["_x_center"], b["_y_center"]
        min_dist = float("inf")
        best_cx, best_cy = b_cx, b_cy
        for ae in active_entries:
            lx1, ly1, lx2, ly2 = ae["x0"], ae["top"], ae["x1"], ae["bottom"]
            if b_cy < ly1:
                dy = ly1 - b_cy
            elif b_cy > ly2:
                dy = b_cy - ly2
            else:
                dy = 0
            if b_cx < lx1:
                dx = lx1 - b_cx
            elif b_cx > lx2:
                dx = b_cx - lx2
            else:
                dx = 0
            dist = (dx ** 2 + dy ** 2) ** 0.5
            if dist < min_dist:
                min_dist = dist
                best_cx, best_cy = ae["cx"], ae["cy"]
        b["_lx_center"] = best_cx
        b["_ly_center"] = best_cy

    sorted_blocks = sorted(page_blocks, key=lambda b: (
        b.get("_ly_center", 0),
        b.get("_lx_center", 0),
        b.get("_y_center", 0),
        b.get("_x_center", 0),
    ))

    for b in sorted_blocks:
        b.pop("_ly_center", None)
        b.pop("_lx_center", None)
        b.pop("_y_center", None)
        b.pop("_x_center", None)

    return sorted_blocks


def _layout_detect_reorder(blocks: list[dict], binary: bytes) -> list[dict]:
    if not blocks:
        return blocks

    recognizer = _get_layout_recognizer()
    if recognizer is None:
        logger.info("Layout detector unavailable, falling back to heuristic sorting")
        return _layout_aware_reorder(blocks)

    try:
        import pdfplumber
        pages_blocks: dict[int, list[dict]] = {}
        for b in blocks:
            pg = b.get("page", 0)
            pages_blocks.setdefault(pg, []).append(b)

        page_indices = sorted(pages_blocks.keys())
        image_list = []
        ocr_res_per_page = []

        with pdfplumber.open(BytesIO(binary)) as pdf:
            for pg in page_indices:
                if pg >= len(pdf.pages):
                    continue
                page = pdf.pages[pg]
                pil_img = page.to_image(resolution=72 * 3).annotated
                image_list.append(pil_img)

                page_bxs = []
                for b in pages_blocks[pg]:
                    page_bxs.append({
                        "x0": float(b["x0"]),
                        "top": float(b["top"]),
                        "x1": float(b["x1"]),
                        "bottom": float(b["bottom"]),
                        "text": b["text"],
                        "page": pg,
                    })
                ocr_res_per_page.append(page_bxs)

        if not image_list:
            return _layout_aware_reorder(blocks)

        tagged_blocks, page_layouts = recognizer(
            image_list, ocr_res_per_page, scale_factor=3, thr=0.2, drop=False
        )

        if not tagged_blocks:
            logger.warning("Layout detector unavailable, falling back to heuristic sorting")
            return _layout_aware_reorder(blocks)

        tagged_per_page: dict[int, list[dict]] = {}
        for b in tagged_blocks:
            pg = b.get("page", 0)
            tagged_per_page.setdefault(pg, []).append(b)

        sorted_all = []
        total_layout_count = 0
        for pn, pg in enumerate(page_indices):
            page_bxs = tagged_per_page.get(pg, [])
            lts = page_layouts[pn] if pn < len(page_layouts) else []
            total_layout_count += len(lts)
            sorted_page = _resort_page_with_layout(page_bxs, lts)
            sorted_all.extend(sorted_page)

        for b in sorted_all:
            if "page" not in b:
                b["page"] = 0

        logger.info(f"YOLOv10 detector completed， {len(sorted_all)} total chunks，"
                    f"checked {total_layout_count} layout")
        return sorted_all

    except Exception as e:
        logger.warning(f"Layout detector unavailable, falling back to heuristic sorting: {e}")
        return _layout_aware_reorder(blocks)


def _text_shingles(text: str, n: int = 5) -> set[tuple[int, ...]]:
    """
    Generate text fingerprint set using tiktoken BPE tokenization + n-gram shingling.

    Compared to character-level splitting, BPE tokens have better granularity,
    and n-grams preserve word order, providing more accurate overlap measurement.

    Args:
        text: Original text
        n: Shingling window size, default 5
    Returns:
        Set of n-gram shingles (each shingle is a tuple of token ids)
    """
    if not text or _tiktoken_encoding is None:
        return set()
    tokens = _tiktoken_encoding.encode(text)
    if len(tokens) < n:
        # Text too short: return the entire token sequence as a single shingle
        return {tuple(tokens)} if tokens else set()
    return {tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)}


def _shingling_jaccard(text1: str, text2: str, n: int = 5) -> float:
    """
    Compute Jaccard similarity between two texts using tiktoken shingling.

    Args:
        text1: First text
        text2: Second text
        n: Shingling window size
    Returns:
        Jaccard similarity [0.0, 1.0]
    """
    s1 = _text_shingles(text1, n=n)
    s2 = _text_shingles(text2, n=n)
    union = s1 | s2
    if not union:
        return 1.0
    return len(s1 & s2) / len(union)