Files
ragflow/rag/app/resume.py
Yao Wei c99b53064d fix: remove company info from resume_summary to prevent over-retrieval (#13358)
### What problem does this PR solve?

Problem: When searching for a specific company name like(Daofeng
Technology), the search would incorrectly return unrelated resumes
containing generic terms like (Technology) in their company names

Root Cause: The `corporation_name_tks` field was included in the
identity fields that are redundantly written to every chunk. This caused
common words like "科技" to match across all chunks, leading to
over-retrieval of irrelevant resumes.

Solution: Remove `corporation_name_tks` from the `_IDENTITY_FIELDS`
list. Company information is still preserved in the "Work Overview"
chunk where it belongs, allowing proper company-based searches while
preventing false positives from generic terms.

---------

Co-authored-by: Aron.Yao <yaowei@192.168.1.68>
Co-authored-by: Aron.Yao <yaowei@yaoweideMacBook-Pro.local>
Co-authored-by: Liu An <asiro@qq.com>
2026-03-04 19:24:49 +08:00

2741 lines
112 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Resume parsing module (aligned with SmartResume Pipeline architecture optimization)
Key optimizations (ref: arXiv:2510.09722):
1. PDF text fusion: metadata + OCR dual-path extraction and fusion
2. Layout-aware reconstruction: YOLOv10 layout segmentation + hierarchical sorting + line indexing
3. Parallel task decomposition: basic info / work experience / education - 3-way parallel LLM extraction
4. Index pointer mechanism: LLM returns line number ranges instead of generating full text, reducing hallucination
5. Four-stage post-processing: source text re-extraction, domain normalization, context deduplication, source text validation
Compatibility:
- chunk(filename, binary, callback, **kwargs) signature remains unchanged
- Compatible with FACTORY[ParserType.RESUME.value] in task_executor.py
"""
import json
import re
import random
import datetime
import unicodedata
import concurrent.futures
from io import BytesIO
from typing import Optional
import numpy as np
# tiktoken for long random string filtering (ref: SmartResume should_remove strategy)
try:
import tiktoken
_tiktoken_encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
except ImportError:
_tiktoken_encoding = None
# Long random string pattern: 40+ char alphanumeric mixed strings (hash, token, tracking ID, etc.)
_LONG_RANDOM_PATTERN = re.compile(r'[a-zA-Z0-9\-~_]{40,}')
import logging as logger
from rag.nlp import rag_tokenizer
from deepdoc.parser.utils import get_text
# json_repair for fixing malformed JSON from LLM responses (ref: SmartResume fault-tolerance strategy)
try:
import json_repair
except ImportError:
json_repair = None
# YOLOv10 layout detector (lazy initialization to avoid loading model when unused)
_layout_recognizer = None
def _get_layout_recognizer():
"""
Get YOLOv10 layout detector singleton (lazy loading)
Uses the existing deepdoc LayoutRecognizer based on layout.onnx model.
Returns:
LayoutRecognizer instance, or None if loading fails
"""
global _layout_recognizer
if _layout_recognizer is None:
try:
from deepdoc.vision import LayoutRecognizer
_layout_recognizer = LayoutRecognizer("layout")
logger.info("YOLOv10 layout detector loaded successfully")
except Exception as e:
logger.warning(f"YOLOv10 layout detector loading failed, falling back to heuristic sorting: {e}")
_layout_recognizer = False # Mark as failed to avoid repeated attempts
return _layout_recognizer if _layout_recognizer is not False else None
# ==================== Constants ====================
# Fields forbidden from being used as select fields in resume
FORBIDDEN_SELECT_FIELDS = [
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd",
"sch_rank_kwd", "edu_fea_kwd"
]
# Field name to description mapping (bilingual versions for chunk construction)
FIELD_MAP_ZH = {
"name_kwd": "姓名/名字",
"name_pinyin_kwd": "姓名拼音/名字拼音",
"gender_kwd": "性别(男,女)",
"age_int": "年龄/岁/年纪",
"phone_kwd": "电话/手机/微信",
"email_tks": "email/e-mail/邮箱",
"position_name_tks": "职位/职能/岗位/职责",
"expect_city_names_tks": "期望城市",
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
"first_school_name_tks": "第一学历毕业学校",
"first_degree_kwd": "第一学历",
"highest_degree_kwd": "最高学历",
"first_major_tks": "第一学历专业",
"edu_first_fea_kwd": "第一学历标签",
"degree_kwd": "过往学历",
"major_tks": "学过的专业/过往专业",
"school_name_tks": "学校/毕业院校",
"sch_rank_kwd": "学校标签",
"edu_fea_kwd": "教育标签",
"corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
"edu_end_int": "毕业年份",
"industry_name_tks": "所在行业",
"birth_dt": "生日/出生年份",
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
"skill_tks": "技能/技术栈/编程语言/框架/工具",
"language_tks": "语言能力/外语水平",
"certificate_tks": "证书/资质/认证",
"project_tks": "项目经验/项目名称",
"work_desc_tks": "工作职责/工作描述",
"project_desc_tks": "项目描述/项目职责",
"self_evaluation_tks": "自我评价/个人优势/个人总结",
}
FIELD_MAP_EN = {
"name_kwd": "Name",
"name_pinyin_kwd": "Name Pinyin",
"gender_kwd": "Gender (Male, Female)",
"age_int": "Age",
"phone_kwd": "Phone/Mobile/WeChat",
"email_tks": "Email",
"position_name_tks": "Position/Title/Role",
"expect_city_names_tks": "Preferred City",
"work_exp_flt": "Years of Experience",
"corporation_name_tks": "Most Recent Company",
"first_school_name_tks": "First Degree School",
"first_degree_kwd": "First Degree",
"highest_degree_kwd": "Highest Degree",
"first_major_tks": "First Degree Major",
"edu_first_fea_kwd": "First Degree Tag",
"degree_kwd": "Past Degrees",
"major_tks": "Past Majors",
"school_name_tks": "School/University",
"sch_rank_kwd": "School Tag",
"edu_fea_kwd": "Education Tag",
"corp_nm_tks": "Past Companies",
"edu_end_int": "Graduation Year",
"industry_name_tks": "Industry",
"birth_dt": "Date of Birth",
"expect_position_name_tks": "Preferred Position/Role",
"skill_tks": "Skills/Tech Stack/Languages/Frameworks/Tools",
"language_tks": "Language Proficiency",
"certificate_tks": "Certificates/Qualifications",
"project_tks": "Project Experience/Project Name",
"work_desc_tks": "Job Responsibilities/Description",
"project_desc_tks": "Project Description/Responsibilities",
"self_evaluation_tks": "Self-Evaluation/Personal Strengths/Summary",
}
def _is_english(lang: str) -> bool:
"""Determine if the language parameter indicates English"""
return lang.lower() in ("english", "en")
def get_field_map(lang: str) -> dict:
"""Get the corresponding field mapping based on language parameter"""
return FIELD_MAP_EN if _is_english(lang) else FIELD_MAP_ZH
# Backward compatible: default to Chinese version
FIELD_MAP = FIELD_MAP_ZH
# ==================== Parallel LLM Extraction Prompt Templates ====================
# Ref: SmartResume task decomposition strategy, splitting extraction into independent subtasks
# Each prompt ends with /no_think marker to suppress reasoning model's thinking output
# Prompts loaded from md files under rag/prompts/, supporting bilingual versions
from rag.prompts.template import load_prompt
def _load_resume_prompt(name: str, lang: str) -> str:
"""Load the corresponding version of resume prompt template based on language parameter
Args:
name: Prompt name (without language suffix), e.g. "resume_system"
lang: Language parameter, e.g. "Chinese" or "English"
Returns:
Prompt template string
"""
suffix = "_en" if _is_english(lang) else ""
return load_prompt(f"{name}{suffix}")
def get_system_prompt(lang: str) -> str:
"""Get system prompt"""
return _load_resume_prompt("resume_system", lang)
def get_basic_info_prompt(lang: str) -> str:
"""Get basic info extraction prompt"""
return _load_resume_prompt("resume_basic_info", lang)
def get_work_exp_prompt(lang: str) -> str:
"""Get work experience extraction prompt"""
return _load_resume_prompt("resume_work_exp", lang)
def get_education_prompt(lang: str) -> str:
"""Get education background extraction prompt"""
return _load_resume_prompt("resume_education", lang)
def get_project_exp_prompt(lang: str) -> str:
"""Get project experience extraction prompt"""
return _load_resume_prompt("resume_project_exp", lang)
# Backward compatible: default Chinese version constants (for possible external direct references)
SYSTEM_PROMPT = load_prompt("resume_system")
BASIC_INFO_PROMPT = load_prompt("resume_basic_info")
WORK_EXP_PROMPT = load_prompt("resume_work_exp")
EDUCATION_PROMPT = load_prompt("resume_education")
PROJECT_EXP_PROMPT = load_prompt("resume_project_exp")
# LLM call max retry count (ref: SmartResume retry strategy)
_LLM_MAX_RETRIES = 2
def _normalize_whitespace(text: str) -> str:
"""
Unicode whitespace normalization (ref: SmartResume _clean_text_content)
Replaces various Unicode spaces (\u00A0 non-breaking space, \u3000 fullwidth space,
\u2000-\u200A various width spaces, etc.) with regular spaces,
then applies NFKC normalization (fullwidth to halfwidth) and merges consecutive spaces.
Args:
text: Original text
Returns:
Normalized text
"""
if not text:
return ""
# NFKC normalization (fullwidth to halfwidth, etc.)
text = unicodedata.normalize('NFKC', text)
# Unify various Unicode spaces to regular space
text = re.sub(
r'[\u0020\u00A0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\u00A7]',
' ', text
)
# Merge consecutive spaces
text = re.sub(r' {2,}', ' ', text)
return text.strip()
def _should_remove_random_str(match: re.Match) -> bool:
"""
Determine if a matched long string is a meaningless random string (ref: SmartResume should_remove)
Uses tiktoken encoding to judge: if token count exceeds 50% of original char count,
it indicates a meaningless random string (hash, token, tracking ID, etc.) that should be removed.
Normal English words have high token encoding efficiency, with token count far less than char count.
Args:
match: Regex match object
Returns:
True means it should be removed
"""
if _tiktoken_encoding is None:
# When tiktoken is unavailable, use simple heuristic: case/digit alternation frequency
s = match.group(0)
changes = sum(
1 for i in range(1, len(s))
if s[i].isdigit() != s[i-1].isdigit()
or (s[i].isalpha() and s[i-1].isalpha() and s[i].isupper() != s[i-1].isupper())
)
return changes / len(s) > 0.3
encoded = _tiktoken_encoding.encode(match.group(0))
return len(encoded) > len(match.group(0)) * 0.5
def _clean_line_content(text: str) -> str:
"""
Clean single line text content (Unicode normalization + long random string filtering)
Args:
text: Original line text
Returns:
Cleaned text
"""
if not text:
return ""
# Unicode whitespace normalization
text = _normalize_whitespace(text)
# Filter long random strings (hash, token and other meaningless content)
text = _LONG_RANDOM_PATTERN.sub(
lambda m: '' if _should_remove_random_str(m) else m.group(0),
text
)
# Clean up extra spaces after filtering
text = re.sub(r' {2,}', ' ', text).strip()
return text
# ==================== Phase 1: PDF Text Fusion and Layout Reconstruction ====================
def _is_noise_char(obj: dict) -> bool:
"""
Determine if a PDF character object is a decorative layer noise character
Uses a "body text whitelist" strategy instead of enumerating noise features,
to handle noise patterns from different resume templates:
Two reliable features of body text characters (either one means body text):
1. Embedded font: Font name format is XXXXXX+FontName (contains '+'),
indicating the font is embedded in the PDF, chosen by the document author
2. Structure tag: Has PDF Tagged Structure tags (e.g., Span, P, NonStruct, etc.),
indicating the character belongs to the document's semantic structure tree
Common features of noise characters:
- Uses system fonts (e.g., Helvetica, Arial), font name doesn't contain '+'
- No structure tags (tag is None or non-semantic tags like 'OC')
- Common in resume template background decorations, watermarks, tracking marks
Args:
obj: pdfplumber character/text object dictionary
Returns:
True means it's a noise character that should be filtered
"""
# Whitelist condition 1: Embedded font (font name contains '+' prefix)
fontname = obj.get("fontname", "")
if "+" in fontname:
return False # Embedded font = body content
# Whitelist condition 2: Has PDF structure tag
tag = obj.get("tag")
if tag in ("Span", "NonStruct", "P", "H1", "H2", "H3", "H4", "H5", "H6",
"TD", "TH", "LI", "L", "Table", "TR", "Figure", "Caption"):
return False # Has semantic structure tag = body content
# Doesn't meet any whitelist condition, treat as noise
return True
def _extract_metadata_text(binary: bytes) -> list[dict]:
"""
Extract text blocks from PDF metadata (with coordinate info)
Strategy:
1. Use whitelist strategy to filter decorative layer noise chars (embedded font or structure tag = body text)
2. Safe fallback: if filtered chars are less than 30% of original, skip filtering to avoid false positives
3. Use extract_words for word-level extraction (with real coordinates)
4. Aggregate adjacent words into line-level text blocks by Y coordinate
5. Additionally extract table content (many resumes use table layouts)
Args:
binary: PDF file binary content
Returns:
List of text blocks, each containing text, x0, top, x1, bottom, page fields
"""
try:
import pdfplumber
blocks = []
with pdfplumber.open(BytesIO(binary)) as pdf:
for page_idx, page in enumerate(pdf.pages):
page_width = page.width or 600
# Filter decorative layer noise chars (whitelist strategy based on embedded font + structure tag)
# Safe fallback: if filtered chars are less than 30% of original, the PDF's body text
# may use non-embedded fonts without structure tags, skip filtering to avoid false positives
try:
original_char_count = len(page.chars)
filtered_page = page.filter(
lambda obj: not _is_noise_char(obj)
)
filtered_char_count = len(filtered_page.chars)
if original_char_count > 0 and filtered_char_count < original_char_count * 0.3:
# Filtered out over 70% of chars, likely false positives, fall back to original page
filtered_page = page
except Exception:
filtered_page = page
# Use extract_words for extraction (with real coordinates)
words = []
try:
words = filtered_page.extract_words(
keep_blank_chars=False, use_text_flow=True
)
except Exception:
pass
if words:
# Aggregate adjacent words into line-level text blocks by Y coordinate
# Words on the same line: top coordinate difference within threshold
line_threshold = 5 # Y coordinate difference threshold (unit: PDF points)
current_line_words = [words[0]]
def _flush_line(line_words):
"""Merge words in a line into a single text block"""
# Sort by x0 to ensure left-to-right order
line_words.sort(key=lambda w: float(w.get("x0", 0)))
texts = []
for w in line_words:
texts.append(w.get("text", ""))
merged_text = " ".join(texts)
if not merged_text.strip():
return None
return {
"text": merged_text.strip(),
"x0": float(min(w.get("x0", 0) for w in line_words)),
"top": float(min(w.get("top", 0) for w in line_words)),
"x1": float(max(w.get("x1", 0) for w in line_words)),
"bottom": float(max(w.get("bottom", 0) for w in line_words)),
"page": page_idx,
}
for w in words[1:]:
w_top = float(w.get("top", 0))
cur_top = float(current_line_words[0].get("top", 0))
if abs(w_top - cur_top) <= line_threshold:
current_line_words.append(w)
else:
block = _flush_line(current_line_words)
if block:
blocks.append(block)
current_line_words = [w]
# Process the last line
if current_line_words:
block = _flush_line(current_line_words)
if block:
blocks.append(block)
else:
# Fall back to extract_text when extract_words fails
page_text = None
try:
page_text = page.extract_text()
except Exception:
pass
if page_text and page_text.strip():
raw_lines = page_text.split("\n")
line_height = 16
for i, line in enumerate(raw_lines):
cleaned = line.strip()
if not cleaned:
continue
blocks.append({
"text": cleaned,
"x0": 0,
"top": i * line_height,
"x1": page_width,
"bottom": i * line_height + line_height - 2,
"page": page_idx,
})
# Extract table content from the page
# Many resumes use table layouts (e.g., personal info section), extract_words may miss table structure
try:
tables = page.extract_tables()
if tables:
page_blocks = [b for b in blocks if b["page"] == page_idx]
max_top = max((b["top"] for b in page_blocks), default=0) + 20
row_height = 16
for table in tables:
for row in table:
if not row:
continue
cells = [str(c).strip() for c in row if c and str(c).strip()]
if not cells:
continue
row_text = " | ".join(cells)
# Dedup: check if table content was already extracted by extract_words
is_dup = False
for pb in page_blocks:
if all(c in pb["text"] for c in cells[:2]):
is_dup = True
break
if is_dup:
continue
blocks.append({
"text": row_text,
"x0": 0,
"top": max_top,
"x1": page_width,
"bottom": max_top + row_height - 2,
"page": page_idx,
})
max_top += row_height
except Exception as e:
logger.debug(f"PDF table extraction skipped (page {page_idx}): {e}")
return blocks
except Exception as e:
logger.warning(f"PDF metadata extraction failed: {e}")
return []
def _extract_ocr_text(binary: bytes, meta_blocks: list[dict] | None = None) -> list[dict]:
"""
Extract OCR text blocks using blackout strategy (with coordinate info).
Strategy (ref: SmartResume):
1. Render PDF pages to images
2. Black out regions already extracted by metadata
3. Run OCR on the blacked-out image, only recognizing content metadata missed
4. Eliminates duplication at source, no IoU dedup needed downstream
Args:
binary: PDF file binary content
meta_blocks: Text blocks from metadata extraction, used to black out existing text regions
Returns:
List of text blocks, each containing text, x0, top, x1, bottom, page fields
"""
if meta_blocks is None:
meta_blocks = []
try:
import pdfplumber
from deepdoc.vision.ocr import OCR
import numpy as np
ocr = OCR()
blocks = []
with pdfplumber.open(BytesIO(binary)) as pdf:
for page_idx, page in enumerate(pdf.pages):
# Render page to image (resolution=216 = 3x scale, since PDF default is 72 DPI)
img = page.to_image(resolution=216)
page_img = np.array(img.annotated)
# Scale factor from PDF coordinates to image coordinates
pdf_to_img_scale = 216.0 / 72.0 # = 3.0
# Black out metadata-extracted text regions before OCR
page_meta_blocks = [b for b in meta_blocks if b.get("page") == page_idx]
if page_meta_blocks:
page_img = _blackout_text_regions(page_img, meta_blocks, page_idx, pdf_to_img_scale)
ocr_result = ocr(page_img)
if not ocr_result:
continue
for box_info in ocr_result:
if isinstance(box_info, (list, tuple)) and len(box_info) >= 2:
coords = box_info[0] # Coordinate points
text_info = box_info[1]
text = text_info[0] if isinstance(text_info, (list, tuple)) else str(text_info)
if text.strip() and isinstance(coords, (list, tuple)) and len(coords) >= 4:
# Extract bounding box from four corner points
xs = [p[0] for p in coords if isinstance(p, (list, tuple))]
ys = [p[1] for p in coords if isinstance(p, (list, tuple))]
if xs and ys:
blocks.append({
"text": text.strip(),
"x0": min(xs), "top": min(ys),
"x1": max(xs), "bottom": max(ys),
"page": page_idx,
})
return blocks
except Exception as e:
logger.warning(f"OCR extraction failed: {e}")
return []
def _fuse_text_blocks(meta_blocks: list[dict], ocr_blocks: list[dict]) -> list[dict]:
"""
Fuse PDF metadata text and OCR text (blackout strategy version).
Since the OCR phase already blacks out metadata-extracted regions, OCR only recognizes
content that metadata missed. Therefore this function only needs to:
1. Filter out garbled blocks from metadata
2. Directly merge valid metadata blocks and OCR blocks (no IoU dedup needed)
Args:
meta_blocks: Text blocks from metadata extraction
ocr_blocks: Text blocks from OCR extraction (already deduplicated via blackout strategy)
Returns:
Fused text block list
"""
if not ocr_blocks:
return meta_blocks
if not meta_blocks:
return ocr_blocks
# Filter out garbled blocks from metadata
valid_meta = []
garbled_count = 0
for b in meta_blocks:
if _is_valid_line(b.get("text", "")):
valid_meta.append(b)
else:
garbled_count += 1
if garbled_count:
logger.info(f"Detected {garbled_count} garbled blocks in metadata, filtered out")
# Under blackout strategy, OCR won't re-recognize existing text, just merge directly
fused = valid_meta + ocr_blocks
return fused
def _layout_aware_reorder(blocks: list[dict]) -> list[dict]:
"""
Layout-aware hierarchical sorting (ref: SmartResume Hierarchical Re-ordering)
Two-level sorting strategy:
1. Inter-segment sorting: first by page number, then by Y coordinate (top to bottom), same row by X coordinate (left to right)
2. Intra-segment sorting: within each logical segment, sort by reading order
For multi-column resumes, detect column positions by clustering X coordinates,
then sort by column order.
Args:
blocks: Text block list (with coordinate info)
Returns:
Sorted text block list
"""
if not blocks:
return blocks
# Group by page
pages = {}
for b in blocks:
pg = b.get("page", 0)
pages.setdefault(pg, []).append(b)
sorted_blocks = []
for pg in sorted(pages.keys()):
page_blocks = pages[pg]
# Detect multi-column layout: by X coordinate median
if len(page_blocks) > 5:
x_centers = [(b["x0"] + b["x1"]) / 2 for b in page_blocks]
x_min, x_max = min(x_centers), max(x_centers)
page_width = x_max - x_min if x_max > x_min else 1
# Simple two-column detection: if text blocks are clearly distributed on left and right sides
mid_x = (x_min + x_max) / 2
left_count = sum(1 for x in x_centers if x < mid_x - page_width * 0.1)
right_count = sum(1 for x in x_centers if x > mid_x + page_width * 0.1)
if left_count > 3 and right_count > 3:
# Multi-column layout: left column first then right column, each column top to bottom
left_blocks = [b for b in page_blocks if (b["x0"] + b["x1"]) / 2 < mid_x]
right_blocks = [b for b in page_blocks if (b["x0"] + b["x1"]) / 2 >= mid_x]
left_blocks.sort(key=lambda b: (b["top"], b["x0"]))
right_blocks.sort(key=lambda b: (b["top"], b["x0"]))
sorted_blocks.extend(left_blocks)
sorted_blocks.extend(right_blocks)
continue
# Single-column layout: top to bottom, same row left to right
page_blocks.sort(key=lambda b: (b["top"], b["x0"]))
sorted_blocks.extend(page_blocks)
return sorted_blocks
def _build_indexed_text(blocks: list[dict]) -> tuple[str, list[str], list[dict]]:
"""
Build indexed text with line numbers (ref: SmartResume Indexed Linearization)
Merges sorted text blocks into lines and adds a unique index number to each line.
Includes garbled line filtering logic and field label split repair.
Also preserves coordinate info for each line, used for writing position_int etc. to chunks.
Args:
blocks: Sorted text block list
Returns:
(indexed_text, lines, line_positions) tuple:
- indexed_text: Text string with line numbers
- lines: Original line text list (without line numbers)
- line_positions: Coordinate info for each line, format:
"""
if not blocks:
return "", [], []
raw_lines = []
raw_positions = []
current_line_parts = []
current_line_blocks = []
current_top = blocks[0].get("top", 0)
current_layoutno = blocks[0].get("layoutno", "")
threshold = 10
def _merge_line_position(line_blocks: list[dict]) -> dict:
"""Merge coordinates of all blocks in a line into outer bounding rectangle"""
return {
"page": line_blocks[0].get("page", 0),
"x0": min(b.get("x0", 0) for b in line_blocks),
"x1": max(b.get("x1", 0) for b in line_blocks),
"top": min(b.get("top", 0) for b in line_blocks),
"bottom": max(b.get("bottom", 0) for b in line_blocks),
}
for b in blocks:
b_layoutno = b.get("layoutno", "")
y_changed = abs(b.get("top", 0) - current_top) > threshold
layout_changed = b_layoutno != current_layoutno and current_layoutno and b_layoutno
if (y_changed or layout_changed) and current_line_parts:
raw_lines.append(" ".join(current_line_parts))
raw_positions.append(_merge_line_position(current_line_blocks))
current_line_parts = []
current_line_blocks = []
current_top = b.get("top", 0)
current_layoutno = b_layoutno
current_line_parts.append(b["text"])
current_line_blocks.append(b)
if current_line_parts:
raw_lines.append(" ".join(current_line_parts))
raw_positions.append(_merge_line_position(current_line_blocks))
# Filter empty and garbled lines (sync filter coordinates)
lines = []
line_positions = []
for line, pos in zip(raw_lines, raw_positions):
# Unicode normalization + long random string filtering (ref: SmartResume _clean_text_content)
line = _clean_line_content(line)
if not line:
continue
# Garbled detection: skip if valid chars (Chinese/ASCII letters/digits/common punctuation) ratio is too low
if not _is_valid_line(line):
continue
lines.append(line)
line_positions.append(pos)
# Fix field label split issues
# Coordinates are not affected, keep original positions
lines = _fix_split_labels(lines)
# Build indexed text with line numbers
indexed_parts = [f"[{i}]: {line}" for i, line in enumerate(lines)]
indexed_text = "\n".join(indexed_parts)
return indexed_text, lines, line_positions
def _is_valid_line(line: str) -> bool:
"""
Check if a text line is valid content (not garbled)
Multi-dimensional detection:
1. Valid character ratio (Chinese, ASCII alphanumeric, common punctuation)
2. Single-character spacing anomaly detection (PDF custom font mapping causing "O U W Z_W V 2" pattern)
3. Consecutive meaningless alphanumeric sequence detection
Args:
line: Text line to check
Returns:
True means valid line, False means garbled line
"""
if len(line) <= 3:
# Short lines may be valid content like names, keep them
return True
cid_count = len(re.findall(r'\(cid:\d+\)', line))
if cid_count >= 3:
return False
# Valid characters: Chinese (incl. extension), ASCII alphanumeric, common punctuation and spaces, fullwidth chars, CJK punctuation
valid_chars = re.findall(
r'[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff'
r'a-zA-Z0-9\s@.,:;!?()()【】\-_/\\|·•'
r'、,。:;!?\u201c\u201d\u2018\u2019《》'
r'\uff01-\uff5e'
r'\u3000-\u303f'
r'#%&+=~`\u00b7\u2022\u2013\u2014'
r']',
line
)
ratio = len(valid_chars) / len(line) if len(line) > 0 else 0
if ratio < 0.5:
return False
# Detect PDF custom font mapping causing single-character spacing anomaly pattern
# Feature: lots of "single letter space single letter space" sequences, e.g. "O U W Z_W V 2 X 3"
# Stats: ratio of space-separated single chars among non-space chars
spaced_singles = re.findall(r'(?:^|\s)([a-zA-Z0-9])(?:\s|$)', line)
non_space_len = len(line.replace(" ", ""))
if non_space_len > 5 and len(spaced_singles) > 0:
# If ratio of space-separated single chars to non-space chars is too high, classify as garbled
single_ratio = len(spaced_singles) / non_space_len
if single_ratio > 0.3:
return False
# Detect consecutive meaningless mixed-case alphanumeric sequences (e.g. "UJqZX9V2")
# Normal English words don't have such frequent case alternation patterns
garbled_seqs = re.findall(r'[a-zA-Z0-9]{4,}', line.replace(" ", ""))
if garbled_seqs:
garbled_count = 0
for seq in garbled_seqs:
# Count case alternations
case_changes = sum(
1 for i in range(1, len(seq))
if (seq[i].isupper() != seq[i-1].isupper() and seq[i].isalpha() and seq[i-1].isalpha())
or (seq[i].isdigit() != seq[i-1].isdigit())
)
# Too high alternation frequency = garbled sequence (normal words like "Spring" have only 1 alternation)
if len(seq) >= 4 and case_changes / len(seq) > 0.5:
garbled_count += 1
# If garbled sequence ratio is too high
if len(garbled_seqs) > 0 and garbled_count / len(garbled_seqs) > 0.4:
return False
return True
def _fix_split_labels(lines: list[str]) -> list[str]:
"""
Fix field label split issues
Some PDF layouts split field labels across line start/end, e.g.:
- "名:陈晓俐 姓" -> should be fixed to "姓名:陈晓俐"
- "别:男 性" -> should be fixed to "性别:男"
Args:
lines: Original line text list
Returns:
Fixed line text list
"""
# Common split field label patterns: (line-end part, line-start part) -> full label
split_patterns = {
("", ""): "姓名",
("", ""): "性别",
("", ""): "年龄",
("", ""): "电话",
("", ""): "邮箱",
("", ""): "学历",
("", ""): "专业",
("", ""): "地址",
("", ""): "籍贯",
("", ""): "民族",
}
fixed = []
for line in lines:
# Detect in-line split patterns: "Xcontent Y" where (Y, X) is a split pair
for (suffix_char, prefix_char), full_label in split_patterns.items():
# Pattern: "prefix_charcontent suffix_char" (first half at line start, second half at line end)
pattern = rf'^({re.escape(prefix_char)})\s*[:]\s*(.+?)\s+{re.escape(suffix_char)}\s*$'
m = re.match(pattern, line)
if m:
content = m.group(2).strip()
line = f"{full_label}{content}"
break
# Pattern: "suffix_char content prefix_char" (second half at line start, first half at line end)
pattern2 = rf'^{re.escape(suffix_char)}\s*[:]?\s*(.+?)\s+{re.escape(prefix_char)}\s*$'
m2 = re.match(pattern2, line)
if m2:
content = m2.group(1).strip()
line = f"{full_label}{content}"
break
fixed.append(line)
return fixed
def extract_text(filename: str, binary: bytes) -> tuple[str, list[str], list[dict]]:
"""
Extract text content based on file type (Pipeline Phase 1).
PDF files use dual-path fusion + layout reconstruction + line indexing.
Other formats fall back to simple text extraction.
Args:
filename: File name
binary: File binary content
Returns:
(indexed_text, lines, line_positions) tuple:
- indexed_text: Text with line number indices
- lines: List of original line texts
- line_positions: List of per-line coordinate info (empty list for non-PDF formats)
"""
fname_lower = filename.lower()
try:
if fname_lower.endswith(".pdf"):
# Dual-path extraction
meta_blocks = _extract_metadata_text(binary)
ocr_blocks = []
# Determine whether OCR supplementation is needed:
# 1. Metadata text too short (< 100 chars)
# 2. High garbled text ratio in metadata (caused by custom font mapping)
meta_text_len = sum(len(b["text"]) for b in meta_blocks)
need_ocr = False
if meta_text_len < 100:
logger.info("PDF metadata text too short, enabling OCR supplementation")
need_ocr = True
else:
# Check metadata text quality: calculate valid line ratio
# If many lines are judged as garbled by _is_valid_line, the PDF font mapping has issues
valid_line_count = 0
total_line_count = 0
for b in meta_blocks:
text = b.get("text", "").strip()
if not text:
continue
total_line_count += 1
if _is_valid_line(text):
valid_line_count += 1
if total_line_count > 0:
valid_ratio = valid_line_count / total_line_count
if valid_ratio < 0.6:
logger.info(
f"PDF metadata text quality low (valid line ratio {valid_ratio:.1%}), enabling OCR supplementation"
)
need_ocr = True
if need_ocr:
# Blackout strategy: black out metadata-extracted regions before OCR
ocr_blocks = _extract_ocr_text(binary, meta_blocks=meta_blocks)
# Text fusion
fused_blocks = _fuse_text_blocks(meta_blocks, ocr_blocks)
# Layout-aware sorting (prefer YOLOv10 layout detection, fall back to heuristic on failure)
sorted_blocks = _layout_detect_reorder(fused_blocks, binary)
# Build line-indexed text (with coordinate info)
return _build_indexed_text(sorted_blocks)
elif fname_lower.endswith(".docx"):
from docx import Document
doc = Document(BytesIO(binary))
lines = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
# Extract table content from DOCX
# Reference: table handling in naive.py Docx class
# Many resumes use table layouts for personal info; iterating only paragraphs would miss this content
for table in doc.tables:
for row in table.rows:
cells = []
for cell in row.cells:
cell_text = cell.text.strip()
if cell_text:
cells.append(cell_text)
if not cells:
continue
row_text = " | ".join(cells)
# Deduplicate: skip if this row text already exists in lines
if row_text not in lines:
lines.append(row_text)
indexed = "\n".join(f"[{i}]: {line}" for i, line in enumerate(lines))
# DOCX has no coordinate info, return empty list
return indexed, lines, []
else:
text = get_text(filename, binary)
lines = [line.strip() for line in text.split("\n") if line.strip()]
indexed = "\n".join(f"[{i}]: {line}" for i, line in enumerate(lines))
return indexed, lines, []
except Exception:
logger.exception(f"Text extraction failed: {filename}")
return "", [], []
# ==================== Phase 2: Parallel LLM Structured Extraction ====================
def _clean_llm_json_response(response: str) -> str:
"""
Clean LLM JSON response.
Uses SmartResume's lightweight string extraction strategy:
1. Remove markdown code block markers
2. Remove <think>...</think> thinking tags (reasoning models may output these)
3. text.find("{") and text.rfind("}") to locate valid JSON block
Args:
response: Raw LLM response text
Returns:
Cleaned JSON string
"""
text = response.strip()
# Remove markdown code block markers
text = text.replace("```json", "").replace("```", "").strip()
# Remove reasoning model thinking tags
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
# Clean escaped quotes (SmartResume's approach)
text = text.replace('\\"', '"')
# SmartResume strategy: locate first { and last }
start = text.find("{")
end = text.rfind("}")
if start != -1 and end != -1 and end > start:
return text[start:end + 1]
return text
def _parse_json_with_repair(text: str) -> dict:
"""
Parse JSON string, attempt repair on failure (ref SmartResume's json_repair strategy).
Repair strategies:
1. Standard json.loads
2. Replace Python-style booleans/None
3. Use json_repair library
Args:
text: JSON string
Returns:
Parsed dictionary
Raises:
json.JSONDecodeError: Raised when all repair strategies fail
"""
# First attempt: standard parsing
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Second attempt: replace Python-style values (ref SmartResume)
repaired = text.replace("'", '"')
repaired = repaired.replace('True', 'true')
repaired = repaired.replace('False', 'false')
repaired = repaired.replace('None', 'null')
try:
return json.loads(repaired)
except json.JSONDecodeError:
pass
# Third attempt: use json_repair library
if json_repair is not None:
try:
return json_repair.loads(text)
except Exception:
pass
# All strategies failed
raise json.JSONDecodeError("All JSON repair strategies failed", text, 0)
def _call_llm(prompt: str, tenant_id , lang: str) -> Optional[dict]:
"""
Call LLM and parse JSON response (ref SmartResume's retry + fault-tolerance strategy).
Retry mechanism:
- Retry up to _LLM_MAX_RETRIES times
- On retry, increase temperature and randomize seed for output diversity
- Use json_repair on JSON parse failure
Args:
prompt: User prompt
lang: Language
Returns:
Parsed dictionary, or None on failure
"""
try:
from api.db.services.llm_service import LLMBundle
from common.constants import LLMType
llm = LLMBundle(tenant_id, LLMType.CHAT, lang=lang)
for attempt in range(_LLM_MAX_RETRIES + 1):
try:
# Increase temperature on retry for diversity (ref SmartResume)
temperature = 0.1 if attempt == 0 else 1.0
gen_conf = {"temperature": temperature, "max_tokens": 2048}
if attempt > 0:
gen_conf["seed"] = random.randint(0, 1000000)
response = llm._run_coroutine_sync(
llm.async_chat(
system=get_system_prompt(lang),
history=[{"role": "user", "content": prompt}],
gen_conf=gen_conf,
)
)
cleaned = _clean_llm_json_response(response)
return _parse_json_with_repair(cleaned)
except json.JSONDecodeError as e:
if attempt < _LLM_MAX_RETRIES:
logger.info(f"LLM JSON parse failed (attempt {attempt + 1}), retrying: {e}")
continue
else:
logger.warning(f"LLM JSON parse failed (retries exhausted): {e}")
return None
except Exception as e:
logger.warning(f"LLM call failed: {e}")
return None
def _normalize_for_comparison(text: str) -> str:
"""
Normalize text for comparison (ref SmartResume's _normalize_for_comparison).
Unify fullwidth/halfwidth, remove whitespace, Unicode normalization,
so that "阿里巴巴" and "阿 里 巴 巴" can match.
Args:
text: Original text
Returns:
Normalized text
"""
if not text:
return ""
# Unicode NFKC normalization (fullwidth to halfwidth, etc.)
text = unicodedata.normalize("NFKC", text)
# Remove all whitespace characters
text = re.sub(r'\s+', '', text)
return text.lower()
def _calc_single_exp_years(start_str: str, end_str: str) -> float:
"""
Calculate years for a single experience entry.
Args:
start_str: Start date string
end_str: End date string ("至今" etc. means current)
Returns:
Years (float, 1 decimal place), returns 0 if unable to calculate
"""
from datetime import datetime
start_str = str(start_str).strip()
end_str = str(end_str).strip()
if not start_str:
return 0
start_date = _parse_date_str(start_str)
if not start_date:
return 0
if end_str in ("至今", "现在", "present", "Present", "now", "Now", ""):
end_date = datetime.now()
else:
end_date = _parse_date_str(end_str)
if not end_date:
end_date = datetime.now()
months = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)
if months <= 0:
return 0
return round(months / 12.0, 1)
def _calculate_work_years(experiences: list[dict]) -> float:
"""
Calculate total work years based on start/end dates of each work experience.
Args:
experiences: List of work experiences, each containing start_date, end_date fields
Returns:
Total work years (float), returns 0 if unable to calculate
"""
total = 0.0
for exp in experiences:
total += _calc_single_exp_years(
exp.get("start_date", ""), exp.get("end_date", "")
)
return round(total, 1)
def _parse_date_str(date_str: str) -> Optional[datetime.datetime]:
"""
Parse date string, supporting multiple common formats.
Supported formats:
- 2024.1 / 2024.01
- 2024-1 / 2024-01
- 2024/1 / 2024/01
- 2024年1月
- 2024 (year only, defaults to January)
Args:
date_str: Date string
Returns:
datetime object, or None on parse failure
"""
from datetime import datetime
date_str = date_str.strip()
# Try matching year.month / year-month / year/month / year(nian)month(yue) formats
patterns = [
(r"((?:19|20)\d{2})[.\-/年](\d{1,2})", "%Y-%m"),
(r"^((?:19|20)\d{2})$", "%Y"),
]
for pattern, _ in patterns:
m = re.search(pattern, date_str)
if m:
try:
year = int(m.group(1))
month = int(m.group(2)) if len(m.groups()) > 1 else 1
# Month range validation
if month < 1 or month > 12:
month = 1
return datetime(year, month, 1)
except (ValueError, IndexError):
continue
return None
def _extract_description_from_range(
index_range: list, lines: list[str],
company: str = "", position: str = ""
) -> str:
"""
Extract description from original text by index range (ref SmartResume's _extract_description_from_range).
Key improvement:
- Filter out lines containing both company name and position title (avoid mixing header lines into description)
- Boundary safety checks
Args:
index_range: [start_line_number, end_line_number]
lines: List of original line texts
company: Company name (used to filter header lines)
position: Position title (used to filter header lines)
Returns:
Extracted description text
"""
if not index_range or len(index_range) != 2:
return ""
start_idx, end_idx = int(index_range[0]), int(index_range[1])
# Boundary safety check
if start_idx < 0 or end_idx >= len(lines) or start_idx > end_idx:
return ""
extracted_lines = lines[start_idx:end_idx + 1]
# Filter out lines containing both company name and position title (ref SmartResume)
if company or position:
norm_company = _normalize_for_comparison(company)
norm_position = _normalize_for_comparison(position)
filtered = []
for line in extracted_lines:
norm_line = _normalize_for_comparison(line)
# If a line contains both company name and position title, it's likely a header line, skip
if norm_company and norm_position and norm_company in norm_line and norm_position in norm_line:
continue
# If a line exactly equals company name or position title, also skip
if norm_line == norm_company or norm_line == norm_position:
continue
filtered.append(line)
extracted_lines = filtered
if not extracted_lines:
return ""
return "\n".join(line.strip() for line in extracted_lines if line.strip())
def _extract_basic_info(indexed_text: str, tenant_id , lang: str) -> Optional[dict]:
"""Extract basic info (subtask 1).
Basic info is usually at the beginning of the resume, first 8000 chars suffice.
"""
prompt = get_basic_info_prompt(lang).format(indexed_text=indexed_text[:8000])
return _call_llm(prompt,tenant_id, lang)
def _extract_work_experience(indexed_text: str, tenant_id , lang: str) -> Optional[dict]:
"""Extract work experience (subtask 2, using index pointers).
Work experience may span the middle-to-end of the resume, use full text to avoid truncation.
"""
prompt = get_work_exp_prompt(lang).format(indexed_text=indexed_text)
return _call_llm(prompt, tenant_id , lang)
def _extract_education(indexed_text: str, tenant_id , lang: str) -> Optional[dict]:
"""Extract education background (subtask 3).
Education is usually at the end of the resume, must use full text to avoid truncation.
Resume text is generally under 30K chars, within LLM context window.
"""
prompt = get_education_prompt(lang).format(indexed_text=indexed_text)
return _call_llm(prompt,tenant_id, lang)
def _extract_project_experience(indexed_text: str, tenant_id , lang: str) -> Optional[dict]:
"""Extract project experience (subtask 4, using index pointers).
Project experience may span the middle-to-end of the resume, use full text to avoid truncation.
"""
prompt = get_project_exp_prompt(lang).format(indexed_text=indexed_text)
return _call_llm(prompt, tenant_id , lang)
def parse_with_llm(indexed_text: str, lines: list[str], tenant_id , lang: str) -> Optional[dict]:
"""
Extract resume info using parallel task decomposition strategy (ref SmartResume Section 3.2).
Decomposes extraction into four independent subtasks executed in parallel:
1. Basic info (name, phone, skills, self-evaluation, etc.)
2. Work experience (company, position, description line ranges)
3. Education background (school, major, degree)
4. Project experience (project name, role, description line ranges)
Args:
indexed_text: Line-indexed resume text
lines: List of original line texts (for index-based extraction)
lang: Language
Returns:
Merged structured resume dictionary, or None on failure
"""
try:
# Execute four subtasks in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
future_basic = executor.submit(_extract_basic_info, indexed_text, tenant_id , lang)
future_work = executor.submit(_extract_work_experience, indexed_text, tenant_id , lang)
future_edu = executor.submit(_extract_education, indexed_text, tenant_id, lang)
future_project = executor.submit(_extract_project_experience, indexed_text, tenant_id , lang)
basic_info = future_basic.result(timeout=60)
work_exp = future_work.result(timeout=60)
education = future_edu.result(timeout=60)
project_exp = future_project.result(timeout=60)
# Merge results
resume = {}
# Merge basic info
if basic_info:
resume.update(basic_info)
logger.info(f"Basic info extraction succeeded: {len(basic_info)} fields")
# Process work experience (index pointer extraction)
if work_exp and "workExperience" in work_exp:
experiences = work_exp["workExperience"]
companies = []
positions = []
work_descs = []
# Save detailed info for each experience (dates, years) for chunk generation
work_exp_details = []
for exp in experiences:
company = exp.get("company", "")
position = exp.get("position", "")
start_date = exp.get("start_date", "")
end_date = exp.get("end_date", "")
# Calculate years for this experience entry
years = _calc_single_exp_years(start_date, end_date)
if company:
companies.append(company)
if position:
positions.append(position)
# Save detailed info for each experience entry
work_exp_details.append({
"company": company,
"position": position,
"start_date": start_date,
"end_date": end_date,
"years": years,
})
# Index pointer mechanism: extract description from original text by line range
# Use _extract_description_from_range to filter header lines (ref SmartResume)
desc_lines = exp.get("desc_lines", [])
if isinstance(desc_lines, list) and len(desc_lines) == 2:
desc = _extract_description_from_range(
desc_lines, lines, company=company, position=position
)
if desc.strip():
work_descs.append(desc.strip())
if companies:
resume["corp_nm_tks"] = companies
resume["corporation_name_tks"] = companies[0]
if positions:
resume["position_name_tks"] = positions
if work_descs:
resume["work_desc_tks"] = work_descs
# Save experience details for _build_chunk_document
if work_exp_details:
resume["_work_exp_details"] = work_exp_details
# Calculate total work years from each experience's dates (overrides LLM's guess in basic info)
calculated_years = _calculate_work_years(experiences)
if calculated_years > 0:
resume["work_exp_flt"] = calculated_years
logger.info(f"Work experience extraction succeeded: {len(experiences)} entries, calculated total years: {calculated_years}")
# Process education background
if education and "education" in education:
edu_list = education["education"]
schools = []
majors = []
degrees = []
for edu in edu_list:
if edu.get("school"):
schools.append(edu["school"])
if edu.get("major"):
majors.append(edu["major"])
if edu.get("degree"):
degrees.append(edu["degree"])
# Extract graduation year
end_date = edu.get("end_date", "")
if end_date and not resume.get("edu_end_int"):
year_match = re.search(r"(19|20)\d{2}", str(end_date))
if year_match:
resume["edu_end_int"] = int(year_match.group(0))
if schools:
resume["school_name_tks"] = schools
resume["first_school_name_tks"] = schools[-1] # Earliest school is usually last
if majors:
resume["major_tks"] = majors
resume["first_major_tks"] = majors[-1]
if degrees:
resume["degree_kwd"] = degrees
# Infer highest degree (supports both Chinese and English degree names)
degree_rank = {
"博士": 5, "PhD": 5, "Doctor": 5,
"硕士": 4, "Master": 4, "MBA": 4, "EMBA": 4, "MPA": 4,
"本科": 3, "Bachelor": 3,
"大专": 2, "专科": 2, "Associate": 2, "Diploma": 2,
"高中": 1, "High School": 1,
}
highest = max(degrees, key=lambda d: degree_rank.get(d, 0), default="")
if highest:
resume["highest_degree_kwd"] = highest
resume["first_degree_kwd"] = degrees[-1] if degrees else ""
logger.info(f"Education extraction succeeded: {len(edu_list)} entries")
# Process project experience (index pointer extraction, similar to work experience)
if project_exp and "projectExperience" in project_exp:
projects = project_exp["projectExperience"]
project_names = []
project_descs = []
for proj in projects:
name = proj.get("project_name", "")
if name:
project_names.append(name)
# Index pointer mechanism: extract project description from original text by line range
desc_lines = proj.get("desc_lines", [])
if isinstance(desc_lines, list) and len(desc_lines) == 2:
desc = _extract_description_from_range(
desc_lines, lines, company=name, position=proj.get("role", "")
)
if desc.strip():
project_descs.append(desc.strip())
if project_names:
resume["project_tks"] = project_names
if project_descs:
resume["project_desc_tks"] = project_descs
logger.info(f"Project experience extraction succeeded: {len(projects)} entries")
if not resume.get("name_kwd"):
resume["name_kwd"] = "Unknown" if _is_english(lang) else "未知"
return resume if len(resume) > 2 else None
except concurrent.futures.TimeoutError:
logger.warning("LLM parallel extraction timed out")
return None
except Exception as e:
logger.warning(f"LLM parallel extraction failed: {e}")
return None
# ==================== Phase 3: Regex Fallback Parsing ====================
def parse_with_regex(text: str, lang: str = "Chinese") -> dict:
"""
Parse resume text using regex (fallback strategy)
When LLM parsing fails, use regex to extract basic structured info from text.
Args:
text: Resume text content (without line number index)
lang: Language parameter, default "Chinese"
Returns:
Structured resume info dictionary
"""
resume: dict = {}
lines = [line.strip() for line in text.split("\n") if line.strip()]
# --- Extract Name ---
if _is_english(lang):
# English resume: extract from "Name: XXX" format
for line in lines[:30]:
name_match = re.search(r'(?:Name|Full\s*Name)\s*[:]\s*([A-Za-z][A-Za-z\s\-\.]{1,40})', line, re.IGNORECASE)
if name_match:
resume["name_kwd"] = name_match.group(1).strip()
break
# English resume strategy 2: first line if short text without digits, may be a name
if "name_kwd" not in resume and lines:
first = lines[0].strip()
if len(first) <= 40 and not re.search(r"\d", first) and re.match(r'^[A-Za-z][A-Za-z\s\-\.]+$', first):
resume["name_kwd"] = first
else:
# Chinese resume: extract from "姓名XXX" format
for line in lines[:30]:
name_match = re.search(r'\s*名\s*[:]\s*([\u4e00-\u9fa5]{2,4})', line)
if name_match:
resume["name_kwd"] = name_match.group(1)
break
# Strategy 2: search first 20 lines for standalone Chinese names (2-4 chars), excluding common title words
if "name_kwd" not in resume:
title_words = {
"个人", "简历", "求职", "应聘", "基本", "信息", "概述", "简介",
"教育", "工作", "经历", "经验", "技能", "项目", "自我", "评价",
"专业", "技术", "证书", "语言", "能力", "培训", "荣誉", "奖项",
}
for line in lines[:20]:
if any(w in line for w in title_words):
continue
if re.search(r'[:]', line) and len(line) > 6:
continue
cleaned = re.sub(r"^[A-Za-z_\-\d\s]+\s+", "", line)
cleaned = re.sub(r"\s+[A-Za-z_\-\d\s]+$", "", cleaned).strip()
if 2 <= len(cleaned) <= 4 and re.match(r"^[\u4e00-\u9fa5]{2,4}$", cleaned):
resume["name_kwd"] = cleaned
break
# Strategy 3: first line if short without digits, may be a name
if "name_kwd" not in resume and lines:
first = lines[0].strip()
if len(first) <= 10 and not re.search(r"\d", first):
cn_part = re.findall(r'[\u4e00-\u9fa5]+', first)
if cn_part and 2 <= len(cn_part[0]) <= 4:
resume["name_kwd"] = cn_part[0]
# --- Extract Phone Number ---
phones = re.findall(r"1[3-9]\d{9}", text)
if phones:
resume["phone_kwd"] = phones[0]
# --- Extract Email ---
emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
if emails:
resume["email_tks"] = emails[0]
# --- Extract Gender ---
if _is_english(lang):
# English resume: extract from "Gender: Male/Female" format
gender_label = re.search(r'(?:Gender|Sex)\s*[:]\s*(Male|Female|M|F)', text, re.IGNORECASE)
if gender_label:
raw = gender_label.group(1).strip().upper()
resume["gender_kwd"] = "Male" if raw in ("M", "MALE") else "Female"
else:
gender_match = re.search(r'\b(Male|Female)\b', text[:500], re.IGNORECASE)
if gender_match:
resume["gender_kwd"] = gender_match.group(1).capitalize()
else:
# Chinese resume: extract from "性别:男/女" format
gender_label = re.search(r'\s*别\s*[:]\s*(男|女)', text)
if gender_label:
resume["gender_kwd"] = gender_label.group(1)
else:
gender_match = re.search(r"(男|女)", text[:500])
if gender_match:
resume["gender_kwd"] = gender_match.group(1)
# --- Extract Age ---
if _is_english(lang):
# English resume: match "25 years old" or "Age: 25"
age_match = re.search(r'(?:Age)\s*[:]\s*(\d{1,2})', text, re.IGNORECASE)
if not age_match:
age_match = re.search(r'(\d{1,2})\s*years?\s*old', text, re.IGNORECASE)
if age_match:
resume["age_int"] = int(age_match.group(1))
else:
# Chinese resume: match "25岁"
age_match = re.search(r"(\d{1,2})\s*岁", text)
if age_match:
resume["age_int"] = int(age_match.group(1))
# --- Extract Date of Birth ---
if _is_english(lang):
# English resume: match "1990-01-15" or "Jan 15, 1990" etc.
birth_match = re.search(r'(?:Birth|DOB|Date\s*of\s*Birth)\s*[:]\s*(.{6,20})', text, re.IGNORECASE)
if birth_match:
resume["birth_dt"] = birth_match.group(1).strip()
else:
birth_match = re.search(r"(19|20)\d{2}[-/]\d{1,2}[-/]\d{1,2}", text)
if birth_match:
resume["birth_dt"] = birth_match.group(0)
else:
# Chinese resume: match "1990年1月15日" or "1990-01-15"
birth_match = re.search(r"(19|20)\d{2}[年/-]\d{1,2}[月/-]\d{1,2}", text)
if birth_match:
resume["birth_dt"] = birth_match.group(0)
# --- Extract Education Level ---
degree_keywords_zh = ["博士", "硕士", "本科", "大专", "专科", "高中", "MBA", "EMBA", "MPA"]
degree_keywords_en = ["PhD", "Master", "Bachelor", "Associate", "Diploma", "High School",
"MBA", "EMBA", "MPA", "Doctor"]
degree_keywords = degree_keywords_en if _is_english(lang) else degree_keywords_zh
found_degrees = [d for d in degree_keywords if d in text]
if found_degrees:
resume["degree_kwd"] = found_degrees
# --- Extract School ---
if _is_english(lang):
# English resume: match "University/College/Institute/School" keywords
schools = re.findall(
r'([A-Z][A-Za-z\s\-&]{2,40}(?:University|College|Institute|School|Academy))',
text
)
# Remove extra whitespace
schools = [re.sub(r'\s+', ' ', s).strip() for s in schools]
else:
# Chinese resume: match "XX大学/学院/职业技术学院"
schools = re.findall(r"[\u4e00-\u9fa5]{2,15}(?:大学|学院|职业技术学院)", text)
if schools:
resume["school_name_tks"] = list(set(schools))
resume["first_school_name_tks"] = schools[0]
# --- Extract Major ---
if _is_english(lang):
# English resume: match "Major: XXX" / "Field of Study: XXX" / "Specialization: XXX"
majors = re.findall(
r'(?:Major|Field\s*of\s*Study|Specialization|Concentration)\s*[:]\s*([A-Za-z\s\-&,]{2,40})',
text, re.IGNORECASE
)
majors = [m.strip() for m in majors if m.strip()]
else:
# Chinese resume: match "专业XXX"
majors = re.findall(r"专业[:]\s*([\u4e00-\u9fa5]{2,20})", text)
if majors:
resume["major_tks"] = majors
resume["first_major_tks"] = majors[0]
# --- Extract Company Names ---
if _is_english(lang):
# English resume: match common company suffixes
en_company_patterns = [
r'([A-Z][A-Za-z\s\-&,\.]{2,40}(?:Inc\.|Corp\.|Ltd\.|LLC|Co\.|Company|Group|Technologies|Technology|Solutions|Consulting|Services|Bank))',
]
companies = []
for pattern in en_company_patterns:
companies.extend(re.findall(pattern, text))
companies = [re.sub(r'\s+', ' ', c).strip() for c in companies]
else:
# Chinese resume: match "XX有限公司" format
company_patterns = [
r"[\u4e00-\u9fa5]{2,20}[(][\u4e00-\u9fa5]{2,10}[)](?:科技|信息技术|网络科技)?(?:股份)?有限公司",
r"[\u4e00-\u9fa5]{4,20}(?:科技|信息技术|网络科技|银行)?(?:股份)?有限公司",
]
companies = []
for pattern in company_patterns:
companies.extend(re.findall(pattern, text))
unique_companies = []
seen = set()
# Filter verb list (bilingual)
filter_verbs = (
["completed", "conducted", "implemented", "responsible", "participated", "developed"]
if _is_english(lang)
else ["完成", "进行", "实施", "负责", "参与", "开发"]
)
min_len = 3 if _is_english(lang) else 6
for c in companies:
if len(c) < min_len or any(v in c.lower() for v in filter_verbs) or c in seen:
continue
is_sub = False
for existing in list(unique_companies):
if c in existing:
is_sub = True
break
if existing in c:
unique_companies.remove(existing)
seen.discard(existing)
if not is_sub:
unique_companies.append(c)
seen.add(c)
if unique_companies:
resume["corp_nm_tks"] = unique_companies
resume["corporation_name_tks"] = unique_companies[0]
# --- Extract Position (improved: context constraints to reduce noise) ---
if _is_english(lang):
# English resume: Strategy 1 - extract from "Title: XXX" / "Position: XXX" / "Role: XXX" format
position_label_matches = re.findall(
r'(?:Title|Position|Role|Job\s*Title)\s*[:]\s*([A-Za-z\s\-/&]{2,30})',
text, re.IGNORECASE
)
positions = [p.strip() for p in position_label_matches if p.strip()]
# English resume: Strategy 2 - match common position suffix keywords
en_position_suffixes = [
"Engineer", "Manager", "Director", "Supervisor", "Specialist",
"Designer", "Consultant", "Assistant", "Architect", "Analyst",
"Developer", "Lead", "Officer", "Coordinator", "Administrator",
"Intern", "VP", "President",
]
for line in lines:
if len(line) > 60:
continue # Skip overly long lines (usually description text)
for suffix in en_position_suffixes:
match = re.search(rf'([A-Za-z\s\-]{{1,25}}{suffix})\b', line, re.IGNORECASE)
if match:
pos = match.group(1).strip()
# Filter out matches that are clearly not positions (contain verbs)
filter_pos_verbs = ["responsible", "participated", "completed", "developed", "designed"]
if not any(v in pos.lower() for v in filter_pos_verbs) and len(pos) > 3:
positions.append(pos)
else:
# Chinese resume: Strategy 1 - extract from "职位/岗位XXX" format
position_label_matches = re.findall(
r'(?:职位|岗位|职务|职称|担任)\s*[:]\s*([\u4e00-\u9fa5a-zA-Z]{2,15})',
text
)
positions = list(position_label_matches)
# Chinese resume: Strategy 2 - extract from work experience paragraphs (company name followed by position)
for line in lines:
pos_match = re.search(
r'(?:有限公司|集团|银行)\s+([\u4e00-\u9fa5]{2,8}(?:工程师|经理|总监|主管|专员|设计师|顾问|助理|架构师|分析师|运营|产品))',
line
)
if pos_match:
positions.append(pos_match.group(1))
# Chinese resume: Strategy 3 - position keywords in standalone lines (length-limited to avoid matching description text)
position_suffixes = ["工程师", "经理", "总监", "主管", "专员", "设计师", "顾问",
"助理", "架构师", "分析师", "开发者", "负责人"]
for line in lines:
if len(line) > 20:
continue # Skip overly long lines
for suffix in position_suffixes:
match = re.search(rf'([\u4e00-\u9fa5]{{1,6}}{suffix})', line)
if match:
pos = match.group(1)
if not any(v in pos for v in ["负责", "参与", "完成", "开发了", "设计了"]):
positions.append(pos)
if positions:
# Deduplicate while preserving order
seen_pos = set()
unique_positions = []
for p in positions:
if p not in seen_pos:
seen_pos.add(p)
unique_positions.append(p)
resume["position_name_tks"] = unique_positions
# --- Extract Years of Experience ---
if _is_english(lang):
# English resume: match "5 years experience" / "5+ years of experience"
work_exp_match = re.search(r'(\d+)\+?\s*years?\s*(?:of\s*)?(?:experience|work)', text, re.IGNORECASE)
if work_exp_match:
resume["work_exp_flt"] = float(work_exp_match.group(1))
else:
# Chinese resume: match "5年...经验"
work_exp_match = re.search(r"(\d+)\s*年.*?经验", text)
if work_exp_match:
resume["work_exp_flt"] = float(work_exp_match.group(1))
# --- Extract Graduation Year ---
if _is_english(lang):
# English resume: match "Graduated 2020" / "Graduation: 2020" / "Class of 2020"
grad_match = re.search(r'(?:Graduat(?:ed|ion)|Class\s*of)\s*[:]?\s*((?:19|20)\d{2})', text, re.IGNORECASE)
if grad_match:
resume["edu_end_int"] = int(grad_match.group(1))
else:
# Chinese resume: match "2020年...毕业"
grad_match = re.search(r"((?:19|20)\d{2})\s*年.*?毕业", text)
if grad_match:
resume["edu_end_int"] = int(grad_match.group(1))
if "name_kwd" not in resume:
resume["name_kwd"] = "Unknown" if _is_english(lang) else "未知"
return resume
# ==================== Phase 4: Post-processing Pipeline ====================
def _postprocess_resume(resume: dict, lines: list[str], lang: str = "Chinese") -> dict:
"""
Four-phase post-processing pipeline (ref: SmartResume Section 3.2.3)
1. Source text validation: check if key fields can be found in the original text
2. Domain normalization: standardize date formats, clean company name suffix noise
3. Contextual deduplication: remove duplicate company/school entries
4. Field completion: ensure all required fields exist
Args:
resume: Raw resume dictionary extracted by LLM
lines: Original line text list (for source text validation)
lang: Language parameter, default "Chinese"
Returns:
Post-processed resume dictionary
"""
_en = _is_english(lang)
full_text = "\n".join(lines) if lines else ""
# Normalize full text for comparison (ref: SmartResume _validate_fields_in_text)
norm_full_text = _normalize_for_comparison(full_text)
# --- Phase 1: Source text validation (prune hallucinations, ref: SmartResume _validate_fields_in_text) ---
# Name validation: clear if not found in source text (SmartResume strategy: discard hallucinated fields)
_unknown_names = ("未知", "Unknown")
if resume.get("name_kwd") and resume["name_kwd"] not in _unknown_names:
norm_name = _normalize_for_comparison(resume["name_kwd"])
if norm_full_text and norm_name and norm_name not in norm_full_text:
logger.warning(f"Name '{resume['name_kwd']}' not found in source text, classified as LLM hallucination, cleared")
resume["name_kwd"] = ""
# Validate company names (strict matching: full name must appear in source text, no longer using loose 4-char prefix matching)
if resume.get("corp_nm_tks") and norm_full_text:
verified_companies = []
for company in resume["corp_nm_tks"]:
norm_company = _normalize_for_comparison(company)
if norm_company and norm_company in norm_full_text:
verified_companies.append(company)
else:
logger.debug(f"Company '{company}' not found in source text, filtered out")
# Update even if all filtered out (SmartResume strategy: prefer missing over wrong)
resume["corp_nm_tks"] = verified_companies
if verified_companies:
resume["corporation_name_tks"] = verified_companies[0]
else:
resume["corporation_name_tks"] = ""
# Validate school names (ref: SmartResume _validate_fields_in_text)
if resume.get("school_name_tks") and norm_full_text:
verified_schools = []
for school in resume["school_name_tks"]:
norm_school = _normalize_for_comparison(school)
if norm_school and norm_school in norm_full_text:
verified_schools.append(school)
else:
logger.debug(f"School '{school}' not found in source text, filtered out")
resume["school_name_tks"] = verified_schools
if verified_schools:
if resume.get("first_school_name_tks"):
# Ensure first_school is also in the verified list
if resume["first_school_name_tks"] not in verified_schools:
resume["first_school_name_tks"] = verified_schools[-1]
else:
resume["first_school_name_tks"] = ""
# Validate position names
if resume.get("position_name_tks") and norm_full_text:
verified_positions = []
for pos in resume["position_name_tks"]:
norm_pos = _normalize_for_comparison(pos)
if norm_pos and norm_pos in norm_full_text:
verified_positions.append(pos)
if verified_positions:
resume["position_name_tks"] = verified_positions
# --- Phase 2: Domain normalization ---
# Standardize date format
if resume.get("birth_dt"):
resume["birth_dt"] = re.sub(r"[年月]", "-", str(resume["birth_dt"])).rstrip("-")
# Clean non-digit characters from phone number (keep + sign)
if resume.get("phone_kwd"):
phone = re.sub(r"[^\d+]", "", str(resume["phone_kwd"]))
if phone:
resume["phone_kwd"] = phone
# Standardize gender (output format determined by language parameter)
if resume.get("gender_kwd"):
gender = str(resume["gender_kwd"]).strip()
if gender in ("male", "Male", "M", "m", ""):
resume["gender_kwd"] = "Male" if _en else ""
elif gender in ("female", "Female", "F", "f", ""):
resume["gender_kwd"] = "Female" if _en else ""
# --- Phase 3: Contextual deduplication ---
for list_field in ["corp_nm_tks", "school_name_tks", "major_tks",
"position_name_tks", "skill_tks"]:
if isinstance(resume.get(list_field), list):
# Order-preserving deduplication
seen = set()
deduped = []
for item in resume[list_field]:
item_str = str(item).strip()
if item_str and item_str not in seen:
seen.add(item_str)
deduped.append(item_str)
resume[list_field] = deduped
# --- Phase 3.4: work_desc_tks dedup by company name + time period ---
# LLM often extracts the same company's content twice: once from the "Work Experience"
# section and once from the "Project Experience" section, producing entries like
# These have different descriptions (daily work vs project details), so content-based
# Jaccard dedup cannot catch them. Instead, we detect duplicate companies by checking
# if one company name is a substring of another AND their time periods overlap.
# This also fixes the inflated work_exp_flt (e.g. 25.5 years instead of ~14).
work_descs = resume.get("work_desc_tks", [])
if len(work_descs) > 1:
corp_names = resume.get("corp_nm_tks", [])
work_details = resume.get("_work_exp_details", [])
positions = resume.get("position_name_tks", [])
kept_indices = []
for i in range(len(work_descs)):
is_dup = False
corp_i = _normalize_for_comparison(corp_names[i]) if i < len(corp_names) else ""
detail_i = work_details[i] if i < len(work_details) else {}
start_i = detail_i.get("start_date", "")
end_i = detail_i.get("end_date", "")
# Parse dates for entry i once (reused across inner loop)
dt_start_i = _parse_date_str(start_i) if start_i else None
dt_end_i = _parse_date_str(end_i) if end_i else None
for j in kept_indices:
# Strategy A: company name substring + time period overlap
corp_j = _normalize_for_comparison(corp_names[j]) if j < len(corp_names) else ""
if corp_i and corp_j:
shorter_c, longer_c = (corp_i, corp_j) if len(corp_i) <= len(corp_j) else (corp_j, corp_i)
if shorter_c in longer_c:
# Check time period overlap using parsed dates
# Two intervals [s1,e1] and [s2,e2] overlap iff s1 <= e2 and s2 <= e1
# Use <= because resume dates are month-granularity (e.g. "2018.03" means "sometime in March 2018")
detail_j = work_details[j] if j < len(work_details) else {}
start_j = detail_j.get("start_date", "")
end_j = detail_j.get("end_date", "")
dt_start_j = _parse_date_str(start_j) if start_j else None
dt_end_j = _parse_date_str(end_j) if end_j else None
# Need at least one valid date on each side to compare
if dt_start_i and dt_start_j:
# Use far-future as default end if missing
eff_end_i = dt_end_i or datetime.datetime(2099, 12, 1)
eff_end_j = dt_end_j or datetime.datetime(2099, 12, 1)
if dt_start_i <= eff_end_j and dt_start_j <= eff_end_i:
is_dup = True
break
elif (start_i and start_j and start_i == start_j) or \
(end_i and end_j and end_i == end_j):
# Fallback: exact string match if date parsing fails
is_dup = True
break
# Strategy B: content-based Jaccard similarity (fallback)
norm_i = _normalize_for_comparison(work_descs[i])
norm_j = _normalize_for_comparison(work_descs[j])
shorter, longer = (norm_i, norm_j) if len(norm_i) <= len(norm_j) else (norm_j, norm_i)
if shorter and longer and shorter in longer:
is_dup = True
break
jac = _shingling_jaccard(work_descs[i], work_descs[j], n=5)
if jac > 0.5:
is_dup = True
break
if is_dup:
dup_corp = corp_names[i] if i < len(corp_names) else f"#{i+1}"
logger.debug(f"Work desc internal duplicate removed: {dup_corp}")
else:
kept_indices.append(i)
# Only update when entries were actually removed
if len(kept_indices) < len(work_descs):
resume["work_desc_tks"] = [work_descs[i] for i in kept_indices]
if corp_names:
resume["corp_nm_tks"] = [corp_names[i] for i in kept_indices if i < len(corp_names)]
if work_details:
resume["_work_exp_details"] = [work_details[i] for i in kept_indices if i < len(work_details)]
if positions:
resume["position_name_tks"] = [positions[i] for i in kept_indices if i < len(positions)]
# Recalculate work years based on deduplicated entries
new_details = resume.get("_work_exp_details", [])
if new_details:
recalc_years = sum(d.get("years", 0) for d in new_details)
recalc_years = round(recalc_years, 1)
if recalc_years > 0:
resume["work_exp_flt"] = recalc_years
logger.info(f"Work years recalculated: {recalc_years} yrs (before dedup: {_calculate_work_years([{'start_date': d.get('start_date',''), 'end_date': d.get('end_date','')} for d in work_details])} yrs)")
new_corps = resume.get("corp_nm_tks", [])
if new_corps:
resume["corporation_name_tks"] = new_corps[0]
# --- Phase 3.5: Merge project_desc_tks into work_desc_tks ---
# Instead of complex cross-dedup, we simply merge unique project descriptions into
# work_desc_tks and clear project_desc_tks. This avoids the problem where LLM extracts
# the same content into both fields with slightly different wording.
# After merge, project_desc_tks is emptied so _build_chunk_document won't generate
# duplicate chunks. Project names are preserved in project_tks for reference.
work_descs = resume.get("work_desc_tks", [])
project_descs = resume.get("project_desc_tks", [])
# Save pre-merge project descriptions for debugging
resume["_raw_project_descs"] = list(project_descs) if project_descs else []
if project_descs:
project_names = resume.get("project_tks", [])
merged_count = 0
skipped_count = 0
for i, proj_desc in enumerate(project_descs):
norm_proj = _normalize_for_comparison(proj_desc)
if not norm_proj:
continue
# Check if this project desc already exists in work_descs (exact or near-duplicate)
already_exists = False
for wd in work_descs:
norm_wd = _normalize_for_comparison(wd)
if not norm_wd:
continue
# Substring containment check
shorter, longer = (norm_proj, norm_wd) if len(norm_proj) <= len(norm_wd) else (norm_wd, norm_proj)
if shorter in longer:
already_exists = True
break
# Jaccard similarity check
if _shingling_jaccard(proj_desc, wd, n=5) > 0.5:
already_exists = True
break
if already_exists:
skipped_count += 1
proj_name = project_names[i] if i < len(project_names) else f"#{i+1}"
logger.debug(f"Project desc already in work_desc, skipped: {proj_name}")
else:
# Append to work_desc_tks with project name prefix for context
proj_name = project_names[i] if i < len(project_names) else ""
if proj_name:
proj_desc_with_prefix = f"[{proj_name}] {proj_desc}"
else:
proj_desc_with_prefix = proj_desc
work_descs.append(proj_desc_with_prefix)
merged_count += 1
resume["work_desc_tks"] = work_descs
# Clear project_desc_tks — all content is now in work_desc_tks
resume["project_desc_tks"] = []
logger.info(f"Merged project descs into work_desc_tks: {merged_count} merged, {skipped_count} skipped (duplicate)")
# --- Phase 4: Field completion ---
required_fields = [
"name_kwd", "gender_kwd", "phone_kwd", "email_tks",
"position_name_tks", "school_name_tks", "major_tks",
]
for field in required_fields:
if field not in resume:
if field.endswith("_tks"):
resume[field] = []
elif field.endswith("_int") or field.endswith("_flt"):
resume[field] = 0
else:
resume[field] = ""
# Clean internal marker fields (already handled in Phase 1, this is a safety fallback)
resume.pop("_name_confidence", None)
return resume
# ==================== Pipeline Orchestration & Chunk Construction ====================
def parse_resume(filename: str, binary: bytes, tenant_id , lang: str = "Chinese") -> tuple[dict, list[str], list[dict]]:
"""
Resume parsing pipeline orchestration function
Execution flow:
1. Text extraction (dual-path fusion + layout reconstruction + line-number index)
2. Parallel LLM structured extraction (three sub-tasks)
3. Regex fallback parsing (when LLM fails)
4. Four-phase post-processing
Args:
filename: File name
binary: File binary content
lang: Language, default "Chinese"
Returns:
(resume, lines, line_positions) tuple:
- resume: Structured resume information dictionary
- lines: Original line text list (for chunk text matching and positioning)
- line_positions: Per-line coordinate info list (for writing chunk position_int fields)
"""
# Phase 1: Text extraction
indexed_text, lines, line_positions = extract_text(filename, binary)
if not indexed_text or not lines:
logger.warning(f"Text extraction returned empty: {filename}")
default_name = "Unknown" if _is_english(lang) else "未知"
return {"name_kwd": default_name}, [], []
# Phase 2: Parallel LLM structured extraction
resume = parse_with_llm(indexed_text, lines, tenant_id , lang)
# Phase 3: Fallback to regex parsing when LLM fails
if not resume:
logger.info(f"LLM parsing failed, falling back to regex parsing: {filename}")
plain_text = "\n".join(lines)
resume = parse_with_regex(plain_text, lang)
# Phase 4: Post-processing pipeline
resume = _postprocess_resume(resume, lines, lang)
return resume, lines, line_positions
def _build_chunk_document(filename: str, resume: dict,
lang: str = "Chinese") -> list[dict]:
"""
Build a list of document chunks from structured resume information
Each field generates an independent chunk containing tokenization results and metadata.
Compatible with the build_chunks flow in task_executor.py.
Key design: Each chunk redundantly includes key identity fields (name, phone, email, etc.),
so that when any chunk is retrieved, the candidate's identity can be immediately identified.
The full resume can be fetched via doc_id to get all chunks for complete information.
Args:
filename: File name
resume: Structured resume information dictionary
lang: Language parameter, default "Chinese"
Returns:
Document chunk list, each chunk contains content_with_weight, content_ltks,
position_int, page_num_int, top_int and other fields
"""
chunks = []
# Get the corresponding field map version based on language parameter
field_map = get_field_map(lang)
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# Extract key identity fields, redundantly written to each chunk
# These fields are small in size but high in information density; once retrieved, the candidate can be immediately identified
_IDENTITY_FIELDS = ("name_kwd", "phone_kwd", "email_tks", "gender_kwd",
"highest_degree_kwd", "work_exp_flt")
identity_meta = {}
for ik in _IDENTITY_FIELDS:
iv = resume.get(ik)
if not iv:
continue
if ik.endswith("_tks"):
identity_meta[ik] = rag_tokenizer.tokenize(
" ".join(iv) if isinstance(iv, list) else str(iv)
)
elif ik.endswith("_kwd"):
identity_meta[ik] = iv if isinstance(iv, list) else str(iv)
elif ik.endswith("_flt"):
try:
identity_meta[ik] = float(iv)
except (ValueError, TypeError):
pass
else:
identity_meta[ik] = str(iv)
# Build resume summary text, appended to each chunk's content to improve semantic retrieval recall
summary_parts = []
_en = _is_english(lang)
if resume.get("name_kwd"):
summary_parts.append(f"{'Name' if _en else '姓名'}:{resume['name_kwd']}")
if resume.get("phone_kwd"):
summary_parts.append(f"{'Phone' if _en else '电话'}:{resume['phone_kwd']}")
if resume.get("corporation_name_tks"):
corp = resume["corporation_name_tks"]
summary_parts.append(f"{'Company' if _en else '公司'}:{corp if isinstance(corp, str) else ' '.join(corp)}")
if resume.get("highest_degree_kwd"):
summary_parts.append(f"{'Degree' if _en else '学历'}:{resume['highest_degree_kwd']}")
if resume.get("work_exp_flt"):
if _en:
summary_parts.append(f"Experience:{resume['work_exp_flt']}yrs")
else:
summary_parts.append(f"经验:{resume['work_exp_flt']}")
resume_summary = " | ".join(summary_parts) if summary_parts else ""
# List fields that need per-element splitting (each experience/project generates a separate chunk to avoid oversized merged chunks)
_SPLIT_LIST_FIELDS = {"work_desc_tks", "project_desc_tks"}
# Basic info field set: these fields should be merged into one chunk to avoid splitting name, phone, email, etc.
_BASIC_INFO_FIELDS = {
"name_kwd", "name_pinyin_kwd", "gender_kwd", "age_int",
"phone_kwd", "email_tks", "birth_dt", "work_exp_flt",
"position_name_tks", "expect_city_names_tks",
"expect_position_name_tks",
}
# Education field set: degree, school, major, tags, etc. should be merged into one chunk
_EDUCATION_FIELDS = {
"first_school_name_tks", "first_degree_kwd", "highest_degree_kwd",
"first_major_tks", "edu_first_fea_kwd", "degree_kwd", "major_tks",
"school_name_tks", "sch_rank_kwd", "edu_fea_kwd", "edu_end_int",
}
# Skills & certificates field set: skills, languages, certificates are small, merge into one chunk
_SKILL_CERT_FIELDS = {
"skill_tks", "language_tks", "certificate_tks",
}
# Work overview field set: company list, industry, most recent company merged into one chunk
_WORK_OVERVIEW_FIELDS = {
"corporation_name_tks", "corp_nm_tks", "industry_name_tks",
}
# All merge groups: (field_set, group_title) tuple list
_MERGE_GROUPS = [
(_BASIC_INFO_FIELDS, "Basic Info" if _en else "基本信息"),
(_EDUCATION_FIELDS, "Education" if _en else "教育背景"),
(_SKILL_CERT_FIELDS, "Skills & Certificates" if _en else "技能与证书"),
(_WORK_OVERVIEW_FIELDS, "Work Overview" if _en else "工作概况"),
]
# Collect all fields that need merge processing; skip them during individual iteration
_ALL_MERGED_FIELDS = set()
for fields_set, _ in _MERGE_GROUPS:
_ALL_MERGED_FIELDS.update(fields_set)
# Merge fields by group, generating one chunk per group
for fields_set, group_title in _MERGE_GROUPS:
group_parts = []
group_field_values = {} # Store structured values for each field, to be written into chunk
for field_key in field_map:
if field_key not in fields_set:
continue
value = resume.get(field_key)
if not value:
continue
field_desc = field_map[field_key]
if isinstance(value, list):
text_value = " ".join(str(v) for v in value if v)
else:
text_value = str(value)
if not text_value.strip():
continue
group_parts.append(f"{field_desc}: {text_value}")
group_field_values[field_key] = value
if not group_parts:
continue
content = f"{group_title}\n" + "\n".join(group_parts)
if resume_summary:
content += f"\n[{resume_summary}]"
chunk = {
"content_with_weight": content,
"content_ltks": rag_tokenizer.tokenize(content),
"content_sm_ltks": rag_tokenizer.fine_grained_tokenize(
rag_tokenizer.tokenize(content)
),
}
chunk.update(doc)
# Redundantly write identity fields
for mk, mv in identity_meta.items():
chunk[mk] = mv
# Write each field's structured value into chunk (for structured retrieval)
for fk, fv in group_field_values.items():
if fk.endswith("_tks"):
text_val = " ".join(str(v) for v in fv) if isinstance(fv, list) else str(fv)
chunk[fk] = rag_tokenizer.tokenize(text_val)
elif fk.endswith("_kwd"):
chunk[fk] = fv if isinstance(fv, list) else str(fv)
elif fk.endswith("_int"):
try:
chunk[fk] = int(fv)
except (ValueError, TypeError):
pass
elif fk.endswith("_flt"):
try:
chunk[fk] = float(fv)
except (ValueError, TypeError):
pass
else:
chunk[fk] = str(fv)
chunks.append(chunk)
# Iterate over field map, generating a chunk for each non-merged field with a value
for field_key, field_desc in field_map.items():
# Skip fields already processed in merge groups
if field_key in _ALL_MERGED_FIELDS:
continue
value = resume.get(field_key)
if not value:
continue
# For work/project descriptions (long text lists), split into multiple chunks per element
if field_key in _SPLIT_LIST_FIELDS and isinstance(value, list):
# Get company name list to add context to each work description
corp_list = resume.get("corp_nm_tks", []) if field_key == "work_desc_tks" else []
project_list = resume.get("project_tks", []) if field_key == "project_desc_tks" else []
# Get detailed info for each work experience entry (time period, years)
work_details = resume.get("_work_exp_details", []) if field_key == "work_desc_tks" else []
for idx, item in enumerate(value):
item_text = str(item).strip()
if not item_text:
continue
# Add company/project name prefix to each description for context
if field_key == "work_desc_tks" and idx < len(work_details):
# Use detailed info to build prefix, including company, time range, years
detail = work_details[idx]
company = detail.get("company", "")
start_d = detail.get("start_date", "")
end_d = detail.get("end_date", "")
years = detail.get("years", 0)
# Build time range text
time_parts = []
if start_d:
time_range = f"{start_d}-{end_d}" if end_d else str(start_d)
time_parts.append(time_range)
if years > 0:
time_parts.append(f"{years}{'yrs' if _en else ''}")
time_text = " ".join(time_parts)
if company and time_text:
content_prefix = f"{field_desc}{company} {time_text}"
elif company:
content_prefix = f"{field_desc}{company}"
else:
content_prefix = f"{field_desc}{'#' if _en else ''}{idx + 1}{'' if _en else ''}"
elif field_key == "work_desc_tks" and idx < len(corp_list):
content_prefix = f"{field_desc}{corp_list[idx]}"
elif field_key == "project_desc_tks" and idx < len(project_list):
content_prefix = f"{field_desc}{project_list[idx]}"
else:
content_prefix = f"{field_desc}{'#' if _en else ''}{idx + 1}{'' if _en else ''}"
if resume_summary:
content = f"{content_prefix}: {item_text}\n[{resume_summary}]"
else:
content = f"{content_prefix}: {item_text}"
chunk = {
"content_with_weight": content,
"content_ltks": rag_tokenizer.tokenize(content),
"content_sm_ltks": rag_tokenizer.fine_grained_tokenize(
rag_tokenizer.tokenize(content)
),
}
chunk.update(doc)
# Redundantly write identity fields
for mk, mv in identity_meta.items():
if mk != field_key:
chunk[mk] = mv
# Tokenization result for current segment
chunk[field_key] = rag_tokenizer.tokenize(item_text)
chunks.append(chunk)
continue
# Merge list values into text
if isinstance(value, list):
text_value = " ".join(str(v) for v in value if v)
else:
text_value = str(value)
if not text_value.strip():
continue
# Build chunk content: "field_desc: field_value", append summary for semantic association
if resume_summary and field_key not in ("name_kwd", "phone_kwd"):
content = f"{field_desc}: {text_value}\n[{resume_summary}]"
else:
content = f"{field_desc}: {text_value}"
chunk = {
"content_with_weight": content,
"content_ltks": rag_tokenizer.tokenize(content),
"content_sm_ltks": rag_tokenizer.fine_grained_tokenize(
rag_tokenizer.tokenize(content)
),
}
chunk.update(doc)
# Redundantly write identity fields (do not overwrite the current field's own value)
for mk, mv in identity_meta.items():
if mk != field_key:
chunk[mk] = mv
# Write resume field value into the chunk's corresponding field (for structured retrieval)
if field_key.endswith("_tks"):
chunk[field_key] = rag_tokenizer.tokenize(text_value)
elif field_key.endswith("_kwd"):
if isinstance(value, list):
chunk[field_key] = value
else:
chunk[field_key] = text_value
elif field_key.endswith("_int"):
try:
chunk[field_key] = int(value)
except (ValueError, TypeError):
pass
elif field_key.endswith("_flt"):
try:
chunk[field_key] = float(value)
except (ValueError, TypeError):
pass
else:
chunk[field_key] = text_value
chunks.append(chunk)
# If no chunks were generated, create at least one chunk containing the name
if not chunks:
name = resume.get("name_kwd", "Unknown" if _en else "未知")
content = f"{'Name' if _en else '姓名'}: {name}"
chunk = {
"content_with_weight": content,
"content_ltks": rag_tokenizer.tokenize(content),
"content_sm_ltks": rag_tokenizer.fine_grained_tokenize(
rag_tokenizer.tokenize(content)
),
}
chunk.update(doc)
chunks.append(chunk)
# Write coordinate info to each chunk (position_int, page_num_int, top_int)
#
# Resume chunks are split by semantic fields (basic info, education, work description, etc.),
# not by PDF physical regions. Field values may be scattered across multiple locations in the PDF,
# and using text matching to reverse-lookup coordinates would cause disordered sorting.
#
# Therefore, assign incrementing coordinates based on chunk generation order (i.e., semantic logical order),
# ensuring display order: basic info -> education -> skills/certs -> work overview -> work desc -> project desc...
#
# add_positions input format: [(page, left, right, top, bottom), ...]
# - page starts from 0, function internally stores +1
# - task_executor sorts by page_num_int and top_int (page first, then Y coordinate)
from rag.nlp import add_positions
for i, ck in enumerate(chunks):
# All chunks placed on page=0, top increments by index to ensure logical ordering
add_positions(ck, [[0, 0, 0, i, i]])
return chunks
def _blackout_text_regions(image: "np.ndarray", meta_blocks: list[dict], page_idx: int,
pdf_to_img_scale: float) -> "np.ndarray":
"""
Black out metadata-extracted text regions on the page image to prevent OCR duplication.
Ref: SmartResume blackout strategy — extract metadata text first, black out those regions,
then run OCR on the blacked-out image so it only recognizes content metadata missed.
More reliable than IoU-based deduplication.
Args:
image: Page image (numpy array)
meta_blocks: Text blocks from metadata extraction
page_idx: Current page number
pdf_to_img_scale: Scale factor from PDF coordinates to image coordinates
Returns:
Image with text regions blacked out
"""
import cv2
blacked = image.copy()
page_blocks = [b for b in meta_blocks if b.get("page") == page_idx]
# Draw filled black rectangles over each metadata text block
padding = 2 # Extra pixels to ensure full coverage
for b in page_blocks:
x0 = int(b["x0"] * pdf_to_img_scale) - padding
y0 = int(b["top"] * pdf_to_img_scale) - padding
x1 = int(b["x1"] * pdf_to_img_scale) + padding
y1 = int(b["bottom"] * pdf_to_img_scale) + padding
# Clamp to image boundaries
x0 = max(0, x0)
y0 = max(0, y0)
x1 = min(blacked.shape[1], x1)
y1 = min(blacked.shape[0], y1)
cv2.rectangle(blacked, (x0, y0), (x1, y1), (0, 0, 0), -1)
return blacked
def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Resume parsing entry function (compatible with task_executor.py)
This function is the entry point registered as FACTORY[ParserType.RESUME.value],
with a signature consistent with other parsers (e.g., naive.chunk).
Args:
filename: File name
binary: File binary content
from_page: Start page number (not used in resume parsing)
to_page: End page number (not used in resume parsing)
lang: Language, default "Chinese"
callback: Progress callback function, accepts (progress, message) parameters
**kwargs: Other parameters (parser_config, kb_id, tenant_id, etc.)
Returns:
Document chunk list
"""
if callback is None:
def callback(prog, msg): return None
try:
callback(0.1, "Starting resume parsing...")
# Parse resume
resume, lines, line_positions = parse_resume(filename, binary, tenant_id , lang)
callback(0.6, "Resume structured extraction complete")
# Build document chunks (with coordinate info)
chunks = _build_chunk_document(filename, resume, lang)
callback(0.9, f"Document chunk construction complete, {len(chunks)} chunks total")
callback(1.0, "Resume parsing complete")
return chunks
except Exception as e:
logger.exception(f"Resume parsing exception: {filename}")
callback(-1, f"Resume parsing failed: {str(e)}")
return []
def _resort_page_with_layout(page_blocks: list[dict], layout_regions: list[dict]) -> list[dict]:
if not page_blocks:
return []
if not layout_regions:
return sorted(page_blocks, key=lambda b: (
(b.get("top", 0) + b.get("bottom", 0)) / 2,
(b.get("x0", 0) + b.get("x1", 0)) / 2,
))
type_groups: dict[str, list] = {}
for lt in layout_regions:
tp = lt.get("type", "")
type_groups.setdefault(tp, []).append(lt)
entries = []
for tp, group in type_groups.items():
for idx, lt in enumerate(group):
key = f"{tp}-{idx}"
x0, x1 = lt.get("x0", 0), lt.get("x1", 0)
top, bottom = lt.get("top", 0), lt.get("bottom", 0)
entries.append({
"key": key, "type": tp,
"x0": x0, "top": top, "x1": x1, "bottom": bottom,
"cy": (top + bottom) / 2, "cx": (x0 + x1) / 2,
})
for b in page_blocks:
if b.get("layoutno"):
continue
b_cx = (b.get("x0", 0) + b.get("x1", 0)) / 2
b_cy = (b.get("top", 0) + b.get("bottom", 0)) / 2
for entry in entries:
if (entry["x0"] <= b_cx <= entry["x1"]
and entry["top"] <= b_cy <= entry["bottom"]):
b["layoutno"] = entry["key"]
b["layout_type"] = entry["type"]
break
for entry in entries:
layout_key = entry["key"]
layout_area = (entry["x1"] - entry["x0"]) * (entry["bottom"] - entry["top"])
if layout_area <= 0:
continue
layout_blocks = [b for b in page_blocks if b.get("layoutno") == layout_key]
if not layout_blocks:
continue
text_total_area = sum(
(b.get("x1", 0) - b.get("x0", 0)) * (b.get("bottom", 0) - b.get("top", 0))
for b in layout_blocks
)
if text_total_area / layout_area < 0.075:
for b in layout_blocks:
b["layoutno"] = ""
b["layout_type"] = ""
entry_map = {e["key"]: e for e in entries}
for b in page_blocks:
b_cx = (b.get("x0", 0) + b.get("x1", 0)) / 2
b_cy = (b.get("top", 0) + b.get("bottom", 0)) / 2
b["_x_center"] = b_cx
b["_y_center"] = b_cy
layoutno = b.get("layoutno", "")
if layoutno and layoutno in entry_map:
b["_lx_center"] = entry_map[layoutno]["cx"]
b["_ly_center"] = entry_map[layoutno]["cy"]
else:
b["_lx_center"] = b_cx
b["_ly_center"] = b_cy
active_keys = {b.get("layoutno") for b in page_blocks if b.get("layoutno")}
active_entries = [e for e in entries if e["key"] in active_keys]
for b in page_blocks:
if b.get("layoutno"):
continue
if not active_entries:
continue
b_cx, b_cy = b["_x_center"], b["_y_center"]
min_dist = float("inf")
best_cx, best_cy = b_cx, b_cy
for ae in active_entries:
lx1, ly1, lx2, ly2 = ae["x0"], ae["top"], ae["x1"], ae["bottom"]
if b_cy < ly1:
dy = ly1 - b_cy
elif b_cy > ly2:
dy = b_cy - ly2
else:
dy = 0
if b_cx < lx1:
dx = lx1 - b_cx
elif b_cx > lx2:
dx = b_cx - lx2
else:
dx = 0
dist = (dx ** 2 + dy ** 2) ** 0.5
if dist < min_dist:
min_dist = dist
best_cx, best_cy = ae["cx"], ae["cy"]
b["_lx_center"] = best_cx
b["_ly_center"] = best_cy
sorted_blocks = sorted(page_blocks, key=lambda b: (
b.get("_ly_center", 0),
b.get("_lx_center", 0),
b.get("_y_center", 0),
b.get("_x_center", 0),
))
for b in sorted_blocks:
b.pop("_ly_center", None)
b.pop("_lx_center", None)
b.pop("_y_center", None)
b.pop("_x_center", None)
return sorted_blocks
def _layout_detect_reorder(blocks: list[dict], binary: bytes) -> list[dict]:
if not blocks:
return blocks
recognizer = _get_layout_recognizer()
if recognizer is None:
logger.info("Layout detector unavailable, falling back to heuristic sorting")
return _layout_aware_reorder(blocks)
try:
import pdfplumber
pages_blocks: dict[int, list[dict]] = {}
for b in blocks:
pg = b.get("page", 0)
pages_blocks.setdefault(pg, []).append(b)
page_indices = sorted(pages_blocks.keys())
image_list = []
ocr_res_per_page = []
with pdfplumber.open(BytesIO(binary)) as pdf:
for pg in page_indices:
if pg >= len(pdf.pages):
continue
page = pdf.pages[pg]
pil_img = page.to_image(resolution=72 * 3).annotated
image_list.append(pil_img)
page_bxs = []
for b in pages_blocks[pg]:
page_bxs.append({
"x0": float(b["x0"]),
"top": float(b["top"]),
"x1": float(b["x1"]),
"bottom": float(b["bottom"]),
"text": b["text"],
"page": pg,
})
ocr_res_per_page.append(page_bxs)
if not image_list:
return _layout_aware_reorder(blocks)
tagged_blocks, page_layouts = recognizer(
image_list, ocr_res_per_page, scale_factor=3, thr=0.2, drop=False
)
if not tagged_blocks:
logger.warning("Layout detector unavailable, falling back to heuristic sorting")
return _layout_aware_reorder(blocks)
tagged_per_page: dict[int, list[dict]] = {}
for b in tagged_blocks:
pg = b.get("page", 0)
tagged_per_page.setdefault(pg, []).append(b)
sorted_all = []
total_layout_count = 0
for pn, pg in enumerate(page_indices):
page_bxs = tagged_per_page.get(pg, [])
lts = page_layouts[pn] if pn < len(page_layouts) else []
total_layout_count += len(lts)
sorted_page = _resort_page_with_layout(page_bxs, lts)
sorted_all.extend(sorted_page)
for b in sorted_all:
if "page" not in b:
b["page"] = 0
logger.info(f"YOLOv10 detector completed {len(sorted_all)} total chunks"
f"checked {total_layout_count} layout")
return sorted_all
except Exception as e:
logger.warning(f"Layout detector unavailable, falling back to heuristic sorting: {e}")
return _layout_aware_reorder(blocks)
def _text_shingles(text: str, n: int = 5) -> set[tuple[int, ...]]:
"""
Generate text fingerprint set using tiktoken BPE tokenization + n-gram shingling.
Compared to character-level splitting, BPE tokens have better granularity,
and n-grams preserve word order, providing more accurate overlap measurement.
Args:
text: Original text
n: Shingling window size, default 5
Returns:
Set of n-gram shingles (each shingle is a tuple of token ids)
"""
if not text or _tiktoken_encoding is None:
return set()
tokens = _tiktoken_encoding.encode(text)
if len(tokens) < n:
# Text too short: return the entire token sequence as a single shingle
return {tuple(tokens)} if tokens else set()
return {tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)}
def _shingling_jaccard(text1: str, text2: str, n: int = 5) -> float:
"""
Compute Jaccard similarity between two texts using tiktoken shingling.
Args:
text1: First text
text2: Second text
n: Shingling window size
Returns:
Jaccard similarity [0.0, 1.0]
"""
s1 = _text_shingles(text1, n=n)
s2 = _text_shingles(text2, n=n)
union = s1 | s2
if not union:
return 1.0
return len(s1 & s2) / len(union)