mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-03 00:37:48 +08:00
Fix: (resume) Cross-verify project experience and work experience, and remove duplicate text (#13323)
Cross-verify project experience and work experience, and remove duplicate text --------- Co-authored-by: Aron.Yao <yaowei@192.168.1.68> Co-authored-by: Aron.Yao <yaowei@yaoweideMacBook-Pro.local>
This commit is contained in:
@ -1892,7 +1892,142 @@ def _postprocess_resume(resume: dict, lines: list[str], lang: str = "Chinese") -
|
||||
seen.add(item_str)
|
||||
deduped.append(item_str)
|
||||
resume[list_field] = deduped
|
||||
# --- Phase 3.4: work_desc_tks dedup by company name + time period ---
|
||||
# LLM often extracts the same company's content twice: once from the "Work Experience"
|
||||
# section and once from the "Project Experience" section, producing entries like
|
||||
# These have different descriptions (daily work vs project details), so content-based
|
||||
# Jaccard dedup cannot catch them. Instead, we detect duplicate companies by checking
|
||||
# if one company name is a substring of another AND their time periods overlap.
|
||||
# This also fixes the inflated work_exp_flt (e.g. 25.5 years instead of ~14).
|
||||
work_descs = resume.get("work_desc_tks", [])
|
||||
if len(work_descs) > 1:
|
||||
corp_names = resume.get("corp_nm_tks", [])
|
||||
work_details = resume.get("_work_exp_details", [])
|
||||
positions = resume.get("position_name_tks", [])
|
||||
kept_indices = []
|
||||
for i in range(len(work_descs)):
|
||||
is_dup = False
|
||||
corp_i = _normalize_for_comparison(corp_names[i]) if i < len(corp_names) else ""
|
||||
detail_i = work_details[i] if i < len(work_details) else {}
|
||||
start_i = detail_i.get("start_date", "")
|
||||
end_i = detail_i.get("end_date", "")
|
||||
# Parse dates for entry i once (reused across inner loop)
|
||||
dt_start_i = _parse_date_str(start_i) if start_i else None
|
||||
dt_end_i = _parse_date_str(end_i) if end_i else None
|
||||
for j in kept_indices:
|
||||
# Strategy A: company name substring + time period overlap
|
||||
corp_j = _normalize_for_comparison(corp_names[j]) if j < len(corp_names) else ""
|
||||
if corp_i and corp_j:
|
||||
shorter_c, longer_c = (corp_i, corp_j) if len(corp_i) <= len(corp_j) else (corp_j, corp_i)
|
||||
if shorter_c in longer_c:
|
||||
# Check time period overlap using parsed dates
|
||||
# Two intervals [s1,e1] and [s2,e2] overlap iff s1 <= e2 and s2 <= e1
|
||||
# Use <= because resume dates are month-granularity (e.g. "2018.03" means "sometime in March 2018")
|
||||
detail_j = work_details[j] if j < len(work_details) else {}
|
||||
start_j = detail_j.get("start_date", "")
|
||||
end_j = detail_j.get("end_date", "")
|
||||
dt_start_j = _parse_date_str(start_j) if start_j else None
|
||||
dt_end_j = _parse_date_str(end_j) if end_j else None
|
||||
# Need at least one valid date on each side to compare
|
||||
if dt_start_i and dt_start_j:
|
||||
# Use far-future as default end if missing
|
||||
eff_end_i = dt_end_i or datetime.datetime(2099, 12, 1)
|
||||
eff_end_j = dt_end_j or datetime.datetime(2099, 12, 1)
|
||||
if dt_start_i <= eff_end_j and dt_start_j <= eff_end_i:
|
||||
is_dup = True
|
||||
break
|
||||
elif (start_i and start_j and start_i == start_j) or \
|
||||
(end_i and end_j and end_i == end_j):
|
||||
# Fallback: exact string match if date parsing fails
|
||||
is_dup = True
|
||||
break
|
||||
# Strategy B: content-based Jaccard similarity (fallback)
|
||||
norm_i = _normalize_for_comparison(work_descs[i])
|
||||
norm_j = _normalize_for_comparison(work_descs[j])
|
||||
shorter, longer = (norm_i, norm_j) if len(norm_i) <= len(norm_j) else (norm_j, norm_i)
|
||||
if shorter and longer and shorter in longer:
|
||||
is_dup = True
|
||||
break
|
||||
jac = _shingling_jaccard(work_descs[i], work_descs[j], n=5)
|
||||
if jac > 0.5:
|
||||
is_dup = True
|
||||
break
|
||||
if is_dup:
|
||||
dup_corp = corp_names[i] if i < len(corp_names) else f"#{i+1}"
|
||||
logger.debug(f"Work desc internal duplicate removed: {dup_corp}")
|
||||
else:
|
||||
kept_indices.append(i)
|
||||
# Only update when entries were actually removed
|
||||
if len(kept_indices) < len(work_descs):
|
||||
resume["work_desc_tks"] = [work_descs[i] for i in kept_indices]
|
||||
if corp_names:
|
||||
resume["corp_nm_tks"] = [corp_names[i] for i in kept_indices if i < len(corp_names)]
|
||||
if work_details:
|
||||
resume["_work_exp_details"] = [work_details[i] for i in kept_indices if i < len(work_details)]
|
||||
if positions:
|
||||
resume["position_name_tks"] = [positions[i] for i in kept_indices if i < len(positions)]
|
||||
# Recalculate work years based on deduplicated entries
|
||||
new_details = resume.get("_work_exp_details", [])
|
||||
if new_details:
|
||||
recalc_years = sum(d.get("years", 0) for d in new_details)
|
||||
recalc_years = round(recalc_years, 1)
|
||||
if recalc_years > 0:
|
||||
resume["work_exp_flt"] = recalc_years
|
||||
logger.info(f"Work years recalculated: {recalc_years} yrs (before dedup: {_calculate_work_years([{'start_date': d.get('start_date',''), 'end_date': d.get('end_date','')} for d in work_details])} yrs)")
|
||||
new_corps = resume.get("corp_nm_tks", [])
|
||||
if new_corps:
|
||||
resume["corporation_name_tks"] = new_corps[0]
|
||||
|
||||
# --- Phase 3.5: Merge project_desc_tks into work_desc_tks ---
|
||||
# Instead of complex cross-dedup, we simply merge unique project descriptions into
|
||||
# work_desc_tks and clear project_desc_tks. This avoids the problem where LLM extracts
|
||||
# the same content into both fields with slightly different wording.
|
||||
# After merge, project_desc_tks is emptied so _build_chunk_document won't generate
|
||||
# duplicate chunks. Project names are preserved in project_tks for reference.
|
||||
work_descs = resume.get("work_desc_tks", [])
|
||||
project_descs = resume.get("project_desc_tks", [])
|
||||
# Save pre-merge project descriptions for debugging
|
||||
resume["_raw_project_descs"] = list(project_descs) if project_descs else []
|
||||
if project_descs:
|
||||
project_names = resume.get("project_tks", [])
|
||||
merged_count = 0
|
||||
skipped_count = 0
|
||||
for i, proj_desc in enumerate(project_descs):
|
||||
norm_proj = _normalize_for_comparison(proj_desc)
|
||||
if not norm_proj:
|
||||
continue
|
||||
# Check if this project desc already exists in work_descs (exact or near-duplicate)
|
||||
already_exists = False
|
||||
for wd in work_descs:
|
||||
norm_wd = _normalize_for_comparison(wd)
|
||||
if not norm_wd:
|
||||
continue
|
||||
# Substring containment check
|
||||
shorter, longer = (norm_proj, norm_wd) if len(norm_proj) <= len(norm_wd) else (norm_wd, norm_proj)
|
||||
if shorter in longer:
|
||||
already_exists = True
|
||||
break
|
||||
# Jaccard similarity check
|
||||
if _shingling_jaccard(proj_desc, wd, n=5) > 0.5:
|
||||
already_exists = True
|
||||
break
|
||||
if already_exists:
|
||||
skipped_count += 1
|
||||
proj_name = project_names[i] if i < len(project_names) else f"#{i+1}"
|
||||
logger.debug(f"Project desc already in work_desc, skipped: {proj_name}")
|
||||
else:
|
||||
# Append to work_desc_tks with project name prefix for context
|
||||
proj_name = project_names[i] if i < len(project_names) else ""
|
||||
if proj_name:
|
||||
proj_desc_with_prefix = f"[{proj_name}] {proj_desc}"
|
||||
else:
|
||||
proj_desc_with_prefix = proj_desc
|
||||
work_descs.append(proj_desc_with_prefix)
|
||||
merged_count += 1
|
||||
resume["work_desc_tks"] = work_descs
|
||||
# Clear project_desc_tks — all content is now in work_desc_tks
|
||||
resume["project_desc_tks"] = []
|
||||
logger.info(f"Merged project descs into work_desc_tks: {merged_count} merged, {skipped_count} skipped (duplicate)")
|
||||
# --- Phase 4: Field completion ---
|
||||
required_fields = [
|
||||
"name_kwd", "gender_kwd", "phone_kwd", "email_tks",
|
||||
@ -2561,3 +2696,45 @@ def _layout_detect_reorder(blocks: list[dict], binary: bytes) -> list[dict]:
|
||||
except Exception as e:
|
||||
logger.warning(f"Layout detector unavailable, falling back to heuristic sorting: {e}")
|
||||
return _layout_aware_reorder(blocks)
|
||||
|
||||
|
||||
|
||||
def _text_shingles(text: str, n: int = 5) -> set[tuple[int, ...]]:
|
||||
"""
|
||||
Generate text fingerprint set using tiktoken BPE tokenization + n-gram shingling.
|
||||
|
||||
Compared to character-level splitting, BPE tokens have better granularity,
|
||||
and n-grams preserve word order, providing more accurate overlap measurement.
|
||||
|
||||
Args:
|
||||
text: Original text
|
||||
n: Shingling window size, default 5
|
||||
Returns:
|
||||
Set of n-gram shingles (each shingle is a tuple of token ids)
|
||||
"""
|
||||
if not text or _tiktoken_encoding is None:
|
||||
return set()
|
||||
tokens = _tiktoken_encoding.encode(text)
|
||||
if len(tokens) < n:
|
||||
# Text too short: return the entire token sequence as a single shingle
|
||||
return {tuple(tokens)} if tokens else set()
|
||||
return {tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)}
|
||||
|
||||
|
||||
def _shingling_jaccard(text1: str, text2: str, n: int = 5) -> float:
|
||||
"""
|
||||
Compute Jaccard similarity between two texts using tiktoken shingling.
|
||||
|
||||
Args:
|
||||
text1: First text
|
||||
text2: Second text
|
||||
n: Shingling window size
|
||||
Returns:
|
||||
Jaccard similarity [0.0, 1.0]
|
||||
"""
|
||||
s1 = _text_shingles(text1, n=n)
|
||||
s2 = _text_shingles(text2, n=n)
|
||||
union = s1 | s2
|
||||
if not union:
|
||||
return 1.0
|
||||
return len(s1 & s2) / len(union)
|
||||
|
||||
Reference in New Issue
Block a user