Feat: support context window for docx (#12455)

### What problem does this PR solve?

Feat: support context window for docx

#12303

Done:
- [x] naive.py
- [x] one.py

TODO:
- [ ] book.py
- [ ] manual.py

Fix: incorrect image position
Fix: incorrect chunk type tag

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Magicbook1108
2026-01-07 15:08:17 +08:00
committed by GitHub
parent a442c9cac6
commit 011bbe9556
7 changed files with 397 additions and 120 deletions

View File

@ -529,6 +529,7 @@ def cancel_all_task_of(doc_id):
def has_canceled(task_id):
try:
if REDIS_CONN.get(f"{task_id}-cancel"):
logging.info(f"Task: {task_id} has been canceled")
return True
except Exception as e:
logging.exception(e)

View File

@ -25,7 +25,7 @@ from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
from rag.prompts.generator import vision_llm_figure_describe_prompt, vision_llm_figure_describe_prompt_with_context
from rag.nlp import append_context2table_image4pdf
# need to delete before pr
def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
if not figures_data_without_positions:
return []
@ -38,7 +38,6 @@ def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
if isinstance(figure_data[1], Image.Image)
]
def vision_figure_parser_docx_wrapper(sections, tbls, callback=None,**kwargs):
if not sections:
return tbls
@ -124,8 +123,56 @@ def vision_figure_parser_pdf_wrapper(tbls, callback=None, **kwargs):
return tbls
shared_executor = ThreadPoolExecutor(max_workers=10)
def vision_figure_parser_docx_wrapper_naive(chunks, idx_lst, callback=None, **kwargs):
print("\n\n hello here i am \n\n")
if not chunks:
return []
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
print(" \n\n Yes vision model \n\n")
except Exception:
vision_model = None
print(" \n\n No vision model \n\n")
if vision_model:
@timeout(30, 3)
def worker(idx, ck):
context_above = ck.get("context_above", "")
context_below = ck.get("context_below", "")
if context_above or context_below:
prompt = vision_llm_figure_describe_prompt_with_context(
# context_above + caption if any
context_above=ck.get("context_above") + ck.get("text", ""),
context_below=ck.get("context_below"),
)
logging.info(f"[VisionFigureParser] figure={idx} context_above_len={len(context_above)} context_below_len={len(context_below)} prompt=with_context")
logging.info(f"[VisionFigureParser] figure={idx} context_above_snippet={context_above[:512]}")
logging.info(f"[VisionFigureParser] figure={idx} context_below_snippet={context_below[:512]}")
else:
prompt = vision_llm_figure_describe_prompt()
logging.info(f"[VisionFigureParser] figure={idx} context_len=0 prompt=default")
description_text = picture_vision_llm_chunk(
binary=ck.get("image"),
vision_model=vision_model,
prompt=prompt,
callback=callback,
)
return idx, description_text
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(worker, idx, chunks[idx])
for idx in idx_lst
]
for future in as_completed(futures):
idx, description = future.result()
chunks[idx]['text'] += description
shared_executor = ThreadPoolExecutor(max_workers=10)
class VisionFigureParser:
def __init__(self, vision_model, figures_data, *args, **kwargs):

View File

@ -87,10 +87,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.1, "Start to parse.")
doc_parser = naive.Docx()
# TODO: table of contents need to be removed
sections, tbls = doc_parser(
main_sections = doc_parser(
filename, binary=binary, from_page=from_page, to_page=to_page)
sections = []
tbls = []
for text, image, html in main_sections:
sections.append((text, image))
tbls.append(((None, html), ""))
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs)
# tbls = [((None, lns), None) for lns in tbls]
sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if

View File

@ -23,6 +23,8 @@ from timeit import default_timer as timer
from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.table import Table as DocxTable
from docx.text.paragraph import Paragraph
from docx.opc.oxml import parse_xml
from markdown import markdown
from PIL import Image
@ -33,15 +35,15 @@ from api.db.services.llm_service import LLMBundle
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, \
PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper, \
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, \
vision_figure_parser_pdf_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \
tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context, append_context2table_image4pdf
tokenize_chunks, doc_tokenize_chunks_with_images, tokenize_table, append_context2table_image4pdf, tokenize_chunks_with_images, \
attach_media_context # noqa: F401
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
**kwargs):
@ -343,67 +345,116 @@ class Docx(DocxParser):
pn = 0
lines = []
last_image = None
for p in self.doc.paragraphs:
table_idx = 0
def flush_last_image():
nonlocal last_image, lines
if last_image is not None:
lines.append({"text": "", "image": last_image, "table": None, "style": "Image"})
last_image = None
for block in self.doc._element.body:
if pn > to_page:
break
if from_page <= pn < to_page:
if p.text.strip():
if p.style and p.style.name == 'Caption':
former_image = None
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
former_image = lines[-1][1].pop()
elif last_image:
former_image = last_image
last_image = None
lines.append((self.__clean(p.text), [former_image], p.style.name))
if block.tag.endswith('p'):
p = Paragraph(block, self.doc)
if from_page <= pn < to_page:
text = p.text.strip()
style_name = p.style.name if p.style else ""
if text:
if style_name == "Caption":
former_image = None
if lines and lines[-1].get("image") and lines[-1].get("style") != "Caption":
former_image = lines[-1].get("image")
lines.pop()
elif last_image is not None:
former_image = last_image
last_image = None
lines.append(
{
"text": self.__clean(text),
"image": former_image if former_image else None,
"table": None,
}
)
else:
flush_last_image()
lines.append(
{
"text": self.__clean(text),
"image": None,
"table": None,
}
)
current_image = self.get_picture(self.doc, p)
if current_image is not None:
lines.append(
{
"text": "",
"image": current_image,
"table": None,
}
)
else:
current_image = self.get_picture(self.doc, p)
image_list = [current_image]
if last_image:
image_list.insert(0, last_image)
last_image = None
lines.append((self.__clean(p.text), image_list, p.style.name if p.style else ""))
else:
if current_image := self.get_picture(self.doc, p):
if lines:
lines[-1][1].append(current_image)
else:
if current_image is not None:
last_image = current_image
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
tbls = []
for i, tb in enumerate(self.doc.tables):
title = self.__get_nearest_title(i, filename)
html = "<table>"
if title:
html += f"<caption>Table Location: {title}</caption>"
for r in tb.rows:
html += "<tr>"
i = 0
try:
while i < len(r.cells):
span = 1
c = r.cells[i]
for j in range(i + 1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
i = j
else:
break
i += 1
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
except Exception as e:
logging.warning(f"Error parsing table, ignore: {e}")
html += "</tr>"
html += "</table>"
tbls.append(((None, html), ""))
return new_line, tbls
for run in p.runs:
xml = run._element.xml
if "lastRenderedPageBreak" in xml:
pn += 1
continue
if "w:br" in xml and 'type="page"' in xml:
pn += 1
elif block.tag.endswith('tbl'):
if pn < from_page or pn > to_page:
table_idx += 1
continue
flush_last_image()
tb = DocxTable(block, self.doc)
title = self.__get_nearest_title(table_idx, filename)
html = "<table>"
if title:
html += f"<caption>Table Location: {title}</caption>"
for r in tb.rows:
html += "<tr>"
col_idx = 0
try:
while col_idx < len(r.cells):
span = 1
c = r.cells[col_idx]
for j in range(col_idx + 1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
col_idx = j
else:
break
col_idx += 1
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
except Exception as e:
logging.warning(f"Error parsing table, ignore: {e}")
html += "</tr>"
html += "</table>"
lines.append({"text": "", "image": None, "table": html})
table_idx += 1
flush_last_image()
new_line = [(line.get("text"), line.get("image"), line.get("table")) for line in lines]
return new_line
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
"""
@ -727,26 +778,26 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships.load_from_xml = load_from_xml_v2
sections, tables = Docx()(filename, binary)
tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
st = timer()
# sections = (text, image, tables)
sections = Docx()(filename, binary)
# chunks list[dict]
# images list - index of image chunk in chunks
chunks, images = naive_merge_docx(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
"delimiter", "\n!?。;!?"), table_context_size, image_context_size)
vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs)
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
callback(0.8, "Finish parsing.")
st = timer()
res.extend(doc_tokenize_chunks_with_images(chunks, doc, is_english, child_delimiters_pattern=child_deli))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res)
res.extend(url_res)
if table_context_size or image_context_size:
attach_media_context(res, table_context_size, image_context_size)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@ -1012,7 +1063,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
res.extend(embed_res)
if url_res:
res.extend(url_res)
#if table_context_size or image_context_size:
# if table_context_size or image_context_size:
# attach_media_context(res, table_context_size, image_context_size)
return res

View File

@ -22,7 +22,7 @@ from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper_naive
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer
@ -76,11 +76,26 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = naive.Docx()(filename, binary)
tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs)
sections = [s for s, _ in sections if s]
for (_, html), _ in tbls:
sections.append(html)
sections = naive.Docx()(filename, binary)
cks = []
image_idxs = []
for text, image, table in sections:
if table is not None:
text = (text or "") + str(table)
ck_type = "table"
else:
ck_type = "image" if image is not None else "text"
if ck_type == "image":
image_idxs.append(len(cks))
cks.append({"text": text, "image": image, "ck_type": ck_type})
vision_figure_parser_docx_wrapper_naive(cks, image_idxs, callback, **kwargs)
for ck in cks:
print(ck)
sections = [ck["text"] for ck in cks if ck.get("text")]
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):

View File

@ -316,6 +316,32 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=
return res
def doc_tokenize_chunks_with_images(chunks, doc, eng, child_delimiters_pattern=None, batch_size=10):
res = []
for ii, ck in enumerate(chunks):
text = ck.get('context_above', "") + ck.get('text') + ck.get('context_below', "")
if len(text.strip()) == 0:
continue
logging.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
if ck.get("image"):
d["image"] = ck.get("image")
add_positions(d, [[ii] * 5])
if ck.get("ck_type") == "text":
if child_delimiters_pattern:
d["mom_with_weight"] = ck
res.extend(split_with_pattern(d, child_delimiters_pattern, text, eng))
continue
elif ck.get("ck_type") == "image":
d["doc_type_kwd"] = "image"
elif ck.get("ck_type") == "table":
d["doc_type_kwd"] = "table"
tokenize(d, text, eng)
res.append(d)
return res
def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_pattern=None):
res = []
# wrap up as es documents
@ -789,6 +815,11 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si
if len(contexts) < len(res) + 1:
contexts.append(("", ""))
res.append(((img, tb), poss))
print("\n\n")
for c in contexts:
print(c)
print("\n\n")
return contexts if return_context else res
@ -1200,57 +1231,181 @@ def concat_img(img1, img2):
new_image.paste(img2, (0, height1))
return new_image
def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
if not sections:
return [], []
def _build_cks(sections, delimiter):
cks = []
tables = []
images = []
tk_nums = []
def add_chunk(t, image, pos=""):
nonlocal cks, images, tk_nums
tnum = num_tokens_from_string(t)
if tnum < 8:
pos = ""
if not cks or tk_nums[-1] > chunk_token_num:
# new chunk
if pos and t.find(pos) < 0:
t += pos
cks.append(t)
images.append(image)
tk_nums.append(tnum)
else:
# add to last chunk
if pos and cks[-1].find(pos) < 0:
t += pos
cks[-1] += t
images[-1] = concat_img(images[-1], image)
tk_nums[-1] += tnum
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
has_custom = bool(custom_delimiters)
if has_custom:
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
cks, images, tk_nums = [], [], []
custom_pattern = "|".join(
re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True)
)
pattern = r"(%s)" % custom_pattern
for sec, image in sections:
split_sec = re.split(pattern, sec)
for text, image, table in sections:
# normalize text
if not text:
text = "\n"
else:
text = "\n" + str(text)
if table:
# table ck
ck_text = text + str(table)
idx = len(cks)
cks.append({"text": ck_text, "image": image, "ck_type": "table", "tk_nums": num_tokens_from_string(ck_text)})
tables.append(idx)
continue
if image:
# image ck (text can be kept as-is; depends on your downstream)
idx = len(cks)
cks.append({"text": text, "image": image, "ck_type": "image", "tk_nums": num_tokens_from_string(text)})
images.append(idx)
continue
# pure text ck(s)
if has_custom:
split_sec = re.split(pattern, text)
for sub_sec in split_sec:
if not sub_sec or re.fullmatch(custom_pattern, sub_sec):
continue
text_seg = "\n" + sub_sec
cks.append(text_seg)
images.append(image)
tk_nums.append(num_tokens_from_string(text_seg))
return cks, images
seg = "\n" + sub_sec if not sub_sec.startswith("\n") else sub_sec
cks.append({"text": seg, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(seg)})
else:
cks.append({"text": text, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(text)})
for sec, image in sections:
add_chunk("\n" + sec, image, "")
return cks, tables, images
return cks, images
def _add_context(cks, idx, context_size):
if cks[idx]["ck_type"] not in ("image", "table"):
return
prev = idx - 1
after = idx + 1
remain_above = context_size
remain_below = context_size
cks[idx]["context_above"] = ""
cks[idx]["context_below"] = ""
split_pat = r"([。!?\n]|\. )"
picked_above = []
picked_below = []
def take_sentences_from_end(cnt, need_tokens):
txts = re.split(split_pat, cnt, flags=re.DOTALL)
sents = []
for j in range(0, len(txts), 2):
sents.append(txts[j] + (txts[j + 1] if j + 1 < len(txts) else ""))
acc = ""
for s in reversed(sents):
acc = s + acc
if num_tokens_from_string(acc) >= need_tokens:
break
return acc
def take_sentences_from_start(cnt, need_tokens):
txts = re.split(split_pat, cnt, flags=re.DOTALL)
acc = ""
for j in range(0, len(txts), 2):
acc += txts[j] + (txts[j + 1] if j + 1 < len(txts) else "")
if num_tokens_from_string(acc) >= need_tokens:
break
return acc
# above
parts_above = []
while prev >= 0 and remain_above > 0:
if cks[prev]["ck_type"] == "text":
tk = cks[prev]["tk_nums"]
if tk >= remain_above:
piece = take_sentences_from_end(cks[prev]["text"], remain_above)
parts_above.insert(0, piece)
picked_above.append((prev, "tail", remain_above, tk, piece[:80]))
remain_above = 0
break
else:
parts_above.insert(0, cks[prev]["text"])
picked_above.append((prev, "full", remain_above, tk, (cks[prev]["text"] or "")[:80]))
remain_above -= tk
prev -= 1
# below
parts_below = []
while after < len(cks) and remain_below > 0:
if cks[after]["ck_type"] == "text":
tk = cks[after]["tk_nums"]
if tk >= remain_below:
piece = take_sentences_from_start(cks[after]["text"], remain_below)
parts_below.append(piece)
picked_below.append((after, "head", remain_below, tk, piece[:80]))
remain_below = 0
break
else:
parts_below.append(cks[after]["text"])
picked_below.append((after, "full", remain_below, tk, (cks[after]["text"] or "")[:80]))
remain_below -= tk
after += 1
cks[idx]["context_above"] = "".join(parts_above) if parts_above else ""
cks[idx]["context_below"] = "".join(parts_below) if parts_below else ""
def _merge_cks(cks, chunk_token_num):
merged = []
image_idxs = []
prev_text_ck = -1
for i in range(len(cks)):
ck_type = cks[i]["ck_type"]
if ck_type != "text":
merged.append(cks[i])
if ck_type == "image":
image_idxs.append(len(merged) - 1)
continue
if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num:
merged.append(cks[i])
prev_text_ck = len(merged) - 1
continue
merged[prev_text_ck]["text"] = (merged[prev_text_ck].get("text") or "") + (cks[i].get("text") or "")
merged[prev_text_ck]["tk_nums"] = merged[prev_text_ck].get("tk_nums", 0) + cks[i].get("tk_nums", 0)
return merged, image_idxs
def naive_merge_docx(
sections,
chunk_token_num = 128,
delimiter="\n。;!?",
table_context_size=0,
image_context_size=0,):
if not sections:
return [], []
cks, tables, images = _build_cks(sections, delimiter)
if table_context_size > 0:
for i in tables:
_add_context(cks, i, table_context_size)
if image_context_size > 0:
for i in images:
_add_context(cks, i, image_context_size)
merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num)
return merged_cks, merged_image_idx
def extract_between(text: str, start_tag: str, end_tag: str) -> list[str]:

View File

@ -1127,7 +1127,7 @@ async def do_handle_task(task):
if has_canceled(task_id):
try:
exists = await asyncio.to_thread(
settings.docStoreConn.indexExist,
settings.docStoreConn.index_exist,
search.index_name(task_tenant_id),
task_dataset_id,
)