Fix: Correctly handle merged cells in DOCX tables to prevent content duplication and loss (#27871)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
李龙飞
2025-11-13 15:56:24 +08:00
committed by CodingOnStar
parent 9c10731494
commit 8dc5d98174
2 changed files with 54 additions and 3 deletions

View File

@ -152,13 +152,15 @@ class WordExtractor(BaseExtractor):
# Initialize a row, all of which are empty by default
row_cells = [""] * total_cols
col_index = 0
for cell in row.cells:
while col_index < len(row.cells):
# make sure the col_index is not out of range
while col_index < total_cols and row_cells[col_index] != "":
while col_index < len(row.cells) and row_cells[col_index] != "":
col_index += 1
# if col_index is out of range the loop is jumped
if col_index >= total_cols:
if col_index >= len(row.cells):
break
# get the correct cell
cell = row.cells[col_index]
cell_content = self._parse_cell(cell, image_map).strip()
cell_colspan = cell.grid_span or 1
for i in range(cell_colspan):