Fix: upgrade pypdf to 6.7.5 and migrate from deprecated pypdf2 to fix CVE-2026-28804 and CVE-2023-36464 (#13454)

### What problem does this PR solve?

This PR addresses security vulnerabilities in PDF processing
dependencies identified by Trivy security scan:

1. CVE-2026-28804 (MEDIUM): pypdf 6.7.4 vulnerable to inefficient
decoding of ASCIIHexDecode streams
2. CVE-2023-36464 (MEDIUM): pypdf2 3.0.1 susceptible to infinite loop
when parsing malformed comments

Since pypdf2 is deprecated with no available fixes, this PR migrates all
pypdf2 usage to the actively maintained pypdf library (version 6.7.5),
which resolves
both vulnerabilities.


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
guptas6est
2026-03-09 04:06:00 +00:00
committed by GitHub
parent 2634cfc06f
commit 32d31284cc
4 changed files with 7 additions and 19 deletions

View File

@ -81,8 +81,7 @@ dependencies = [
"pyobvector==0.2.22",
"pyodbc>=5.2.0,<6.0.0",
"pypandoc>=1.16",
"pypdf>=6.6.2",
"pypdf2>=3.0.1,<4.0.0",
"pypdf>=6.7.5",
"python-calamine>=0.4.0",
"python-docx>=1.1.2,<2.0.0",
"python-pptx>=1.0.2,<2.0.0",

View File

@ -20,7 +20,7 @@ import re
from collections import defaultdict
from io import BytesIO
from PyPDF2 import PdfReader as pdf2_read
from pypdf import PdfReader as pdf2_read
from deepdoc.parser import PdfParser, PlainParser
from deepdoc.parser.ppt_parser import RAGFlowPptParser

View File

@ -21,7 +21,7 @@ import requests
from requests.exceptions import Timeout, RequestException
from io import BytesIO
from typing import List, Union, Tuple, Optional, Dict
import PyPDF2
import pypdf as PyPDF2
from docx import Document
import olefile

19
uv.lock generated
View File

@ -5760,20 +5760,11 @@ wheels = [
[[package]]
name = "pypdf"
version = "6.7.4"
version = "6.7.5"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/dc/f52deef12797ad58b88e4663f097a343f53b9361338aef6573f135ac302f/pypdf-6.7.4.tar.gz", hash = "sha256:9edd1cd47938bb35ec87795f61225fd58a07cfaf0c5699018ae1a47d6f8ab0e3", size = 5304821, upload-time = "2026-02-27T10:44:39.395Z" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f6/52/37cc0aa9e9d1bf7729a737a0d83f8b3f851c8eb137373d9f71eafb0a3405/pypdf-6.7.5.tar.gz", hash = "sha256:40bb2e2e872078655f12b9b89e2f900888bb505e88a82150b64f9f34fa25651d", size = 5304278, upload-time = "2026-03-02T09:05:21.464Z" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/be/cded021305f5c81b47265b8c5292b99388615a4391c21ff00fd538d34a56/pypdf-6.7.4-py3-none-any.whl", hash = "sha256:527d6da23274a6c70a9cb59d1986d93946ba8e36a6bc17f3f7cce86331492dda", size = 331496, upload-time = "2026-02-27T10:44:37.527Z" },
]
[[package]]
name = "pypdf2"
version = "3.0.1"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/89/336673efd0a88956562658aba4f0bbef7cb92a6fbcbcaf94926dbc82b408/pypdf-6.7.5-py3-none-any.whl", hash = "sha256:07ba7f1d6e6d9aa2a17f5452e320a84718d4ce863367f7ede2fd72280349ab13", size = 331421, upload-time = "2026-03-02T09:05:19.722Z" },
]
[[package]]
@ -6323,7 +6314,6 @@ dependencies = [
{ name = "pyodbc" },
{ name = "pypandoc" },
{ name = "pypdf" },
{ name = "pypdf2" },
{ name = "python-calamine" },
{ name = "python-docx" },
{ name = "python-gitlab" },
@ -6462,8 +6452,7 @@ requires-dist = [
{ name = "pyobvector", specifier = "==0.2.22" },
{ name = "pyodbc", specifier = ">=5.2.0,<6.0.0" },
{ name = "pypandoc", specifier = ">=1.16" },
{ name = "pypdf", specifier = ">=6.6.2" },
{ name = "pypdf2", specifier = ">=3.0.1,<4.0.0" },
{ name = "pypdf", specifier = ">=6.7.5" },
{ name = "python-calamine", specifier = ">=0.4.0" },
{ name = "python-docx", specifier = ">=1.1.2,<2.0.0" },
{ name = "python-gitlab", specifier = ">=7.0.0" },