Fix: zip extraction vulnerabilities in MinerU and TCADP (#12527)

### What problem does this PR solve? Fix zip extraction vulnerabilities: - Block symlink entries in zip files. - Reject encrypted zip entries. - Prevent absolute path attacks (including Windows paths). - Block path traversal attempts (../). - Stop zip slip exploits (directory escape). - Use streaming for memory-safe file handling. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-19 03:35:11 +08:00 · 2026-01-13 12:24:50 +08:00
parent 41c84fd78f
commit 64c75d558e
2 changed files with 130 additions and 83 deletions
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@ -17,6 +17,7 @@ import json
 import logging
 import os
 import re
+import shutil
 import sys
 import tempfile
 import threading
@ -138,39 +139,58 @@ class MinerUParser(RAGFlowPdfParser):
        self.outlines = []
        self.logger = logging.getLogger(self.__class__.__name__)

+    @staticmethod
+    def _is_zipinfo_symlink(member: zipfile.ZipInfo) -> bool:
+        return (member.external_attr >> 16) & 0o170000 == 0o120000
+
    def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
        self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
+        base_dir = Path(extract_to).resolve()
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            members = zip_ref.infolist()
            if not root_dir:
-                files = zip_ref.namelist()
-                if files and files[0].endswith("/"):
-                    root_dir = files[0]
+                if members and members[0].filename.endswith("/"):
+                    root_dir = members[0].filename
                else:
                    root_dir = None
+            if root_dir:
+                root_dir = root_dir.replace("\\", "/")
+                if not root_dir.endswith("/"):
+                    root_dir += "/"

-            if not root_dir or not root_dir.endswith("/"):
-                self.logger.info(f"[MinerU] No root directory found, extracting all (root_hint={root_dir})")
-                zip_ref.extractall(extract_to)
-                return
+            for member in members:
+                if member.flag_bits & 0x1:
+                    raise RuntimeError(f"[MinerU] Encrypted zip entry not supported: {member.filename}")
+                if self._is_zipinfo_symlink(member):
+                    raise RuntimeError(f"[MinerU] Symlink zip entry not supported: {member.filename}")

-            root_len = len(root_dir)
-            for member in zip_ref.infolist():
-                filename = member.filename
-                if filename == root_dir:
+                name = member.filename.replace("\\", "/")
+                if root_dir and name == root_dir:
                    self.logger.info("[MinerU] Ignore root folder...")
                    continue
+                if root_dir and name.startswith(root_dir):
+                    name = name[len(root_dir) :]
+                if not name:
+                    continue
+                if name.startswith("/") or name.startswith("//") or re.match(r"^[A-Za-z]:", name):
+                    raise RuntimeError(f"[MinerU] Unsafe zip path (absolute): {member.filename}")

-                path = filename
-                if path.startswith(root_dir):
-                    path = path[root_len:]
+                parts = [p for p in name.split("/") if p not in ("", ".")]
+                if any(p == ".." for p in parts):
+                    raise RuntimeError(f"[MinerU] Unsafe zip path (traversal): {member.filename}")
+
+                rel_path = os.path.join(*parts) if parts else ""
+                dest_path = (Path(extract_to) / rel_path).resolve(strict=False)
+                if dest_path != base_dir and base_dir not in dest_path.parents:
+                    raise RuntimeError(f"[MinerU] Unsafe zip path (escape): {member.filename}")

-                full_path = os.path.join(extract_to, path)
                if member.is_dir():
-                    os.makedirs(full_path, exist_ok=True)
-                else:
-                    os.makedirs(os.path.dirname(full_path), exist_ok=True)
-                    with open(full_path, "wb") as f:
-                        f.write(zip_ref.read(filename))
+                    os.makedirs(dest_path, exist_ok=True)
+                    continue
+
+                os.makedirs(dest_path.parent, exist_ok=True)
+                with zip_ref.open(member) as src, open(dest_path, "wb") as dst:
+                    shutil.copyfileobj(src, dst)

    @staticmethod
    def _is_http_endpoint_valid(url, timeout=5):
@ -237,8 +257,6 @@ class MinerUParser(RAGFlowPdfParser):
        output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir))
        output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip")

-        files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
-
        data = {
            "output_dir": "./output",
            "lang_list": options.lang,
@ -270,26 +288,35 @@ class MinerUParser(RAGFlowPdfParser):
            self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
            if callback:
                callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
-            response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers,
-                                     timeout=1800)
+            with open(pdf_file_path, "rb") as pdf_file:
+                files = {"files": (pdf_file_name + ".pdf", pdf_file, "application/pdf")}
+                with requests.post(
+                    url=f"{self.mineru_api}/file_parse",
+                    files=files,
+                    data=data,
+                    headers=headers,
+                    timeout=1800,
+                    stream=True,
+                ) as response:
+                    response.raise_for_status()
+                    content_type = response.headers.get("Content-Type", "")
+                    if content_type.startswith("application/zip"):
+                        self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")

-            response.raise_for_status()
-            if response.headers.get("Content-Type") == "application/zip":
-                self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
+                        if callback:
+                            callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")

-                if callback:
-                    callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
+                        with open(output_zip_path, "wb") as f:
+                            response.raw.decode_content = True
+                            shutil.copyfileobj(response.raw, f)

-                with open(output_zip_path, "wb") as f:
-                    f.write(response.content)
+                        self.logger.info(f"[MinerU] Unzip to {output_path}...")
+                        self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")

-                self.logger.info(f"[MinerU] Unzip to {output_path}...")
-                self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
-
-                if callback:
-                    callback(0.40, f"[MinerU] Unzip to {output_path}...")
-            else:
-                self.logger.warning(f"[MinerU] not zip returned from api: {response.headers.get('Content-Type')}")
+                        if callback:
+                            callback(0.40, f"[MinerU] Unzip to {output_path}...")
+                    else:
+                        self.logger.warning(f"[MinerU] not zip returned from api: {content_type}")
        except Exception as e:
            raise RuntimeError(f"[MinerU] api failed with exception {e}")
        self.logger.info("[MinerU] Api completed successfully.")
--- a/deepdoc/parser/tcadp_parser.py
+++ b/deepdoc/parser/tcadp_parser.py
@ -17,6 +17,7 @@ import base64
 import json
 import logging
 import os
+import re
 import shutil
 import tempfile
 import time
@ -48,10 +49,10 @@ class TencentCloudAPIClient:
        self.secret_key = secret_key
        self.region = region
        self.outlines = []
-        
+
        # Create credentials
        self.cred = credential.Credential(secret_id, secret_key)
-        
+
        # Instantiate an http option, optional, can be skipped if no special requirements
        self.httpProfile = HttpProfile()
        self.httpProfile.endpoint = "lkeap.tencentcloudapi.com"
@ -59,7 +60,7 @@ class TencentCloudAPIClient:
        # Instantiate a client option, optional, can be skipped if no special requirements
        self.clientProfile = ClientProfile()
        self.clientProfile.httpProfile = self.httpProfile
-        
+
        # Instantiate the client object for the product to be requested, clientProfile is optional
        self.client = lkeap_client.LkeapClient(self.cred, region, self.clientProfile)

@ -68,14 +69,14 @@ class TencentCloudAPIClient:
        try:
            # Instantiate a request object, each interface corresponds to a request object
            req = models.ReconstructDocumentSSERequest()
-            
+
            # Build request parameters
            params = {
                "FileType": file_type,
                "FileStartPageNumber": file_start_page,
                "FileEndPageNumber": file_end_page,
            }
-            
+
            # According to Tencent Cloud API documentation, either FileUrl or FileBase64 parameter must be provided, if both are provided only FileUrl will be used
            if file_url:
                params["FileUrl"] = file_url
@ -94,7 +95,7 @@ class TencentCloudAPIClient:
            # The returned resp is an instance of ReconstructDocumentSSEResponse, corresponding to the request object
            resp = self.client.ReconstructDocumentSSE(req)
            parser_result = {}
-            
+
            # Output json format string response
            if isinstance(resp, types.GeneratorType):  # Streaming response
                logging.info("[TCADP] Detected streaming response")
@ -104,7 +105,7 @@ class TencentCloudAPIClient:
                        try:
                            data_dict = json.loads(event['data'])
                            logging.info(f"[TCADP] Parsed data: {data_dict}")
-                            
+
                            if data_dict.get('Progress') == "100":
                                parser_result = data_dict
                                logging.info("[TCADP] Document parsing completed!")
@ -118,14 +119,14 @@ class TencentCloudAPIClient:
                                    logging.warning("[TCADP] Failed parsing pages:")
                                    for page in failed_pages:
                                        logging.warning(f"[TCADP]   Page number: {page.get('PageNumber')}, Error: {page.get('ErrorMsg')}")
-                                
+
                                # Check if there is a download link
                                download_url = data_dict.get("DocumentRecognizeResultUrl")
                                if download_url:
                                    logging.info(f"[TCADP] Got download link: {download_url}")
                                else:
                                    logging.warning("[TCADP] No download link obtained")
-                                
+
                                break  # Found final result, exit loop
                            else:
                                # Print progress information
@ -168,9 +169,6 @@ class TencentCloudAPIClient:
            return None

        try:
-            response = requests.get(download_url)
-            response.raise_for_status()
-
            # Ensure output directory exists
            os.makedirs(output_dir, exist_ok=True)

@ -179,29 +177,36 @@ class TencentCloudAPIClient:
            filename = f"tcadp_result_{timestamp}.zip"
            file_path = os.path.join(output_dir, filename)

-            # Save file
-            with open(file_path, "wb") as f:
-                f.write(response.content)
+            with requests.get(download_url, stream=True) as response:
+                response.raise_for_status()
+                with open(file_path, "wb") as f:
+                    response.raw.decode_content = True
+                    shutil.copyfileobj(response.raw, f)

            logging.info(f"[TCADP] Document parsing result downloaded to: {os.path.basename(file_path)}")
            return file_path

-        except requests.exceptions.RequestException as e:
+        except Exception as e:
            logging.error(f"[TCADP] Failed to download file: {e}")
+            try:
+                if "file_path" in locals() and os.path.exists(file_path):
+                    os.unlink(file_path)
+            except Exception:
+                pass
            return None


 class TCADPParser(RAGFlowPdfParser):
-    def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou", 
+    def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou",
                 table_result_type: str = None, markdown_image_response_type: str = None):
        super().__init__()
-        
+
        # First initialize logger
        self.logger = logging.getLogger(self.__class__.__name__)
-        
+
        # Log received parameters
        self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
-        
+
        # Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
        try:
            tcadp_parser = get_base_config("tcadp_config", {})
@ -212,7 +217,7 @@ class TCADPParser(RAGFlowPdfParser):
                # Set table_result_type and markdown_image_response_type from config or parameters
                self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
                self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
-                
+
            else:
                self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
                # If config file is empty, use provided parameters or defaults
@ -237,6 +242,10 @@ class TCADPParser(RAGFlowPdfParser):
        if not self.secret_id or not self.secret_key:
            raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")

+    @staticmethod
+    def _is_zipinfo_symlink(member: zipfile.ZipInfo) -> bool:
+        return (member.external_attr >> 16) & 0o170000 == 0o120000
+
    def check_installation(self) -> bool:
        """Check if Tencent Cloud API configuration is correct"""
        try:
@ -255,7 +264,7 @@ class TCADPParser(RAGFlowPdfParser):

    def _file_to_base64(self, file_path: str, binary: bytes = None) -> str:
        """Convert file to Base64 format"""
-        
+
        if binary:
            # If binary data is directly available, convert directly
            return base64.b64encode(binary).decode('utf-8')
@ -271,23 +280,34 @@ class TCADPParser(RAGFlowPdfParser):

        try:
            with zipfile.ZipFile(zip_path, "r") as zip_file:
-                # Find JSON result files
-                json_files = [f for f in zip_file.namelist() if f.endswith(".json")]
+                members = zip_file.infolist()
+                for member in members:
+                    name = member.filename.replace("\\", "/")
+                    if member.is_dir():
+                        continue
+                    if member.flag_bits & 0x1:
+                        raise RuntimeError(f"[TCADP] Encrypted zip entry not supported: {member.filename}")
+                    if self._is_zipinfo_symlink(member):
+                        raise RuntimeError(f"[TCADP] Symlink zip entry not supported: {member.filename}")
+                    if name.startswith("/") or name.startswith("//") or re.match(r"^[A-Za-z]:", name):
+                        raise RuntimeError(f"[TCADP] Unsafe zip path (absolute): {member.filename}")
+                    parts = [p for p in name.split("/") if p not in ("", ".")]
+                    if any(p == ".." for p in parts):
+                        raise RuntimeError(f"[TCADP] Unsafe zip path (traversal): {member.filename}")

-                for json_file in json_files:
-                    with zip_file.open(json_file) as f:
-                        data = json.load(f)
-                        if isinstance(data, list):
-                            results.extend(data)
+                    if not (name.endswith(".json") or name.endswith(".md")):
+                        continue
+
+                    with zip_file.open(member) as f:
+                        if name.endswith(".json"):
+                            data = json.load(f)
+                            if isinstance(data, list):
+                                results.extend(data)
+                            else:
+                                results.append(data)
                        else:
-                            results.append(data)
-
-                # Find Markdown files
-                md_files = [f for f in zip_file.namelist() if f.endswith(".md")]
-                for md_file in md_files:
-                    with zip_file.open(md_file) as f:
-                        content = f.read().decode("utf-8")
-                        results.append({"type": "text", "content": content, "file": md_file})
+                            content = f.read().decode("utf-8")
+                            results.append({"type": "text", "content": content, "file": name})

        except Exception as e:
            self.logger.error(f"[TCADP] Failed to extract ZIP file content: {e}")
@ -395,7 +415,7 @@ class TCADPParser(RAGFlowPdfParser):
            # Convert file to Base64 format
            if callback:
                callback(0.2, "[TCADP] Converting file to Base64 format")
-            
+
            file_base64 = self._file_to_base64(file_path, binary)
            if callback:
                callback(0.25, f"[TCADP] File converted to Base64, size: {len(file_base64)} characters")
@ -420,23 +440,23 @@ class TCADPParser(RAGFlowPdfParser):
                        "TableResultType": self.table_result_type,
                        "MarkdownImageResponseType": self.markdown_image_response_type
                    }
-                    
+
                    self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")

                    result = client.reconstruct_document_sse(
-                        file_type=file_type, 
-                        file_base64=file_base64, 
-                        file_start_page=file_start_page, 
-                        file_end_page=file_end_page, 
+                        file_type=file_type,
+                        file_base64=file_base64,
+                        file_start_page=file_start_page,
+                        file_end_page=file_end_page,
                        config=config
                    )
-                    
+
                    if result:
                        self.logger.info(f"[TCADP] Attempt {attempt + 1} successful")
                        break
                    else:
                        self.logger.warning(f"[TCADP] Attempt {attempt + 1} failed, result is None")
-                        
+
                except Exception as e:
                    self.logger.error(f"[TCADP] Attempt {attempt + 1} exception: {e}")
                    if attempt == max_retries - 1: