diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index aba237dd1..cc4c99c76 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -17,6 +17,7 @@ import json import logging import os import re +import shutil import sys import tempfile import threading @@ -138,39 +139,58 @@ class MinerUParser(RAGFlowPdfParser): self.outlines = [] self.logger = logging.getLogger(self.__class__.__name__) + @staticmethod + def _is_zipinfo_symlink(member: zipfile.ZipInfo) -> bool: + return (member.external_attr >> 16) & 0o170000 == 0o120000 + def _extract_zip_no_root(self, zip_path, extract_to, root_dir): self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}") + base_dir = Path(extract_to).resolve() with zipfile.ZipFile(zip_path, "r") as zip_ref: + members = zip_ref.infolist() if not root_dir: - files = zip_ref.namelist() - if files and files[0].endswith("/"): - root_dir = files[0] + if members and members[0].filename.endswith("/"): + root_dir = members[0].filename else: root_dir = None + if root_dir: + root_dir = root_dir.replace("\\", "/") + if not root_dir.endswith("/"): + root_dir += "/" - if not root_dir or not root_dir.endswith("/"): - self.logger.info(f"[MinerU] No root directory found, extracting all (root_hint={root_dir})") - zip_ref.extractall(extract_to) - return + for member in members: + if member.flag_bits & 0x1: + raise RuntimeError(f"[MinerU] Encrypted zip entry not supported: {member.filename}") + if self._is_zipinfo_symlink(member): + raise RuntimeError(f"[MinerU] Symlink zip entry not supported: {member.filename}") - root_len = len(root_dir) - for member in zip_ref.infolist(): - filename = member.filename - if filename == root_dir: + name = member.filename.replace("\\", "/") + if root_dir and name == root_dir: self.logger.info("[MinerU] Ignore root folder...") continue + if root_dir and name.startswith(root_dir): + name = name[len(root_dir) :] + if not name: + continue + if name.startswith("/") or name.startswith("//") or re.match(r"^[A-Za-z]:", name): + raise RuntimeError(f"[MinerU] Unsafe zip path (absolute): {member.filename}") - path = filename - if path.startswith(root_dir): - path = path[root_len:] + parts = [p for p in name.split("/") if p not in ("", ".")] + if any(p == ".." for p in parts): + raise RuntimeError(f"[MinerU] Unsafe zip path (traversal): {member.filename}") + + rel_path = os.path.join(*parts) if parts else "" + dest_path = (Path(extract_to) / rel_path).resolve(strict=False) + if dest_path != base_dir and base_dir not in dest_path.parents: + raise RuntimeError(f"[MinerU] Unsafe zip path (escape): {member.filename}") - full_path = os.path.join(extract_to, path) if member.is_dir(): - os.makedirs(full_path, exist_ok=True) - else: - os.makedirs(os.path.dirname(full_path), exist_ok=True) - with open(full_path, "wb") as f: - f.write(zip_ref.read(filename)) + os.makedirs(dest_path, exist_ok=True) + continue + + os.makedirs(dest_path.parent, exist_ok=True) + with zip_ref.open(member) as src, open(dest_path, "wb") as dst: + shutil.copyfileobj(src, dst) @staticmethod def _is_http_endpoint_valid(url, timeout=5): @@ -237,8 +257,6 @@ class MinerUParser(RAGFlowPdfParser): output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir)) output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip") - files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")} - data = { "output_dir": "./output", "lang_list": options.lang, @@ -270,26 +288,35 @@ class MinerUParser(RAGFlowPdfParser): self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}") if callback: callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse") - response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, - timeout=1800) + with open(pdf_file_path, "rb") as pdf_file: + files = {"files": (pdf_file_name + ".pdf", pdf_file, "application/pdf")} + with requests.post( + url=f"{self.mineru_api}/file_parse", + files=files, + data=data, + headers=headers, + timeout=1800, + stream=True, + ) as response: + response.raise_for_status() + content_type = response.headers.get("Content-Type", "") + if content_type.startswith("application/zip"): + self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...") - response.raise_for_status() - if response.headers.get("Content-Type") == "application/zip": - self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...") + if callback: + callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...") - if callback: - callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...") + with open(output_zip_path, "wb") as f: + response.raw.decode_content = True + shutil.copyfileobj(response.raw, f) - with open(output_zip_path, "wb") as f: - f.write(response.content) + self.logger.info(f"[MinerU] Unzip to {output_path}...") + self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/") - self.logger.info(f"[MinerU] Unzip to {output_path}...") - self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/") - - if callback: - callback(0.40, f"[MinerU] Unzip to {output_path}...") - else: - self.logger.warning(f"[MinerU] not zip returned from api: {response.headers.get('Content-Type')}") + if callback: + callback(0.40, f"[MinerU] Unzip to {output_path}...") + else: + self.logger.warning(f"[MinerU] not zip returned from api: {content_type}") except Exception as e: raise RuntimeError(f"[MinerU] api failed with exception {e}") self.logger.info("[MinerU] Api completed successfully.") diff --git a/deepdoc/parser/tcadp_parser.py b/deepdoc/parser/tcadp_parser.py index 8d704baed..af1c90348 100644 --- a/deepdoc/parser/tcadp_parser.py +++ b/deepdoc/parser/tcadp_parser.py @@ -17,6 +17,7 @@ import base64 import json import logging import os +import re import shutil import tempfile import time @@ -48,10 +49,10 @@ class TencentCloudAPIClient: self.secret_key = secret_key self.region = region self.outlines = [] - + # Create credentials self.cred = credential.Credential(secret_id, secret_key) - + # Instantiate an http option, optional, can be skipped if no special requirements self.httpProfile = HttpProfile() self.httpProfile.endpoint = "lkeap.tencentcloudapi.com" @@ -59,7 +60,7 @@ class TencentCloudAPIClient: # Instantiate a client option, optional, can be skipped if no special requirements self.clientProfile = ClientProfile() self.clientProfile.httpProfile = self.httpProfile - + # Instantiate the client object for the product to be requested, clientProfile is optional self.client = lkeap_client.LkeapClient(self.cred, region, self.clientProfile) @@ -68,14 +69,14 @@ class TencentCloudAPIClient: try: # Instantiate a request object, each interface corresponds to a request object req = models.ReconstructDocumentSSERequest() - + # Build request parameters params = { "FileType": file_type, "FileStartPageNumber": file_start_page, "FileEndPageNumber": file_end_page, } - + # According to Tencent Cloud API documentation, either FileUrl or FileBase64 parameter must be provided, if both are provided only FileUrl will be used if file_url: params["FileUrl"] = file_url @@ -94,7 +95,7 @@ class TencentCloudAPIClient: # The returned resp is an instance of ReconstructDocumentSSEResponse, corresponding to the request object resp = self.client.ReconstructDocumentSSE(req) parser_result = {} - + # Output json format string response if isinstance(resp, types.GeneratorType): # Streaming response logging.info("[TCADP] Detected streaming response") @@ -104,7 +105,7 @@ class TencentCloudAPIClient: try: data_dict = json.loads(event['data']) logging.info(f"[TCADP] Parsed data: {data_dict}") - + if data_dict.get('Progress') == "100": parser_result = data_dict logging.info("[TCADP] Document parsing completed!") @@ -118,14 +119,14 @@ class TencentCloudAPIClient: logging.warning("[TCADP] Failed parsing pages:") for page in failed_pages: logging.warning(f"[TCADP] Page number: {page.get('PageNumber')}, Error: {page.get('ErrorMsg')}") - + # Check if there is a download link download_url = data_dict.get("DocumentRecognizeResultUrl") if download_url: logging.info(f"[TCADP] Got download link: {download_url}") else: logging.warning("[TCADP] No download link obtained") - + break # Found final result, exit loop else: # Print progress information @@ -168,9 +169,6 @@ class TencentCloudAPIClient: return None try: - response = requests.get(download_url) - response.raise_for_status() - # Ensure output directory exists os.makedirs(output_dir, exist_ok=True) @@ -179,29 +177,36 @@ class TencentCloudAPIClient: filename = f"tcadp_result_{timestamp}.zip" file_path = os.path.join(output_dir, filename) - # Save file - with open(file_path, "wb") as f: - f.write(response.content) + with requests.get(download_url, stream=True) as response: + response.raise_for_status() + with open(file_path, "wb") as f: + response.raw.decode_content = True + shutil.copyfileobj(response.raw, f) logging.info(f"[TCADP] Document parsing result downloaded to: {os.path.basename(file_path)}") return file_path - except requests.exceptions.RequestException as e: + except Exception as e: logging.error(f"[TCADP] Failed to download file: {e}") + try: + if "file_path" in locals() and os.path.exists(file_path): + os.unlink(file_path) + except Exception: + pass return None class TCADPParser(RAGFlowPdfParser): - def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou", + def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou", table_result_type: str = None, markdown_image_response_type: str = None): super().__init__() - + # First initialize logger self.logger = logging.getLogger(self.__class__.__name__) - + # Log received parameters self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}") - + # Priority: read configuration from RAGFlow configuration system (service_conf.yaml) try: tcadp_parser = get_base_config("tcadp_config", {}) @@ -212,7 +217,7 @@ class TCADPParser(RAGFlowPdfParser): # Set table_result_type and markdown_image_response_type from config or parameters self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1") self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1") - + else: self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first") # If config file is empty, use provided parameters or defaults @@ -237,6 +242,10 @@ class TCADPParser(RAGFlowPdfParser): if not self.secret_id or not self.secret_key: raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml") + @staticmethod + def _is_zipinfo_symlink(member: zipfile.ZipInfo) -> bool: + return (member.external_attr >> 16) & 0o170000 == 0o120000 + def check_installation(self) -> bool: """Check if Tencent Cloud API configuration is correct""" try: @@ -255,7 +264,7 @@ class TCADPParser(RAGFlowPdfParser): def _file_to_base64(self, file_path: str, binary: bytes = None) -> str: """Convert file to Base64 format""" - + if binary: # If binary data is directly available, convert directly return base64.b64encode(binary).decode('utf-8') @@ -271,23 +280,34 @@ class TCADPParser(RAGFlowPdfParser): try: with zipfile.ZipFile(zip_path, "r") as zip_file: - # Find JSON result files - json_files = [f for f in zip_file.namelist() if f.endswith(".json")] + members = zip_file.infolist() + for member in members: + name = member.filename.replace("\\", "/") + if member.is_dir(): + continue + if member.flag_bits & 0x1: + raise RuntimeError(f"[TCADP] Encrypted zip entry not supported: {member.filename}") + if self._is_zipinfo_symlink(member): + raise RuntimeError(f"[TCADP] Symlink zip entry not supported: {member.filename}") + if name.startswith("/") or name.startswith("//") or re.match(r"^[A-Za-z]:", name): + raise RuntimeError(f"[TCADP] Unsafe zip path (absolute): {member.filename}") + parts = [p for p in name.split("/") if p not in ("", ".")] + if any(p == ".." for p in parts): + raise RuntimeError(f"[TCADP] Unsafe zip path (traversal): {member.filename}") - for json_file in json_files: - with zip_file.open(json_file) as f: - data = json.load(f) - if isinstance(data, list): - results.extend(data) + if not (name.endswith(".json") or name.endswith(".md")): + continue + + with zip_file.open(member) as f: + if name.endswith(".json"): + data = json.load(f) + if isinstance(data, list): + results.extend(data) + else: + results.append(data) else: - results.append(data) - - # Find Markdown files - md_files = [f for f in zip_file.namelist() if f.endswith(".md")] - for md_file in md_files: - with zip_file.open(md_file) as f: - content = f.read().decode("utf-8") - results.append({"type": "text", "content": content, "file": md_file}) + content = f.read().decode("utf-8") + results.append({"type": "text", "content": content, "file": name}) except Exception as e: self.logger.error(f"[TCADP] Failed to extract ZIP file content: {e}") @@ -395,7 +415,7 @@ class TCADPParser(RAGFlowPdfParser): # Convert file to Base64 format if callback: callback(0.2, "[TCADP] Converting file to Base64 format") - + file_base64 = self._file_to_base64(file_path, binary) if callback: callback(0.25, f"[TCADP] File converted to Base64, size: {len(file_base64)} characters") @@ -420,23 +440,23 @@ class TCADPParser(RAGFlowPdfParser): "TableResultType": self.table_result_type, "MarkdownImageResponseType": self.markdown_image_response_type } - + self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}") result = client.reconstruct_document_sse( - file_type=file_type, - file_base64=file_base64, - file_start_page=file_start_page, - file_end_page=file_end_page, + file_type=file_type, + file_base64=file_base64, + file_start_page=file_start_page, + file_end_page=file_end_page, config=config ) - + if result: self.logger.info(f"[TCADP] Attempt {attempt + 1} successful") break else: self.logger.warning(f"[TCADP] Attempt {attempt + 1} failed, result is None") - + except Exception as e: self.logger.error(f"[TCADP] Attempt {attempt + 1} exception: {e}") if attempt == max_retries - 1: