refactor: move workflow package to dify_graph (#32844)

2026-05-31 22:26:19 +08:00 · 2026-03-02 18:42:30 +08:00
parent 9c33923985
commit c917838f9c
613 changed files with 2008 additions and 2012 deletions
--- a/api/dify_graph/nodes/document_extractor/node.py
+++ b/api/dify_graph/nodes/document_extractor/node.py
@ -0,0 +1,729 @@
+import csv
+import io
+import json
+import logging
+import os
+import tempfile
+from collections.abc import Mapping, Sequence
+from typing import TYPE_CHECKING, Any
+
+import charset_normalizer
+import docx
+import pandas as pd
+import pypandoc
+import pypdfium2
+import webvtt
+import yaml
+from docx.document import Document
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+from docx.table import Table
+from docx.text.paragraph import Paragraph
+
+from core.helper import ssrf_proxy
+from dify_graph.enums import NodeType, WorkflowNodeExecutionStatus
+from dify_graph.file import File, FileTransferMethod, file_manager
+from dify_graph.node_events import NodeRunResult
+from dify_graph.nodes.base.node import Node
+from dify_graph.variables import ArrayFileSegment
+from dify_graph.variables.segments import ArrayStringSegment, FileSegment
+
+from .entities import DocumentExtractorNodeData, UnstructuredApiConfig
+from .exc import DocumentExtractorError, FileDownloadError, TextExtractionError, UnsupportedFileTypeError
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from dify_graph.entities import GraphInitParams
+    from dify_graph.runtime import GraphRuntimeState
+
+
+class DocumentExtractorNode(Node[DocumentExtractorNodeData]):
+    """
+    Extracts text content from various file types.
+    Supports plain text, PDF, and DOC/DOCX files.
+    """
+
+    node_type = NodeType.DOCUMENT_EXTRACTOR
+
+    @classmethod
+    def version(cls) -> str:
+        return "1"
+
+    def __init__(
+        self,
+        id: str,
+        config: Mapping[str, Any],
+        graph_init_params: "GraphInitParams",
+        graph_runtime_state: "GraphRuntimeState",
+        *,
+        unstructured_api_config: UnstructuredApiConfig | None = None,
+    ) -> None:
+        super().__init__(
+            id=id,
+            config=config,
+            graph_init_params=graph_init_params,
+            graph_runtime_state=graph_runtime_state,
+        )
+        self._unstructured_api_config = unstructured_api_config or UnstructuredApiConfig()
+
+    def _run(self):
+        variable_selector = self.node_data.variable_selector
+        variable = self.graph_runtime_state.variable_pool.get(variable_selector)
+
+        if variable is None:
+            error_message = f"File variable not found for selector: {variable_selector}"
+            return NodeRunResult(status=WorkflowNodeExecutionStatus.FAILED, error=error_message)
+        if variable.value and not isinstance(variable, ArrayFileSegment | FileSegment):
+            error_message = f"Variable {variable_selector} is not an ArrayFileSegment"
+            return NodeRunResult(status=WorkflowNodeExecutionStatus.FAILED, error=error_message)
+
+        value = variable.value
+        inputs = {"variable_selector": variable_selector}
+        process_data = {"documents": value if isinstance(value, list) else [value]}
+
+        try:
+            if isinstance(value, list):
+                extracted_text_list = [
+                    _extract_text_from_file(file, unstructured_api_config=self._unstructured_api_config)
+                    for file in value
+                ]
+                return NodeRunResult(
+                    status=WorkflowNodeExecutionStatus.SUCCEEDED,
+                    inputs=inputs,
+                    process_data=process_data,
+                    outputs={"text": ArrayStringSegment(value=extracted_text_list)},
+                )
+            elif isinstance(value, File):
+                extracted_text = _extract_text_from_file(value, unstructured_api_config=self._unstructured_api_config)
+                return NodeRunResult(
+                    status=WorkflowNodeExecutionStatus.SUCCEEDED,
+                    inputs=inputs,
+                    process_data=process_data,
+                    outputs={"text": extracted_text},
+                )
+            else:
+                raise DocumentExtractorError(f"Unsupported variable type: {type(value)}")
+        except DocumentExtractorError as e:
+            return NodeRunResult(
+                status=WorkflowNodeExecutionStatus.FAILED,
+                error=str(e),
+                inputs=inputs,
+                process_data=process_data,
+            )
+
+    @classmethod
+    def _extract_variable_selector_to_variable_mapping(
+        cls,
+        *,
+        graph_config: Mapping[str, Any],
+        node_id: str,
+        node_data: Mapping[str, Any],
+    ) -> Mapping[str, Sequence[str]]:
+        # Create typed NodeData from dict
+        typed_node_data = DocumentExtractorNodeData.model_validate(node_data)
+
+        return {node_id + ".files": typed_node_data.variable_selector}
+
+
+def _extract_text_by_mime_type(
+    *,
+    file_content: bytes,
+    mime_type: str,
+    unstructured_api_config: UnstructuredApiConfig,
+) -> str:
+    """Extract text from a file based on its MIME type."""
+    match mime_type:
+        case "text/plain" | "text/html" | "text/htm" | "text/markdown" | "text/xml":
+            return _extract_text_from_plain_text(file_content)
+        case "application/pdf":
+            return _extract_text_from_pdf(file_content)
+        case "application/msword":
+            return _extract_text_from_doc(file_content, unstructured_api_config=unstructured_api_config)
+        case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+            return _extract_text_from_docx(file_content)
+        case "text/csv":
+            return _extract_text_from_csv(file_content)
+        case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel":
+            return _extract_text_from_excel(file_content)
+        case "application/vnd.ms-powerpoint":
+            return _extract_text_from_ppt(file_content, unstructured_api_config=unstructured_api_config)
+        case "application/vnd.openxmlformats-officedocument.presentationml.presentation":
+            return _extract_text_from_pptx(file_content, unstructured_api_config=unstructured_api_config)
+        case "application/epub+zip":
+            return _extract_text_from_epub(file_content, unstructured_api_config=unstructured_api_config)
+        case "message/rfc822":
+            return _extract_text_from_eml(file_content)
+        case "application/vnd.ms-outlook":
+            return _extract_text_from_msg(file_content)
+        case "application/json":
+            return _extract_text_from_json(file_content)
+        case "application/x-yaml" | "text/yaml":
+            return _extract_text_from_yaml(file_content)
+        case "text/vtt":
+            return _extract_text_from_vtt(file_content)
+        case "text/properties":
+            return _extract_text_from_properties(file_content)
+        case _:
+            raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
+
+
+def _extract_text_by_file_extension(
+    *,
+    file_content: bytes,
+    file_extension: str,
+    unstructured_api_config: UnstructuredApiConfig,
+) -> str:
+    """Extract text from a file based on its file extension."""
+    match file_extension:
+        case (
+            ".txt"
+            | ".markdown"
+            | ".md"
+            | ".mdx"
+            | ".html"
+            | ".htm"
+            | ".xml"
+            | ".c"
+            | ".h"
+            | ".cpp"
+            | ".hpp"
+            | ".cc"
+            | ".cxx"
+            | ".c++"
+            | ".py"
+            | ".js"
+            | ".ts"
+            | ".jsx"
+            | ".tsx"
+            | ".java"
+            | ".php"
+            | ".rb"
+            | ".go"
+            | ".rs"
+            | ".swift"
+            | ".kt"
+            | ".scala"
+            | ".sh"
+            | ".bash"
+            | ".bat"
+            | ".ps1"
+            | ".sql"
+            | ".r"
+            | ".m"
+            | ".pl"
+            | ".lua"
+            | ".vim"
+            | ".asm"
+            | ".s"
+            | ".css"
+            | ".scss"
+            | ".less"
+            | ".sass"
+            | ".ini"
+            | ".cfg"
+            | ".conf"
+            | ".toml"
+            | ".env"
+            | ".log"
+            | ".vtt"
+        ):
+            return _extract_text_from_plain_text(file_content)
+        case ".json":
+            return _extract_text_from_json(file_content)
+        case ".yaml" | ".yml":
+            return _extract_text_from_yaml(file_content)
+        case ".pdf":
+            return _extract_text_from_pdf(file_content)
+        case ".doc":
+            return _extract_text_from_doc(file_content, unstructured_api_config=unstructured_api_config)
+        case ".docx":
+            return _extract_text_from_docx(file_content)
+        case ".csv":
+            return _extract_text_from_csv(file_content)
+        case ".xls" | ".xlsx":
+            return _extract_text_from_excel(file_content)
+        case ".ppt":
+            return _extract_text_from_ppt(file_content, unstructured_api_config=unstructured_api_config)
+        case ".pptx":
+            return _extract_text_from_pptx(file_content, unstructured_api_config=unstructured_api_config)
+        case ".epub":
+            return _extract_text_from_epub(file_content, unstructured_api_config=unstructured_api_config)
+        case ".eml":
+            return _extract_text_from_eml(file_content)
+        case ".msg":
+            return _extract_text_from_msg(file_content)
+        case ".properties":
+            return _extract_text_from_properties(file_content)
+        case _:
+            raise UnsupportedFileTypeError(f"Unsupported Extension Type: {file_extension}")
+
+
+def _extract_text_from_plain_text(file_content: bytes) -> str:
+    try:
+        # Detect encoding using charset_normalizer
+        result = charset_normalizer.from_bytes(file_content, cp_isolation=["utf_8", "latin_1", "cp1252"]).best()
+        if result:
+            encoding = result.encoding
+        else:
+            encoding = "utf-8"
+
+        # Fallback to utf-8 if detection fails
+        if not encoding:
+            encoding = "utf-8"
+
+        return file_content.decode(encoding, errors="ignore")
+    except (UnicodeDecodeError, LookupError) as e:
+        # If decoding fails, try with utf-8 as last resort
+        try:
+            return file_content.decode("utf-8", errors="ignore")
+        except UnicodeDecodeError:
+            raise TextExtractionError(f"Failed to decode plain text file: {e}") from e
+
+
+def _extract_text_from_json(file_content: bytes) -> str:
+    try:
+        # Detect encoding using charset_normalizer
+        result = charset_normalizer.from_bytes(file_content).best()
+        if result:
+            encoding = result.encoding
+        else:
+            encoding = "utf-8"
+
+        # Fallback to utf-8 if detection fails
+        if not encoding:
+            encoding = "utf-8"
+
+        json_data = json.loads(file_content.decode(encoding, errors="ignore"))
+        return json.dumps(json_data, indent=2, ensure_ascii=False)
+    except (UnicodeDecodeError, LookupError, json.JSONDecodeError) as e:
+        # If decoding fails, try with utf-8 as last resort
+        try:
+            json_data = json.loads(file_content.decode("utf-8", errors="ignore"))
+            return json.dumps(json_data, indent=2, ensure_ascii=False)
+        except (UnicodeDecodeError, json.JSONDecodeError):
+            raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
+
+
+def _extract_text_from_yaml(file_content: bytes) -> str:
+    """Extract the content from yaml file"""
+    try:
+        # Detect encoding using charset_normalizer
+        result = charset_normalizer.from_bytes(file_content).best()
+        if result:
+            encoding = result.encoding
+        else:
+            encoding = "utf-8"
+
+        # Fallback to utf-8 if detection fails
+        if not encoding:
+            encoding = "utf-8"
+
+        yaml_data = yaml.safe_load_all(file_content.decode(encoding, errors="ignore"))
+        return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
+    except (UnicodeDecodeError, LookupError, yaml.YAMLError) as e:
+        # If decoding fails, try with utf-8 as last resort
+        try:
+            yaml_data = yaml.safe_load_all(file_content.decode("utf-8", errors="ignore"))
+            return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
+        except (UnicodeDecodeError, yaml.YAMLError):
+            raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
+
+
+def _extract_text_from_pdf(file_content: bytes) -> str:
+    try:
+        pdf_file = io.BytesIO(file_content)
+        pdf_document = pypdfium2.PdfDocument(pdf_file, autoclose=True)
+        text = ""
+        for page in pdf_document:
+            text_page = page.get_textpage()
+            text += text_page.get_text_range()
+            text_page.close()
+            page.close()
+        return text
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from PDF: {str(e)}") from e
+
+
+def _extract_text_from_doc(file_content: bytes, *, unstructured_api_config: UnstructuredApiConfig) -> str:
+    """
+    Extract text from a DOC file.
+    """
+    from unstructured.partition.api import partition_via_api
+
+    if not unstructured_api_config.api_url:
+        raise TextExtractionError("Unstructured API URL is not configured for DOC file processing.")
+    api_key = unstructured_api_config.api_key or ""
+
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
+            temp_file.write(file_content)
+            temp_file.flush()
+            with open(temp_file.name, "rb") as file:
+                elements = partition_via_api(
+                    file=file,
+                    metadata_filename=temp_file.name,
+                    api_url=unstructured_api_config.api_url,
+                    api_key=api_key,
+                )
+            os.unlink(temp_file.name)
+        return "\n".join([getattr(element, "text", "") for element in elements])
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e
+
+
+def parser_docx_part(block, doc: Document, content_items, i):
+    if isinstance(block, CT_P):
+        content_items.append((i, "paragraph", Paragraph(block, doc)))
+    elif isinstance(block, CT_Tbl):
+        content_items.append((i, "table", Table(block, doc)))
+
+
+def _extract_text_from_docx(file_content: bytes) -> str:
+    """
+    Extract text from a DOCX file.
+    For now support only paragraph and table add more if needed
+    """
+    try:
+        doc_file = io.BytesIO(file_content)
+        doc = docx.Document(doc_file)
+        text = []
+
+        # Keep track of paragraph and table positions
+        content_items: list[tuple[int, str, Table | Paragraph]] = []
+
+        it = iter(doc.element.body)
+        part = next(it, None)
+        i = 0
+        while part is not None:
+            parser_docx_part(part, doc, content_items, i)
+            i = i + 1
+            part = next(it, None)
+
+        # Process sorted content
+        for _, item_type, item in content_items:
+            if item_type == "paragraph":
+                if isinstance(item, Table):
+                    continue
+                text.append(item.text)
+            elif item_type == "table":
+                # Process tables
+                if not isinstance(item, Table):
+                    continue
+                try:
+                    # Check if any cell in the table has text
+                    has_content = False
+                    for row in item.rows:
+                        if any(cell.text.strip() for cell in row.cells):
+                            has_content = True
+                            break
+
+                    if has_content:
+                        cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
+                        markdown_table = f"| {' | '.join(cell_texts)} |\n"
+                        markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"
+
+                        for row in item.rows[1:]:
+                            # Replace newlines with <br> in each cell
+                            row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
+                            markdown_table += "| " + " | ".join(row_cells) + " |\n"
+
+                        text.append(markdown_table)
+                except Exception as e:
+                    logger.warning("Failed to extract table from DOC: %s", e)
+                    continue
+
+        return "\n".join(text)
+
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from DOCX: {str(e)}") from e
+
+
+def _download_file_content(file: File) -> bytes:
+    """Download the content of a file based on its transfer method."""
+    try:
+        if file.transfer_method == FileTransferMethod.REMOTE_URL:
+            if file.remote_url is None:
+                raise FileDownloadError("Missing URL for remote file")
+            response = ssrf_proxy.get(file.remote_url)
+            response.raise_for_status()
+            return response.content
+        else:
+            return file_manager.download(file)
+    except Exception as e:
+        raise FileDownloadError(f"Error downloading file: {str(e)}") from e
+
+
+def _extract_text_from_file(file: File, *, unstructured_api_config: UnstructuredApiConfig) -> str:
+    file_content = _download_file_content(file)
+    if file.extension:
+        extracted_text = _extract_text_by_file_extension(
+            file_content=file_content,
+            file_extension=file.extension,
+            unstructured_api_config=unstructured_api_config,
+        )
+    elif file.mime_type:
+        extracted_text = _extract_text_by_mime_type(
+            file_content=file_content,
+            mime_type=file.mime_type,
+            unstructured_api_config=unstructured_api_config,
+        )
+    else:
+        raise UnsupportedFileTypeError("Unable to determine file type: MIME type or file extension is missing")
+    return extracted_text
+
+
+def _extract_text_from_csv(file_content: bytes) -> str:
+    try:
+        # Detect encoding using charset_normalizer
+        result = charset_normalizer.from_bytes(file_content).best()
+        if result:
+            encoding = result.encoding
+        else:
+            encoding = "utf-8"
+
+        # Fallback to utf-8 if detection fails
+        if not encoding:
+            encoding = "utf-8"
+
+        try:
+            csv_file = io.StringIO(file_content.decode(encoding, errors="ignore"))
+        except (UnicodeDecodeError, LookupError):
+            # If decoding fails, try with utf-8 as last resort
+            csv_file = io.StringIO(file_content.decode("utf-8", errors="ignore"))
+
+        csv_reader = csv.reader(csv_file)
+        rows = list(csv_reader)
+
+        if not rows:
+            return ""
+
+        # Combine multi-line text in the header row
+        header_row = [cell.replace("\n", " ").replace("\r", "") for cell in rows[0]]
+
+        # Create Markdown table
+        markdown_table = "| " + " | ".join(header_row) + " |\n"
+        markdown_table += "| " + " | ".join(["-" * len(col) for col in rows[0]]) + " |\n"
+
+        # Process each data row and combine multi-line text in each cell
+        for row in rows[1:]:
+            processed_row = [cell.replace("\n", " ").replace("\r", "") for cell in row]
+            markdown_table += "| " + " | ".join(processed_row) + " |\n"
+
+        return markdown_table
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from CSV: {str(e)}") from e
+
+
+def _extract_text_from_excel(file_content: bytes) -> str:
+    """Extract text from an Excel file using pandas."""
+
+    def _construct_markdown_table(df: pd.DataFrame) -> str:
+        """Manually construct a Markdown table from a DataFrame."""
+        # Construct the header row
+        header_row = "| " + " | ".join(df.columns) + " |"
+
+        # Construct the separator row
+        separator_row = "| " + " | ".join(["-" * len(col) for col in df.columns]) + " |"
+
+        # Construct the data rows
+        data_rows = []
+        for _, row in df.iterrows():
+            data_row = "| " + " | ".join(map(str, row)) + " |"
+            data_rows.append(data_row)
+
+        # Combine all rows into a single string
+        markdown_table = "\n".join([header_row, separator_row] + data_rows)
+        return markdown_table
+
+    try:
+        excel_file = pd.ExcelFile(io.BytesIO(file_content))
+        markdown_table = ""
+        for sheet_name in excel_file.sheet_names:
+            try:
+                df = excel_file.parse(sheet_name=sheet_name)
+                df.dropna(how="all", inplace=True)
+
+                # Combine multi-line text in each cell into a single line
+                df = df.map(lambda x: " ".join(str(x).splitlines()) if isinstance(x, str) else x)
+
+                # Combine multi-line text in column names into a single line
+                df.columns = pd.Index([" ".join(str(col).splitlines()) for col in df.columns])
+
+                # Manually construct the Markdown table
+                markdown_table += _construct_markdown_table(df) + "\n\n"
+            except Exception:
+                continue
+        return markdown_table
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e
+
+
+def _extract_text_from_ppt(file_content: bytes, *, unstructured_api_config: UnstructuredApiConfig) -> str:
+    from unstructured.partition.api import partition_via_api
+    from unstructured.partition.ppt import partition_ppt
+
+    api_key = unstructured_api_config.api_key or ""
+
+    try:
+        if unstructured_api_config.api_url:
+            with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file:
+                temp_file.write(file_content)
+                temp_file.flush()
+                with open(temp_file.name, "rb") as file:
+                    elements = partition_via_api(
+                        file=file,
+                        metadata_filename=temp_file.name,
+                        api_url=unstructured_api_config.api_url,
+                        api_key=api_key,
+                    )
+                os.unlink(temp_file.name)
+        else:
+            with io.BytesIO(file_content) as file:
+                elements = partition_ppt(file=file)
+        return "\n".join([getattr(element, "text", "") for element in elements])
+
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
+
+
+def _extract_text_from_pptx(file_content: bytes, *, unstructured_api_config: UnstructuredApiConfig) -> str:
+    from unstructured.partition.api import partition_via_api
+    from unstructured.partition.pptx import partition_pptx
+
+    api_key = unstructured_api_config.api_key or ""
+
+    try:
+        if unstructured_api_config.api_url:
+            with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file:
+                temp_file.write(file_content)
+                temp_file.flush()
+                with open(temp_file.name, "rb") as file:
+                    elements = partition_via_api(
+                        file=file,
+                        metadata_filename=temp_file.name,
+                        api_url=unstructured_api_config.api_url,
+                        api_key=api_key,
+                    )
+                os.unlink(temp_file.name)
+        else:
+            with io.BytesIO(file_content) as file:
+                elements = partition_pptx(file=file)
+        return "\n".join([getattr(element, "text", "") for element in elements])
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
+
+
+def _extract_text_from_epub(file_content: bytes, *, unstructured_api_config: UnstructuredApiConfig) -> str:
+    from unstructured.partition.api import partition_via_api
+    from unstructured.partition.epub import partition_epub
+
+    api_key = unstructured_api_config.api_key or ""
+
+    try:
+        if unstructured_api_config.api_url:
+            with tempfile.NamedTemporaryFile(suffix=".epub", delete=False) as temp_file:
+                temp_file.write(file_content)
+                temp_file.flush()
+                with open(temp_file.name, "rb") as file:
+                    elements = partition_via_api(
+                        file=file,
+                        metadata_filename=temp_file.name,
+                        api_url=unstructured_api_config.api_url,
+                        api_key=api_key,
+                    )
+                os.unlink(temp_file.name)
+        else:
+            pypandoc.download_pandoc()
+            with io.BytesIO(file_content) as file:
+                elements = partition_epub(file=file)
+        return "\n".join([str(element) for element in elements])
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e
+
+
+def _extract_text_from_eml(file_content: bytes) -> str:
+    from unstructured.partition.email import partition_email
+
+    try:
+        with io.BytesIO(file_content) as file:
+            elements = partition_email(file=file)
+        return "\n".join([str(element) for element in elements])
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from EML: {str(e)}") from e
+
+
+def _extract_text_from_msg(file_content: bytes) -> str:
+    from unstructured.partition.msg import partition_msg
+
+    try:
+        with io.BytesIO(file_content) as file:
+            elements = partition_msg(file=file)
+        return "\n".join([str(element) for element in elements])
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e
+
+
+def _extract_text_from_vtt(vtt_bytes: bytes) -> str:
+    text = _extract_text_from_plain_text(vtt_bytes)
+
+    # remove bom
+    text = text.lstrip("\ufeff")
+
+    raw_results = []
+    for caption in webvtt.from_string(text):
+        raw_results.append((caption.voice, caption.text))
+
+    # Merge consecutive utterances by the same speaker
+    merged_results = []
+    if raw_results:
+        current_speaker, current_text = raw_results[0]
+
+        for i in range(1, len(raw_results)):
+            spk, txt = raw_results[i]
+            if spk is None:
+                merged_results.append((None, current_text))
+                continue
+
+            if spk == current_speaker:
+                # If it is the same speaker, merge the utterances (joined by space)
+                current_text += " " + txt
+            else:
+                # If the speaker changes, register the utterance so far and move on
+                merged_results.append((current_speaker, current_text))
+                current_speaker, current_text = spk, txt
+
+        # Add the last element
+        merged_results.append((current_speaker, current_text))
+    else:
+        merged_results = raw_results
+
+    # Return the result in the specified format: Speaker "text" style
+    formatted = [f'{spk or ""} "{txt}"' for spk, txt in merged_results]
+    return "\n".join(formatted)
+
+
+def _extract_text_from_properties(file_content: bytes) -> str:
+    try:
+        text = _extract_text_from_plain_text(file_content)
+        lines = text.splitlines()
+        result = []
+        for line in lines:
+            line = line.strip()
+            # Preserve comments and empty lines
+            if not line or line.startswith("#") or line.startswith("!"):
+                result.append(line)
+                continue
+
+            if "=" in line:
+                key, value = line.split("=", 1)
+            elif ":" in line:
+                key, value = line.split(":", 1)
+            else:
+                key, value = line, ""
+
+            result.append(f"{key.strip()}: {value.strip()}")
+
+        return "\n".join(result)
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract text from properties file: {str(e)}") from e