mirror of
https://github.com/langgenius/dify.git
synced 2026-03-13 11:07:40 +08:00
refactor: move workflow package to dify_graph (#32844)
This commit is contained in:
729
api/dify_graph/nodes/document_extractor/node.py
Normal file
729
api/dify_graph/nodes/document_extractor/node.py
Normal file
@ -0,0 +1,729 @@
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from collections.abc import Mapping, Sequence
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import charset_normalizer
|
||||
import docx
|
||||
import pandas as pd
|
||||
import pypandoc
|
||||
import pypdfium2
|
||||
import webvtt
|
||||
import yaml
|
||||
from docx.document import Document
|
||||
from docx.oxml.table import CT_Tbl
|
||||
from docx.oxml.text.paragraph import CT_P
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
from core.helper import ssrf_proxy
|
||||
from dify_graph.enums import NodeType, WorkflowNodeExecutionStatus
|
||||
from dify_graph.file import File, FileTransferMethod, file_manager
|
||||
from dify_graph.node_events import NodeRunResult
|
||||
from dify_graph.nodes.base.node import Node
|
||||
from dify_graph.variables import ArrayFileSegment
|
||||
from dify_graph.variables.segments import ArrayStringSegment, FileSegment
|
||||
|
||||
from .entities import DocumentExtractorNodeData, UnstructuredApiConfig
|
||||
from .exc import DocumentExtractorError, FileDownloadError, TextExtractionError, UnsupportedFileTypeError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from dify_graph.entities import GraphInitParams
|
||||
from dify_graph.runtime import GraphRuntimeState
|
||||
|
||||
|
||||
class DocumentExtractorNode(Node[DocumentExtractorNodeData]):
|
||||
"""
|
||||
Extracts text content from various file types.
|
||||
Supports plain text, PDF, and DOC/DOCX files.
|
||||
"""
|
||||
|
||||
node_type = NodeType.DOCUMENT_EXTRACTOR
|
||||
|
||||
@classmethod
|
||||
def version(cls) -> str:
|
||||
return "1"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
id: str,
|
||||
config: Mapping[str, Any],
|
||||
graph_init_params: "GraphInitParams",
|
||||
graph_runtime_state: "GraphRuntimeState",
|
||||
*,
|
||||
unstructured_api_config: UnstructuredApiConfig | None = None,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
id=id,
|
||||
config=config,
|
||||
graph_init_params=graph_init_params,
|
||||
graph_runtime_state=graph_runtime_state,
|
||||
)
|
||||
self._unstructured_api_config = unstructured_api_config or UnstructuredApiConfig()
|
||||
|
||||
def _run(self):
|
||||
variable_selector = self.node_data.variable_selector
|
||||
variable = self.graph_runtime_state.variable_pool.get(variable_selector)
|
||||
|
||||
if variable is None:
|
||||
error_message = f"File variable not found for selector: {variable_selector}"
|
||||
return NodeRunResult(status=WorkflowNodeExecutionStatus.FAILED, error=error_message)
|
||||
if variable.value and not isinstance(variable, ArrayFileSegment | FileSegment):
|
||||
error_message = f"Variable {variable_selector} is not an ArrayFileSegment"
|
||||
return NodeRunResult(status=WorkflowNodeExecutionStatus.FAILED, error=error_message)
|
||||
|
||||
value = variable.value
|
||||
inputs = {"variable_selector": variable_selector}
|
||||
process_data = {"documents": value if isinstance(value, list) else [value]}
|
||||
|
||||
try:
|
||||
if isinstance(value, list):
|
||||
extracted_text_list = [
|
||||
_extract_text_from_file(file, unstructured_api_config=self._unstructured_api_config)
|
||||
for file in value
|
||||
]
|
||||
return NodeRunResult(
|
||||
status=WorkflowNodeExecutionStatus.SUCCEEDED,
|
||||
inputs=inputs,
|
||||
process_data=process_data,
|
||||
outputs={"text": ArrayStringSegment(value=extracted_text_list)},
|
||||
)
|
||||
elif isinstance(value, File):
|
||||
extracted_text = _extract_text_from_file(value, unstructured_api_config=self._unstructured_api_config)
|
||||
return NodeRunResult(
|
||||
status=WorkflowNodeExecutionStatus.SUCCEEDED,
|
||||
inputs=inputs,
|
||||
process_data=process_data,
|
||||
outputs={"text": extracted_text},
|
||||
)
|
||||
else:
|
||||
raise DocumentExtractorError(f"Unsupported variable type: {type(value)}")
|
||||
except DocumentExtractorError as e:
|
||||
return NodeRunResult(
|
||||
status=WorkflowNodeExecutionStatus.FAILED,
|
||||
error=str(e),
|
||||
inputs=inputs,
|
||||
process_data=process_data,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _extract_variable_selector_to_variable_mapping(
|
||||
cls,
|
||||
*,
|
||||
graph_config: Mapping[str, Any],
|
||||
node_id: str,
|
||||
node_data: Mapping[str, Any],
|
||||
) -> Mapping[str, Sequence[str]]:
|
||||
# Create typed NodeData from dict
|
||||
typed_node_data = DocumentExtractorNodeData.model_validate(node_data)
|
||||
|
||||
return {node_id + ".files": typed_node_data.variable_selector}
|
||||
|
||||
|
||||
def _extract_text_by_mime_type(
|
||||
*,
|
||||
file_content: bytes,
|
||||
mime_type: str,
|
||||
unstructured_api_config: UnstructuredApiConfig,
|
||||
) -> str:
|
||||
"""Extract text from a file based on its MIME type."""
|
||||
match mime_type:
|
||||
case "text/plain" | "text/html" | "text/htm" | "text/markdown" | "text/xml":
|
||||
return _extract_text_from_plain_text(file_content)
|
||||
case "application/pdf":
|
||||
return _extract_text_from_pdf(file_content)
|
||||
case "application/msword":
|
||||
return _extract_text_from_doc(file_content, unstructured_api_config=unstructured_api_config)
|
||||
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
||||
return _extract_text_from_docx(file_content)
|
||||
case "text/csv":
|
||||
return _extract_text_from_csv(file_content)
|
||||
case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel":
|
||||
return _extract_text_from_excel(file_content)
|
||||
case "application/vnd.ms-powerpoint":
|
||||
return _extract_text_from_ppt(file_content, unstructured_api_config=unstructured_api_config)
|
||||
case "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
||||
return _extract_text_from_pptx(file_content, unstructured_api_config=unstructured_api_config)
|
||||
case "application/epub+zip":
|
||||
return _extract_text_from_epub(file_content, unstructured_api_config=unstructured_api_config)
|
||||
case "message/rfc822":
|
||||
return _extract_text_from_eml(file_content)
|
||||
case "application/vnd.ms-outlook":
|
||||
return _extract_text_from_msg(file_content)
|
||||
case "application/json":
|
||||
return _extract_text_from_json(file_content)
|
||||
case "application/x-yaml" | "text/yaml":
|
||||
return _extract_text_from_yaml(file_content)
|
||||
case "text/vtt":
|
||||
return _extract_text_from_vtt(file_content)
|
||||
case "text/properties":
|
||||
return _extract_text_from_properties(file_content)
|
||||
case _:
|
||||
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
||||
|
||||
|
||||
def _extract_text_by_file_extension(
|
||||
*,
|
||||
file_content: bytes,
|
||||
file_extension: str,
|
||||
unstructured_api_config: UnstructuredApiConfig,
|
||||
) -> str:
|
||||
"""Extract text from a file based on its file extension."""
|
||||
match file_extension:
|
||||
case (
|
||||
".txt"
|
||||
| ".markdown"
|
||||
| ".md"
|
||||
| ".mdx"
|
||||
| ".html"
|
||||
| ".htm"
|
||||
| ".xml"
|
||||
| ".c"
|
||||
| ".h"
|
||||
| ".cpp"
|
||||
| ".hpp"
|
||||
| ".cc"
|
||||
| ".cxx"
|
||||
| ".c++"
|
||||
| ".py"
|
||||
| ".js"
|
||||
| ".ts"
|
||||
| ".jsx"
|
||||
| ".tsx"
|
||||
| ".java"
|
||||
| ".php"
|
||||
| ".rb"
|
||||
| ".go"
|
||||
| ".rs"
|
||||
| ".swift"
|
||||
| ".kt"
|
||||
| ".scala"
|
||||
| ".sh"
|
||||
| ".bash"
|
||||
| ".bat"
|
||||
| ".ps1"
|
||||
| ".sql"
|
||||
| ".r"
|
||||
| ".m"
|
||||
| ".pl"
|
||||
| ".lua"
|
||||
| ".vim"
|
||||
| ".asm"
|
||||
| ".s"
|
||||
| ".css"
|
||||
| ".scss"
|
||||
| ".less"
|
||||
| ".sass"
|
||||
| ".ini"
|
||||
| ".cfg"
|
||||
| ".conf"
|
||||
| ".toml"
|
||||
| ".env"
|
||||
| ".log"
|
||||
| ".vtt"
|
||||
):
|
||||
return _extract_text_from_plain_text(file_content)
|
||||
case ".json":
|
||||
return _extract_text_from_json(file_content)
|
||||
case ".yaml" | ".yml":
|
||||
return _extract_text_from_yaml(file_content)
|
||||
case ".pdf":
|
||||
return _extract_text_from_pdf(file_content)
|
||||
case ".doc":
|
||||
return _extract_text_from_doc(file_content, unstructured_api_config=unstructured_api_config)
|
||||
case ".docx":
|
||||
return _extract_text_from_docx(file_content)
|
||||
case ".csv":
|
||||
return _extract_text_from_csv(file_content)
|
||||
case ".xls" | ".xlsx":
|
||||
return _extract_text_from_excel(file_content)
|
||||
case ".ppt":
|
||||
return _extract_text_from_ppt(file_content, unstructured_api_config=unstructured_api_config)
|
||||
case ".pptx":
|
||||
return _extract_text_from_pptx(file_content, unstructured_api_config=unstructured_api_config)
|
||||
case ".epub":
|
||||
return _extract_text_from_epub(file_content, unstructured_api_config=unstructured_api_config)
|
||||
case ".eml":
|
||||
return _extract_text_from_eml(file_content)
|
||||
case ".msg":
|
||||
return _extract_text_from_msg(file_content)
|
||||
case ".properties":
|
||||
return _extract_text_from_properties(file_content)
|
||||
case _:
|
||||
raise UnsupportedFileTypeError(f"Unsupported Extension Type: {file_extension}")
|
||||
|
||||
|
||||
def _extract_text_from_plain_text(file_content: bytes) -> str:
|
||||
try:
|
||||
# Detect encoding using charset_normalizer
|
||||
result = charset_normalizer.from_bytes(file_content, cp_isolation=["utf_8", "latin_1", "cp1252"]).best()
|
||||
if result:
|
||||
encoding = result.encoding
|
||||
else:
|
||||
encoding = "utf-8"
|
||||
|
||||
# Fallback to utf-8 if detection fails
|
||||
if not encoding:
|
||||
encoding = "utf-8"
|
||||
|
||||
return file_content.decode(encoding, errors="ignore")
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
# If decoding fails, try with utf-8 as last resort
|
||||
try:
|
||||
return file_content.decode("utf-8", errors="ignore")
|
||||
except UnicodeDecodeError:
|
||||
raise TextExtractionError(f"Failed to decode plain text file: {e}") from e
|
||||
|
||||
|
||||
def _extract_text_from_json(file_content: bytes) -> str:
|
||||
try:
|
||||
# Detect encoding using charset_normalizer
|
||||
result = charset_normalizer.from_bytes(file_content).best()
|
||||
if result:
|
||||
encoding = result.encoding
|
||||
else:
|
||||
encoding = "utf-8"
|
||||
|
||||
# Fallback to utf-8 if detection fails
|
||||
if not encoding:
|
||||
encoding = "utf-8"
|
||||
|
||||
json_data = json.loads(file_content.decode(encoding, errors="ignore"))
|
||||
return json.dumps(json_data, indent=2, ensure_ascii=False)
|
||||
except (UnicodeDecodeError, LookupError, json.JSONDecodeError) as e:
|
||||
# If decoding fails, try with utf-8 as last resort
|
||||
try:
|
||||
json_data = json.loads(file_content.decode("utf-8", errors="ignore"))
|
||||
return json.dumps(json_data, indent=2, ensure_ascii=False)
|
||||
except (UnicodeDecodeError, json.JSONDecodeError):
|
||||
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
|
||||
|
||||
|
||||
def _extract_text_from_yaml(file_content: bytes) -> str:
|
||||
"""Extract the content from yaml file"""
|
||||
try:
|
||||
# Detect encoding using charset_normalizer
|
||||
result = charset_normalizer.from_bytes(file_content).best()
|
||||
if result:
|
||||
encoding = result.encoding
|
||||
else:
|
||||
encoding = "utf-8"
|
||||
|
||||
# Fallback to utf-8 if detection fails
|
||||
if not encoding:
|
||||
encoding = "utf-8"
|
||||
|
||||
yaml_data = yaml.safe_load_all(file_content.decode(encoding, errors="ignore"))
|
||||
return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
|
||||
except (UnicodeDecodeError, LookupError, yaml.YAMLError) as e:
|
||||
# If decoding fails, try with utf-8 as last resort
|
||||
try:
|
||||
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", errors="ignore"))
|
||||
return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
|
||||
except (UnicodeDecodeError, yaml.YAMLError):
|
||||
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
|
||||
|
||||
|
||||
def _extract_text_from_pdf(file_content: bytes) -> str:
|
||||
try:
|
||||
pdf_file = io.BytesIO(file_content)
|
||||
pdf_document = pypdfium2.PdfDocument(pdf_file, autoclose=True)
|
||||
text = ""
|
||||
for page in pdf_document:
|
||||
text_page = page.get_textpage()
|
||||
text += text_page.get_text_range()
|
||||
text_page.close()
|
||||
page.close()
|
||||
return text
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from PDF: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_doc(file_content: bytes, *, unstructured_api_config: UnstructuredApiConfig) -> str:
|
||||
"""
|
||||
Extract text from a DOC file.
|
||||
"""
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
if not unstructured_api_config.api_url:
|
||||
raise TextExtractionError("Unstructured API URL is not configured for DOC file processing.")
|
||||
api_key = unstructured_api_config.api_key or ""
|
||||
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
|
||||
temp_file.write(file_content)
|
||||
temp_file.flush()
|
||||
with open(temp_file.name, "rb") as file:
|
||||
elements = partition_via_api(
|
||||
file=file,
|
||||
metadata_filename=temp_file.name,
|
||||
api_url=unstructured_api_config.api_url,
|
||||
api_key=api_key,
|
||||
)
|
||||
os.unlink(temp_file.name)
|
||||
return "\n".join([getattr(element, "text", "") for element in elements])
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e
|
||||
|
||||
|
||||
def parser_docx_part(block, doc: Document, content_items, i):
|
||||
if isinstance(block, CT_P):
|
||||
content_items.append((i, "paragraph", Paragraph(block, doc)))
|
||||
elif isinstance(block, CT_Tbl):
|
||||
content_items.append((i, "table", Table(block, doc)))
|
||||
|
||||
|
||||
def _extract_text_from_docx(file_content: bytes) -> str:
|
||||
"""
|
||||
Extract text from a DOCX file.
|
||||
For now support only paragraph and table add more if needed
|
||||
"""
|
||||
try:
|
||||
doc_file = io.BytesIO(file_content)
|
||||
doc = docx.Document(doc_file)
|
||||
text = []
|
||||
|
||||
# Keep track of paragraph and table positions
|
||||
content_items: list[tuple[int, str, Table | Paragraph]] = []
|
||||
|
||||
it = iter(doc.element.body)
|
||||
part = next(it, None)
|
||||
i = 0
|
||||
while part is not None:
|
||||
parser_docx_part(part, doc, content_items, i)
|
||||
i = i + 1
|
||||
part = next(it, None)
|
||||
|
||||
# Process sorted content
|
||||
for _, item_type, item in content_items:
|
||||
if item_type == "paragraph":
|
||||
if isinstance(item, Table):
|
||||
continue
|
||||
text.append(item.text)
|
||||
elif item_type == "table":
|
||||
# Process tables
|
||||
if not isinstance(item, Table):
|
||||
continue
|
||||
try:
|
||||
# Check if any cell in the table has text
|
||||
has_content = False
|
||||
for row in item.rows:
|
||||
if any(cell.text.strip() for cell in row.cells):
|
||||
has_content = True
|
||||
break
|
||||
|
||||
if has_content:
|
||||
cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
|
||||
markdown_table = f"| {' | '.join(cell_texts)} |\n"
|
||||
markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"
|
||||
|
||||
for row in item.rows[1:]:
|
||||
# Replace newlines with <br> in each cell
|
||||
row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
|
||||
markdown_table += "| " + " | ".join(row_cells) + " |\n"
|
||||
|
||||
text.append(markdown_table)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to extract table from DOC: %s", e)
|
||||
continue
|
||||
|
||||
return "\n".join(text)
|
||||
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from DOCX: {str(e)}") from e
|
||||
|
||||
|
||||
def _download_file_content(file: File) -> bytes:
|
||||
"""Download the content of a file based on its transfer method."""
|
||||
try:
|
||||
if file.transfer_method == FileTransferMethod.REMOTE_URL:
|
||||
if file.remote_url is None:
|
||||
raise FileDownloadError("Missing URL for remote file")
|
||||
response = ssrf_proxy.get(file.remote_url)
|
||||
response.raise_for_status()
|
||||
return response.content
|
||||
else:
|
||||
return file_manager.download(file)
|
||||
except Exception as e:
|
||||
raise FileDownloadError(f"Error downloading file: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_file(file: File, *, unstructured_api_config: UnstructuredApiConfig) -> str:
|
||||
file_content = _download_file_content(file)
|
||||
if file.extension:
|
||||
extracted_text = _extract_text_by_file_extension(
|
||||
file_content=file_content,
|
||||
file_extension=file.extension,
|
||||
unstructured_api_config=unstructured_api_config,
|
||||
)
|
||||
elif file.mime_type:
|
||||
extracted_text = _extract_text_by_mime_type(
|
||||
file_content=file_content,
|
||||
mime_type=file.mime_type,
|
||||
unstructured_api_config=unstructured_api_config,
|
||||
)
|
||||
else:
|
||||
raise UnsupportedFileTypeError("Unable to determine file type: MIME type or file extension is missing")
|
||||
return extracted_text
|
||||
|
||||
|
||||
def _extract_text_from_csv(file_content: bytes) -> str:
|
||||
try:
|
||||
# Detect encoding using charset_normalizer
|
||||
result = charset_normalizer.from_bytes(file_content).best()
|
||||
if result:
|
||||
encoding = result.encoding
|
||||
else:
|
||||
encoding = "utf-8"
|
||||
|
||||
# Fallback to utf-8 if detection fails
|
||||
if not encoding:
|
||||
encoding = "utf-8"
|
||||
|
||||
try:
|
||||
csv_file = io.StringIO(file_content.decode(encoding, errors="ignore"))
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
# If decoding fails, try with utf-8 as last resort
|
||||
csv_file = io.StringIO(file_content.decode("utf-8", errors="ignore"))
|
||||
|
||||
csv_reader = csv.reader(csv_file)
|
||||
rows = list(csv_reader)
|
||||
|
||||
if not rows:
|
||||
return ""
|
||||
|
||||
# Combine multi-line text in the header row
|
||||
header_row = [cell.replace("\n", " ").replace("\r", "") for cell in rows[0]]
|
||||
|
||||
# Create Markdown table
|
||||
markdown_table = "| " + " | ".join(header_row) + " |\n"
|
||||
markdown_table += "| " + " | ".join(["-" * len(col) for col in rows[0]]) + " |\n"
|
||||
|
||||
# Process each data row and combine multi-line text in each cell
|
||||
for row in rows[1:]:
|
||||
processed_row = [cell.replace("\n", " ").replace("\r", "") for cell in row]
|
||||
markdown_table += "| " + " | ".join(processed_row) + " |\n"
|
||||
|
||||
return markdown_table
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from CSV: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_excel(file_content: bytes) -> str:
|
||||
"""Extract text from an Excel file using pandas."""
|
||||
|
||||
def _construct_markdown_table(df: pd.DataFrame) -> str:
|
||||
"""Manually construct a Markdown table from a DataFrame."""
|
||||
# Construct the header row
|
||||
header_row = "| " + " | ".join(df.columns) + " |"
|
||||
|
||||
# Construct the separator row
|
||||
separator_row = "| " + " | ".join(["-" * len(col) for col in df.columns]) + " |"
|
||||
|
||||
# Construct the data rows
|
||||
data_rows = []
|
||||
for _, row in df.iterrows():
|
||||
data_row = "| " + " | ".join(map(str, row)) + " |"
|
||||
data_rows.append(data_row)
|
||||
|
||||
# Combine all rows into a single string
|
||||
markdown_table = "\n".join([header_row, separator_row] + data_rows)
|
||||
return markdown_table
|
||||
|
||||
try:
|
||||
excel_file = pd.ExcelFile(io.BytesIO(file_content))
|
||||
markdown_table = ""
|
||||
for sheet_name in excel_file.sheet_names:
|
||||
try:
|
||||
df = excel_file.parse(sheet_name=sheet_name)
|
||||
df.dropna(how="all", inplace=True)
|
||||
|
||||
# Combine multi-line text in each cell into a single line
|
||||
df = df.map(lambda x: " ".join(str(x).splitlines()) if isinstance(x, str) else x)
|
||||
|
||||
# Combine multi-line text in column names into a single line
|
||||
df.columns = pd.Index([" ".join(str(col).splitlines()) for col in df.columns])
|
||||
|
||||
# Manually construct the Markdown table
|
||||
markdown_table += _construct_markdown_table(df) + "\n\n"
|
||||
except Exception:
|
||||
continue
|
||||
return markdown_table
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_ppt(file_content: bytes, *, unstructured_api_config: UnstructuredApiConfig) -> str:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
|
||||
api_key = unstructured_api_config.api_key or ""
|
||||
|
||||
try:
|
||||
if unstructured_api_config.api_url:
|
||||
with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file:
|
||||
temp_file.write(file_content)
|
||||
temp_file.flush()
|
||||
with open(temp_file.name, "rb") as file:
|
||||
elements = partition_via_api(
|
||||
file=file,
|
||||
metadata_filename=temp_file.name,
|
||||
api_url=unstructured_api_config.api_url,
|
||||
api_key=api_key,
|
||||
)
|
||||
os.unlink(temp_file.name)
|
||||
else:
|
||||
with io.BytesIO(file_content) as file:
|
||||
elements = partition_ppt(file=file)
|
||||
return "\n".join([getattr(element, "text", "") for element in elements])
|
||||
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_pptx(file_content: bytes, *, unstructured_api_config: UnstructuredApiConfig) -> str:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
api_key = unstructured_api_config.api_key or ""
|
||||
|
||||
try:
|
||||
if unstructured_api_config.api_url:
|
||||
with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file:
|
||||
temp_file.write(file_content)
|
||||
temp_file.flush()
|
||||
with open(temp_file.name, "rb") as file:
|
||||
elements = partition_via_api(
|
||||
file=file,
|
||||
metadata_filename=temp_file.name,
|
||||
api_url=unstructured_api_config.api_url,
|
||||
api_key=api_key,
|
||||
)
|
||||
os.unlink(temp_file.name)
|
||||
else:
|
||||
with io.BytesIO(file_content) as file:
|
||||
elements = partition_pptx(file=file)
|
||||
return "\n".join([getattr(element, "text", "") for element in elements])
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_epub(file_content: bytes, *, unstructured_api_config: UnstructuredApiConfig) -> str:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
from unstructured.partition.epub import partition_epub
|
||||
|
||||
api_key = unstructured_api_config.api_key or ""
|
||||
|
||||
try:
|
||||
if unstructured_api_config.api_url:
|
||||
with tempfile.NamedTemporaryFile(suffix=".epub", delete=False) as temp_file:
|
||||
temp_file.write(file_content)
|
||||
temp_file.flush()
|
||||
with open(temp_file.name, "rb") as file:
|
||||
elements = partition_via_api(
|
||||
file=file,
|
||||
metadata_filename=temp_file.name,
|
||||
api_url=unstructured_api_config.api_url,
|
||||
api_key=api_key,
|
||||
)
|
||||
os.unlink(temp_file.name)
|
||||
else:
|
||||
pypandoc.download_pandoc()
|
||||
with io.BytesIO(file_content) as file:
|
||||
elements = partition_epub(file=file)
|
||||
return "\n".join([str(element) for element in elements])
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_eml(file_content: bytes) -> str:
|
||||
from unstructured.partition.email import partition_email
|
||||
|
||||
try:
|
||||
with io.BytesIO(file_content) as file:
|
||||
elements = partition_email(file=file)
|
||||
return "\n".join([str(element) for element in elements])
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from EML: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_msg(file_content: bytes) -> str:
|
||||
from unstructured.partition.msg import partition_msg
|
||||
|
||||
try:
|
||||
with io.BytesIO(file_content) as file:
|
||||
elements = partition_msg(file=file)
|
||||
return "\n".join([str(element) for element in elements])
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_vtt(vtt_bytes: bytes) -> str:
|
||||
text = _extract_text_from_plain_text(vtt_bytes)
|
||||
|
||||
# remove bom
|
||||
text = text.lstrip("\ufeff")
|
||||
|
||||
raw_results = []
|
||||
for caption in webvtt.from_string(text):
|
||||
raw_results.append((caption.voice, caption.text))
|
||||
|
||||
# Merge consecutive utterances by the same speaker
|
||||
merged_results = []
|
||||
if raw_results:
|
||||
current_speaker, current_text = raw_results[0]
|
||||
|
||||
for i in range(1, len(raw_results)):
|
||||
spk, txt = raw_results[i]
|
||||
if spk is None:
|
||||
merged_results.append((None, current_text))
|
||||
continue
|
||||
|
||||
if spk == current_speaker:
|
||||
# If it is the same speaker, merge the utterances (joined by space)
|
||||
current_text += " " + txt
|
||||
else:
|
||||
# If the speaker changes, register the utterance so far and move on
|
||||
merged_results.append((current_speaker, current_text))
|
||||
current_speaker, current_text = spk, txt
|
||||
|
||||
# Add the last element
|
||||
merged_results.append((current_speaker, current_text))
|
||||
else:
|
||||
merged_results = raw_results
|
||||
|
||||
# Return the result in the specified format: Speaker "text" style
|
||||
formatted = [f'{spk or ""} "{txt}"' for spk, txt in merged_results]
|
||||
return "\n".join(formatted)
|
||||
|
||||
|
||||
def _extract_text_from_properties(file_content: bytes) -> str:
|
||||
try:
|
||||
text = _extract_text_from_plain_text(file_content)
|
||||
lines = text.splitlines()
|
||||
result = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
# Preserve comments and empty lines
|
||||
if not line or line.startswith("#") or line.startswith("!"):
|
||||
result.append(line)
|
||||
continue
|
||||
|
||||
if "=" in line:
|
||||
key, value = line.split("=", 1)
|
||||
elif ":" in line:
|
||||
key, value = line.split(":", 1)
|
||||
else:
|
||||
key, value = line, ""
|
||||
|
||||
result.append(f"{key.strip()}: {value.strip()}")
|
||||
|
||||
return "\n".join(result)
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from properties file: {str(e)}") from e
|
||||
Reference in New Issue
Block a user