Merge branch 'main' into feat/rag-2

# Conflicts:
#	api/core/app/entities/queue_entities.py
#	api/core/workflow/graph_engine/entities/event.py
This commit is contained in:
jyong
2025-08-29 11:29:51 +08:00
106 changed files with 5142 additions and 2399 deletions

View File

@ -169,7 +169,7 @@ class QueueLoopNextEvent(AppQueueEvent):
parent_parallel_start_node_id: Optional[str] = None
"""parent parallel start node id if node is in parallel"""
parallel_mode_run_id: Optional[str] = None
"""iteratoin run in parallel mode run id"""
"""iteration run in parallel mode run id"""
node_run_index: int
output: Optional[Any] = None # output for the current loop

View File

@ -1,8 +1,8 @@
class TaskPipilineError(ValueError):
class TaskPipelineError(ValueError):
pass
class RecordNotFoundError(TaskPipilineError):
class RecordNotFoundError(TaskPipelineError):
def __init__(self, record_name: str, record_id: str):
super().__init__(f"{record_name} with id {record_id} not found")

View File

@ -43,7 +43,7 @@ class GPT2Tokenizer:
except Exception:
from os.path import abspath, dirname, join
from transformers import GPT2Tokenizer as TransformerGPT2Tokenizer # type: ignore
from transformers import GPT2Tokenizer as TransformerGPT2Tokenizer
base_path = abspath(__file__)
gpt2_tokenizer_path = join(dirname(base_path), "gpt2")

View File

@ -332,7 +332,7 @@ class OpsTraceManager:
except KeyError:
raise ValueError(f"Invalid tracing provider: {tracing_provider}")
else:
if tracing_provider is not None:
if tracing_provider is None:
raise ValueError(f"Invalid tracing provider: {tracing_provider}")
app_config: Optional[App] = db.session.query(App).where(App.id == app_id).first()

View File

@ -375,16 +375,16 @@ Here is the extra instruction you need to follow:
# merge lines into messages with max tokens
messages: list[str] = []
for i in new_lines: # type: ignore
for line in new_lines:
if len(messages) == 0:
messages.append(i) # type: ignore
messages.append(line)
else:
if len(messages[-1]) + len(i) < max_tokens * 0.5: # type: ignore
messages[-1] += i # type: ignore
if get_prompt_tokens(messages[-1] + i) > max_tokens * 0.7: # type: ignore
messages.append(i) # type: ignore
if len(messages[-1]) + len(line) < max_tokens * 0.5:
messages[-1] += line
if get_prompt_tokens(messages[-1] + line) > max_tokens * 0.7:
messages.append(line)
else:
messages[-1] += i # type: ignore
messages[-1] += line
summaries = []
for i in range(len(messages)):

View File

@ -3,8 +3,8 @@ import uuid
from contextlib import contextmanager
from typing import Any
import psycopg2.extras # type: ignore
import psycopg2.pool # type: ignore
import psycopg2.extras
import psycopg2.pool
from pydantic import BaseModel, model_validator
from core.rag.models.document import Document

View File

@ -3,8 +3,8 @@ import uuid
from contextlib import contextmanager
from typing import Any
import psycopg2.extras # type: ignore
import psycopg2.pool # type: ignore
import psycopg2.extras
import psycopg2.pool
from pydantic import BaseModel, model_validator
from configs import dify_config

View File

@ -48,7 +48,7 @@ class OpenSearchConfig(BaseModel):
return values
def create_aws_managed_iam_auth(self) -> Urllib3AWSV4SignerAuth:
import boto3 # type: ignore
import boto3
return Urllib3AWSV4SignerAuth(
credentials=boto3.Session().get_credentials(),

View File

@ -6,8 +6,8 @@ from contextlib import contextmanager
from typing import Any
import psycopg2.errors
import psycopg2.extras # type: ignore
import psycopg2.pool # type: ignore
import psycopg2.extras
import psycopg2.pool
from pydantic import BaseModel, model_validator
from configs import dify_config

View File

@ -3,8 +3,8 @@ import uuid
from contextlib import contextmanager
from typing import Any
import psycopg2.extras # type: ignore
import psycopg2.pool # type: ignore
import psycopg2.extras
import psycopg2.pool
from pydantic import BaseModel, model_validator
from configs import dify_config

View File

@ -3,7 +3,7 @@ import os
import uuid
from collections.abc import Generator, Iterable, Sequence
from itertools import islice
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from typing import TYPE_CHECKING, Any, Optional, Union
import qdrant_client
import requests
@ -398,7 +398,6 @@ class TidbOnQdrantVector(BaseVector):
def _reload_if_needed(self):
if isinstance(self._client, QdrantLocal):
self._client = cast(QdrantLocal, self._client)
self._client._load()
@classmethod

View File

@ -4,7 +4,7 @@ import os
from typing import Optional, cast
import pandas as pd
from openpyxl import load_workbook # type: ignore
from openpyxl import load_workbook
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document

View File

@ -73,8 +73,8 @@ class ExtractProcessor:
suffix = "." + match.group(1)
else:
suffix = ""
# FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore
# https://stackoverflow.com/questions/26541416/generate-temporary-file-names-without-creating-actual-file-in-python#comment90414256_26541521
file_path = f"{temp_dir}/{tempfile.gettempdir()}{suffix}"
Path(file_path).write_bytes(response.content)
extract_setting = ExtractSetting(datasource_type="upload_file", document_model="text_model")
if return_text:

View File

@ -1,6 +1,6 @@
"""Abstract interface for document loader implementations."""
from bs4 import BeautifulSoup # type: ignore
from bs4 import BeautifulSoup
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document

View File

@ -3,7 +3,7 @@ import contextlib
import logging
from typing import Optional
from bs4 import BeautifulSoup # type: ignore
from bs4 import BeautifulSoup
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document

View File

@ -144,7 +144,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
"""Text splitter that uses HuggingFace tokenizer to count length."""
try:
from transformers import PreTrainedTokenizerBase # type: ignore
from transformers import PreTrainedTokenizerBase
if not isinstance(tokenizer, PreTrainedTokenizerBase):
raise ValueError("Tokenizer received was not an instance of PreTrainedTokenizerBase")

View File

@ -6,7 +6,7 @@ from typing import Optional
from flask import request
from requests import get
from yaml import YAMLError, safe_load # type: ignore
from yaml import YAMLError, safe_load
from core.tools.entities.common_entities import I18nObject
from core.tools.entities.tool_bundle import ApiToolBundle

View File

@ -514,14 +514,14 @@ def _extract_text_from_excel(file_content: bytes) -> str:
df.dropna(how="all", inplace=True)
# Combine multi-line text in each cell into a single line
df = df.applymap(lambda x: " ".join(str(x).splitlines()) if isinstance(x, str) else x) # type: ignore
df = df.map(lambda x: " ".join(str(x).splitlines()) if isinstance(x, str) else x)
# Combine multi-line text in column names into a single line
df.columns = pd.Index([" ".join(str(col).splitlines()) for col in df.columns])
# Manually construct the Markdown table
markdown_table += _construct_markdown_table(df) + "\n\n"
except Exception as e:
except Exception:
continue
return markdown_table
except Exception as e: