external knowledge api

This commit is contained in:
jyong
2024-09-18 14:36:51 +08:00
parent 89e81873c4
commit 9f894bb3b3
12 changed files with 330 additions and 274 deletions

View File

@ -32,6 +32,7 @@ from models.dataset import (
DatasetQuery,
Document,
DocumentSegment,
ExternalKnowledgeBindings,
)
from models.model import UploadFile
from models.source import DataSourceOauthBinding
@ -39,6 +40,7 @@ from services.errors.account import NoPermissionError
from services.errors.dataset import DatasetNameDuplicateError
from services.errors.document import DocumentIndexingError
from services.errors.file import FileNotExistsError
from services.external_knowledge_service import ExternalDatasetService
from services.feature_service import FeatureModel, FeatureService
from services.tag_service import TagService
from services.vector_service import VectorService
@ -137,7 +139,14 @@ class DatasetService:
@staticmethod
def create_empty_dataset(
tenant_id: str, name: str, indexing_technique: Optional[str], account: Account, permission: Optional[str] = None
tenant_id: str,
name: str,
indexing_technique: Optional[str],
account: Account,
permission: Optional[str] = None,
provider: str = "vendor",
external_api_template_id: Optional[str] = None,
external_knowledge_id: Optional[str] = None,
):
# check if dataset name already exists
if Dataset.query.filter_by(name=name, tenant_id=tenant_id).first():
@ -156,7 +165,23 @@ class DatasetService:
dataset.embedding_model_provider = embedding_model.provider if embedding_model else None
dataset.embedding_model = embedding_model.model if embedding_model else None
dataset.permission = permission if permission else DatasetPermissionEnum.ONLY_ME
dataset.provider = provider
db.session.add(dataset)
db.session.flush()
if provider == "external" and external_api_template_id:
external_api_template = ExternalDatasetService.get_api_template(external_api_template_id)
if not external_api_template:
raise ValueError("External API template not found.")
external_knowledge_binding = ExternalKnowledgeBindings(
tenant_id=tenant_id,
dataset_id=dataset.id,
external_api_template_id=external_api_template_id,
external_knowledge_id=external_knowledge_id,
created_by=account.id,
)
db.session.add(external_knowledge_binding)
db.session.commit()
return dataset

View File

@ -1,16 +1,16 @@
from typing import Literal, Union, Optional
from typing import Literal, Optional, Union
from pydantic import BaseModel
class AuthorizationConfig(BaseModel):
type: Literal[None, 'basic', 'bearer', 'custom']
type: Literal[None, "basic", "bearer", "custom"]
api_key: Union[None, str] = None
header: Union[None, str] = None
class Authorization(BaseModel):
type: Literal['no-auth', 'api-key']
type: Literal["no-auth", "api-key"]
config: Optional[AuthorizationConfig] = None

View File

@ -3,59 +3,45 @@ import random
import time
from copy import deepcopy
from datetime import datetime, timezone
from typing import Any, Union, Optional
from typing import Any, Optional, Union
import httpx
from core.helper import ssrf_proxy
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from libs import helper
from models.account import Account, TenantAccountRole
from models.dataset import (
AppDatasetJoin,
Dataset,
DatasetCollectionBinding,
DatasetPermission,
DatasetProcessRule,
DatasetQuery,
Document,
DocumentSegment, ExternalApiTemplates, ExternalKnowledgeBindings,
ExternalApiTemplates,
ExternalKnowledgeBindings,
)
from models.model import UploadFile
from services.dataset_service import DocumentService
from services.entities.external_knowledge_entities.external_knowledge_entities import Authorization, ApiTemplateSetting
from services.entities.external_knowledge_entities.external_knowledge_entities import ApiTemplateSetting, Authorization
from services.errors.dataset import DatasetNameDuplicateError
from tasks.external_document_indexing_task import external_document_indexing_task
# from tasks.external_document_indexing_task import external_document_indexing_task
class ExternalDatasetService:
@staticmethod
def get_external_api_templates(page, per_page, tenant_id, search=None) -> tuple[list[ExternalApiTemplates], int]:
query = ExternalApiTemplates.query.filter(ExternalApiTemplates.tenant_id == tenant_id).order_by(
ExternalApiTemplates.created_at.desc()
)
if search:
query = query.filter(ExternalApiTemplates.name.ilike(f'%{search}%'))
query = query.filter(ExternalApiTemplates.name.ilike(f"%{search}%"))
api_templates = query.paginate(
page=page,
per_page=per_page,
max_per_page=100,
error_out=False
)
api_templates = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
return api_templates.items, api_templates.total
@classmethod
def validate_api_list(cls, api_settings: dict):
if not api_settings:
raise ValueError('api list is empty')
if 'endpoint' not in api_settings and not api_settings['endpoint']:
raise ValueError('endpoint is required')
if 'api_key' not in api_settings and not api_settings['api_key']:
raise ValueError('api_key is required')
raise ValueError("api list is empty")
if "endpoint" not in api_settings and not api_settings["endpoint"]:
raise ValueError("endpoint is required")
if "api_key" not in api_settings and not api_settings["api_key"]:
raise ValueError("api_key is required")
@staticmethod
def create_api_template(tenant_id: str, user_id: str, args: dict) -> ExternalApiTemplates:
@ -63,9 +49,9 @@ class ExternalDatasetService:
tenant_id=tenant_id,
created_by=user_id,
updated_by=user_id,
name=args.get('name'),
description=args.get('description', ''),
settings=json.dumps(args.get('settings'), ensure_ascii=False),
name=args.get("name"),
description=args.get("description", ""),
settings=json.dumps(args.get("settings"), ensure_ascii=False),
)
db.session.add(api_template)
@ -74,22 +60,17 @@ class ExternalDatasetService:
@staticmethod
def get_api_template(api_template_id: str) -> ExternalApiTemplates:
return ExternalApiTemplates.query.filter_by(
id=api_template_id
).first()
return ExternalApiTemplates.query.filter_by(id=api_template_id).first()
@staticmethod
def update_api_template(tenant_id, user_id, api_template_id, args) -> ExternalApiTemplates:
api_template = ExternalApiTemplates.query.filter_by(
id=api_template_id,
tenant_id=tenant_id
).first()
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
if api_template is None:
raise ValueError('api template not found')
raise ValueError("api template not found")
api_template.name = args.get('name')
api_template.description = args.get('description', '')
api_template.settings = json.dumps(args.get('settings'), ensure_ascii=False)
api_template.name = args.get("name")
api_template.description = args.get("description", "")
api_template.settings = json.dumps(args.get("settings"), ensure_ascii=False)
api_template.updated_by = user_id
api_template.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
db.session.commit()
@ -98,21 +79,16 @@ class ExternalDatasetService:
@staticmethod
def delete_api_template(tenant_id: str, api_template_id: str):
api_template = ExternalApiTemplates.query.filter_by(
id=api_template_id,
tenant_id=tenant_id
).first()
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
if api_template is None:
raise ValueError('api template not found')
raise ValueError("api template not found")
db.session.delete(api_template)
db.session.commit()
@staticmethod
def external_api_template_use_check(api_template_id: str) -> bool:
count = ExternalKnowledgeBindings.query.filter_by(
external_api_template_id=api_template_id
).count()
count = ExternalKnowledgeBindings.query.filter_by(external_api_template_id=api_template_id).count()
if count > 0:
return True
return False
@ -120,66 +96,63 @@ class ExternalDatasetService:
@staticmethod
def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
dataset_id=dataset_id,
tenant_id=tenant_id
dataset_id=dataset_id, tenant_id=tenant_id
).first()
if not external_knowledge_binding:
raise ValueError('external knowledge binding not found')
raise ValueError("external knowledge binding not found")
return external_knowledge_binding
@staticmethod
def document_create_args_validate(tenant_id: str, api_template_id: str, process_parameter: dict):
api_template = ExternalApiTemplates.query.filter_by(
id=api_template_id,
tenant_id=tenant_id
).first()
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
if api_template is None:
raise ValueError('api template not found')
raise ValueError("api template not found")
settings = json.loads(api_template.settings)
for settings in settings:
custom_parameters = settings.get('document_process_setting')
custom_parameters = settings.get("document_process_setting")
if custom_parameters:
for parameter in custom_parameters:
if parameter.get('required', False) and not process_parameter.get(parameter.get('name')):
if parameter.get("required", False) and not process_parameter.get(parameter.get("name")):
raise ValueError(f'{parameter.get("name")} is required')
@staticmethod
def init_external_dataset(tenant_id: str, user_id: str, args: dict, created_from: str = 'web'):
api_template_id = args.get('api_template_id')
def init_external_dataset(tenant_id: str, user_id: str, args: dict, created_from: str = "web"):
api_template_id = args.get("api_template_id")
data_source = args.get('data_source')
data_source = args.get("data_source")
if data_source is None:
raise ValueError('data source is required')
raise ValueError("data source is required")
process_parameter = args.get('process_parameter')
api_template = ExternalApiTemplates.query.filter_by(
id=api_template_id,
tenant_id=tenant_id
).first()
process_parameter = args.get("process_parameter")
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
if api_template is None:
raise ValueError('api template not found')
raise ValueError("api template not found")
dataset = Dataset(
tenant_id=tenant_id,
name=args.get('name'),
description=args.get('description', ''),
provider='external',
name=args.get("name"),
description=args.get("description", ""),
provider="external",
created_by=user_id,
)
db.session.add(dataset)
db.session.flush()
position = DocumentService.get_documents_position(dataset.id)
batch = time.strftime('%Y%m%d%H%M%S') + str(random.randint(100000, 999999))
document = Document.query.filter_by(dataset_id=dataset.id).order_by(Document.position.desc()).first()
position = document.position + 1 if document else 1
batch = time.strftime("%Y%m%d%H%M%S") + str(random.randint(100000, 999999))
document_ids = []
if data_source["type"] == "upload_file":
upload_file_list = data_source["info_list"]['file_info_list']['file_ids']
upload_file_list = data_source["info_list"]["file_info_list"]["file_ids"]
for file_id in upload_file_list:
file = db.session.query(UploadFile).filter(
UploadFile.tenant_id == dataset.tenant_id,
UploadFile.id == file_id
).first()
file = (
db.session.query(UploadFile)
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
.first()
)
if file:
data_source_info = {
"upload_file_id": file_id,
@ -200,21 +173,20 @@ class ExternalDatasetService:
db.session.flush()
document_ids.append(document.id)
db.session.commit()
external_document_indexing_task.delay(dataset.id, api_template_id, data_source, process_parameter)
#external_document_indexing_task.delay(dataset.id, api_template_id, data_source, process_parameter)
return dataset
@staticmethod
def process_external_api(settings: ApiTemplateSetting,
files: Union[None, dict[str, Any]]) -> httpx.Response:
def process_external_api(settings: ApiTemplateSetting, files: Union[None, dict[str, Any]]) -> httpx.Response:
"""
do http request depending on api bundle
"""
kwargs = {
'url': settings.url,
'headers': settings.headers,
'follow_redirects': True,
"url": settings.url,
"headers": settings.headers,
"follow_redirects": True,
}
response = getattr(ssrf_proxy, settings.request_method)(data=settings.params, files=files, **kwargs)
@ -228,21 +200,21 @@ class ExternalDatasetService:
headers = deepcopy(headers)
else:
headers = {}
if authorization.type == 'api-key':
if authorization.type == "api-key":
if authorization.config is None:
raise ValueError('authorization config is required')
raise ValueError("authorization config is required")
if authorization.config.api_key is None:
raise ValueError('api_key is required')
raise ValueError("api_key is required")
if not authorization.config.header:
authorization.config.header = 'Authorization'
authorization.config.header = "Authorization"
if authorization.config.type == 'bearer':
headers[authorization.config.header] = f'Bearer {authorization.config.api_key}'
elif authorization.config.type == 'basic':
headers[authorization.config.header] = f'Basic {authorization.config.api_key}'
elif authorization.config.type == 'custom':
if authorization.config.type == "bearer":
headers[authorization.config.header] = f"Bearer {authorization.config.api_key}"
elif authorization.config.type == "basic":
headers[authorization.config.header] = f"Basic {authorization.config.api_key}"
elif authorization.config.type == "custom":
headers[authorization.config.header] = authorization.config.api_key
return headers
@ -254,21 +226,20 @@ class ExternalDatasetService:
@staticmethod
def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
# check if dataset name already exists
if Dataset.query.filter_by(name=args.get('name'), tenant_id=tenant_id).first():
if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
api_template = ExternalApiTemplates.query.filter_by(
id=args.get('api_template_id'),
tenant_id=tenant_id
id=args.get("external_api_template_id"), tenant_id=tenant_id
).first()
if api_template is None:
raise ValueError('api template not found')
raise ValueError("api template not found")
dataset = Dataset(
tenant_id=tenant_id,
name=args.get('name'),
description=args.get('description', ''),
provider='external',
name=args.get("name"),
description=args.get("description", ""),
provider="external",
created_by=user_id,
)
@ -278,8 +249,8 @@ class ExternalDatasetService:
external_knowledge_binding = ExternalKnowledgeBindings(
tenant_id=tenant_id,
dataset_id=dataset.id,
external_api_template_id=args.get('api_template_id'),
external_knowledge_id=args.get('external_knowledge_id'),
external_api_template_id=args.get("external_api_template_id"),
external_knowledge_id=args.get("external_knowledge_id"),
created_by=user_id,
)
db.session.add(external_knowledge_binding)
@ -289,36 +260,32 @@ class ExternalDatasetService:
return dataset
@staticmethod
def fetch_external_knowledge_retrival(tenant_id: str,
dataset_id: str,
query: str,
external_retrival_parameters: dict):
def fetch_external_knowledge_retrival(
tenant_id: str, dataset_id: str, query: str, external_retrival_parameters: dict
):
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
dataset_id=dataset_id,
tenant_id=tenant_id
dataset_id=dataset_id, tenant_id=tenant_id
).first()
if not external_knowledge_binding:
raise ValueError('external knowledge binding not found')
raise ValueError("external knowledge binding not found")
external_api_template = ExternalApiTemplates.query.filter_by(
id=external_knowledge_binding.external_api_template_id
).first()
if not external_api_template:
raise ValueError('external api template not found')
raise ValueError("external api template not found")
settings = json.loads(external_api_template.settings)
headers = {}
if settings.get('api_token'):
headers['Authorization'] = f"Bearer {settings.get('api_token')}"
if settings.get("api_token"):
headers["Authorization"] = f"Bearer {settings.get('api_token')}"
external_retrival_parameters['query'] = query
external_retrival_parameters["query"] = query
api_template_setting = {
'url': f"{settings.get('endpoint')}/dify/external-knowledge/retrival-documents",
'request_method': 'post',
'headers': settings.get('headers'),
'params': external_retrival_parameters
"url": f"{settings.get('endpoint')}/dify/external-knowledge/retrival-documents",
"request_method": "post",
"headers": settings.get("headers"),
"params": external_retrival_parameters,
}
response = ExternalDatasetService.process_external_api(
ApiTemplateSetting(**api_template_setting), None
)
response = ExternalDatasetService.process_external_api(ApiTemplateSetting(**api_template_setting), None)

View File

@ -19,8 +19,15 @@ default_retrieval_model = {
class HitTestingService:
@classmethod
def retrieve(cls, dataset: Dataset, query: str, account: Account,
retrieval_model: dict, external_retrieval_model: dict, limit: int = 10) -> dict:
def retrieve(
cls,
dataset: Dataset,
query: str,
account: Account,
retrieval_model: dict,
external_retrieval_model: dict,
limit: int = 10,
) -> dict:
if dataset.available_document_count == 0 or dataset.available_segment_count == 0:
return {
"query": {