Fix/upload limit (#2521)

Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com>
2026-05-03 00:48:04 +08:00 · 2024-02-22 17:16:22 +08:00
parent 52b12ed7eb
commit 97fe817186
12 changed files with 97 additions and 14 deletions
--- a/api/.env.example
+++ b/api/.env.example
@ -130,3 +130,5 @@ UNSTRUCTURED_API_URL=

 SSRF_PROXY_HTTP_URL=
 SSRF_PROXY_HTTPS_URL=
+
+BATCH_UPLOAD_LIMIT=10
--- a/api/config.py
+++ b/api/config.py
@ -56,6 +56,7 @@ DEFAULTS = {
    'BILLING_ENABLED': 'False',
    'CAN_REPLACE_LOGO': 'False',
    'ETL_TYPE': 'dify',
+    'BATCH_UPLOAD_LIMIT': 20
 }


@ -285,6 +286,8 @@ class Config:
        self.BILLING_ENABLED = get_bool_env('BILLING_ENABLED')
        self.CAN_REPLACE_LOGO = get_bool_env('CAN_REPLACE_LOGO')

+        self.BATCH_UPLOAD_LIMIT = get_env('BATCH_UPLOAD_LIMIT')
+

 class CloudEditionConfig(Config):

--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@ -32,6 +32,7 @@ from models.dataset import Dataset, DatasetProcessRule, DocumentSegment
 from models.dataset import Document as DatasetDocument
 from models.model import UploadFile
 from models.source import DataSourceBinding
+from services.feature_service import FeatureService


 class IndexingRunner:
@ -244,6 +245,14 @@ class IndexingRunner:
        """
        Estimate the indexing for the document.
        """
+        # check document limit
+        features = FeatureService.get_features(tenant_id)
+        if features.billing.enabled:
+            count = len(file_details)
+            batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
+            if count > batch_upload_limit:
+                raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
+
        embedding_model_instance = None
        if dataset_id:
            dataset = Dataset.query.filter_by(
@ -361,6 +370,14 @@ class IndexingRunner:
        """
        Estimate the indexing for the document.
        """
+        # check document limit
+        features = FeatureService.get_features(tenant_id)
+        if features.billing.enabled:
+            count = len(notion_info_list)
+            batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
+            if count > batch_upload_limit:
+                raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
+
        embedding_model_instance = None
        if dataset_id:
            dataset = Dataset.query.filter_by(
--- a/api/services/annotation_service.py
+++ b/api/services/annotation_service.py
@ -10,6 +10,7 @@ from werkzeug.exceptions import NotFound
 from extensions.ext_database import db
 from extensions.ext_redis import redis_client
 from models.model import App, AppAnnotationHitHistory, AppAnnotationSetting, Message, MessageAnnotation
+from services.feature_service import FeatureService
 from tasks.annotation.add_annotation_to_index_task import add_annotation_to_index_task
 from tasks.annotation.batch_import_annotations_task import batch_import_annotations_task
 from tasks.annotation.delete_annotation_index_task import delete_annotation_index_task
@ -284,6 +285,12 @@ class AppAnnotationService:
                result.append(content)
            if len(result) == 0:
                raise ValueError("The CSV file is empty.")
+            # check annotation limit
+            features = FeatureService.get_features(current_user.current_tenant_id)
+            if features.billing.enabled:
+                annotation_quota_limit = features.annotation_quota_limit
+                if annotation_quota_limit.limit < len(result) + annotation_quota_limit.size:
+                    raise ValueError("The number of annotations exceeds the limit of your subscription.")
            # async job
            job_id = str(uuid.uuid4())
            indexing_cache_key = 'app_annotation_batch_import_{}'.format(str(job_id))
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@ -36,6 +36,7 @@ from services.errors.account import NoPermissionError
 from services.errors.dataset import DatasetNameDuplicateError
 from services.errors.document import DocumentIndexingError
 from services.errors.file import FileNotExistsError
+from services.feature_service import FeatureService
 from services.vector_service import VectorService
 from tasks.clean_notion_document_task import clean_notion_document_task
 from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task
@ -452,7 +453,9 @@ class DocumentService:
                                      created_from: str = 'web'):

        # check document limit
-        if current_app.config['EDITION'] == 'CLOUD':
+        features = FeatureService.get_features(current_user.current_tenant_id)
+
+        if features.billing.enabled:
            if 'original_document_id' not in document_data or not document_data['original_document_id']:
                count = 0
                if document_data["data_source"]["type"] == "upload_file":
@ -462,6 +465,9 @@ class DocumentService:
                    notion_info_list = document_data["data_source"]['info_list']['notion_info_list']
                    for notion_info in notion_info_list:
                        count = count + len(notion_info['pages'])
+                batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
+                if count > batch_upload_limit:
+                    raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
        # if dataset is empty, update dataset data_source_type
        if not dataset.data_source_type:
            dataset.data_source_type = document_data["data_source"]["type"]
@ -741,14 +747,20 @@ class DocumentService:

    @staticmethod
    def save_document_without_dataset_id(tenant_id: str, document_data: dict, account: Account):
-        count = 0
-        if document_data["data_source"]["type"] == "upload_file":
-            upload_file_list = document_data["data_source"]["info_list"]['file_info_list']['file_ids']
-            count = len(upload_file_list)
-        elif document_data["data_source"]["type"] == "notion_import":
-            notion_info_list = document_data["data_source"]['info_list']['notion_info_list']
-            for notion_info in notion_info_list:
-                count = count + len(notion_info['pages'])
+        features = FeatureService.get_features(current_user.current_tenant_id)
+
+        if features.billing.enabled:
+            count = 0
+            if document_data["data_source"]["type"] == "upload_file":
+                upload_file_list = document_data["data_source"]["info_list"]['file_info_list']['file_ids']
+                count = len(upload_file_list)
+            elif document_data["data_source"]["type"] == "notion_import":
+                notion_info_list = document_data["data_source"]['info_list']['notion_info_list']
+                for notion_info in notion_info_list:
+                    count = count + len(notion_info['pages'])
+            batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
+            if count > batch_upload_limit:
+                raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")

        embedding_model = None
        dataset_collection_binding_id = None
@ -1139,7 +1151,7 @@ class SegmentService:
                    segment.answer = args['answer']
                if 'keywords' in args and args['keywords']:
                    segment.keywords = args['keywords']
-                if'enabled' in args and args['enabled'] is not None:
+                if 'enabled' in args and args['enabled'] is not None:
                    segment.enabled = args['enabled']
                db.session.add(segment)
                db.session.commit()
--- a/api/services/file_service.py
+++ b/api/services/file_service.py
@ -20,9 +20,9 @@ from services.errors.file import FileTooLargeError, UnsupportedFileTypeError
 IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
 IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])

-ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] + IMAGE_EXTENSIONS
+ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
 UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
-                                   'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] + IMAGE_EXTENSIONS
+                                   'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
 PREVIEW_WORDS_LIMIT = 3000


--- a/api/tasks/document_indexing_task.py
+++ b/api/tasks/document_indexing_task.py
@ -4,10 +4,12 @@ import time

 import click
 from celery import shared_task
+from flask import current_app

 from core.indexing_runner import DocumentIsPausedException, IndexingRunner
 from extensions.ext_database import db
-from models.dataset import Document
+from models.dataset import Dataset, Document
+from services.feature_service import FeatureService


@shared_task(queue='dataset')
@ -21,6 +23,35 @@ def document_indexing_task(dataset_id: str, document_ids: list):
    """
    documents = []
    start_at = time.perf_counter()
+
+    dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
+
+    # check document limit
+    features = FeatureService.get_features(dataset.tenant_id)
+    try:
+        if features.billing.enabled:
+            vector_space = features.vector_space
+            count = len(document_ids)
+            batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
+            if count > batch_upload_limit:
+                raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
+            if 0 < vector_space.limit <= vector_space.size:
+                raise ValueError("Your total number of documents plus the number of uploads have over the limit of "
+                                 "your subscription.")
+    except Exception as e:
+        for document_id in document_ids:
+            document = db.session.query(Document).filter(
+                Document.id == document_id,
+                Document.dataset_id == dataset_id
+            ).first()
+            if document:
+                document.indexing_status = 'error'
+                document.error = str(e)
+                document.stopped_at = datetime.datetime.utcnow()
+                db.session.add(document)
+        db.session.commit()
+        return
+
    for document_id in document_ids:
        logging.info(click.style('Start process document: {}'.format(document_id), fg='green'))