ragflow/test/unit_test/api/utils/test_api_file_utils.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

"""Unit tests for api.utils.file_utils (filename_type, thumbnail_img, sanitize_path, read_potential_broken_pdf)."""

import pytest
from api.db import FileType
from api.utils.file_utils import (
    MAX_BLOB_SIZE_PDF,
    MAX_BLOB_SIZE_THUMBNAIL,
    GHOSTSCRIPT_TIMEOUT_SEC,
    filename_type,
    thumbnail_img,
    thumbnail,
    sanitize_path,
    read_potential_broken_pdf,
    repair_pdf_with_ghostscript,
)


class TestFilenameType:
    """Edge cases and robustness for filename_type."""

    @pytest.mark.parametrize(
        "filename,expected",
        [
            ("doc.pdf", FileType.PDF.value),
            ("a.PDF", FileType.PDF.value),
            ("x.png", FileType.VISUAL.value),
            ("file.docx", FileType.DOC.value),
            ("a/b/c.pdf", FileType.PDF.value),
            ("path/to/file.txt", FileType.DOC.value),
            ("book.epub", FileType.DOC.value),
            ("BOOK.EPUB", FileType.DOC.value),
            ("path/to/book.epub", FileType.DOC.value),
        ],
    )
    def test_valid_filenames(self, filename, expected):
        assert filename_type(filename) == expected

    @pytest.mark.parametrize(
        "filename",
        [
            None,
            "",
            "   ",
            123,
            [],
        ],
    )
    def test_invalid_or_empty_returns_other(self, filename):
        assert filename_type(filename) == FileType.OTHER.value

    def test_path_with_basename_uses_extension(self):
        assert filename_type("folder/subfolder/document.pdf") == FileType.PDF.value


class TestSanitizePath:
    """Edge cases for sanitize_path."""

    @pytest.mark.parametrize(
        "raw,expected",
        [
            (None, ""),
            ("", ""),
            ("  ", ""),
            (42, ""),
            ("a/b", "a/b"),
            ("a/../b", "a/b"),
            ("/leading/", "leading"),
            ("\\mixed\\path", "mixed/path"),
        ],
    )
    def test_sanitize_cases(self, raw, expected):
        assert sanitize_path(raw) == expected


class TestReadPotentialBrokenPdf:
    """Edge cases and robustness for read_potential_broken_pdf."""

    def test_none_returns_empty_bytes(self):
        assert read_potential_broken_pdf(None) == b""

    def test_empty_bytes_returns_as_is(self):
        assert read_potential_broken_pdf(b"") == b""

    def test_non_len_raises_or_returns_empty(self):
        class NoLen:
            pass

        result = read_potential_broken_pdf(NoLen())
        assert result == b""


class TestThumbnailImg:
    """Edge cases for thumbnail_img."""

    def test_none_blob_returns_none(self):
        assert thumbnail_img("x.pdf", None) is None

    def test_none_filename_returns_none(self):
        assert thumbnail_img(None, b"fake pdf content") is None

    def test_empty_blob_returns_none(self):
        assert thumbnail_img("x.pdf", b"") is None

    def test_empty_filename_returns_none(self):
        assert thumbnail_img("", b"x") is None

    def test_oversized_blob_returns_none(self):
        huge = b"x" * (MAX_BLOB_SIZE_THUMBNAIL + 1)
        assert thumbnail_img("x.pdf", huge) is None


class TestThumbnail:
    """thumbnail() wraps thumbnail_img and returns base64 or empty string."""

    def test_none_img_returns_empty_string(self):
        assert thumbnail("x.xyz", b"garbage") == ""

    def test_valid_img_returns_base64_prefix(self):
        from api.constants import IMG_BASE64_PREFIX

        result = thumbnail(
            "x.png",
            b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18\xd8N\x00\x00\x00\x00IEND\xaeB`\x82",
        )
        assert result.startswith(IMG_BASE64_PREFIX) or result == ""


class TestRepairPdfWithGhostscript:
    """repair_pdf_with_ghostscript edge cases."""

    def test_none_returns_empty_bytes(self):
        assert repair_pdf_with_ghostscript(None) == b""

    def test_empty_bytes_returns_empty(self):
        assert repair_pdf_with_ghostscript(b"") == b""

    def test_oversized_returns_original_without_calling_gs(self):
        huge = b"%" * (MAX_BLOB_SIZE_PDF + 1)
        result = repair_pdf_with_ghostscript(huge)
        assert result == huge


class TestConstants:
    """Resource limit constants are positive and reasonable."""

    def test_thumbnail_limit_positive(self):
        assert MAX_BLOB_SIZE_THUMBNAIL > 0

    def test_pdf_limit_positive(self):
        assert MAX_BLOB_SIZE_PDF > 0

    def test_gs_timeout_positive(self):
        assert GHOSTSCRIPT_TIMEOUT_SEC > 0