mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-03-13 02:57:48 +08:00
### What problem does this PR solve?
The SeaFile connector currently synchronises the entire account — every
library
visible to the authenticated user. This is impractical for users who
only need
a subset of their data indexed, especially on large SeaFile instances
with many
shared libraries.
This PR introduces granular sync scope support, allowing users to choose
between
syncing their entire account, a single library, or a specific directory
within a
library. It also adds support for SeaFile library-scoped API tokens
(`/api/v2.1/via-repo-token/` endpoints), enabling tighter access control
without
exposing account-level credentials.
### Type of change
- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
### Test
```
from seafile_connector import SeaFileConnector
import logging
import os
logging.basicConfig(level=logging.DEBUG)
URL = os.environ.get("SEAFILE_URL", "https://seafile.example.com")
TOKEN = os.environ.get("SEAFILE_TOKEN", "")
REPO_ID = os.environ.get("SEAFILE_REPO_ID", "")
SYNC_PATH = os.environ.get("SEAFILE_SYNC_PATH", "/Documents")
REPO_TOKEN = os.environ.get("SEAFILE_REPO_TOKEN", "")
def _test_scope(scope, repo_id=None, sync_path=None):
print(f"\n{'='*50}")
print(f"Testing scope: {scope}")
print(f"{'='*50}")
creds = {"seafile_token": TOKEN} if TOKEN else {}
if REPO_TOKEN and scope in ("library", "directory"):
creds["repo_token"] = REPO_TOKEN
connector = SeaFileConnector(
seafile_url=URL,
batch_size=5,
sync_scope=scope,
include_shared = False,
repo_id=repo_id,
sync_path=sync_path,
)
connector.load_credentials(creds)
connector.validate_connector_settings()
count = 0
for batch in connector.load_from_state():
for doc in batch:
count += 1
print(f" [{count}] {doc.semantic_identifier} "
f"({doc.size_bytes} bytes, {doc.extension})")
print(f"\n-> {scope} scope: {count} document(s) found.\n")
# 1. Account scope
if TOKEN:
_test_scope("account")
else:
print("\nSkipping account scope (set SEAFILE_TOKEN)")
# 2. Library scope
if REPO_ID and (TOKEN or REPO_TOKEN):
_test_scope("library", repo_id=REPO_ID)
else:
print("\nSkipping library scope (set SEAFILE_REPO_ID + token)")
# 3. Directory scope
if REPO_ID and SYNC_PATH and (TOKEN or REPO_TOKEN):
_test_scope("directory", repo_id=REPO_ID, sync_path=SYNC_PATH)
else:
print("\nSkipping directory scope (set SEAFILE_REPO_ID + SEAFILE_SYNC_PATH + token)")
```
542 lines
20 KiB
Python
542 lines
20 KiB
Python
"""SeaFile connector with granular sync support"""
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Optional
|
|
|
|
from retry import retry
|
|
|
|
from common.data_source.utils import (
|
|
get_file_ext,
|
|
rl_requests,
|
|
)
|
|
from common.data_source.config import (
|
|
DocumentSource,
|
|
INDEX_BATCH_SIZE,
|
|
BLOB_STORAGE_SIZE_THRESHOLD,
|
|
)
|
|
from common.data_source.exceptions import (
|
|
ConnectorMissingCredentialError,
|
|
ConnectorValidationError,
|
|
CredentialExpiredError,
|
|
InsufficientPermissionsError,
|
|
)
|
|
from common.data_source.interfaces import LoadConnector, PollConnector
|
|
from common.data_source.models import (
|
|
Document,
|
|
SecondsSinceUnixEpoch,
|
|
GenerateDocumentsOutput,
|
|
SeafileSyncScope,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SeaFileConnector(LoadConnector, PollConnector):
|
|
"""SeaFile connector supporting account-, library- and directory-level sync.
|
|
|
|
API endpoints used:
|
|
Account token (api2):
|
|
GET /api2/account/info/
|
|
GET /api2/repos/
|
|
GET /api2/repos/{repo_id}/
|
|
GET /api2/repos/{repo_id}/dir/?p=...
|
|
GET /api2/repos/{repo_id}/file/?p=...&reuse=1
|
|
|
|
Repo token (api/v2.1/via-repo-token):
|
|
GET /api/v2.1/via-repo-token/repo-info/
|
|
GET /api/v2.1/via-repo-token/dir/?path=...
|
|
GET /api/v2.1/via-repo-token/download-link/?path=...
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
seafile_url: str,
|
|
batch_size: int = INDEX_BATCH_SIZE,
|
|
include_shared: bool = True,
|
|
sync_scope: str = SeafileSyncScope.ACCOUNT,
|
|
repo_id: Optional[str] = None,
|
|
sync_path: Optional[str] = None,
|
|
) -> None:
|
|
self.seafile_url = seafile_url.rstrip("/")
|
|
self.batch_size = batch_size
|
|
self.include_shared = include_shared
|
|
self.sync_scope = SeafileSyncScope(sync_scope)
|
|
self.repo_id = repo_id
|
|
self.sync_path = self._normalise_path(sync_path)
|
|
|
|
self.token: Optional[str] = None # account-level
|
|
self.repo_token: Optional[str] = None # library-scoped
|
|
self.current_user_email: Optional[str] = None
|
|
self.size_threshold: int = BLOB_STORAGE_SIZE_THRESHOLD
|
|
|
|
self._validate_scope_params()
|
|
|
|
|
|
@staticmethod
|
|
def _normalise_path(path: Optional[str]) -> str:
|
|
if not path:
|
|
return "/"
|
|
path = path.strip()
|
|
if not path.startswith("/"):
|
|
path = f"/{path}"
|
|
return path.rstrip("/") or "/"
|
|
|
|
@staticmethod
|
|
def _parse_mtime(raw_mtime) -> datetime:
|
|
"""Parse mtime from SeaFile API response.
|
|
|
|
Handles:
|
|
- Unix timestamp as int: 1575514722
|
|
- Unix timestamp as str: "1575514722"
|
|
- ISO 8601 datetime str: "2026-02-15T17:26:53+01:00"
|
|
- None / missing
|
|
"""
|
|
if not raw_mtime:
|
|
return datetime.now(timezone.utc)
|
|
|
|
# Try as unix timestamp (int or numeric string)
|
|
if isinstance(raw_mtime, (int, float)):
|
|
return datetime.fromtimestamp(raw_mtime, tz=timezone.utc)
|
|
|
|
if isinstance(raw_mtime, str):
|
|
# Try numeric string first
|
|
try:
|
|
return datetime.fromtimestamp(int(raw_mtime), tz=timezone.utc)
|
|
except ValueError:
|
|
pass
|
|
|
|
# Try ISO 8601
|
|
try:
|
|
return datetime.fromisoformat(raw_mtime)
|
|
except ValueError:
|
|
pass
|
|
|
|
logger.warning("Unparseable mtime %r, using current time", raw_mtime)
|
|
return datetime.now(timezone.utc)
|
|
|
|
def _validate_scope_params(self) -> None:
|
|
if self.sync_scope in (SeafileSyncScope.LIBRARY, SeafileSyncScope.DIRECTORY):
|
|
if not self.repo_id:
|
|
raise ConnectorValidationError(
|
|
f"sync_scope={self.sync_scope.value!r} requires 'repo_id'."
|
|
)
|
|
if self.sync_scope == SeafileSyncScope.DIRECTORY:
|
|
if self.sync_path == "/":
|
|
raise ConnectorValidationError(
|
|
"sync_scope='directory' requires a non-root 'sync_path'. "
|
|
"Use sync_scope='library' to sync an entire library."
|
|
)
|
|
|
|
@property
|
|
def _use_repo_token(self) -> bool:
|
|
"""Whether we should use repo-token endpoints."""
|
|
return self.repo_token is not None
|
|
|
|
|
|
def _account_headers(self) -> dict[str, str]:
|
|
if not self.token:
|
|
raise ConnectorMissingCredentialError("Account token not set")
|
|
return {
|
|
"Authorization": f"Token {self.token}",
|
|
"Accept": "application/json",
|
|
}
|
|
|
|
def _repo_token_headers(self) -> dict[str, str]:
|
|
if not self.repo_token:
|
|
raise ConnectorMissingCredentialError("Repo token not set")
|
|
return {
|
|
"Authorization": f"Bearer {self.repo_token}", # <-- Bearer, not Token
|
|
"Accept": "application/json",
|
|
}
|
|
|
|
def _account_get(self, endpoint: str, params: Optional[dict] = None):
|
|
"""GET against /api2/... using the account token."""
|
|
url = f"{self.seafile_url}/api2/{endpoint.lstrip('/')}"
|
|
resp = rl_requests.get(
|
|
url, headers=self._account_headers(), params=params, timeout=60,
|
|
)
|
|
return resp
|
|
|
|
def _repo_token_get(self, endpoint: str, params: Optional[dict] = None):
|
|
"""GET against /api/v2.1/via-repo-token/... using the repo token."""
|
|
url = f"{self.seafile_url}/api/v2.1/via-repo-token/{endpoint.lstrip('/')}"
|
|
resp = rl_requests.get(
|
|
url, headers=self._repo_token_headers(), params=params, timeout=60,
|
|
)
|
|
return resp
|
|
|
|
|
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
|
logger.debug("Loading credentials for SeaFile server %s", self.seafile_url)
|
|
|
|
token = credentials.get("seafile_token")
|
|
repo_token = credentials.get("repo_token")
|
|
username = credentials.get("username")
|
|
password = credentials.get("password")
|
|
|
|
if token:
|
|
self.token = token
|
|
elif username and password:
|
|
self.token = self._authenticate_with_password(username, password)
|
|
|
|
if repo_token and self.sync_scope in (SeafileSyncScope.LIBRARY, SeafileSyncScope.DIRECTORY):
|
|
self.repo_token = repo_token
|
|
elif repo_token:
|
|
logger.debug(
|
|
"repo_token supplied but scope=%s; ignoring.",
|
|
self.sync_scope.value,
|
|
)
|
|
|
|
if not self.token and not self.repo_token:
|
|
raise ConnectorMissingCredentialError(
|
|
"SeaFile requires 'seafile_token', 'repo_token', "
|
|
"or 'username'/'password'."
|
|
)
|
|
|
|
try:
|
|
self._validate_credentials()
|
|
except ConnectorMissingCredentialError:
|
|
raise
|
|
except Exception as e:
|
|
raise CredentialExpiredError(
|
|
f"SeaFile credential validation failed: {e}"
|
|
)
|
|
|
|
return None
|
|
|
|
def _authenticate_with_password(self, username: str, password: str) -> str:
|
|
try:
|
|
resp = rl_requests.post(
|
|
f"{self.seafile_url}/api2/auth-token/",
|
|
data={"username": username, "password": password},
|
|
timeout=30,
|
|
)
|
|
resp.raise_for_status()
|
|
token = resp.json().get("token")
|
|
if not token:
|
|
raise CredentialExpiredError("No token returned")
|
|
return token
|
|
except Exception as e:
|
|
raise ConnectorMissingCredentialError(
|
|
f"Failed to authenticate with SeaFile: {e}"
|
|
)
|
|
|
|
def _validate_credentials(self) -> None:
|
|
if self.token:
|
|
self._validate_account_token()
|
|
|
|
if self.repo_token:
|
|
self._validate_repo_token()
|
|
elif self.sync_scope in (SeafileSyncScope.LIBRARY, SeafileSyncScope.DIRECTORY):
|
|
self._validate_repo_access_via_account()
|
|
|
|
def _validate_account_token(self) -> dict:
|
|
resp = self._account_get("/account/info/")
|
|
resp.raise_for_status()
|
|
info = resp.json()
|
|
self.current_user_email = info.get("email")
|
|
logger.info("SeaFile authenticated as: %s", self.current_user_email)
|
|
return info
|
|
|
|
def _validate_repo_token(self) -> None:
|
|
"""Validate repo token using /api/v2.1/via-repo-token/repo-info/"""
|
|
try:
|
|
resp = self._repo_token_get("repo-info/")
|
|
resp.raise_for_status()
|
|
info = resp.json()
|
|
logger.info(
|
|
"Repo token validated — library: %s (id: %s)",
|
|
info.get("repo_name", "?"), info.get("repo_id", self.repo_id),
|
|
)
|
|
# Update repo_id from response if not set
|
|
if not self.repo_id and info.get("repo_id"):
|
|
self.repo_id = info["repo_id"]
|
|
except Exception as e:
|
|
raise CredentialExpiredError(
|
|
f"Repo token validation failed: {e}"
|
|
)
|
|
|
|
def _validate_repo_access_via_account(self) -> None:
|
|
repo_info = self._get_repo_info_via_account(self.repo_id)
|
|
if not repo_info:
|
|
raise ConnectorValidationError(
|
|
f"Library {self.repo_id} not accessible with account token."
|
|
)
|
|
if self.sync_scope == SeafileSyncScope.DIRECTORY:
|
|
entries = self._get_directory_entries(self.repo_id, self.sync_path)
|
|
if entries is None:
|
|
raise ConnectorValidationError(
|
|
f"Directory {self.sync_path!r} does not exist "
|
|
f"in library {self.repo_id}."
|
|
)
|
|
|
|
|
|
def validate_connector_settings(self) -> None:
|
|
if not self.token and not self.repo_token:
|
|
raise ConnectorMissingCredentialError("SeaFile credentials not loaded.")
|
|
if not self.seafile_url:
|
|
raise ConnectorValidationError("No SeaFile URL was provided.")
|
|
|
|
try:
|
|
if self.sync_scope == SeafileSyncScope.ACCOUNT:
|
|
libs = self._get_libraries()
|
|
logger.info("Validated (account scope). %d libraries.", len(libs))
|
|
elif self.sync_scope == SeafileSyncScope.LIBRARY:
|
|
info = self._get_repo_info()
|
|
logger.info(
|
|
"Validated (library scope): %s", info.get("name", self.repo_id)
|
|
)
|
|
elif self.sync_scope == SeafileSyncScope.DIRECTORY:
|
|
entries = self._get_directory_entries(self.repo_id, self.sync_path)
|
|
logger.info(
|
|
"Validated (directory scope): %s:%s (%d entries)",
|
|
self.repo_id, self.sync_path, len(entries),
|
|
)
|
|
except (
|
|
ConnectorValidationError, ConnectorMissingCredentialError,
|
|
CredentialExpiredError, InsufficientPermissionsError,
|
|
):
|
|
raise
|
|
except Exception as e:
|
|
status = getattr(getattr(e, "response", None), "status_code", None)
|
|
if status == 401:
|
|
raise CredentialExpiredError("Token invalid or expired.")
|
|
if status == 403:
|
|
raise InsufficientPermissionsError("Insufficient permissions.")
|
|
raise ConnectorValidationError(f"Validation failed: {repr(e)}")
|
|
|
|
|
|
@retry(tries=3, delay=1, backoff=2)
|
|
def _get_libraries(self) -> list[dict]:
|
|
"""List all libraries (account token only)."""
|
|
resp = self._account_get("/repos/")
|
|
resp.raise_for_status()
|
|
libraries = resp.json()
|
|
|
|
if not self.include_shared and self.current_user_email:
|
|
libraries = [
|
|
lib for lib in libraries
|
|
if lib.get("owner") == self.current_user_email
|
|
or lib.get("owner_email") == self.current_user_email
|
|
]
|
|
|
|
return libraries
|
|
|
|
@retry(tries=3, delay=1, backoff=2)
|
|
def _get_repo_info_via_account(self, repo_id: str) -> Optional[dict]:
|
|
"""GET /api2/repos/{repo_id}/ — account token."""
|
|
try:
|
|
resp = self._account_get(f"/repos/{repo_id}/")
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
except Exception as e:
|
|
logger.warning("Error fetching repo info for %s: %s", repo_id, e)
|
|
return None
|
|
|
|
@retry(tries=3, delay=1, backoff=2)
|
|
def _get_repo_info_via_repo_token(self) -> Optional[dict]:
|
|
"""GET /api/v2.1/via-repo-token/repo-info/ — repo token."""
|
|
try:
|
|
resp = self._repo_token_get("repo-info/")
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
except Exception as e:
|
|
logger.warning("Error fetching repo info via repo token: %s", e)
|
|
return None
|
|
|
|
def _get_repo_info(self) -> Optional[dict]:
|
|
"""Get repo info using whichever token is available."""
|
|
if self._use_repo_token:
|
|
info = self._get_repo_info_via_repo_token()
|
|
if info:
|
|
# Normalise keys to match account-token response shape
|
|
return {
|
|
"id": info.get("repo_id", self.repo_id),
|
|
"name": info.get("repo_name", self.repo_id),
|
|
}
|
|
return None
|
|
return self._get_repo_info_via_account(self.repo_id)
|
|
|
|
@retry(tries=3, delay=1, backoff=2)
|
|
def _get_directory_entries(self, repo_id: str, path: str = "/") -> list[dict]:
|
|
"""List directory contents using the appropriate endpoint."""
|
|
try:
|
|
if self._use_repo_token:
|
|
# GET /api/v2.1/via-repo-token/dir/?path=/foo
|
|
resp = self._repo_token_get("dir/", params={"path": path})
|
|
else:
|
|
# GET /api2/repos/{repo_id}/dir/?p=/foo
|
|
resp = self._account_get(
|
|
f"/repos/{repo_id}/dir/", params={"p": path},
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
# v2.1 wraps entries in {"dirent_list": [...]}
|
|
if isinstance(data, dict) and "dirent_list" in data:
|
|
return data["dirent_list"]
|
|
return data
|
|
|
|
except Exception as e:
|
|
logger.warning(
|
|
"Error fetching directory %s in repo %s: %s", path, repo_id, e,
|
|
)
|
|
return []
|
|
|
|
@retry(tries=3, delay=1, backoff=2)
|
|
def _get_file_download_link(
|
|
self, repo_id: str, path: str
|
|
) -> Optional[str]:
|
|
"""Get a temporary download URL for a file."""
|
|
try:
|
|
if self._use_repo_token:
|
|
# GET /api/v2.1/via-repo-token/download-link/?path=/foo.pdf
|
|
resp = self._repo_token_get(
|
|
"download-link/", params={"path": path},
|
|
)
|
|
else:
|
|
# GET /api2/repos/{repo_id}/file/?p=/foo.pdf&reuse=1
|
|
resp = self._account_get(
|
|
f"/repos/{repo_id}/file/", params={"p": path, "reuse": 1},
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.text.strip('"')
|
|
except Exception as e:
|
|
logger.warning("Error getting download link for %s: %s", path, e)
|
|
return None
|
|
|
|
|
|
def _list_files_recursive(
|
|
self,
|
|
repo_id: str,
|
|
repo_name: str,
|
|
path: str,
|
|
start: datetime,
|
|
end: datetime,
|
|
) -> list[tuple[str, dict, dict]]:
|
|
files = []
|
|
entries = self._get_directory_entries(repo_id, path)
|
|
|
|
for entry in entries:
|
|
entry_type = entry.get("type")
|
|
entry_name = entry.get("name", "")
|
|
entry_path = f"{path.rstrip('/')}/{entry_name}"
|
|
|
|
if entry_type == "dir":
|
|
files.extend(
|
|
self._list_files_recursive(
|
|
repo_id, repo_name, entry_path, start, end,
|
|
)
|
|
)
|
|
elif entry_type == "file":
|
|
modified = self._parse_mtime(entry.get("mtime"))
|
|
if start < modified <= end:
|
|
files.append(
|
|
(entry_path, entry,
|
|
{"id": repo_id, "name": repo_name})
|
|
)
|
|
|
|
return files
|
|
|
|
def _resolve_libraries_to_scan(self) -> list[dict]:
|
|
if self.sync_scope == SeafileSyncScope.ACCOUNT:
|
|
return [
|
|
{"id": lib["id"], "name": lib.get("name", "Unknown")}
|
|
for lib in self._get_libraries() if lib.get("id")
|
|
]
|
|
|
|
info = self._get_repo_info()
|
|
if info:
|
|
return [{"id": info.get("id", self.repo_id),
|
|
"name": info.get("name", self.repo_id)}]
|
|
return [{"id": self.repo_id, "name": self.repo_id}]
|
|
|
|
def _root_path_for_repo(self, repo_id: str) -> str:
|
|
if (self.sync_scope == SeafileSyncScope.DIRECTORY
|
|
and repo_id == self.repo_id):
|
|
return self.sync_path
|
|
return "/"
|
|
|
|
|
|
def _yield_seafile_documents(
|
|
self, start: datetime, end: datetime,
|
|
) -> GenerateDocumentsOutput:
|
|
libraries = self._resolve_libraries_to_scan()
|
|
logger.info(
|
|
"Processing %d library(ies) [scope=%s]",
|
|
len(libraries), self.sync_scope.value,
|
|
)
|
|
|
|
all_files: list[tuple[str, dict, dict]] = []
|
|
for lib in libraries:
|
|
root = self._root_path_for_repo(lib["id"])
|
|
logger.debug("Scanning %s starting at %s", lib["name"], root)
|
|
try:
|
|
files = self._list_files_recursive(
|
|
lib["id"], lib["name"], root, start, end,
|
|
)
|
|
all_files.extend(files)
|
|
except Exception as e:
|
|
logger.error("Error in library %s: %s", lib["name"], e)
|
|
|
|
logger.info("Found %d file(s) matching criteria", len(all_files))
|
|
|
|
batch: list[Document] = []
|
|
for file_path, file_entry, library in all_files:
|
|
file_name = file_entry.get("name", "")
|
|
file_size = file_entry.get("size", 0)
|
|
file_id = file_entry.get("id", "")
|
|
repo_id = library["id"]
|
|
repo_name = library["name"]
|
|
|
|
modified = self._parse_mtime(file_entry.get("mtime"))
|
|
|
|
if file_size > self.size_threshold:
|
|
logger.warning("Skipping large file: %s (%d B)", file_path, file_size)
|
|
continue
|
|
|
|
try:
|
|
download_link = self._get_file_download_link(repo_id, file_path)
|
|
if not download_link:
|
|
continue
|
|
|
|
resp = rl_requests.get(download_link, timeout=120)
|
|
resp.raise_for_status()
|
|
blob = resp.content
|
|
if not blob:
|
|
continue
|
|
|
|
batch.append(Document(
|
|
id=f"seafile:{repo_id}:{file_id}",
|
|
blob=blob,
|
|
source=DocumentSource.SEAFILE,
|
|
semantic_identifier=f"{repo_name}{file_path}",
|
|
extension=get_file_ext(file_name),
|
|
doc_updated_at=modified, # <-- already parsed
|
|
size_bytes=len(blob),
|
|
))
|
|
|
|
if len(batch) >= self.batch_size:
|
|
yield batch
|
|
batch = []
|
|
|
|
except Exception as e:
|
|
logger.error("Error downloading %s: %s", file_path, e)
|
|
|
|
if batch:
|
|
yield batch
|
|
|
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
|
return self._yield_seafile_documents(
|
|
start=datetime(1970, 1, 1, tzinfo=timezone.utc),
|
|
end=datetime.now(timezone.utc),
|
|
)
|
|
|
|
def poll_source(
|
|
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch,
|
|
) -> GenerateDocumentsOutput:
|
|
start_dt = datetime.fromtimestamp(start, tz=timezone.utc)
|
|
end_dt = datetime.fromtimestamp(end, tz=timezone.utc)
|
|
for batch in self._yield_seafile_documents(start_dt, end_dt):
|
|
yield batch
|
|
|
|
|