mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-01 05:17:51 +08:00
feat(moodle): support deleted-file sync (#14548)
Fixes #14551 ### What problem does this PR solve? The Moodle connector did not let the sync runner clean up indexed documents that were deleted from the source. Other connectors such as dropbox, seafile, webdav, and rss already do this through a slim snapshot pass. This PR adds the same support for Moodle. When `sync_deleted_files` is on, the runner now asks the Moodle connector for a lightweight list of every module id that could be indexed. The runner then compares this list with the index and removes any indexed document whose id is not in the list. The slim pass does not download files. It only goes through courses and modules and yields ids. The id format matches the ids that the loader produces, so the match is exact. ### Type of change - [x] New Feature (non-breaking change which adds functionality) ### Notes - `MoodleConnector` now also implements `SlimConnectorWithPermSync`. - New `retrieve_all_slim_docs_perm_sync` yields slim docs with the same ids the loader uses (`moodle_resource_<id>`, `moodle_forum_<id>`, `moodle_page_<id>`, `moodle_book_<id>`, `moodle_assign_<id>`, `moodle_quiz_<id>`). - The `Moodle` sync class now returns `(document_generator, file_list)` so the runner can do the cleanup. If the slim snapshot fails, `file_list` is set back to `None` and the run continues without cleanup. - The web data source map exposes `syncDeletedFiles` for Moodle so the option shows up in the UI. ### How was this tested? - `ruff check` passes on the changed Python files. - Manual review of the produced slim ids against the ids the loader builds in `_process_resource`, `_process_forum`, `_process_page`, `_process_book`, and `_process_activity`. - Behavior parity with the merged dropbox (#14476), seafile (#14499), webdav (#14491), and rss (#14493) PRs.
This commit is contained in:
@ -21,14 +21,19 @@ from common.data_source.interfaces import (
|
||||
LoadConnector,
|
||||
PollConnector,
|
||||
SecondsSinceUnixEpoch,
|
||||
SlimConnectorWithPermSync,
|
||||
)
|
||||
from common.data_source.models import (
|
||||
Document,
|
||||
GenerateSlimDocumentOutput,
|
||||
SlimDocument,
|
||||
)
|
||||
from common.data_source.models import Document
|
||||
from common.data_source.utils import batch_generator, rl_requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MoodleConnector(LoadConnector, PollConnector):
|
||||
class MoodleConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
|
||||
"""Moodle LMS connector for accessing course content"""
|
||||
|
||||
def __init__(self, moodle_url: str, batch_size: int = INDEX_BATCH_SIZE) -> None:
|
||||
@ -137,6 +142,78 @@ class MoodleConnector(LoadConnector, PollConnector):
|
||||
self._get_updated_content(courses, start, end)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _slim_doc_id_for_module(module) -> Optional[str]:
|
||||
"""Return the indexed document id for a Moodle module, or None.
|
||||
|
||||
The id format must match the ones produced by the _process_*
|
||||
helpers below. Module types that we never ingest (label, url) and
|
||||
modules with no id return None.
|
||||
"""
|
||||
mtype = getattr(module, "modname", None)
|
||||
mid = getattr(module, "id", None)
|
||||
if not mtype or mid is None:
|
||||
return None
|
||||
if mtype in ("label", "url"):
|
||||
return None
|
||||
if mtype == "resource":
|
||||
return f"moodle_resource_{mid}"
|
||||
if mtype == "forum":
|
||||
return f"moodle_forum_{mid}"
|
||||
if mtype == "page":
|
||||
return f"moodle_page_{mid}"
|
||||
if mtype == "book":
|
||||
return f"moodle_book_{mid}"
|
||||
if mtype in ("assign", "quiz"):
|
||||
return f"moodle_{mtype}_{mid}"
|
||||
return None
|
||||
|
||||
def retrieve_all_slim_docs_perm_sync(
|
||||
self,
|
||||
callback: Any = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
"""List the ids of every Moodle module that could be indexed.
|
||||
|
||||
This is a lightweight pass over courses and modules with no file
|
||||
downloads. The caller compares the returned ids against the index
|
||||
and removes any indexed document whose id is not in this list.
|
||||
"""
|
||||
del callback
|
||||
if not self.moodle_client:
|
||||
raise ConnectorMissingCredentialError("Moodle client not initialized")
|
||||
|
||||
logger.info("Starting Moodle slim snapshot for stale-document cleanup")
|
||||
courses = self._get_enrolled_courses()
|
||||
if not courses:
|
||||
logger.warning("No courses found for slim snapshot")
|
||||
return
|
||||
|
||||
batch: list[SlimDocument] = []
|
||||
total = 0
|
||||
for course in courses:
|
||||
try:
|
||||
contents = self._get_course_contents(course.id)
|
||||
for section in contents:
|
||||
for module in section.modules:
|
||||
slim_id = self._slim_doc_id_for_module(module)
|
||||
if slim_id is None:
|
||||
continue
|
||||
batch.append(SlimDocument(id=slim_id))
|
||||
total += 1
|
||||
if len(batch) >= self.batch_size:
|
||||
yield batch
|
||||
batch = []
|
||||
except Exception as e:
|
||||
self._log_error(
|
||||
f"slim snapshot for course {getattr(course, 'fullname', '?')}",
|
||||
e,
|
||||
)
|
||||
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
logger.info(f"Moodle slim snapshot completed: {total} documents listed")
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _get_enrolled_courses(self) -> list:
|
||||
if not self.moodle_client:
|
||||
|
||||
@ -976,19 +976,40 @@ class Moodle(SyncBase):
|
||||
|
||||
# Determine the time range for synchronization based on reindex or poll_range_start
|
||||
poll_start = task.get("poll_range_start")
|
||||
file_list = None
|
||||
|
||||
if task["reindex"] == "1" or poll_start is None:
|
||||
document_generator = self.connector.load_from_state()
|
||||
_begin_info = "totally"
|
||||
else:
|
||||
# Freeze the poll end time BEFORE the slim snapshot so that the
|
||||
# snapshot and the poll cover the same point in time. Without
|
||||
# this, a module created between the snapshot and the poll
|
||||
# could be polled as new and at the same time be missing from
|
||||
# the slim list, which would mark it as stale and delete it.
|
||||
end_ts = datetime.now(timezone.utc).timestamp()
|
||||
|
||||
if self.conf.get("sync_deleted_files"):
|
||||
file_list = []
|
||||
try:
|
||||
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
|
||||
file_list.extend(slim_batch)
|
||||
except Exception:
|
||||
logging.exception(
|
||||
"Moodle slim snapshot failed; skipping stale-document cleanup "
|
||||
"(connector_id=%s, kb_id=%s)",
|
||||
task.get("connector_id"),
|
||||
task.get("kb_id"),
|
||||
)
|
||||
file_list = None
|
||||
document_generator = self.connector.poll_source(
|
||||
poll_start.timestamp(),
|
||||
datetime.now(timezone.utc).timestamp(),
|
||||
end_ts,
|
||||
)
|
||||
_begin_info = f"from {poll_start}"
|
||||
|
||||
self.log_connection("Moodle", self.conf["moodle_url"], task)
|
||||
return document_generator
|
||||
return document_generator, file_list
|
||||
|
||||
|
||||
class BOX(SyncBase):
|
||||
|
||||
@ -42,14 +42,23 @@ export default function ChatBasicSetting() {
|
||||
}, [metadataKeys]);
|
||||
|
||||
useEffect(() => {
|
||||
const currentFields = form.getValues('prompt_config.reference_metadata.fields');
|
||||
if (metadataInclude && Array.isArray(currentFields) && currentFields.length > 0 && metadataKeys) {
|
||||
const validFields = currentFields.filter((field) => metadataKeys.includes(field));
|
||||
const currentFields = form.getValues(
|
||||
'prompt_config.reference_metadata.fields',
|
||||
);
|
||||
if (
|
||||
metadataInclude &&
|
||||
Array.isArray(currentFields) &&
|
||||
currentFields.length > 0 &&
|
||||
metadataKeys
|
||||
) {
|
||||
const validFields = currentFields.filter((field) =>
|
||||
metadataKeys.includes(field),
|
||||
);
|
||||
if (validFields.length !== currentFields.length) {
|
||||
form.setValue('prompt_config.reference_metadata.fields', validFields);
|
||||
}
|
||||
} else if (!metadataInclude) {
|
||||
form.setValue('prompt_config.reference_metadata.fields', undefined);
|
||||
form.setValue('prompt_config.reference_metadata.fields', undefined);
|
||||
}
|
||||
}, [kbIds, metadataKeys, metadataInclude, form]);
|
||||
|
||||
|
||||
@ -26,11 +26,7 @@ import { MultiSelect } from '@/components/ui/multi-select';
|
||||
import { RAGFlowSelect } from '@/components/ui/select';
|
||||
import { Spin } from '@/components/ui/spin';
|
||||
import { Switch } from '@/components/ui/switch';
|
||||
import { Textarea } from '@/components/ui/textarea';
|
||||
import {
|
||||
useFetchKnowledgeList,
|
||||
useFetchKnowledgeMetadataKeys,
|
||||
} from '@/hooks/use-knowledge-request';
|
||||
import { useFetchKnowledgeMetadataKeys } from '@/hooks/use-knowledge-request';
|
||||
import {
|
||||
useComposeLlmOptionsByModelTypes,
|
||||
useSelectLlmOptionsByModelType,
|
||||
@ -232,14 +228,29 @@ const SearchSetting: React.FC<SearchSettingProps> = ({
|
||||
}, [metadataKeys]);
|
||||
|
||||
useEffect(() => {
|
||||
const currentFields = formMethods.getValues('search_config.reference_metadata.fields');
|
||||
if (referenceMetadataEnabled && Array.isArray(currentFields) && currentFields.length > 0 && metadataKeys) {
|
||||
const validFields = currentFields.filter((field) => metadataKeys.includes(field));
|
||||
const currentFields = formMethods.getValues(
|
||||
'search_config.reference_metadata.fields',
|
||||
);
|
||||
if (
|
||||
referenceMetadataEnabled &&
|
||||
Array.isArray(currentFields) &&
|
||||
currentFields.length > 0 &&
|
||||
metadataKeys
|
||||
) {
|
||||
const validFields = currentFields.filter((field) =>
|
||||
metadataKeys.includes(field),
|
||||
);
|
||||
if (validFields.length !== currentFields.length) {
|
||||
formMethods.setValue('search_config.reference_metadata.fields', validFields);
|
||||
formMethods.setValue(
|
||||
'search_config.reference_metadata.fields',
|
||||
validFields,
|
||||
);
|
||||
}
|
||||
} else if (!referenceMetadataEnabled) {
|
||||
formMethods.setValue('search_config.reference_metadata.fields', undefined);
|
||||
formMethods.setValue(
|
||||
'search_config.reference_metadata.fields',
|
||||
undefined,
|
||||
);
|
||||
}
|
||||
}, [selectedKbIds, metadataKeys, referenceMetadataEnabled, formMethods]);
|
||||
|
||||
|
||||
@ -126,6 +126,9 @@ export const DataSourceFeatureVisibilityMap: Partial<
|
||||
[DataSourceKey.RSS]: {
|
||||
syncDeletedFiles: true,
|
||||
},
|
||||
[DataSourceKey.MOODLE]: {
|
||||
syncDeletedFiles: true,
|
||||
},
|
||||
[DataSourceKey.MYSQL]: {
|
||||
syncDeletedFiles: true,
|
||||
},
|
||||
|
||||
@ -23,4 +23,4 @@ test('handles mixed double-escaped delimiters with HTML entities', () => {
|
||||
test('passes through already correct single-escaped delimiters unchanged', () => {
|
||||
const result = preprocessLaTeX('\\(x = 1\\)');
|
||||
expect(result).toBe('$x = 1$');
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user