feat(moodle): support deleted-file sync (#14548)

Fixes #14551 

### What problem does this PR solve?

The Moodle connector did not let the sync runner clean up indexed
documents that were deleted from the source. Other connectors such as
dropbox, seafile, webdav, and rss already do this through a slim
snapshot pass. This PR adds the same support for Moodle.

When `sync_deleted_files` is on, the runner now asks the Moodle
connector for a lightweight list of every module id that could be
indexed. The runner then compares this list with the index and removes
any indexed document whose id is not in the list.

The slim pass does not download files. It only goes through courses and
modules and yields ids. The id format matches the ids that the loader
produces, so the match is exact.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

### Notes

- `MoodleConnector` now also implements `SlimConnectorWithPermSync`.
- New `retrieve_all_slim_docs_perm_sync` yields slim docs with the same
ids the loader uses (`moodle_resource_<id>`, `moodle_forum_<id>`,
`moodle_page_<id>`, `moodle_book_<id>`, `moodle_assign_<id>`,
`moodle_quiz_<id>`).
- The `Moodle` sync class now returns `(document_generator, file_list)`
so the runner can do the cleanup. If the slim snapshot fails,
`file_list` is set back to `None` and the run continues without cleanup.
- The web data source map exposes `syncDeletedFiles` for Moodle so the
option shows up in the UI.

### How was this tested?

- `ruff check` passes on the changed Python files.
- Manual review of the produced slim ids against the ids the loader
builds in `_process_resource`, `_process_forum`, `_process_page`,
`_process_book`, and `_process_activity`.
- Behavior parity with the merged dropbox (#14476), seafile (#14499),
webdav (#14491), and rss (#14493) PRs.
This commit is contained in:
Jack Storment
2026-05-07 11:44:46 +02:00
committed by GitHub
parent 94324afee9
commit 59bb184e63
6 changed files with 140 additions and 19 deletions

View File

@ -21,14 +21,19 @@ from common.data_source.interfaces import (
LoadConnector,
PollConnector,
SecondsSinceUnixEpoch,
SlimConnectorWithPermSync,
)
from common.data_source.models import (
Document,
GenerateSlimDocumentOutput,
SlimDocument,
)
from common.data_source.models import Document
from common.data_source.utils import batch_generator, rl_requests
logger = logging.getLogger(__name__)
class MoodleConnector(LoadConnector, PollConnector):
class MoodleConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
"""Moodle LMS connector for accessing course content"""
def __init__(self, moodle_url: str, batch_size: int = INDEX_BATCH_SIZE) -> None:
@ -137,6 +142,78 @@ class MoodleConnector(LoadConnector, PollConnector):
self._get_updated_content(courses, start, end)
)
@staticmethod
def _slim_doc_id_for_module(module) -> Optional[str]:
"""Return the indexed document id for a Moodle module, or None.
The id format must match the ones produced by the _process_*
helpers below. Module types that we never ingest (label, url) and
modules with no id return None.
"""
mtype = getattr(module, "modname", None)
mid = getattr(module, "id", None)
if not mtype or mid is None:
return None
if mtype in ("label", "url"):
return None
if mtype == "resource":
return f"moodle_resource_{mid}"
if mtype == "forum":
return f"moodle_forum_{mid}"
if mtype == "page":
return f"moodle_page_{mid}"
if mtype == "book":
return f"moodle_book_{mid}"
if mtype in ("assign", "quiz"):
return f"moodle_{mtype}_{mid}"
return None
def retrieve_all_slim_docs_perm_sync(
self,
callback: Any = None,
) -> GenerateSlimDocumentOutput:
"""List the ids of every Moodle module that could be indexed.
This is a lightweight pass over courses and modules with no file
downloads. The caller compares the returned ids against the index
and removes any indexed document whose id is not in this list.
"""
del callback
if not self.moodle_client:
raise ConnectorMissingCredentialError("Moodle client not initialized")
logger.info("Starting Moodle slim snapshot for stale-document cleanup")
courses = self._get_enrolled_courses()
if not courses:
logger.warning("No courses found for slim snapshot")
return
batch: list[SlimDocument] = []
total = 0
for course in courses:
try:
contents = self._get_course_contents(course.id)
for section in contents:
for module in section.modules:
slim_id = self._slim_doc_id_for_module(module)
if slim_id is None:
continue
batch.append(SlimDocument(id=slim_id))
total += 1
if len(batch) >= self.batch_size:
yield batch
batch = []
except Exception as e:
self._log_error(
f"slim snapshot for course {getattr(course, 'fullname', '?')}",
e,
)
if batch:
yield batch
logger.info(f"Moodle slim snapshot completed: {total} documents listed")
@retry(tries=3, delay=1, backoff=2)
def _get_enrolled_courses(self) -> list:
if not self.moodle_client:

View File

@ -976,19 +976,40 @@ class Moodle(SyncBase):
# Determine the time range for synchronization based on reindex or poll_range_start
poll_start = task.get("poll_range_start")
file_list = None
if task["reindex"] == "1" or poll_start is None:
document_generator = self.connector.load_from_state()
_begin_info = "totally"
else:
# Freeze the poll end time BEFORE the slim snapshot so that the
# snapshot and the poll cover the same point in time. Without
# this, a module created between the snapshot and the poll
# could be polled as new and at the same time be missing from
# the slim list, which would mark it as stale and delete it.
end_ts = datetime.now(timezone.utc).timestamp()
if self.conf.get("sync_deleted_files"):
file_list = []
try:
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
except Exception:
logging.exception(
"Moodle slim snapshot failed; skipping stale-document cleanup "
"(connector_id=%s, kb_id=%s)",
task.get("connector_id"),
task.get("kb_id"),
)
file_list = None
document_generator = self.connector.poll_source(
poll_start.timestamp(),
datetime.now(timezone.utc).timestamp(),
end_ts,
)
_begin_info = f"from {poll_start}"
self.log_connection("Moodle", self.conf["moodle_url"], task)
return document_generator
return document_generator, file_list
class BOX(SyncBase):

View File

@ -42,14 +42,23 @@ export default function ChatBasicSetting() {
}, [metadataKeys]);
useEffect(() => {
const currentFields = form.getValues('prompt_config.reference_metadata.fields');
if (metadataInclude && Array.isArray(currentFields) && currentFields.length > 0 && metadataKeys) {
const validFields = currentFields.filter((field) => metadataKeys.includes(field));
const currentFields = form.getValues(
'prompt_config.reference_metadata.fields',
);
if (
metadataInclude &&
Array.isArray(currentFields) &&
currentFields.length > 0 &&
metadataKeys
) {
const validFields = currentFields.filter((field) =>
metadataKeys.includes(field),
);
if (validFields.length !== currentFields.length) {
form.setValue('prompt_config.reference_metadata.fields', validFields);
}
} else if (!metadataInclude) {
form.setValue('prompt_config.reference_metadata.fields', undefined);
form.setValue('prompt_config.reference_metadata.fields', undefined);
}
}, [kbIds, metadataKeys, metadataInclude, form]);

View File

@ -26,11 +26,7 @@ import { MultiSelect } from '@/components/ui/multi-select';
import { RAGFlowSelect } from '@/components/ui/select';
import { Spin } from '@/components/ui/spin';
import { Switch } from '@/components/ui/switch';
import { Textarea } from '@/components/ui/textarea';
import {
useFetchKnowledgeList,
useFetchKnowledgeMetadataKeys,
} from '@/hooks/use-knowledge-request';
import { useFetchKnowledgeMetadataKeys } from '@/hooks/use-knowledge-request';
import {
useComposeLlmOptionsByModelTypes,
useSelectLlmOptionsByModelType,
@ -232,14 +228,29 @@ const SearchSetting: React.FC<SearchSettingProps> = ({
}, [metadataKeys]);
useEffect(() => {
const currentFields = formMethods.getValues('search_config.reference_metadata.fields');
if (referenceMetadataEnabled && Array.isArray(currentFields) && currentFields.length > 0 && metadataKeys) {
const validFields = currentFields.filter((field) => metadataKeys.includes(field));
const currentFields = formMethods.getValues(
'search_config.reference_metadata.fields',
);
if (
referenceMetadataEnabled &&
Array.isArray(currentFields) &&
currentFields.length > 0 &&
metadataKeys
) {
const validFields = currentFields.filter((field) =>
metadataKeys.includes(field),
);
if (validFields.length !== currentFields.length) {
formMethods.setValue('search_config.reference_metadata.fields', validFields);
formMethods.setValue(
'search_config.reference_metadata.fields',
validFields,
);
}
} else if (!referenceMetadataEnabled) {
formMethods.setValue('search_config.reference_metadata.fields', undefined);
formMethods.setValue(
'search_config.reference_metadata.fields',
undefined,
);
}
}, [selectedKbIds, metadataKeys, referenceMetadataEnabled, formMethods]);

View File

@ -126,6 +126,9 @@ export const DataSourceFeatureVisibilityMap: Partial<
[DataSourceKey.RSS]: {
syncDeletedFiles: true,
},
[DataSourceKey.MOODLE]: {
syncDeletedFiles: true,
},
[DataSourceKey.MYSQL]: {
syncDeletedFiles: true,
},

View File

@ -23,4 +23,4 @@ test('handles mixed double-escaped delimiters with HTML entities', () => {
test('passes through already correct single-escaped delimiters unchanged', () => {
const result = preprocessLaTeX('\\(x = 1\\)');
expect(result).toBe('$x = 1$');
});
});