mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-22 00:50:10 +08:00
## Summary Fix critical severity security issue in `internal/cpp/opencc/dictionary/text.c`. ## Vulnerability | Field | Value | |-------|-------| | **ID** | V-001 | | **Severity** | CRITICAL | | **Scanner** | multi_agent_ai | | **Rule** | `V-001` | | **File** | `internal/cpp/opencc/dictionary/text.c:107` | **Description**: The OpenCC C library uses fgets() to read dictionary and configuration files without proper bounds validation on subsequent buffer operations. While fgets() itself is bounds-checked, the sprintf() call at config_reader.c:174 constructs file paths by concatenating home_path and filename without verifying the result fits in pkg_filename buffer. An attacker providing malformed OpenCC configuration files with excessively long path components can overflow the fixed-size buffer, overwriting adjacent memory including return addresses and function pointers. ## Changes - `internal/cpp/opencc/config_reader.c` - `internal/cpp/opencc/dictionary/text.c` - `internal/cpp/opencc/utils.c` ## Verification - [x] Build passes - [x] Scanner re-scan confirms fix - [x] LLM code review passed --- *Automated security fix by [OrbisAI Security](https://orbisappsec.com)* <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Improved error detection and handling for malformed configuration and dictionary entries during file parsing. * Enhanced memory cleanup in error recovery paths to prevent potential issues. * Strengthened robustness of string operations and buffer handling throughout the library. <!-- end of auto-generated comment: release notes by coderabbit.ai --> Co-authored-by: Ubuntu <ubuntu@ip-172-31-32-15.us-west-2.compute.internal>
248 lines
8.0 KiB
C
248 lines
8.0 KiB
C
/*
|
|
* Open Chinese Convert
|
|
*
|
|
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "text.h"
|
|
#include "../encoding.h"
|
|
|
|
#define INITIAL_DICTIONARY_SIZE 1024
|
|
#define ENTRY_BUFF_SIZE 4096
|
|
#define ENTRY_WBUFF_SIZE ENTRY_BUFF_SIZE / sizeof(size_t)
|
|
|
|
struct _text_dictionary {
|
|
size_t entry_count;
|
|
size_t max_length;
|
|
entry *lexicon;
|
|
ucs4_t *word_buff;
|
|
};
|
|
typedef struct _text_dictionary text_dictionary_desc;
|
|
|
|
int qsort_entry_cmp(const void *a, const void *b) { return ucs4cmp(((entry *)a)->key, ((entry *)b)->key); }
|
|
|
|
int parse_entry(const char *buff, entry *entry_i) {
|
|
size_t length;
|
|
const char *pbuff;
|
|
|
|
/* 解析鍵 */
|
|
for (pbuff = buff; *pbuff != '\t' && *pbuff != '\0'; ++pbuff)
|
|
;
|
|
if (*pbuff == '\0')
|
|
return -1;
|
|
length = pbuff - buff;
|
|
|
|
ucs4_t *ucs4_buff;
|
|
ucs4_buff = utf8_to_ucs4(buff, length);
|
|
if (ucs4_buff == (ucs4_t *)-1)
|
|
return -1;
|
|
entry_i->key = (ucs4_t *)malloc((length + 1) * sizeof(ucs4_t));
|
|
ucs4cpy(entry_i->key, ucs4_buff);
|
|
free(ucs4_buff);
|
|
|
|
/* 解析值 */
|
|
size_t value_i, value_count = INITIAL_DICTIONARY_SIZE;
|
|
entry_i->value = (ucs4_t **)malloc(value_count * sizeof(ucs4_t *));
|
|
|
|
for (value_i = 0; *pbuff != '\0' && *pbuff != '\n'; ++value_i) {
|
|
if (value_i >= value_count) {
|
|
value_count += value_count;
|
|
entry_i->value = (ucs4_t **)realloc(entry_i->value, value_count * sizeof(ucs4_t *));
|
|
}
|
|
|
|
for (buff = ++pbuff; *pbuff != ' ' && *pbuff != '\0' && *pbuff != '\n'; ++pbuff)
|
|
;
|
|
length = pbuff - buff;
|
|
ucs4_buff = utf8_to_ucs4(buff, length);
|
|
if (ucs4_buff == (ucs4_t *)-1) {
|
|
/* 發生錯誤 回退內存申請 */
|
|
ssize_t i;
|
|
for (i = value_i - 1; i >= 0; --i) {
|
|
free(entry_i->value[i]);
|
|
entry_i->value[i] = NULL;
|
|
}
|
|
free(entry_i->value);
|
|
entry_i->value = NULL;
|
|
free(entry_i->key);
|
|
entry_i->key = NULL;
|
|
return -1;
|
|
}
|
|
|
|
entry_i->value[value_i] = (ucs4_t *)malloc((length + 1) * sizeof(ucs4_t));
|
|
ucs4cpy(entry_i->value[value_i], ucs4_buff);
|
|
free(ucs4_buff);
|
|
}
|
|
|
|
entry_i->value = (ucs4_t **)realloc(entry_i->value, value_count * sizeof(ucs4_t *));
|
|
entry_i->value[value_i] = NULL;
|
|
|
|
return 0;
|
|
}
|
|
|
|
dictionary_t dictionary_text_open(const char *filename) {
|
|
text_dictionary_desc *text_dictionary;
|
|
text_dictionary = (text_dictionary_desc *)malloc(sizeof(text_dictionary_desc));
|
|
text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE;
|
|
text_dictionary->max_length = 0;
|
|
text_dictionary->lexicon = (entry *)malloc(sizeof(entry) * text_dictionary->entry_count);
|
|
text_dictionary->word_buff = NULL;
|
|
|
|
char buff[ENTRY_BUFF_SIZE];
|
|
|
|
FILE *fp = fopen(filename, "rb");
|
|
if (fp == NULL) {
|
|
dictionary_text_close((dictionary_t)text_dictionary);
|
|
return (dictionary_t)-1;
|
|
}
|
|
|
|
size_t i = 0;
|
|
while (fgets(buff, ENTRY_BUFF_SIZE, fp)) {
|
|
/* Detect line truncation: if buffer is full and last char is not newline,
|
|
* the line was longer than ENTRY_BUFF_SIZE-1 bytes. Drain the remainder
|
|
* and skip this malformed entry to prevent parsing partial data. */
|
|
size_t buff_len = strlen(buff);
|
|
if (buff_len == ENTRY_BUFF_SIZE - 1 && buff[buff_len - 1] != '\n') {
|
|
int c;
|
|
while ((c = fgetc(fp)) != '\n' && c != EOF)
|
|
;
|
|
continue;
|
|
}
|
|
|
|
if (i >= text_dictionary->entry_count) {
|
|
text_dictionary->entry_count += text_dictionary->entry_count;
|
|
text_dictionary->lexicon = (entry *)realloc(text_dictionary->lexicon, sizeof(entry) * text_dictionary->entry_count);
|
|
}
|
|
|
|
if (parse_entry(buff, text_dictionary->lexicon + i) == -1) {
|
|
text_dictionary->entry_count = i;
|
|
dictionary_text_close((dictionary_t)text_dictionary);
|
|
return (dictionary_t)-1;
|
|
}
|
|
|
|
size_t length = ucs4len(text_dictionary->lexicon[i].key);
|
|
if (length > text_dictionary->max_length)
|
|
text_dictionary->max_length = length;
|
|
|
|
i++;
|
|
}
|
|
|
|
fclose(fp);
|
|
|
|
text_dictionary->entry_count = i;
|
|
text_dictionary->lexicon = (entry *)realloc(text_dictionary->lexicon, sizeof(entry) * text_dictionary->entry_count);
|
|
text_dictionary->word_buff = (ucs4_t *)malloc(sizeof(ucs4_t) * (text_dictionary->max_length + 1));
|
|
|
|
qsort(text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp);
|
|
|
|
return (dictionary_t)text_dictionary;
|
|
}
|
|
|
|
void dictionary_text_close(dictionary_t t_dictionary) {
|
|
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
|
|
|
size_t i;
|
|
for (i = 0; i < text_dictionary->entry_count; ++i) {
|
|
free(text_dictionary->lexicon[i].key);
|
|
|
|
ucs4_t **j;
|
|
for (j = text_dictionary->lexicon[i].value; *j; ++j) {
|
|
free(*j);
|
|
}
|
|
free(text_dictionary->lexicon[i].value);
|
|
}
|
|
|
|
free(text_dictionary->lexicon);
|
|
free(text_dictionary->word_buff);
|
|
free(text_dictionary);
|
|
}
|
|
|
|
const ucs4_t *const *dictionary_text_match_longest(dictionary_t t_dictionary, const ucs4_t *word, size_t maxlen, size_t *match_length) {
|
|
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
|
|
|
if (text_dictionary->entry_count == 0)
|
|
return NULL;
|
|
|
|
if (maxlen == 0)
|
|
maxlen = ucs4len(word);
|
|
size_t len = text_dictionary->max_length;
|
|
if (maxlen < len)
|
|
len = maxlen;
|
|
|
|
ucs4ncpy(text_dictionary->word_buff, word, len);
|
|
text_dictionary->word_buff[len] = L'\0';
|
|
|
|
entry buff;
|
|
buff.key = text_dictionary->word_buff;
|
|
|
|
for (; len > 0; len--) {
|
|
text_dictionary->word_buff[len] = L'\0';
|
|
entry *brs =
|
|
(entry *)bsearch(&buff, text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp);
|
|
|
|
if (brs != NULL) {
|
|
if (match_length != NULL)
|
|
*match_length = len;
|
|
return (const ucs4_t *const *)brs->value;
|
|
}
|
|
}
|
|
|
|
if (match_length != NULL)
|
|
*match_length = 0;
|
|
return NULL;
|
|
}
|
|
|
|
size_t dictionary_text_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t *word, size_t *match_length) {
|
|
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
|
|
|
size_t rscnt = 0;
|
|
|
|
if (text_dictionary->entry_count == 0)
|
|
return rscnt;
|
|
|
|
size_t length = ucs4len(word);
|
|
size_t len = text_dictionary->max_length;
|
|
if (length < len)
|
|
len = length;
|
|
|
|
ucs4ncpy(text_dictionary->word_buff, word, len);
|
|
text_dictionary->word_buff[len] = L'\0';
|
|
|
|
entry buff;
|
|
buff.key = text_dictionary->word_buff;
|
|
|
|
for (; len > 0; len--) {
|
|
text_dictionary->word_buff[len] = L'\0';
|
|
entry *brs =
|
|
(entry *)bsearch(&buff, text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp);
|
|
|
|
if (brs != NULL)
|
|
match_length[rscnt++] = len;
|
|
}
|
|
|
|
return rscnt;
|
|
}
|
|
|
|
size_t dictionary_text_get_lexicon(dictionary_t t_dictionary, entry *lexicon) {
|
|
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
|
|
|
size_t i;
|
|
for (i = 0; i < text_dictionary->entry_count; i++) {
|
|
lexicon[i].key = text_dictionary->lexicon[i].key;
|
|
lexicon[i].value = text_dictionary->lexicon[i].value;
|
|
}
|
|
|
|
return text_dictionary->entry_count;
|
|
}
|