Files
ragflow/internal/cpp/opencc/dictionary/datrie.c
Jin Hai 70e9743ef1 RAGFlow go API server (#13240)
# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
2026-03-04 19:17:16 +08:00

251 lines
7.9 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Open Chinese Convert
*
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "datrie.h"
#include <fcntl.h>
#include <unistd.h>
#ifdef __WIN32
/* Todo: Win32 mmap*/
#else
#include <sys/mman.h>
#define MMAP_ENABLED
#endif
typedef enum { MEMORY_TYPE_MMAP, MEMORY_TYPE_ALLOCATE } memory_type;
struct _datrie_dictionary {
const DoubleArrayTrieItem *dat;
uint32_t dat_item_count;
ucs4_t *lexicon;
uint32_t lexicon_count;
ucs4_t ***lexicon_set;
void *dic_memory;
size_t dic_size;
memory_type dic_memory_type;
};
typedef struct _datrie_dictionary datrie_dictionary_desc;
static int load_allocate(datrie_dictionary_desc *datrie_dictionary, int fd) {
datrie_dictionary->dic_memory_type = MEMORY_TYPE_ALLOCATE;
datrie_dictionary->dic_memory = malloc(datrie_dictionary->dic_size);
if (datrie_dictionary->dic_memory == NULL) {
/* 內存申請失敗 */
return -1;
}
lseek(fd, 0, SEEK_SET);
if (read(fd, datrie_dictionary->dic_memory, datrie_dictionary->dic_size) == -1) {
/* 讀取失敗 */
return -1;
}
return 0;
}
static int load_mmap(datrie_dictionary_desc *datrie_dictionary, int fd) {
#ifdef MMAP_ENABLED
datrie_dictionary->dic_memory_type = MEMORY_TYPE_MMAP;
datrie_dictionary->dic_memory = mmap(NULL, datrie_dictionary->dic_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (datrie_dictionary->dic_memory == MAP_FAILED) {
/* 內存映射創建失敗 */
datrie_dictionary->dic_memory = NULL;
return -1;
}
return 0;
#else
return -1;
#endif
}
static int load_dict(datrie_dictionary_desc *datrie_dictionary, FILE *fp) {
int fd = fileno(fp);
fseek(fp, 0, SEEK_END);
datrie_dictionary->dic_size = ftell(fp);
/* 首先嘗試mmap如果失敗嘗試申請內存 */
if (load_mmap(datrie_dictionary, fd) == -1) {
if (load_allocate(datrie_dictionary, fd) == -1) {
return -1;
}
}
size_t header_len = strlen("OPENCCDATRIE");
if (strncmp((const char *)datrie_dictionary->dic_memory, "OPENCCDATRIE", header_len) != 0) {
return -1;
}
size_t offset = 0;
offset += header_len * sizeof(char);
/* 詞彙表 */
uint32_t lexicon_length = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
offset += sizeof(uint32_t);
datrie_dictionary->lexicon = (ucs4_t *)(datrie_dictionary->dic_memory + offset);
offset += lexicon_length * sizeof(ucs4_t);
/* 詞彙索引表 */
uint32_t lexicon_index_length = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
offset += sizeof(uint32_t);
uint32_t *lexicon_index = (uint32_t *)(datrie_dictionary->dic_memory + offset);
offset += lexicon_index_length * sizeof(uint32_t);
datrie_dictionary->lexicon_count = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
offset += sizeof(uint32_t);
datrie_dictionary->dat_item_count = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
offset += sizeof(uint32_t);
datrie_dictionary->dat = (DoubleArrayTrieItem *)(datrie_dictionary->dic_memory + offset);
/* 構造索引表 */
datrie_dictionary->lexicon_set = (ucs4_t ***)malloc(datrie_dictionary->lexicon_count * sizeof(ucs4_t **));
size_t i, last = 0;
for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
size_t count, j;
for (j = last; j < lexicon_index_length; j++) {
if (lexicon_index[j] == (uint32_t)-1)
break;
}
count = j - last;
datrie_dictionary->lexicon_set[i] = (ucs4_t **)malloc((count + 1) * sizeof(ucs4_t *));
for (j = 0; j < count; j++) {
datrie_dictionary->lexicon_set[i][j] = datrie_dictionary->lexicon + lexicon_index[last + j];
}
datrie_dictionary->lexicon_set[i][count] = NULL;
last += j + 1;
}
return 0;
}
static int unload_dict(datrie_dictionary_desc *datrie_dictionary) {
if (datrie_dictionary->dic_memory != NULL) {
size_t i;
for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
free(datrie_dictionary->lexicon_set[i]);
}
free(datrie_dictionary->lexicon_set);
if (MEMORY_TYPE_MMAP == datrie_dictionary->dic_memory_type) {
#ifdef MMAP_ENABLED
return munmap(datrie_dictionary->dic_memory, datrie_dictionary->dic_size);
#else
debug_should_not_be_here();
#endif
} else if (MEMORY_TYPE_ALLOCATE == datrie_dictionary->dic_memory_type) {
free(datrie_dictionary->dic_memory);
} else {
return -1;
}
}
return 0;
}
dictionary_t dictionary_datrie_open(const char *filename) {
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)malloc(sizeof(datrie_dictionary_desc));
datrie_dictionary->dat = NULL;
datrie_dictionary->lexicon = NULL;
FILE *fp = fopen(filename, "rb");
if (load_dict(datrie_dictionary, fp) == -1) {
dictionary_datrie_close((dictionary_t)datrie_dictionary);
return (dictionary_t)-1;
}
fclose(fp);
return (dictionary_t)datrie_dictionary;
}
int dictionary_datrie_close(dictionary_t t_dictionary) {
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)t_dictionary;
if (unload_dict(datrie_dictionary) == -1) {
free(datrie_dictionary);
return -1;
}
free(datrie_dictionary);
return 0;
}
int encode_char(ucs4_t ch) { return (int)ch; }
void datrie_match(const datrie_dictionary_desc *datrie_dictionary, const ucs4_t *word, size_t *match_pos, size_t *id, size_t limit) {
size_t i, p;
for (i = 0, p = 0; word[p] && (limit == 0 || p < limit) && datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) {
int k = encode_char(word[p]);
int j = datrie_dictionary->dat[i].base + k;
if (j < 0 || j >= datrie_dictionary->dat_item_count || datrie_dictionary->dat[j].parent != i)
break;
i = j;
}
if (match_pos)
*match_pos = p;
if (id)
*id = i;
}
const ucs4_t *const *dictionary_datrie_match_longest(dictionary_t t_dictionary, const ucs4_t *word, size_t maxlen, size_t *match_length) {
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)t_dictionary;
size_t pos, item;
datrie_match(datrie_dictionary, word, &pos, &item, maxlen);
while (datrie_dictionary->dat[item].word == -1 && pos > 1)
datrie_match(datrie_dictionary, word, &pos, &item, pos - 1);
if (pos == 0 || datrie_dictionary->dat[item].word == -1) {
if (match_length != NULL)
*match_length = 0;
return NULL;
}
if (match_length != NULL)
*match_length = pos;
return (const ucs4_t *const *)datrie_dictionary->lexicon_set[datrie_dictionary->dat[item].word];
}
size_t dictionary_datrie_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t *word, size_t *match_length) {
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)t_dictionary;
size_t rscnt = 0;
size_t i, p;
for (i = 0, p = 0; word[p] && datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) {
int k = encode_char(word[p]);
int j = datrie_dictionary->dat[i].base + k;
if (j < 0 || j >= datrie_dictionary->dat_item_count || datrie_dictionary->dat[j].parent != i)
break;
i = j;
if (datrie_dictionary->dat[i].word != -1)
match_length[rscnt++] = p + 1;
}
return rscnt;
}