mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-01 07:47:47 +08:00
RAGFlow go API server (#13240)
# RAGFlow Go Implementation Plan 🚀 This repository tracks the progress of porting RAGFlow to Go. We'll implement core features and provide performance comparisons between Python and Go versions. ## Implementation Checklist - [x] User Management APIs - [x] Dataset Management Operations - [x] Retrieval Test - [x] Chat Management Operations - [x] Infinity Go SDK --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
This commit is contained in:
94
internal/cpp/opencc/dictionary/abstract.c
Normal file
94
internal/cpp/opencc/dictionary/abstract.c
Normal file
@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "abstract.h"
|
||||
#include "datrie.h"
|
||||
#include "text.h"
|
||||
|
||||
struct _dictionary {
|
||||
opencc_dictionary_type type;
|
||||
dictionary_t dict;
|
||||
};
|
||||
typedef struct _dictionary dictionary_desc;
|
||||
|
||||
dictionary_t dictionary_open(const char *filename, opencc_dictionary_type type) {
|
||||
dictionary_desc *dictionary = (dictionary_desc *)malloc(sizeof(dictionary_desc));
|
||||
dictionary->type = type;
|
||||
switch (type) {
|
||||
case OPENCC_DICTIONARY_TYPE_TEXT:
|
||||
dictionary->dict = dictionary_text_open(filename);
|
||||
break;
|
||||
case OPENCC_DICTIONARY_TYPE_DATRIE:
|
||||
dictionary->dict = dictionary_datrie_open(filename);
|
||||
break;
|
||||
default:
|
||||
free(dictionary);
|
||||
dictionary = (dictionary_t)-1; /* TODO:辭典格式不支持 */
|
||||
}
|
||||
return dictionary;
|
||||
}
|
||||
|
||||
dictionary_t dictionary_get(dictionary_t t_dictionary) {
|
||||
dictionary_desc *dictionary = (dictionary_desc *)t_dictionary;
|
||||
return dictionary->dict;
|
||||
}
|
||||
|
||||
void dictionary_close(dictionary_t t_dictionary) {
|
||||
dictionary_desc *dictionary = (dictionary_desc *)t_dictionary;
|
||||
switch (dictionary->type) {
|
||||
case OPENCC_DICTIONARY_TYPE_TEXT:
|
||||
dictionary_text_close(dictionary->dict);
|
||||
break;
|
||||
case OPENCC_DICTIONARY_TYPE_DATRIE:
|
||||
dictionary_datrie_close(dictionary->dict);
|
||||
break;
|
||||
default:
|
||||
debug_should_not_be_here();
|
||||
}
|
||||
free(dictionary);
|
||||
}
|
||||
|
||||
const ucs4_t *const *dictionary_match_longest(dictionary_t t_dictionary, const ucs4_t *word, size_t maxlen, size_t *match_length) {
|
||||
dictionary_desc *dictionary = (dictionary_desc *)t_dictionary;
|
||||
switch (dictionary->type) {
|
||||
case OPENCC_DICTIONARY_TYPE_TEXT:
|
||||
return dictionary_text_match_longest(dictionary->dict, word, maxlen, match_length);
|
||||
break;
|
||||
case OPENCC_DICTIONARY_TYPE_DATRIE:
|
||||
return dictionary_datrie_match_longest(dictionary->dict, word, maxlen, match_length);
|
||||
break;
|
||||
default:
|
||||
debug_should_not_be_here();
|
||||
}
|
||||
return (const ucs4_t *const *)-1;
|
||||
}
|
||||
|
||||
size_t dictionary_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t *word, size_t *match_length) {
|
||||
dictionary_desc *dictionary = (dictionary_desc *)t_dictionary;
|
||||
switch (dictionary->type) {
|
||||
case OPENCC_DICTIONARY_TYPE_TEXT:
|
||||
return dictionary_text_get_all_match_lengths(dictionary->dict, word, match_length);
|
||||
break;
|
||||
case OPENCC_DICTIONARY_TYPE_DATRIE:
|
||||
return dictionary_datrie_get_all_match_lengths(dictionary->dict, word, match_length);
|
||||
break;
|
||||
default:
|
||||
debug_should_not_be_here();
|
||||
}
|
||||
return (size_t)-1;
|
||||
}
|
||||
45
internal/cpp/opencc/dictionary/abstract.h
Normal file
45
internal/cpp/opencc/dictionary/abstract.h
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_DICTIONARY_ABSTRACT_H_
|
||||
#define __OPENCC_DICTIONARY_ABSTRACT_H_
|
||||
|
||||
#include "../utils.h"
|
||||
|
||||
struct _entry
|
||||
{
|
||||
ucs4_t * key;
|
||||
ucs4_t ** value;
|
||||
};
|
||||
typedef struct _entry entry;
|
||||
|
||||
typedef void * dictionary_t;
|
||||
|
||||
dictionary_t dictionary_open(const char * filename, opencc_dictionary_type type);
|
||||
|
||||
void dictionary_close(dictionary_t t_dictionary);
|
||||
|
||||
dictionary_t dictionary_get(dictionary_t t_dictionary);
|
||||
|
||||
const ucs4_t * const * dictionary_match_longest(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t maxlen, size_t * match_length);
|
||||
|
||||
size_t dictionary_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t * match_length);
|
||||
|
||||
#endif /* __OPENCC_DICTIONARY_ABSTRACT_H_ */
|
||||
250
internal/cpp/opencc/dictionary/datrie.c
Normal file
250
internal/cpp/opencc/dictionary/datrie.c
Normal file
@ -0,0 +1,250 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "datrie.h"
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef __WIN32
|
||||
/* Todo: Win32 mmap*/
|
||||
#else
|
||||
#include <sys/mman.h>
|
||||
#define MMAP_ENABLED
|
||||
#endif
|
||||
|
||||
typedef enum { MEMORY_TYPE_MMAP, MEMORY_TYPE_ALLOCATE } memory_type;
|
||||
|
||||
struct _datrie_dictionary {
|
||||
const DoubleArrayTrieItem *dat;
|
||||
uint32_t dat_item_count;
|
||||
ucs4_t *lexicon;
|
||||
uint32_t lexicon_count;
|
||||
|
||||
ucs4_t ***lexicon_set;
|
||||
void *dic_memory;
|
||||
size_t dic_size;
|
||||
memory_type dic_memory_type;
|
||||
};
|
||||
typedef struct _datrie_dictionary datrie_dictionary_desc;
|
||||
|
||||
static int load_allocate(datrie_dictionary_desc *datrie_dictionary, int fd) {
|
||||
datrie_dictionary->dic_memory_type = MEMORY_TYPE_ALLOCATE;
|
||||
datrie_dictionary->dic_memory = malloc(datrie_dictionary->dic_size);
|
||||
if (datrie_dictionary->dic_memory == NULL) {
|
||||
/* 內存申請失敗 */
|
||||
return -1;
|
||||
}
|
||||
lseek(fd, 0, SEEK_SET);
|
||||
if (read(fd, datrie_dictionary->dic_memory, datrie_dictionary->dic_size) == -1) {
|
||||
/* 讀取失敗 */
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int load_mmap(datrie_dictionary_desc *datrie_dictionary, int fd) {
|
||||
#ifdef MMAP_ENABLED
|
||||
datrie_dictionary->dic_memory_type = MEMORY_TYPE_MMAP;
|
||||
datrie_dictionary->dic_memory = mmap(NULL, datrie_dictionary->dic_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
||||
if (datrie_dictionary->dic_memory == MAP_FAILED) {
|
||||
/* 內存映射創建失敗 */
|
||||
datrie_dictionary->dic_memory = NULL;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
static int load_dict(datrie_dictionary_desc *datrie_dictionary, FILE *fp) {
|
||||
int fd = fileno(fp);
|
||||
|
||||
fseek(fp, 0, SEEK_END);
|
||||
datrie_dictionary->dic_size = ftell(fp);
|
||||
|
||||
/* 首先嘗試mmap,如果失敗嘗試申請內存 */
|
||||
if (load_mmap(datrie_dictionary, fd) == -1) {
|
||||
if (load_allocate(datrie_dictionary, fd) == -1) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
size_t header_len = strlen("OPENCCDATRIE");
|
||||
|
||||
if (strncmp((const char *)datrie_dictionary->dic_memory, "OPENCCDATRIE", header_len) != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t offset = 0;
|
||||
|
||||
offset += header_len * sizeof(char);
|
||||
|
||||
/* 詞彙表 */
|
||||
uint32_t lexicon_length = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
|
||||
offset += sizeof(uint32_t);
|
||||
|
||||
datrie_dictionary->lexicon = (ucs4_t *)(datrie_dictionary->dic_memory + offset);
|
||||
offset += lexicon_length * sizeof(ucs4_t);
|
||||
|
||||
/* 詞彙索引表 */
|
||||
uint32_t lexicon_index_length = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
|
||||
offset += sizeof(uint32_t);
|
||||
|
||||
uint32_t *lexicon_index = (uint32_t *)(datrie_dictionary->dic_memory + offset);
|
||||
offset += lexicon_index_length * sizeof(uint32_t);
|
||||
|
||||
datrie_dictionary->lexicon_count = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
|
||||
offset += sizeof(uint32_t);
|
||||
|
||||
datrie_dictionary->dat_item_count = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
|
||||
offset += sizeof(uint32_t);
|
||||
|
||||
datrie_dictionary->dat = (DoubleArrayTrieItem *)(datrie_dictionary->dic_memory + offset);
|
||||
|
||||
/* 構造索引表 */
|
||||
datrie_dictionary->lexicon_set = (ucs4_t ***)malloc(datrie_dictionary->lexicon_count * sizeof(ucs4_t **));
|
||||
size_t i, last = 0;
|
||||
for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
|
||||
size_t count, j;
|
||||
for (j = last; j < lexicon_index_length; j++) {
|
||||
if (lexicon_index[j] == (uint32_t)-1)
|
||||
break;
|
||||
}
|
||||
count = j - last;
|
||||
|
||||
datrie_dictionary->lexicon_set[i] = (ucs4_t **)malloc((count + 1) * sizeof(ucs4_t *));
|
||||
for (j = 0; j < count; j++) {
|
||||
datrie_dictionary->lexicon_set[i][j] = datrie_dictionary->lexicon + lexicon_index[last + j];
|
||||
}
|
||||
datrie_dictionary->lexicon_set[i][count] = NULL;
|
||||
last += j + 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int unload_dict(datrie_dictionary_desc *datrie_dictionary) {
|
||||
if (datrie_dictionary->dic_memory != NULL) {
|
||||
size_t i;
|
||||
for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
|
||||
free(datrie_dictionary->lexicon_set[i]);
|
||||
}
|
||||
free(datrie_dictionary->lexicon_set);
|
||||
|
||||
if (MEMORY_TYPE_MMAP == datrie_dictionary->dic_memory_type) {
|
||||
#ifdef MMAP_ENABLED
|
||||
return munmap(datrie_dictionary->dic_memory, datrie_dictionary->dic_size);
|
||||
#else
|
||||
debug_should_not_be_here();
|
||||
#endif
|
||||
} else if (MEMORY_TYPE_ALLOCATE == datrie_dictionary->dic_memory_type) {
|
||||
free(datrie_dictionary->dic_memory);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
dictionary_t dictionary_datrie_open(const char *filename) {
|
||||
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)malloc(sizeof(datrie_dictionary_desc));
|
||||
datrie_dictionary->dat = NULL;
|
||||
datrie_dictionary->lexicon = NULL;
|
||||
|
||||
FILE *fp = fopen(filename, "rb");
|
||||
|
||||
if (load_dict(datrie_dictionary, fp) == -1) {
|
||||
dictionary_datrie_close((dictionary_t)datrie_dictionary);
|
||||
return (dictionary_t)-1;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return (dictionary_t)datrie_dictionary;
|
||||
}
|
||||
|
||||
int dictionary_datrie_close(dictionary_t t_dictionary) {
|
||||
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)t_dictionary;
|
||||
|
||||
if (unload_dict(datrie_dictionary) == -1) {
|
||||
free(datrie_dictionary);
|
||||
return -1;
|
||||
}
|
||||
|
||||
free(datrie_dictionary);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int encode_char(ucs4_t ch) { return (int)ch; }
|
||||
|
||||
void datrie_match(const datrie_dictionary_desc *datrie_dictionary, const ucs4_t *word, size_t *match_pos, size_t *id, size_t limit) {
|
||||
size_t i, p;
|
||||
for (i = 0, p = 0; word[p] && (limit == 0 || p < limit) && datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) {
|
||||
int k = encode_char(word[p]);
|
||||
int j = datrie_dictionary->dat[i].base + k;
|
||||
if (j < 0 || j >= datrie_dictionary->dat_item_count || datrie_dictionary->dat[j].parent != i)
|
||||
break;
|
||||
i = j;
|
||||
}
|
||||
if (match_pos)
|
||||
*match_pos = p;
|
||||
if (id)
|
||||
*id = i;
|
||||
}
|
||||
|
||||
const ucs4_t *const *dictionary_datrie_match_longest(dictionary_t t_dictionary, const ucs4_t *word, size_t maxlen, size_t *match_length) {
|
||||
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)t_dictionary;
|
||||
|
||||
size_t pos, item;
|
||||
datrie_match(datrie_dictionary, word, &pos, &item, maxlen);
|
||||
|
||||
while (datrie_dictionary->dat[item].word == -1 && pos > 1)
|
||||
datrie_match(datrie_dictionary, word, &pos, &item, pos - 1);
|
||||
|
||||
if (pos == 0 || datrie_dictionary->dat[item].word == -1) {
|
||||
if (match_length != NULL)
|
||||
*match_length = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (match_length != NULL)
|
||||
*match_length = pos;
|
||||
|
||||
return (const ucs4_t *const *)datrie_dictionary->lexicon_set[datrie_dictionary->dat[item].word];
|
||||
}
|
||||
|
||||
size_t dictionary_datrie_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t *word, size_t *match_length) {
|
||||
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)t_dictionary;
|
||||
|
||||
size_t rscnt = 0;
|
||||
|
||||
size_t i, p;
|
||||
for (i = 0, p = 0; word[p] && datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) {
|
||||
int k = encode_char(word[p]);
|
||||
int j = datrie_dictionary->dat[i].base + k;
|
||||
if (j < 0 || j >= datrie_dictionary->dat_item_count || datrie_dictionary->dat[j].parent != i)
|
||||
break;
|
||||
i = j;
|
||||
|
||||
if (datrie_dictionary->dat[i].word != -1)
|
||||
match_length[rscnt++] = p + 1;
|
||||
}
|
||||
|
||||
return rscnt;
|
||||
}
|
||||
45
internal/cpp/opencc/dictionary/datrie.h
Normal file
45
internal/cpp/opencc/dictionary/datrie.h
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_DICTIONARY_DATRIE_H_
|
||||
#define __OPENCC_DICTIONARY_DATRIE_H_
|
||||
|
||||
#include "abstract.h"
|
||||
|
||||
#define DATRIE_UNUSED -1
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int base;
|
||||
int parent;
|
||||
int word;
|
||||
} DoubleArrayTrieItem;
|
||||
|
||||
dictionary_t dictionary_datrie_open(const char * filename);
|
||||
|
||||
int dictionary_datrie_close(dictionary_t t_dictionary);
|
||||
|
||||
const ucs4_t * const * dictionary_datrie_match_longest(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t maxlen, size_t * match_length);
|
||||
|
||||
size_t dictionary_datrie_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t * match_length);
|
||||
|
||||
int encode_char(ucs4_t ch);
|
||||
|
||||
#endif /* __OPENCC_DICTIONARY_DATRIE_H_ */
|
||||
232
internal/cpp/opencc/dictionary/text.c
Normal file
232
internal/cpp/opencc/dictionary/text.c
Normal file
@ -0,0 +1,232 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "text.h"
|
||||
#include "../encoding.h"
|
||||
|
||||
#define INITIAL_DICTIONARY_SIZE 1024
|
||||
#define ENTRY_BUFF_SIZE 128
|
||||
#define ENTRY_WBUFF_SIZE ENTRY_BUFF_SIZE / sizeof(size_t)
|
||||
|
||||
struct _text_dictionary {
|
||||
size_t entry_count;
|
||||
size_t max_length;
|
||||
entry *lexicon;
|
||||
ucs4_t *word_buff;
|
||||
};
|
||||
typedef struct _text_dictionary text_dictionary_desc;
|
||||
|
||||
int qsort_entry_cmp(const void *a, const void *b) { return ucs4cmp(((entry *)a)->key, ((entry *)b)->key); }
|
||||
|
||||
int parse_entry(const char *buff, entry *entry_i) {
|
||||
size_t length;
|
||||
const char *pbuff;
|
||||
|
||||
/* 解析鍵 */
|
||||
for (pbuff = buff; *pbuff != '\t' && *pbuff != '\0'; ++pbuff)
|
||||
;
|
||||
if (*pbuff == '\0')
|
||||
return -1;
|
||||
length = pbuff - buff;
|
||||
|
||||
ucs4_t *ucs4_buff;
|
||||
ucs4_buff = utf8_to_ucs4(buff, length);
|
||||
if (ucs4_buff == (ucs4_t *)-1)
|
||||
return -1;
|
||||
entry_i->key = (ucs4_t *)malloc((length + 1) * sizeof(ucs4_t));
|
||||
ucs4cpy(entry_i->key, ucs4_buff);
|
||||
free(ucs4_buff);
|
||||
|
||||
/* 解析值 */
|
||||
size_t value_i, value_count = INITIAL_DICTIONARY_SIZE;
|
||||
entry_i->value = (ucs4_t **)malloc(value_count * sizeof(ucs4_t *));
|
||||
|
||||
for (value_i = 0; *pbuff != '\0' && *pbuff != '\n'; ++value_i) {
|
||||
if (value_i >= value_count) {
|
||||
value_count += value_count;
|
||||
entry_i->value = (ucs4_t **)realloc(entry_i->value, value_count * sizeof(ucs4_t *));
|
||||
}
|
||||
|
||||
for (buff = ++pbuff; *pbuff != ' ' && *pbuff != '\0' && *pbuff != '\n'; ++pbuff)
|
||||
;
|
||||
length = pbuff - buff;
|
||||
ucs4_buff = utf8_to_ucs4(buff, length);
|
||||
if (ucs4_buff == (ucs4_t *)-1) {
|
||||
/* 發生錯誤 回退內存申請 */
|
||||
ssize_t i;
|
||||
for (i = value_i - 1; i >= 0; --i)
|
||||
free(entry_i->value[i]);
|
||||
free(entry_i->value);
|
||||
free(entry_i->key);
|
||||
return -1;
|
||||
}
|
||||
|
||||
entry_i->value[value_i] = (ucs4_t *)malloc((length + 1) * sizeof(ucs4_t));
|
||||
ucs4cpy(entry_i->value[value_i], ucs4_buff);
|
||||
free(ucs4_buff);
|
||||
}
|
||||
|
||||
entry_i->value = (ucs4_t **)realloc(entry_i->value, value_count * sizeof(ucs4_t *));
|
||||
entry_i->value[value_i] = NULL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
dictionary_t dictionary_text_open(const char *filename) {
|
||||
text_dictionary_desc *text_dictionary;
|
||||
text_dictionary = (text_dictionary_desc *)malloc(sizeof(text_dictionary_desc));
|
||||
text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE;
|
||||
text_dictionary->max_length = 0;
|
||||
text_dictionary->lexicon = (entry *)malloc(sizeof(entry) * text_dictionary->entry_count);
|
||||
text_dictionary->word_buff = NULL;
|
||||
|
||||
static char buff[ENTRY_BUFF_SIZE];
|
||||
|
||||
FILE *fp = fopen(filename, "rb");
|
||||
if (fp == NULL) {
|
||||
dictionary_text_close((dictionary_t)text_dictionary);
|
||||
return (dictionary_t)-1;
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
while (fgets(buff, ENTRY_BUFF_SIZE, fp)) {
|
||||
if (i >= text_dictionary->entry_count) {
|
||||
text_dictionary->entry_count += text_dictionary->entry_count;
|
||||
text_dictionary->lexicon = (entry *)realloc(text_dictionary->lexicon, sizeof(entry) * text_dictionary->entry_count);
|
||||
}
|
||||
|
||||
if (parse_entry(buff, text_dictionary->lexicon + i) == -1) {
|
||||
text_dictionary->entry_count = i;
|
||||
dictionary_text_close((dictionary_t)text_dictionary);
|
||||
return (dictionary_t)-1;
|
||||
}
|
||||
|
||||
size_t length = ucs4len(text_dictionary->lexicon[i].key);
|
||||
if (length > text_dictionary->max_length)
|
||||
text_dictionary->max_length = length;
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
text_dictionary->entry_count = i;
|
||||
text_dictionary->lexicon = (entry *)realloc(text_dictionary->lexicon, sizeof(entry) * text_dictionary->entry_count);
|
||||
text_dictionary->word_buff = (ucs4_t *)malloc(sizeof(ucs4_t) * (text_dictionary->max_length + 1));
|
||||
|
||||
qsort(text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp);
|
||||
|
||||
return (dictionary_t)text_dictionary;
|
||||
}
|
||||
|
||||
void dictionary_text_close(dictionary_t t_dictionary) {
|
||||
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < text_dictionary->entry_count; ++i) {
|
||||
free(text_dictionary->lexicon[i].key);
|
||||
|
||||
ucs4_t **j;
|
||||
for (j = text_dictionary->lexicon[i].value; *j; ++j) {
|
||||
free(*j);
|
||||
}
|
||||
free(text_dictionary->lexicon[i].value);
|
||||
}
|
||||
|
||||
free(text_dictionary->lexicon);
|
||||
free(text_dictionary->word_buff);
|
||||
free(text_dictionary);
|
||||
}
|
||||
|
||||
const ucs4_t *const *dictionary_text_match_longest(dictionary_t t_dictionary, const ucs4_t *word, size_t maxlen, size_t *match_length) {
|
||||
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
||||
|
||||
if (text_dictionary->entry_count == 0)
|
||||
return NULL;
|
||||
|
||||
if (maxlen == 0)
|
||||
maxlen = ucs4len(word);
|
||||
size_t len = text_dictionary->max_length;
|
||||
if (maxlen < len)
|
||||
len = maxlen;
|
||||
|
||||
ucs4ncpy(text_dictionary->word_buff, word, len);
|
||||
text_dictionary->word_buff[len] = L'\0';
|
||||
|
||||
entry buff;
|
||||
buff.key = text_dictionary->word_buff;
|
||||
|
||||
for (; len > 0; len--) {
|
||||
text_dictionary->word_buff[len] = L'\0';
|
||||
entry *brs =
|
||||
(entry *)bsearch(&buff, text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp);
|
||||
|
||||
if (brs != NULL) {
|
||||
if (match_length != NULL)
|
||||
*match_length = len;
|
||||
return (const ucs4_t *const *)brs->value;
|
||||
}
|
||||
}
|
||||
|
||||
if (match_length != NULL)
|
||||
*match_length = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t dictionary_text_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t *word, size_t *match_length) {
|
||||
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
||||
|
||||
size_t rscnt = 0;
|
||||
|
||||
if (text_dictionary->entry_count == 0)
|
||||
return rscnt;
|
||||
|
||||
size_t length = ucs4len(word);
|
||||
size_t len = text_dictionary->max_length;
|
||||
if (length < len)
|
||||
len = length;
|
||||
|
||||
ucs4ncpy(text_dictionary->word_buff, word, len);
|
||||
text_dictionary->word_buff[len] = L'\0';
|
||||
|
||||
entry buff;
|
||||
buff.key = text_dictionary->word_buff;
|
||||
|
||||
for (; len > 0; len--) {
|
||||
text_dictionary->word_buff[len] = L'\0';
|
||||
entry *brs =
|
||||
(entry *)bsearch(&buff, text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp);
|
||||
|
||||
if (brs != NULL)
|
||||
match_length[rscnt++] = len;
|
||||
}
|
||||
|
||||
return rscnt;
|
||||
}
|
||||
|
||||
size_t dictionary_text_get_lexicon(dictionary_t t_dictionary, entry *lexicon) {
|
||||
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < text_dictionary->entry_count; i++) {
|
||||
lexicon[i].key = text_dictionary->lexicon[i].key;
|
||||
lexicon[i].value = text_dictionary->lexicon[i].value;
|
||||
}
|
||||
|
||||
return text_dictionary->entry_count;
|
||||
}
|
||||
36
internal/cpp/opencc/dictionary/text.h
Normal file
36
internal/cpp/opencc/dictionary/text.h
Normal file
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_DICTIONARY_TEXT_H_
|
||||
#define __OPENCC_DICTIONARY_TEXT_H_
|
||||
|
||||
#include "abstract.h"
|
||||
|
||||
dictionary_t dictionary_text_open(const char * filename);
|
||||
|
||||
void dictionary_text_close(dictionary_t t_dictionary);
|
||||
|
||||
const ucs4_t * const * dictionary_text_match_longest(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t maxlen, size_t * match_length);
|
||||
|
||||
size_t dictionary_text_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t * match_length);
|
||||
|
||||
size_t dictionary_text_get_lexicon(dictionary_t t_dictionary, entry * lexicon);
|
||||
|
||||
#endif /* __OPENCC_DICTIONARY_TEXT_H_ */
|
||||
Reference in New Issue
Block a user