mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-04-30 23:37:49 +08:00
RAGFlow go API server (#13240)
# RAGFlow Go Implementation Plan 🚀 This repository tracks the progress of porting RAGFlow to Go. We'll implement core features and provide performance comparisons between Python and Go versions. ## Implementation Checklist - [x] User Management APIs - [x] Dataset Management Operations - [x] Retrieval Test - [x] Chat Management Operations - [x] Infinity Go SDK --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
This commit is contained in:
289
internal/cpp/opencc/config_reader.c
Normal file
289
internal/cpp/opencc/config_reader.c
Normal file
@ -0,0 +1,289 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "config_reader.h"
|
||||
#include "dictionary_set.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define BUFFER_SIZE 8192
|
||||
#define DICTIONARY_MAX_COUNT 1024
|
||||
#define CONFIG_DICT_TYPE_OCD "OCD"
|
||||
#define CONFIG_DICT_TYPE_TEXT "TEXT"
|
||||
|
||||
typedef struct {
|
||||
opencc_dictionary_type dict_type;
|
||||
char *file_name;
|
||||
size_t index;
|
||||
size_t stamp;
|
||||
} dictionary_buffer;
|
||||
|
||||
struct _config_desc {
|
||||
char *title;
|
||||
char *description;
|
||||
dictionary_set_t dictionary_set;
|
||||
char *home_dir;
|
||||
dictionary_buffer dicts[DICTIONARY_MAX_COUNT];
|
||||
size_t dicts_count;
|
||||
size_t stamp;
|
||||
};
|
||||
typedef struct _config_desc config_desc;
|
||||
|
||||
static config_error errnum = CONFIG_ERROR_VOID;
|
||||
|
||||
static int qsort_dictionary_buffer_cmp(const void *a, const void *b) {
|
||||
if (((dictionary_buffer *)a)->index < ((dictionary_buffer *)b)->index)
|
||||
return -1;
|
||||
if (((dictionary_buffer *)a)->index > ((dictionary_buffer *)b)->index)
|
||||
return 1;
|
||||
return ((dictionary_buffer *)a)->stamp < ((dictionary_buffer *)b)->stamp ? -1 : 1;
|
||||
}
|
||||
|
||||
static int load_dictionary(config_desc *config) {
|
||||
if (config->dicts_count == 0)
|
||||
return 0;
|
||||
|
||||
qsort(config->dicts, config->dicts_count, sizeof(config->dicts[0]), qsort_dictionary_buffer_cmp);
|
||||
|
||||
size_t i, last_index = 0;
|
||||
dictionary_group_t group = dictionary_set_new_group(config->dictionary_set);
|
||||
|
||||
for (i = 0; i < config->dicts_count; i++) {
|
||||
if (config->dicts[i].index > last_index) {
|
||||
last_index = config->dicts[i].index;
|
||||
group = dictionary_set_new_group(config->dictionary_set);
|
||||
}
|
||||
dictionary_group_load(group, config->dicts[i].file_name, config->home_dir, config->dicts[i].dict_type);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_add_dict(config_desc *config, size_t index, const char *dictstr) {
|
||||
const char *pstr = dictstr;
|
||||
|
||||
while (*pstr != '\0' && *pstr != ' ')
|
||||
pstr++;
|
||||
|
||||
opencc_dictionary_type dict_type;
|
||||
|
||||
if (strncmp(dictstr, CONFIG_DICT_TYPE_OCD, sizeof(CONFIG_DICT_TYPE_OCD) - 1) == 0)
|
||||
dict_type = OPENCC_DICTIONARY_TYPE_DATRIE;
|
||||
else if (strncmp(dictstr, CONFIG_DICT_TYPE_TEXT, sizeof(CONFIG_DICT_TYPE_OCD) - 1) == 0)
|
||||
dict_type = OPENCC_DICTIONARY_TYPE_TEXT;
|
||||
else {
|
||||
errnum = CONFIG_ERROR_INVALID_DICT_TYPE;
|
||||
return -1;
|
||||
}
|
||||
|
||||
while (*pstr != '\0' && (*pstr == ' ' || *pstr == '\t'))
|
||||
pstr++;
|
||||
|
||||
size_t i = config->dicts_count++;
|
||||
|
||||
config->dicts[i].dict_type = dict_type;
|
||||
config->dicts[i].file_name = mstrcpy(pstr);
|
||||
config->dicts[i].index = index;
|
||||
config->dicts[i].stamp = config->stamp++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_property(config_desc *config, const char *key, const char *value) {
|
||||
if (strncmp(key, "dict", 4) == 0) {
|
||||
int index = 0;
|
||||
sscanf(key + 4, "%d", &index);
|
||||
return parse_add_dict(config, index, value);
|
||||
} else if (strcmp(key, "title") == 0) {
|
||||
free(config->title);
|
||||
config->title = mstrcpy(value);
|
||||
return 0;
|
||||
} else if (strcmp(key, "description") == 0) {
|
||||
free(config->description);
|
||||
config->description = mstrcpy(value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
errnum = CONFIG_ERROR_NO_PROPERTY;
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int parse_line(const char *line, char **key, char **value) {
|
||||
const char *line_begin = line;
|
||||
|
||||
while (*line != '\0' && (*line != ' ' && *line != '\t' && *line != '='))
|
||||
line++;
|
||||
|
||||
size_t key_len = line - line_begin;
|
||||
|
||||
while (*line != '\0' && *line != '=')
|
||||
line++;
|
||||
|
||||
if (*line == '\0')
|
||||
return -1;
|
||||
|
||||
assert(*line == '=');
|
||||
|
||||
*key = mstrncpy(line_begin, key_len);
|
||||
|
||||
line++;
|
||||
while (*line != '\0' && (*line == ' ' || *line == '\t'))
|
||||
line++;
|
||||
|
||||
if (*line == '\0') {
|
||||
free(*key);
|
||||
return -1;
|
||||
}
|
||||
|
||||
*value = mstrcpy(line);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static char *parse_trim(char *str) {
|
||||
for (; *str != '\0' && (*str == ' ' || *str == '\t'); str++)
|
||||
;
|
||||
register char *prs = str;
|
||||
for (; *prs != '\0' && *prs != '\n' && *prs != '\r'; prs++)
|
||||
;
|
||||
for (prs--; prs > str && (*prs == ' ' || *prs == '\t'); prs--)
|
||||
;
|
||||
*(++prs) = '\0';
|
||||
return str;
|
||||
}
|
||||
|
||||
static int parse(config_desc *config, const char *filename, const char *home_path) {
|
||||
FILE *fp = fopen(filename, "rb");
|
||||
if (!fp) {
|
||||
char *pkg_filename = (char *)malloc(sizeof(char) * (strlen(filename) + strlen(home_path) + 2));
|
||||
sprintf(pkg_filename, "%s/%s", home_path, filename);
|
||||
printf("pkg_filename %s\n", pkg_filename);
|
||||
fp = fopen(pkg_filename, "rb");
|
||||
if (!fp) {
|
||||
free(pkg_filename);
|
||||
errnum = CONFIG_ERROR_CANNOT_ACCESS_CONFIG_FILE;
|
||||
return -1;
|
||||
}
|
||||
free(pkg_filename);
|
||||
}
|
||||
|
||||
config->home_dir = (char *)malloc(sizeof(char) * (strlen(home_path) + 1));
|
||||
sprintf(config->home_dir, "%s", home_path);
|
||||
|
||||
static char buff[BUFFER_SIZE];
|
||||
|
||||
while (fgets(buff, BUFFER_SIZE, fp) != NULL) {
|
||||
char *trimed_buff = parse_trim(buff);
|
||||
if (*trimed_buff == ';' || *trimed_buff == '#' || *trimed_buff == '\0') {
|
||||
/* Comment Line or empty line */
|
||||
continue;
|
||||
}
|
||||
|
||||
char *key = NULL, *value = NULL;
|
||||
|
||||
if (parse_line(trimed_buff, &key, &value) == -1) {
|
||||
free(key);
|
||||
free(value);
|
||||
fclose(fp);
|
||||
errnum = CONFIG_ERROR_PARSE;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (parse_property(config, key, value) == -1) {
|
||||
free(key);
|
||||
free(value);
|
||||
fclose(fp);
|
||||
return -1;
|
||||
}
|
||||
|
||||
free(key);
|
||||
free(value);
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
dictionary_set_t config_get_dictionary_set(config_t t_config) {
|
||||
config_desc *config = (config_desc *)t_config;
|
||||
|
||||
if (config->dictionary_set != NULL) {
|
||||
dictionary_set_close(config->dictionary_set);
|
||||
}
|
||||
|
||||
config->dictionary_set = dictionary_set_open();
|
||||
load_dictionary(config);
|
||||
|
||||
return config->dictionary_set;
|
||||
}
|
||||
|
||||
config_error config_errno(void) { return errnum; }
|
||||
|
||||
void config_perror(const char *spec) {
|
||||
perr(spec);
|
||||
perr("\n");
|
||||
switch (errnum) {
|
||||
case CONFIG_ERROR_VOID:
|
||||
break;
|
||||
case CONFIG_ERROR_CANNOT_ACCESS_CONFIG_FILE:
|
||||
perror(_("Can not access configuration file"));
|
||||
break;
|
||||
case CONFIG_ERROR_PARSE:
|
||||
perr(_("Configuration file parse error"));
|
||||
break;
|
||||
case CONFIG_ERROR_NO_PROPERTY:
|
||||
perr(_("Invalid property"));
|
||||
break;
|
||||
case CONFIG_ERROR_INVALID_DICT_TYPE:
|
||||
perr(_("Invalid dictionary type"));
|
||||
break;
|
||||
default:
|
||||
perr(_("Unknown"));
|
||||
}
|
||||
}
|
||||
|
||||
config_t config_open(const char *filename, const char *home_path) {
|
||||
config_desc *config = (config_desc *)malloc(sizeof(config_desc));
|
||||
|
||||
config->title = NULL;
|
||||
config->description = NULL;
|
||||
config->home_dir = NULL;
|
||||
config->dicts_count = 0;
|
||||
config->stamp = 0;
|
||||
config->dictionary_set = NULL;
|
||||
|
||||
if (parse(config, filename, home_path) == -1) {
|
||||
config_close((config_t)config);
|
||||
return (config_t)-1;
|
||||
}
|
||||
|
||||
return (config_t)config;
|
||||
}
|
||||
|
||||
void config_close(config_t t_config) {
|
||||
config_desc *config = (config_desc *)t_config;
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < config->dicts_count; i++)
|
||||
free(config->dicts[i].file_name);
|
||||
|
||||
free(config->title);
|
||||
free(config->description);
|
||||
free(config->home_dir);
|
||||
free(config);
|
||||
}
|
||||
46
internal/cpp/opencc/config_reader.h
Normal file
46
internal/cpp/opencc/config_reader.h
Normal file
@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_CONFIG_H_
|
||||
#define __OPENCC_CONFIG_H_
|
||||
|
||||
#include "utils.h"
|
||||
#include "dictionary_set.h"
|
||||
|
||||
typedef void * config_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
CONFIG_ERROR_VOID,
|
||||
CONFIG_ERROR_CANNOT_ACCESS_CONFIG_FILE,
|
||||
CONFIG_ERROR_PARSE,
|
||||
CONFIG_ERROR_NO_PROPERTY,
|
||||
CONFIG_ERROR_INVALID_DICT_TYPE,
|
||||
} config_error;
|
||||
|
||||
config_t config_open(const char * filename, const char* home_path);
|
||||
|
||||
void config_close(config_t t_config);
|
||||
|
||||
dictionary_set_t config_get_dictionary_set(config_t t_config);
|
||||
|
||||
config_error config_errno(void);
|
||||
|
||||
void config_perror(const char * spec);
|
||||
|
||||
#endif /* __OPENCC_CONFIG_H_ */
|
||||
590
internal/cpp/opencc/converter.c
Normal file
590
internal/cpp/opencc/converter.c
Normal file
@ -0,0 +1,590 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "converter.h"
|
||||
#include "dictionary_set.h"
|
||||
#include "encoding.h"
|
||||
#include "utils.h"
|
||||
|
||||
#define DELIMITER ' '
|
||||
#define SEGMENT_MAXIMUM_LENGTH 0
|
||||
#define SEGMENT_SHORTEST_PATH 1
|
||||
#define SEGMENT_METHOD SEGMENT_SHORTEST_PATH
|
||||
|
||||
#if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
|
||||
|
||||
#define OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE 1024
|
||||
|
||||
typedef struct {
|
||||
int initialized;
|
||||
size_t buffer_size;
|
||||
size_t *match_length;
|
||||
size_t *min_len;
|
||||
size_t *parent;
|
||||
size_t *path;
|
||||
} spseg_buffer_desc;
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
|
||||
spseg_buffer_desc spseg_buffer;
|
||||
#endif
|
||||
dictionary_set_t dictionary_set;
|
||||
dictionary_group_t current_dictionary_group;
|
||||
opencc_conversion_mode conversion_mode;
|
||||
} converter_desc;
|
||||
static converter_error errnum = CONVERTER_ERROR_VOID;
|
||||
|
||||
#if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
|
||||
static void sp_seg_buffer_free(spseg_buffer_desc *ossb) {
|
||||
free(ossb->match_length);
|
||||
free(ossb->min_len);
|
||||
free(ossb->parent);
|
||||
free(ossb->path);
|
||||
}
|
||||
|
||||
static void sp_seg_set_buffer_size(spseg_buffer_desc *ossb, size_t buffer_size) {
|
||||
if (ossb->initialized == TRUE)
|
||||
sp_seg_buffer_free(ossb);
|
||||
|
||||
ossb->buffer_size = buffer_size;
|
||||
ossb->match_length = (size_t *)malloc((buffer_size + 1) * sizeof(size_t));
|
||||
ossb->min_len = (size_t *)malloc(buffer_size * sizeof(size_t));
|
||||
ossb->parent = (size_t *)malloc(buffer_size * sizeof(size_t));
|
||||
ossb->path = (size_t *)malloc(buffer_size * sizeof(size_t));
|
||||
|
||||
ossb->initialized = TRUE;
|
||||
}
|
||||
|
||||
static size_t sp_seg(converter_desc *converter, ucs4_t **inbuf, size_t *inbuf_left, ucs4_t **outbuf, size_t *outbuf_left, size_t length) {
|
||||
/* 最短路徑分詞 */
|
||||
|
||||
/* 對長度爲1時特殊優化 */
|
||||
if (length == 1) {
|
||||
const ucs4_t *const *match_rs = dictionary_group_match_longest(converter->current_dictionary_group, *inbuf, 1, NULL);
|
||||
|
||||
size_t match_len = 1;
|
||||
if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
|
||||
if (match_rs == NULL) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
} else {
|
||||
const ucs4_t *result = match_rs[0];
|
||||
|
||||
/* 輸出緩衝區剩餘空間小於分詞長度 */
|
||||
if (ucs4len(result) > *outbuf_left) {
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
for (; *result; result++) {
|
||||
**outbuf = *result;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
}
|
||||
|
||||
*inbuf += match_len;
|
||||
*inbuf_left -= match_len;
|
||||
}
|
||||
} else if (converter->conversion_mode == OPENCC_CONVERSION_LIST_CANDIDATES) {
|
||||
if (match_rs == NULL) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
} else {
|
||||
size_t i;
|
||||
for (i = 0; match_rs[i] != NULL; i++) {
|
||||
const ucs4_t *result = match_rs[i];
|
||||
int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
|
||||
|
||||
/* 輸出緩衝區剩餘空間小於分詞長度 */
|
||||
if (ucs4len(result) + show_delimiter > *outbuf_left) {
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
for (; *result; result++) {
|
||||
**outbuf = *result;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
}
|
||||
|
||||
if (show_delimiter) {
|
||||
**outbuf = DELIMITER;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
}
|
||||
}
|
||||
*inbuf += match_len;
|
||||
*inbuf_left -= match_len;
|
||||
}
|
||||
} else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
|
||||
if (match_rs == NULL) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
} else {
|
||||
/* 輸出緩衝區剩餘空間小於分詞長度 */
|
||||
if (match_len + 1 > *outbuf_left) {
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < match_len; i++) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
}
|
||||
}
|
||||
**outbuf = DELIMITER;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
} else
|
||||
debug_should_not_be_here();
|
||||
/* 必須保證有一個字符空間 */
|
||||
return match_len;
|
||||
}
|
||||
|
||||
/* 設置緩衝區空間 */
|
||||
spseg_buffer_desc *ossb = &(converter->spseg_buffer);
|
||||
size_t buffer_size_need = length + 1;
|
||||
if (ossb->initialized == FALSE || ossb->buffer_size < buffer_size_need)
|
||||
sp_seg_set_buffer_size(&(converter->spseg_buffer), buffer_size_need);
|
||||
|
||||
size_t i, j;
|
||||
|
||||
for (i = 0; i <= length; i++)
|
||||
ossb->min_len[i] = INFINITY_INT;
|
||||
|
||||
ossb->min_len[0] = ossb->parent[0] = 0;
|
||||
|
||||
for (i = 0; i < length; i++) {
|
||||
/* 獲取所有匹配長度 */
|
||||
size_t match_count = dictionary_group_get_all_match_lengths(converter->current_dictionary_group, (*inbuf) + i, ossb->match_length);
|
||||
|
||||
if (ossb->match_length[0] != 1)
|
||||
ossb->match_length[match_count++] = 1;
|
||||
|
||||
/* 動態規劃求最短分割路徑 */
|
||||
for (j = 0; j < match_count; j++) {
|
||||
size_t k = ossb->match_length[j];
|
||||
ossb->match_length[j] = 0;
|
||||
|
||||
if (k > 1 && ossb->min_len[i] + 1 <= ossb->min_len[i + k]) {
|
||||
ossb->min_len[i + k] = ossb->min_len[i] + 1;
|
||||
ossb->parent[i + k] = i;
|
||||
} else if (k == 1 && ossb->min_len[i] + 1 < ossb->min_len[i + k]) {
|
||||
ossb->min_len[i + k] = ossb->min_len[i] + 1;
|
||||
ossb->parent[i + k] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 取得最短分割路徑 */
|
||||
for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i])
|
||||
ossb->path[--j] = i;
|
||||
|
||||
size_t inbuf_left_start = *inbuf_left;
|
||||
size_t begin, end;
|
||||
|
||||
/* 根據最短分割路徑轉換 */
|
||||
for (i = begin = 0; i < ossb->min_len[length]; i++) {
|
||||
end = ossb->path[i];
|
||||
|
||||
size_t match_len;
|
||||
const ucs4_t *const *match_rs = dictionary_group_match_longest(converter->current_dictionary_group, *inbuf, end - begin, &match_len);
|
||||
|
||||
if (match_rs == NULL) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
} else {
|
||||
if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
|
||||
if (match_rs == NULL) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
} else {
|
||||
const ucs4_t *result = match_rs[0];
|
||||
|
||||
/* 輸出緩衝區剩餘空間小於分詞長度 */
|
||||
if (ucs4len(result) > *outbuf_left) {
|
||||
if (inbuf_left_start - *inbuf_left > 0)
|
||||
break;
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
for (; *result; result++) {
|
||||
**outbuf = *result;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
}
|
||||
|
||||
*inbuf += match_len;
|
||||
*inbuf_left -= match_len;
|
||||
}
|
||||
} else if (converter->conversion_mode == OPENCC_CONVERSION_LIST_CANDIDATES) {
|
||||
if (match_rs == NULL) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
} else {
|
||||
size_t i;
|
||||
for (i = 0; match_rs[i] != NULL; i++) {
|
||||
const ucs4_t *result = match_rs[i];
|
||||
int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
|
||||
|
||||
/* 輸出緩衝區剩餘空間小於分詞長度 */
|
||||
if (ucs4len(result) + show_delimiter > *outbuf_left) {
|
||||
if (inbuf_left_start - *inbuf_left > 0)
|
||||
break;
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
for (; *result; result++) {
|
||||
**outbuf = *result;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
}
|
||||
|
||||
if (show_delimiter) {
|
||||
**outbuf = DELIMITER;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
}
|
||||
}
|
||||
*inbuf += match_len;
|
||||
*inbuf_left -= match_len;
|
||||
}
|
||||
} else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
|
||||
if (match_rs == NULL) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
} else {
|
||||
/* 輸出緩衝區剩餘空間小於分詞長度 */
|
||||
if (match_len + 1 > *outbuf_left) {
|
||||
if (inbuf_left_start - *inbuf_left > 0)
|
||||
break;
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < match_len; i++) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
}
|
||||
}
|
||||
**outbuf = DELIMITER;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
} else
|
||||
debug_should_not_be_here();
|
||||
}
|
||||
|
||||
begin = end;
|
||||
}
|
||||
|
||||
return inbuf_left_start - *inbuf_left;
|
||||
}
|
||||
|
||||
static size_t segment(converter_desc *converter, ucs4_t **inbuf, size_t *inbuf_left, ucs4_t **outbuf, size_t *outbuf_left) {
|
||||
/* 歧義分割最短路徑分詞 */
|
||||
size_t i, start, bound;
|
||||
const ucs4_t *inbuf_start = *inbuf;
|
||||
size_t inbuf_left_start = *inbuf_left;
|
||||
size_t sp_seg_length;
|
||||
|
||||
bound = 0;
|
||||
|
||||
for (i = start = 0; inbuf_start[i] && *inbuf_left > 0 && *outbuf_left > 0; i++) {
|
||||
if (i != 0 && i == bound) {
|
||||
/* 對歧義部分進行最短路徑分詞 */
|
||||
sp_seg_length = sp_seg(converter, inbuf, inbuf_left, outbuf, outbuf_left, bound - start);
|
||||
if (sp_seg_length == (size_t)-1)
|
||||
return (size_t)-1;
|
||||
if (sp_seg_length == 0) {
|
||||
if (inbuf_left_start - *inbuf_left > 0)
|
||||
return inbuf_left_start - *inbuf_left;
|
||||
/* 空間不足 */
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
start = i;
|
||||
}
|
||||
|
||||
size_t match_len;
|
||||
dictionary_group_match_longest(converter->current_dictionary_group, inbuf_start + i, 0, &match_len);
|
||||
|
||||
if (match_len == 0)
|
||||
match_len = 1;
|
||||
|
||||
if (i + match_len > bound)
|
||||
bound = i + match_len;
|
||||
}
|
||||
|
||||
if (*inbuf_left > 0 && *outbuf_left > 0) {
|
||||
sp_seg_length = sp_seg(converter, inbuf, inbuf_left, outbuf, outbuf_left, bound - start);
|
||||
if (sp_seg_length == (size_t)-1)
|
||||
return (size_t)-1;
|
||||
if (sp_seg_length == 0) {
|
||||
if (inbuf_left_start - *inbuf_left > 0)
|
||||
return inbuf_left_start - *inbuf_left;
|
||||
/* 空間不足 */
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
}
|
||||
|
||||
if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
|
||||
(*outbuf)--;
|
||||
(*outbuf_left)++;
|
||||
}
|
||||
|
||||
return inbuf_left_start - *inbuf_left;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if SEGMENT_METHOD == SEGMENT_MAXIMUM_LENGTH
|
||||
static size_t segment(converter_desc *converter, ucs4_t **inbuf, size_t *inbuf_left, ucs4_t **outbuf, size_t *outbuf_left) {
|
||||
/* 正向最大分詞 */
|
||||
size_t inbuf_left_start = *inbuf_left;
|
||||
|
||||
for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;) {
|
||||
size_t match_len;
|
||||
const ucs4_t *const *match_rs = dictionary_group_match_longest(converter->current_dictionary_group, *inbuf, *inbuf_left, &match_len);
|
||||
|
||||
if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
|
||||
if (match_rs == NULL) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
} else {
|
||||
const ucs4_t *result = match_rs[0];
|
||||
|
||||
/* 輸出緩衝區剩餘空間小於分詞長度 */
|
||||
if (ucs4len(result) > *outbuf_left) {
|
||||
if (inbuf_left_start - *inbuf_left > 0)
|
||||
break;
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
for (; *result; result++) {
|
||||
**outbuf = *result;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
}
|
||||
|
||||
*inbuf += match_len;
|
||||
*inbuf_left -= match_len;
|
||||
}
|
||||
} else if (converter->conversion_mode == OPENCC_CONVERSION_LIST_CANDIDATES) {
|
||||
if (match_rs == NULL) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
} else {
|
||||
size_t i;
|
||||
for (i = 0; match_rs[i] != NULL; i++) {
|
||||
const ucs4_t *result = match_rs[i];
|
||||
int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
|
||||
|
||||
/* 輸出緩衝區剩餘空間小於分詞長度 */
|
||||
if (ucs4len(result) + show_delimiter > *outbuf_left) {
|
||||
if (inbuf_left_start - *inbuf_left > 0)
|
||||
break;
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
for (; *result; result++) {
|
||||
**outbuf = *result;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
}
|
||||
|
||||
if (show_delimiter) {
|
||||
**outbuf = DELIMITER;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
}
|
||||
}
|
||||
|
||||
*inbuf += match_len;
|
||||
*inbuf_left -= match_len;
|
||||
}
|
||||
} else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
|
||||
if (match_rs == NULL) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
} else {
|
||||
/* 輸出緩衝區剩餘空間小於分詞長度 */
|
||||
if (match_len + 1 > *outbuf_left) {
|
||||
if (inbuf_left_start - *inbuf_left > 0)
|
||||
break;
|
||||
errnum = CONVERTER_ERROR_OUTBUF;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < match_len; i++) {
|
||||
**outbuf = **inbuf;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
(*inbuf)++, (*inbuf_left)--;
|
||||
}
|
||||
}
|
||||
**outbuf = DELIMITER;
|
||||
(*outbuf)++, (*outbuf_left)--;
|
||||
} else
|
||||
debug_should_not_be_here();
|
||||
}
|
||||
|
||||
if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
|
||||
(*outbuf)--;
|
||||
(*outbuf_left)++;
|
||||
}
|
||||
|
||||
return inbuf_left_start - *inbuf_left;
|
||||
}
|
||||
#endif
|
||||
|
||||
size_t converter_convert(converter_t t_converter, ucs4_t **inbuf, size_t *inbuf_left, ucs4_t **outbuf, size_t *outbuf_left) {
|
||||
converter_desc *converter = (converter_desc *)t_converter;
|
||||
|
||||
if (converter->dictionary_set == NULL) {
|
||||
errnum = CONVERTER_ERROR_NODICT;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
if (dictionary_set_count_group(converter->dictionary_set) == 1) {
|
||||
/* 只有一個辭典,直接輸出 */
|
||||
return segment(converter, inbuf, inbuf_left, outbuf, outbuf_left);
|
||||
}
|
||||
|
||||
// 啓用辭典轉換鏈
|
||||
size_t inbuf_size = *inbuf_left;
|
||||
size_t outbuf_size = *outbuf_left;
|
||||
size_t retval = (size_t)-1;
|
||||
size_t cinbuf_left, coutbuf_left;
|
||||
size_t coutbuf_delta = 0;
|
||||
size_t i, cur;
|
||||
|
||||
ucs4_t *tmpbuf = (ucs4_t *)malloc(sizeof(ucs4_t) * outbuf_size);
|
||||
ucs4_t *orig_outbuf = *outbuf;
|
||||
ucs4_t *cinbuf, *coutbuf;
|
||||
|
||||
cinbuf_left = inbuf_size;
|
||||
coutbuf_left = outbuf_size;
|
||||
cinbuf = *inbuf;
|
||||
coutbuf = tmpbuf;
|
||||
|
||||
for (i = cur = 0; i < dictionary_set_count_group(converter->dictionary_set); ++i, cur = 1 - cur) {
|
||||
if (i > 0) {
|
||||
cinbuf_left = coutbuf_delta;
|
||||
coutbuf_left = outbuf_size;
|
||||
if (cur == 1) {
|
||||
cinbuf = tmpbuf;
|
||||
coutbuf = orig_outbuf;
|
||||
} else {
|
||||
cinbuf = orig_outbuf;
|
||||
coutbuf = tmpbuf;
|
||||
}
|
||||
}
|
||||
|
||||
converter->current_dictionary_group = dictionary_set_get_group(converter->dictionary_set, i);
|
||||
|
||||
size_t ret = segment(converter, &cinbuf, &cinbuf_left, &coutbuf, &coutbuf_left);
|
||||
if (ret == (size_t)-1) {
|
||||
free(tmpbuf);
|
||||
return (size_t)-1;
|
||||
}
|
||||
coutbuf_delta = outbuf_size - coutbuf_left;
|
||||
if (i == 0) {
|
||||
retval = ret;
|
||||
*inbuf = cinbuf;
|
||||
*inbuf_left = cinbuf_left;
|
||||
}
|
||||
}
|
||||
|
||||
if (cur == 1) {
|
||||
// 結果在緩衝區
|
||||
memcpy(*outbuf, tmpbuf, coutbuf_delta * sizeof(ucs4_t));
|
||||
}
|
||||
|
||||
*outbuf += coutbuf_delta;
|
||||
*outbuf_left = coutbuf_left;
|
||||
free(tmpbuf);
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
void converter_assign_dictionary(converter_t t_converter, dictionary_set_t dictionary_set) {
|
||||
converter_desc *converter = (converter_desc *)t_converter;
|
||||
converter->dictionary_set = dictionary_set;
|
||||
if (dictionary_set_count_group(converter->dictionary_set) > 0)
|
||||
converter->current_dictionary_group = dictionary_set_get_group(converter->dictionary_set, 0);
|
||||
}
|
||||
|
||||
converter_t converter_open(void) {
|
||||
converter_desc *converter = (converter_desc *)malloc(sizeof(converter_desc));
|
||||
|
||||
converter->dictionary_set = NULL;
|
||||
converter->current_dictionary_group = NULL;
|
||||
|
||||
#if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
|
||||
converter->spseg_buffer.initialized = FALSE;
|
||||
converter->spseg_buffer.match_length = converter->spseg_buffer.min_len = converter->spseg_buffer.parent = converter->spseg_buffer.path = NULL;
|
||||
|
||||
sp_seg_set_buffer_size(&converter->spseg_buffer, OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE);
|
||||
#endif
|
||||
|
||||
return (converter_t)converter;
|
||||
}
|
||||
|
||||
void converter_close(converter_t t_converter) {
|
||||
converter_desc *converter = (converter_desc *)t_converter;
|
||||
|
||||
#if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
|
||||
sp_seg_buffer_free(&(converter->spseg_buffer));
|
||||
#endif
|
||||
|
||||
free(converter);
|
||||
}
|
||||
|
||||
void converter_set_conversion_mode(converter_t t_converter, opencc_conversion_mode conversion_mode) {
|
||||
converter_desc *converter = (converter_desc *)t_converter;
|
||||
converter->conversion_mode = conversion_mode;
|
||||
}
|
||||
|
||||
converter_error converter_errno(void) { return errnum; }
|
||||
|
||||
void converter_perror(const char *spec) {
|
||||
perr(spec);
|
||||
perr("\n");
|
||||
switch (errnum) {
|
||||
case CONVERTER_ERROR_VOID:
|
||||
break;
|
||||
case CONVERTER_ERROR_NODICT:
|
||||
perr(_("No dictionary loaded"));
|
||||
break;
|
||||
case CONVERTER_ERROR_OUTBUF:
|
||||
perr(_("Output buffer not enough for one segment"));
|
||||
break;
|
||||
default:
|
||||
perr(_("Unknown"));
|
||||
}
|
||||
}
|
||||
48
internal/cpp/opencc/converter.h
Normal file
48
internal/cpp/opencc/converter.h
Normal file
@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __CONVERTER_H_
|
||||
#define __CONVERTER_H_
|
||||
|
||||
#include "dictionary_set.h"
|
||||
|
||||
typedef void * converter_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
CONVERTER_ERROR_VOID,
|
||||
CONVERTER_ERROR_NODICT,
|
||||
CONVERTER_ERROR_OUTBUF,
|
||||
} converter_error;
|
||||
|
||||
void converter_assign_dictionary(converter_t t_converter, dictionary_set_t dictionary_set);
|
||||
|
||||
converter_t converter_open(void);
|
||||
|
||||
void converter_close(converter_t t_converter);
|
||||
|
||||
size_t converter_convert(converter_t t_converter, ucs4_t ** inbuf, size_t * inbuf_left,
|
||||
ucs4_t ** outbuf, size_t * outbuf_left);
|
||||
|
||||
void converter_set_conversion_mode(converter_t t_converter, opencc_conversion_mode conversion_mode);
|
||||
|
||||
converter_error converter_errno(void);
|
||||
|
||||
void converter_perror(const char * spec);
|
||||
|
||||
#endif /* __CONVERTER_H_ */
|
||||
94
internal/cpp/opencc/dictionary/abstract.c
Normal file
94
internal/cpp/opencc/dictionary/abstract.c
Normal file
@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "abstract.h"
|
||||
#include "datrie.h"
|
||||
#include "text.h"
|
||||
|
||||
struct _dictionary {
|
||||
opencc_dictionary_type type;
|
||||
dictionary_t dict;
|
||||
};
|
||||
typedef struct _dictionary dictionary_desc;
|
||||
|
||||
dictionary_t dictionary_open(const char *filename, opencc_dictionary_type type) {
|
||||
dictionary_desc *dictionary = (dictionary_desc *)malloc(sizeof(dictionary_desc));
|
||||
dictionary->type = type;
|
||||
switch (type) {
|
||||
case OPENCC_DICTIONARY_TYPE_TEXT:
|
||||
dictionary->dict = dictionary_text_open(filename);
|
||||
break;
|
||||
case OPENCC_DICTIONARY_TYPE_DATRIE:
|
||||
dictionary->dict = dictionary_datrie_open(filename);
|
||||
break;
|
||||
default:
|
||||
free(dictionary);
|
||||
dictionary = (dictionary_t)-1; /* TODO:辭典格式不支持 */
|
||||
}
|
||||
return dictionary;
|
||||
}
|
||||
|
||||
dictionary_t dictionary_get(dictionary_t t_dictionary) {
|
||||
dictionary_desc *dictionary = (dictionary_desc *)t_dictionary;
|
||||
return dictionary->dict;
|
||||
}
|
||||
|
||||
void dictionary_close(dictionary_t t_dictionary) {
|
||||
dictionary_desc *dictionary = (dictionary_desc *)t_dictionary;
|
||||
switch (dictionary->type) {
|
||||
case OPENCC_DICTIONARY_TYPE_TEXT:
|
||||
dictionary_text_close(dictionary->dict);
|
||||
break;
|
||||
case OPENCC_DICTIONARY_TYPE_DATRIE:
|
||||
dictionary_datrie_close(dictionary->dict);
|
||||
break;
|
||||
default:
|
||||
debug_should_not_be_here();
|
||||
}
|
||||
free(dictionary);
|
||||
}
|
||||
|
||||
const ucs4_t *const *dictionary_match_longest(dictionary_t t_dictionary, const ucs4_t *word, size_t maxlen, size_t *match_length) {
|
||||
dictionary_desc *dictionary = (dictionary_desc *)t_dictionary;
|
||||
switch (dictionary->type) {
|
||||
case OPENCC_DICTIONARY_TYPE_TEXT:
|
||||
return dictionary_text_match_longest(dictionary->dict, word, maxlen, match_length);
|
||||
break;
|
||||
case OPENCC_DICTIONARY_TYPE_DATRIE:
|
||||
return dictionary_datrie_match_longest(dictionary->dict, word, maxlen, match_length);
|
||||
break;
|
||||
default:
|
||||
debug_should_not_be_here();
|
||||
}
|
||||
return (const ucs4_t *const *)-1;
|
||||
}
|
||||
|
||||
size_t dictionary_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t *word, size_t *match_length) {
|
||||
dictionary_desc *dictionary = (dictionary_desc *)t_dictionary;
|
||||
switch (dictionary->type) {
|
||||
case OPENCC_DICTIONARY_TYPE_TEXT:
|
||||
return dictionary_text_get_all_match_lengths(dictionary->dict, word, match_length);
|
||||
break;
|
||||
case OPENCC_DICTIONARY_TYPE_DATRIE:
|
||||
return dictionary_datrie_get_all_match_lengths(dictionary->dict, word, match_length);
|
||||
break;
|
||||
default:
|
||||
debug_should_not_be_here();
|
||||
}
|
||||
return (size_t)-1;
|
||||
}
|
||||
45
internal/cpp/opencc/dictionary/abstract.h
Normal file
45
internal/cpp/opencc/dictionary/abstract.h
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_DICTIONARY_ABSTRACT_H_
|
||||
#define __OPENCC_DICTIONARY_ABSTRACT_H_
|
||||
|
||||
#include "../utils.h"
|
||||
|
||||
struct _entry
|
||||
{
|
||||
ucs4_t * key;
|
||||
ucs4_t ** value;
|
||||
};
|
||||
typedef struct _entry entry;
|
||||
|
||||
typedef void * dictionary_t;
|
||||
|
||||
dictionary_t dictionary_open(const char * filename, opencc_dictionary_type type);
|
||||
|
||||
void dictionary_close(dictionary_t t_dictionary);
|
||||
|
||||
dictionary_t dictionary_get(dictionary_t t_dictionary);
|
||||
|
||||
const ucs4_t * const * dictionary_match_longest(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t maxlen, size_t * match_length);
|
||||
|
||||
size_t dictionary_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t * match_length);
|
||||
|
||||
#endif /* __OPENCC_DICTIONARY_ABSTRACT_H_ */
|
||||
250
internal/cpp/opencc/dictionary/datrie.c
Normal file
250
internal/cpp/opencc/dictionary/datrie.c
Normal file
@ -0,0 +1,250 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "datrie.h"
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef __WIN32
|
||||
/* Todo: Win32 mmap*/
|
||||
#else
|
||||
#include <sys/mman.h>
|
||||
#define MMAP_ENABLED
|
||||
#endif
|
||||
|
||||
typedef enum { MEMORY_TYPE_MMAP, MEMORY_TYPE_ALLOCATE } memory_type;
|
||||
|
||||
struct _datrie_dictionary {
|
||||
const DoubleArrayTrieItem *dat;
|
||||
uint32_t dat_item_count;
|
||||
ucs4_t *lexicon;
|
||||
uint32_t lexicon_count;
|
||||
|
||||
ucs4_t ***lexicon_set;
|
||||
void *dic_memory;
|
||||
size_t dic_size;
|
||||
memory_type dic_memory_type;
|
||||
};
|
||||
typedef struct _datrie_dictionary datrie_dictionary_desc;
|
||||
|
||||
static int load_allocate(datrie_dictionary_desc *datrie_dictionary, int fd) {
|
||||
datrie_dictionary->dic_memory_type = MEMORY_TYPE_ALLOCATE;
|
||||
datrie_dictionary->dic_memory = malloc(datrie_dictionary->dic_size);
|
||||
if (datrie_dictionary->dic_memory == NULL) {
|
||||
/* 內存申請失敗 */
|
||||
return -1;
|
||||
}
|
||||
lseek(fd, 0, SEEK_SET);
|
||||
if (read(fd, datrie_dictionary->dic_memory, datrie_dictionary->dic_size) == -1) {
|
||||
/* 讀取失敗 */
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int load_mmap(datrie_dictionary_desc *datrie_dictionary, int fd) {
|
||||
#ifdef MMAP_ENABLED
|
||||
datrie_dictionary->dic_memory_type = MEMORY_TYPE_MMAP;
|
||||
datrie_dictionary->dic_memory = mmap(NULL, datrie_dictionary->dic_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
||||
if (datrie_dictionary->dic_memory == MAP_FAILED) {
|
||||
/* 內存映射創建失敗 */
|
||||
datrie_dictionary->dic_memory = NULL;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
static int load_dict(datrie_dictionary_desc *datrie_dictionary, FILE *fp) {
|
||||
int fd = fileno(fp);
|
||||
|
||||
fseek(fp, 0, SEEK_END);
|
||||
datrie_dictionary->dic_size = ftell(fp);
|
||||
|
||||
/* 首先嘗試mmap,如果失敗嘗試申請內存 */
|
||||
if (load_mmap(datrie_dictionary, fd) == -1) {
|
||||
if (load_allocate(datrie_dictionary, fd) == -1) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
size_t header_len = strlen("OPENCCDATRIE");
|
||||
|
||||
if (strncmp((const char *)datrie_dictionary->dic_memory, "OPENCCDATRIE", header_len) != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t offset = 0;
|
||||
|
||||
offset += header_len * sizeof(char);
|
||||
|
||||
/* 詞彙表 */
|
||||
uint32_t lexicon_length = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
|
||||
offset += sizeof(uint32_t);
|
||||
|
||||
datrie_dictionary->lexicon = (ucs4_t *)(datrie_dictionary->dic_memory + offset);
|
||||
offset += lexicon_length * sizeof(ucs4_t);
|
||||
|
||||
/* 詞彙索引表 */
|
||||
uint32_t lexicon_index_length = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
|
||||
offset += sizeof(uint32_t);
|
||||
|
||||
uint32_t *lexicon_index = (uint32_t *)(datrie_dictionary->dic_memory + offset);
|
||||
offset += lexicon_index_length * sizeof(uint32_t);
|
||||
|
||||
datrie_dictionary->lexicon_count = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
|
||||
offset += sizeof(uint32_t);
|
||||
|
||||
datrie_dictionary->dat_item_count = *((uint32_t *)(datrie_dictionary->dic_memory + offset));
|
||||
offset += sizeof(uint32_t);
|
||||
|
||||
datrie_dictionary->dat = (DoubleArrayTrieItem *)(datrie_dictionary->dic_memory + offset);
|
||||
|
||||
/* 構造索引表 */
|
||||
datrie_dictionary->lexicon_set = (ucs4_t ***)malloc(datrie_dictionary->lexicon_count * sizeof(ucs4_t **));
|
||||
size_t i, last = 0;
|
||||
for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
|
||||
size_t count, j;
|
||||
for (j = last; j < lexicon_index_length; j++) {
|
||||
if (lexicon_index[j] == (uint32_t)-1)
|
||||
break;
|
||||
}
|
||||
count = j - last;
|
||||
|
||||
datrie_dictionary->lexicon_set[i] = (ucs4_t **)malloc((count + 1) * sizeof(ucs4_t *));
|
||||
for (j = 0; j < count; j++) {
|
||||
datrie_dictionary->lexicon_set[i][j] = datrie_dictionary->lexicon + lexicon_index[last + j];
|
||||
}
|
||||
datrie_dictionary->lexicon_set[i][count] = NULL;
|
||||
last += j + 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int unload_dict(datrie_dictionary_desc *datrie_dictionary) {
|
||||
if (datrie_dictionary->dic_memory != NULL) {
|
||||
size_t i;
|
||||
for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
|
||||
free(datrie_dictionary->lexicon_set[i]);
|
||||
}
|
||||
free(datrie_dictionary->lexicon_set);
|
||||
|
||||
if (MEMORY_TYPE_MMAP == datrie_dictionary->dic_memory_type) {
|
||||
#ifdef MMAP_ENABLED
|
||||
return munmap(datrie_dictionary->dic_memory, datrie_dictionary->dic_size);
|
||||
#else
|
||||
debug_should_not_be_here();
|
||||
#endif
|
||||
} else if (MEMORY_TYPE_ALLOCATE == datrie_dictionary->dic_memory_type) {
|
||||
free(datrie_dictionary->dic_memory);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
dictionary_t dictionary_datrie_open(const char *filename) {
|
||||
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)malloc(sizeof(datrie_dictionary_desc));
|
||||
datrie_dictionary->dat = NULL;
|
||||
datrie_dictionary->lexicon = NULL;
|
||||
|
||||
FILE *fp = fopen(filename, "rb");
|
||||
|
||||
if (load_dict(datrie_dictionary, fp) == -1) {
|
||||
dictionary_datrie_close((dictionary_t)datrie_dictionary);
|
||||
return (dictionary_t)-1;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return (dictionary_t)datrie_dictionary;
|
||||
}
|
||||
|
||||
int dictionary_datrie_close(dictionary_t t_dictionary) {
|
||||
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)t_dictionary;
|
||||
|
||||
if (unload_dict(datrie_dictionary) == -1) {
|
||||
free(datrie_dictionary);
|
||||
return -1;
|
||||
}
|
||||
|
||||
free(datrie_dictionary);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int encode_char(ucs4_t ch) { return (int)ch; }
|
||||
|
||||
void datrie_match(const datrie_dictionary_desc *datrie_dictionary, const ucs4_t *word, size_t *match_pos, size_t *id, size_t limit) {
|
||||
size_t i, p;
|
||||
for (i = 0, p = 0; word[p] && (limit == 0 || p < limit) && datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) {
|
||||
int k = encode_char(word[p]);
|
||||
int j = datrie_dictionary->dat[i].base + k;
|
||||
if (j < 0 || j >= datrie_dictionary->dat_item_count || datrie_dictionary->dat[j].parent != i)
|
||||
break;
|
||||
i = j;
|
||||
}
|
||||
if (match_pos)
|
||||
*match_pos = p;
|
||||
if (id)
|
||||
*id = i;
|
||||
}
|
||||
|
||||
const ucs4_t *const *dictionary_datrie_match_longest(dictionary_t t_dictionary, const ucs4_t *word, size_t maxlen, size_t *match_length) {
|
||||
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)t_dictionary;
|
||||
|
||||
size_t pos, item;
|
||||
datrie_match(datrie_dictionary, word, &pos, &item, maxlen);
|
||||
|
||||
while (datrie_dictionary->dat[item].word == -1 && pos > 1)
|
||||
datrie_match(datrie_dictionary, word, &pos, &item, pos - 1);
|
||||
|
||||
if (pos == 0 || datrie_dictionary->dat[item].word == -1) {
|
||||
if (match_length != NULL)
|
||||
*match_length = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (match_length != NULL)
|
||||
*match_length = pos;
|
||||
|
||||
return (const ucs4_t *const *)datrie_dictionary->lexicon_set[datrie_dictionary->dat[item].word];
|
||||
}
|
||||
|
||||
size_t dictionary_datrie_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t *word, size_t *match_length) {
|
||||
datrie_dictionary_desc *datrie_dictionary = (datrie_dictionary_desc *)t_dictionary;
|
||||
|
||||
size_t rscnt = 0;
|
||||
|
||||
size_t i, p;
|
||||
for (i = 0, p = 0; word[p] && datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) {
|
||||
int k = encode_char(word[p]);
|
||||
int j = datrie_dictionary->dat[i].base + k;
|
||||
if (j < 0 || j >= datrie_dictionary->dat_item_count || datrie_dictionary->dat[j].parent != i)
|
||||
break;
|
||||
i = j;
|
||||
|
||||
if (datrie_dictionary->dat[i].word != -1)
|
||||
match_length[rscnt++] = p + 1;
|
||||
}
|
||||
|
||||
return rscnt;
|
||||
}
|
||||
45
internal/cpp/opencc/dictionary/datrie.h
Normal file
45
internal/cpp/opencc/dictionary/datrie.h
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_DICTIONARY_DATRIE_H_
|
||||
#define __OPENCC_DICTIONARY_DATRIE_H_
|
||||
|
||||
#include "abstract.h"
|
||||
|
||||
#define DATRIE_UNUSED -1
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int base;
|
||||
int parent;
|
||||
int word;
|
||||
} DoubleArrayTrieItem;
|
||||
|
||||
dictionary_t dictionary_datrie_open(const char * filename);
|
||||
|
||||
int dictionary_datrie_close(dictionary_t t_dictionary);
|
||||
|
||||
const ucs4_t * const * dictionary_datrie_match_longest(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t maxlen, size_t * match_length);
|
||||
|
||||
size_t dictionary_datrie_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t * match_length);
|
||||
|
||||
int encode_char(ucs4_t ch);
|
||||
|
||||
#endif /* __OPENCC_DICTIONARY_DATRIE_H_ */
|
||||
232
internal/cpp/opencc/dictionary/text.c
Normal file
232
internal/cpp/opencc/dictionary/text.c
Normal file
@ -0,0 +1,232 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "text.h"
|
||||
#include "../encoding.h"
|
||||
|
||||
#define INITIAL_DICTIONARY_SIZE 1024
|
||||
#define ENTRY_BUFF_SIZE 128
|
||||
#define ENTRY_WBUFF_SIZE ENTRY_BUFF_SIZE / sizeof(size_t)
|
||||
|
||||
struct _text_dictionary {
|
||||
size_t entry_count;
|
||||
size_t max_length;
|
||||
entry *lexicon;
|
||||
ucs4_t *word_buff;
|
||||
};
|
||||
typedef struct _text_dictionary text_dictionary_desc;
|
||||
|
||||
int qsort_entry_cmp(const void *a, const void *b) { return ucs4cmp(((entry *)a)->key, ((entry *)b)->key); }
|
||||
|
||||
int parse_entry(const char *buff, entry *entry_i) {
|
||||
size_t length;
|
||||
const char *pbuff;
|
||||
|
||||
/* 解析鍵 */
|
||||
for (pbuff = buff; *pbuff != '\t' && *pbuff != '\0'; ++pbuff)
|
||||
;
|
||||
if (*pbuff == '\0')
|
||||
return -1;
|
||||
length = pbuff - buff;
|
||||
|
||||
ucs4_t *ucs4_buff;
|
||||
ucs4_buff = utf8_to_ucs4(buff, length);
|
||||
if (ucs4_buff == (ucs4_t *)-1)
|
||||
return -1;
|
||||
entry_i->key = (ucs4_t *)malloc((length + 1) * sizeof(ucs4_t));
|
||||
ucs4cpy(entry_i->key, ucs4_buff);
|
||||
free(ucs4_buff);
|
||||
|
||||
/* 解析值 */
|
||||
size_t value_i, value_count = INITIAL_DICTIONARY_SIZE;
|
||||
entry_i->value = (ucs4_t **)malloc(value_count * sizeof(ucs4_t *));
|
||||
|
||||
for (value_i = 0; *pbuff != '\0' && *pbuff != '\n'; ++value_i) {
|
||||
if (value_i >= value_count) {
|
||||
value_count += value_count;
|
||||
entry_i->value = (ucs4_t **)realloc(entry_i->value, value_count * sizeof(ucs4_t *));
|
||||
}
|
||||
|
||||
for (buff = ++pbuff; *pbuff != ' ' && *pbuff != '\0' && *pbuff != '\n'; ++pbuff)
|
||||
;
|
||||
length = pbuff - buff;
|
||||
ucs4_buff = utf8_to_ucs4(buff, length);
|
||||
if (ucs4_buff == (ucs4_t *)-1) {
|
||||
/* 發生錯誤 回退內存申請 */
|
||||
ssize_t i;
|
||||
for (i = value_i - 1; i >= 0; --i)
|
||||
free(entry_i->value[i]);
|
||||
free(entry_i->value);
|
||||
free(entry_i->key);
|
||||
return -1;
|
||||
}
|
||||
|
||||
entry_i->value[value_i] = (ucs4_t *)malloc((length + 1) * sizeof(ucs4_t));
|
||||
ucs4cpy(entry_i->value[value_i], ucs4_buff);
|
||||
free(ucs4_buff);
|
||||
}
|
||||
|
||||
entry_i->value = (ucs4_t **)realloc(entry_i->value, value_count * sizeof(ucs4_t *));
|
||||
entry_i->value[value_i] = NULL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
dictionary_t dictionary_text_open(const char *filename) {
|
||||
text_dictionary_desc *text_dictionary;
|
||||
text_dictionary = (text_dictionary_desc *)malloc(sizeof(text_dictionary_desc));
|
||||
text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE;
|
||||
text_dictionary->max_length = 0;
|
||||
text_dictionary->lexicon = (entry *)malloc(sizeof(entry) * text_dictionary->entry_count);
|
||||
text_dictionary->word_buff = NULL;
|
||||
|
||||
static char buff[ENTRY_BUFF_SIZE];
|
||||
|
||||
FILE *fp = fopen(filename, "rb");
|
||||
if (fp == NULL) {
|
||||
dictionary_text_close((dictionary_t)text_dictionary);
|
||||
return (dictionary_t)-1;
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
while (fgets(buff, ENTRY_BUFF_SIZE, fp)) {
|
||||
if (i >= text_dictionary->entry_count) {
|
||||
text_dictionary->entry_count += text_dictionary->entry_count;
|
||||
text_dictionary->lexicon = (entry *)realloc(text_dictionary->lexicon, sizeof(entry) * text_dictionary->entry_count);
|
||||
}
|
||||
|
||||
if (parse_entry(buff, text_dictionary->lexicon + i) == -1) {
|
||||
text_dictionary->entry_count = i;
|
||||
dictionary_text_close((dictionary_t)text_dictionary);
|
||||
return (dictionary_t)-1;
|
||||
}
|
||||
|
||||
size_t length = ucs4len(text_dictionary->lexicon[i].key);
|
||||
if (length > text_dictionary->max_length)
|
||||
text_dictionary->max_length = length;
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
text_dictionary->entry_count = i;
|
||||
text_dictionary->lexicon = (entry *)realloc(text_dictionary->lexicon, sizeof(entry) * text_dictionary->entry_count);
|
||||
text_dictionary->word_buff = (ucs4_t *)malloc(sizeof(ucs4_t) * (text_dictionary->max_length + 1));
|
||||
|
||||
qsort(text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp);
|
||||
|
||||
return (dictionary_t)text_dictionary;
|
||||
}
|
||||
|
||||
void dictionary_text_close(dictionary_t t_dictionary) {
|
||||
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < text_dictionary->entry_count; ++i) {
|
||||
free(text_dictionary->lexicon[i].key);
|
||||
|
||||
ucs4_t **j;
|
||||
for (j = text_dictionary->lexicon[i].value; *j; ++j) {
|
||||
free(*j);
|
||||
}
|
||||
free(text_dictionary->lexicon[i].value);
|
||||
}
|
||||
|
||||
free(text_dictionary->lexicon);
|
||||
free(text_dictionary->word_buff);
|
||||
free(text_dictionary);
|
||||
}
|
||||
|
||||
const ucs4_t *const *dictionary_text_match_longest(dictionary_t t_dictionary, const ucs4_t *word, size_t maxlen, size_t *match_length) {
|
||||
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
||||
|
||||
if (text_dictionary->entry_count == 0)
|
||||
return NULL;
|
||||
|
||||
if (maxlen == 0)
|
||||
maxlen = ucs4len(word);
|
||||
size_t len = text_dictionary->max_length;
|
||||
if (maxlen < len)
|
||||
len = maxlen;
|
||||
|
||||
ucs4ncpy(text_dictionary->word_buff, word, len);
|
||||
text_dictionary->word_buff[len] = L'\0';
|
||||
|
||||
entry buff;
|
||||
buff.key = text_dictionary->word_buff;
|
||||
|
||||
for (; len > 0; len--) {
|
||||
text_dictionary->word_buff[len] = L'\0';
|
||||
entry *brs =
|
||||
(entry *)bsearch(&buff, text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp);
|
||||
|
||||
if (brs != NULL) {
|
||||
if (match_length != NULL)
|
||||
*match_length = len;
|
||||
return (const ucs4_t *const *)brs->value;
|
||||
}
|
||||
}
|
||||
|
||||
if (match_length != NULL)
|
||||
*match_length = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t dictionary_text_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t *word, size_t *match_length) {
|
||||
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
||||
|
||||
size_t rscnt = 0;
|
||||
|
||||
if (text_dictionary->entry_count == 0)
|
||||
return rscnt;
|
||||
|
||||
size_t length = ucs4len(word);
|
||||
size_t len = text_dictionary->max_length;
|
||||
if (length < len)
|
||||
len = length;
|
||||
|
||||
ucs4ncpy(text_dictionary->word_buff, word, len);
|
||||
text_dictionary->word_buff[len] = L'\0';
|
||||
|
||||
entry buff;
|
||||
buff.key = text_dictionary->word_buff;
|
||||
|
||||
for (; len > 0; len--) {
|
||||
text_dictionary->word_buff[len] = L'\0';
|
||||
entry *brs =
|
||||
(entry *)bsearch(&buff, text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp);
|
||||
|
||||
if (brs != NULL)
|
||||
match_length[rscnt++] = len;
|
||||
}
|
||||
|
||||
return rscnt;
|
||||
}
|
||||
|
||||
size_t dictionary_text_get_lexicon(dictionary_t t_dictionary, entry *lexicon) {
|
||||
text_dictionary_desc *text_dictionary = (text_dictionary_desc *)t_dictionary;
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < text_dictionary->entry_count; i++) {
|
||||
lexicon[i].key = text_dictionary->lexicon[i].key;
|
||||
lexicon[i].value = text_dictionary->lexicon[i].value;
|
||||
}
|
||||
|
||||
return text_dictionary->entry_count;
|
||||
}
|
||||
36
internal/cpp/opencc/dictionary/text.h
Normal file
36
internal/cpp/opencc/dictionary/text.h
Normal file
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_DICTIONARY_TEXT_H_
|
||||
#define __OPENCC_DICTIONARY_TEXT_H_
|
||||
|
||||
#include "abstract.h"
|
||||
|
||||
dictionary_t dictionary_text_open(const char * filename);
|
||||
|
||||
void dictionary_text_close(dictionary_t t_dictionary);
|
||||
|
||||
const ucs4_t * const * dictionary_text_match_longest(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t maxlen, size_t * match_length);
|
||||
|
||||
size_t dictionary_text_get_all_match_lengths(dictionary_t t_dictionary, const ucs4_t * word,
|
||||
size_t * match_length);
|
||||
|
||||
size_t dictionary_text_get_lexicon(dictionary_t t_dictionary, entry * lexicon);
|
||||
|
||||
#endif /* __OPENCC_DICTIONARY_TEXT_H_ */
|
||||
177
internal/cpp/opencc/dictionary_group.c
Normal file
177
internal/cpp/opencc/dictionary_group.c
Normal file
@ -0,0 +1,177 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dictionary_group.h"
|
||||
|
||||
#define DICTIONARY_MAX_COUNT 128
|
||||
|
||||
struct _dictionary_group {
|
||||
size_t count;
|
||||
dictionary_t dicts[DICTIONARY_MAX_COUNT];
|
||||
};
|
||||
typedef struct _dictionary_group dictionary_group_desc;
|
||||
|
||||
static dictionary_error errnum = DICTIONARY_ERROR_VOID;
|
||||
|
||||
dictionary_group_t dictionary_group_open(void) {
|
||||
dictionary_group_desc *dictionary_group = (dictionary_group_desc *)malloc(sizeof(dictionary_group_desc));
|
||||
|
||||
dictionary_group->count = 0;
|
||||
|
||||
return dictionary_group;
|
||||
}
|
||||
|
||||
void dictionary_group_close(dictionary_group_t t_dictionary) {
|
||||
dictionary_group_desc *dictionary_group = (dictionary_group_desc *)t_dictionary;
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < dictionary_group->count; i++)
|
||||
dictionary_close(dictionary_group->dicts[i]);
|
||||
|
||||
free(dictionary_group);
|
||||
}
|
||||
|
||||
int dictionary_group_load(dictionary_group_t t_dictionary, const char *filename, const char *home_path, opencc_dictionary_type type) {
|
||||
dictionary_group_desc *dictionary_group = (dictionary_group_desc *)t_dictionary;
|
||||
dictionary_t dictionary;
|
||||
|
||||
FILE *fp = fopen(filename, "rb");
|
||||
if (!fp) {
|
||||
char *new_filename = (char *)malloc(sizeof(char) * (strlen(filename) + strlen(home_path) + 2));
|
||||
sprintf(new_filename, "%s/%s", home_path, filename);
|
||||
|
||||
fp = fopen(new_filename, "rb");
|
||||
if (!fp) {
|
||||
free(new_filename);
|
||||
errnum = DICTIONARY_ERROR_CANNOT_ACCESS_DICTFILE;
|
||||
return -1;
|
||||
}
|
||||
dictionary = dictionary_open(new_filename, type);
|
||||
free(new_filename);
|
||||
} else {
|
||||
dictionary = dictionary_open(filename, type);
|
||||
}
|
||||
fclose(fp);
|
||||
|
||||
if (dictionary == (dictionary_t)-1) {
|
||||
errnum = DICTIONARY_ERROR_INVALID_DICT;
|
||||
return -1;
|
||||
}
|
||||
dictionary_group->dicts[dictionary_group->count++] = dictionary;
|
||||
return 0;
|
||||
}
|
||||
|
||||
dictionary_t dictionary_group_get_dictionary(dictionary_group_t t_dictionary, size_t index) {
|
||||
dictionary_group_desc *dictionary_group = (dictionary_group_desc *)t_dictionary;
|
||||
|
||||
if (index < 0 || index >= dictionary_group->count) {
|
||||
errnum = DICTIONARY_ERROR_INVALID_INDEX;
|
||||
return (dictionary_t)-1;
|
||||
}
|
||||
|
||||
return dictionary_group->dicts[index];
|
||||
}
|
||||
|
||||
size_t dictionary_group_count(dictionary_group_t t_dictionary) {
|
||||
dictionary_group_desc *dictionary_group = (dictionary_group_desc *)t_dictionary;
|
||||
return dictionary_group->count;
|
||||
}
|
||||
|
||||
const ucs4_t *const *dictionary_group_match_longest(dictionary_group_t t_dictionary, const ucs4_t *word, size_t maxlen, size_t *match_length) {
|
||||
dictionary_group_desc *dictionary_group = (dictionary_group_desc *)t_dictionary;
|
||||
|
||||
if (dictionary_group->count == 0) {
|
||||
errnum = DICTIONARY_ERROR_NODICT;
|
||||
return (const ucs4_t *const *)-1;
|
||||
}
|
||||
|
||||
const ucs4_t *const *retval = NULL;
|
||||
size_t t_match_length, max_length = 0;
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < dictionary_group->count; i++) {
|
||||
/* 依次查找每個辭典,取得最長匹配長度 */
|
||||
const ucs4_t *const *t_retval = dictionary_match_longest(dictionary_group->dicts[i], word, maxlen, &t_match_length);
|
||||
|
||||
if (t_retval != NULL) {
|
||||
if (t_match_length > max_length) {
|
||||
max_length = t_match_length;
|
||||
retval = t_retval;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (match_length != NULL) {
|
||||
*match_length = max_length;
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
size_t dictionary_group_get_all_match_lengths(dictionary_group_t t_dictionary, const ucs4_t *word, size_t *match_length) {
|
||||
dictionary_group_desc *dictionary_group = (dictionary_group_desc *)t_dictionary;
|
||||
|
||||
if (dictionary_group->count == 0) {
|
||||
errnum = DICTIONARY_ERROR_NODICT;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
size_t rscnt = 0;
|
||||
size_t i;
|
||||
for (i = 0; i < dictionary_group->count; i++) {
|
||||
size_t retval;
|
||||
retval = dictionary_get_all_match_lengths(dictionary_group->dicts[i], word, match_length + rscnt);
|
||||
rscnt += retval;
|
||||
/* 去除重複長度 */
|
||||
if (i > 0 && rscnt > 1) {
|
||||
qsort(match_length, rscnt, sizeof(match_length[0]), qsort_int_cmp);
|
||||
int j, k;
|
||||
for (j = 0, k = 1; k < rscnt; k++) {
|
||||
if (match_length[k] != match_length[j])
|
||||
match_length[++j] = match_length[k];
|
||||
}
|
||||
rscnt = j + 1;
|
||||
}
|
||||
}
|
||||
return rscnt;
|
||||
}
|
||||
|
||||
dictionary_error dictionary_errno(void) { return errnum; }
|
||||
|
||||
void dictionary_perror(const char *spec) {
|
||||
perr(spec);
|
||||
perr("\n");
|
||||
switch (errnum) {
|
||||
case DICTIONARY_ERROR_VOID:
|
||||
break;
|
||||
case DICTIONARY_ERROR_NODICT:
|
||||
perr(_("No dictionary loaded"));
|
||||
break;
|
||||
case DICTIONARY_ERROR_CANNOT_ACCESS_DICTFILE:
|
||||
perror(_("Can not open dictionary file"));
|
||||
break;
|
||||
case DICTIONARY_ERROR_INVALID_DICT:
|
||||
perror(_("Invalid dictionary file"));
|
||||
break;
|
||||
case DICTIONARY_ERROR_INVALID_INDEX:
|
||||
perror(_("Invalid dictionary index"));
|
||||
break;
|
||||
default:
|
||||
perr(_("Unknown"));
|
||||
}
|
||||
}
|
||||
57
internal/cpp/opencc/dictionary_group.h
Normal file
57
internal/cpp/opencc/dictionary_group.h
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __DICTIONARY_GROUP_H_
|
||||
#define __DICTIONARY_GROUP_H_
|
||||
|
||||
#include "utils.h"
|
||||
#include "dictionary/abstract.h"
|
||||
|
||||
typedef void * dictionary_group_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
DICTIONARY_ERROR_VOID,
|
||||
DICTIONARY_ERROR_NODICT,
|
||||
DICTIONARY_ERROR_CANNOT_ACCESS_DICTFILE,
|
||||
DICTIONARY_ERROR_INVALID_DICT,
|
||||
DICTIONARY_ERROR_INVALID_INDEX,
|
||||
} dictionary_error;
|
||||
|
||||
dictionary_group_t dictionary_group_open(void);
|
||||
|
||||
void dictionary_group_close(dictionary_group_t t_dictionary);
|
||||
|
||||
int dictionary_group_load(dictionary_group_t t_dictionary, const char * filename, const char* home_dir,
|
||||
opencc_dictionary_type type);
|
||||
|
||||
const ucs4_t * const * dictionary_group_match_longest(dictionary_group_t t_dictionary, const ucs4_t * word,
|
||||
size_t maxlen, size_t * match_length);
|
||||
|
||||
size_t dictionary_group_get_all_match_lengths(dictionary_group_t t_dictionary, const ucs4_t * word,
|
||||
size_t * match_length);
|
||||
|
||||
dictionary_t dictionary_group_get_dictionary(dictionary_group_t t_dictionary, size_t index);
|
||||
|
||||
size_t dictionary_group_count(dictionary_group_t t_dictionary);
|
||||
|
||||
dictionary_error dictionary_errno(void);
|
||||
|
||||
void dictionary_perror(const char * spec);
|
||||
|
||||
#endif /* __DICTIONARY_GROUP_H_ */
|
||||
73
internal/cpp/opencc/dictionary_set.c
Normal file
73
internal/cpp/opencc/dictionary_set.c
Normal file
@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dictionary_set.h"
|
||||
|
||||
#define DICTIONARY_GROUP_MAX_COUNT 128
|
||||
|
||||
struct _dictionary_set {
|
||||
size_t count;
|
||||
dictionary_group_t groups[DICTIONARY_GROUP_MAX_COUNT];
|
||||
};
|
||||
typedef struct _dictionary_set dictionary_set_desc;
|
||||
|
||||
dictionary_set_t dictionary_set_open(void) {
|
||||
dictionary_set_desc *dictionary_set = (dictionary_set_desc *)malloc(sizeof(dictionary_set_desc));
|
||||
|
||||
dictionary_set->count = 0;
|
||||
|
||||
return dictionary_set;
|
||||
}
|
||||
|
||||
void dictionary_set_close(dictionary_set_t t_dictionary) {
|
||||
dictionary_set_desc *dictionary_set = (dictionary_set_desc *)t_dictionary;
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i < dictionary_set->count; i++)
|
||||
dictionary_group_close(dictionary_set->groups[i]);
|
||||
|
||||
free(dictionary_set);
|
||||
}
|
||||
|
||||
dictionary_group_t dictionary_set_new_group(dictionary_set_t t_dictionary) {
|
||||
dictionary_set_desc *dictionary_set = (dictionary_set_desc *)t_dictionary;
|
||||
|
||||
if (dictionary_set->count + 1 == DICTIONARY_GROUP_MAX_COUNT) {
|
||||
return (dictionary_group_t)-1;
|
||||
}
|
||||
|
||||
dictionary_group_t group = dictionary_group_open();
|
||||
dictionary_set->groups[dictionary_set->count++] = group;
|
||||
|
||||
return group;
|
||||
}
|
||||
|
||||
dictionary_group_t dictionary_set_get_group(dictionary_set_t t_dictionary, size_t index) {
|
||||
dictionary_set_desc *dictionary_set = (dictionary_set_desc *)t_dictionary;
|
||||
|
||||
if (index < 0 || index >= dictionary_set->count) {
|
||||
return (dictionary_group_t)-1;
|
||||
}
|
||||
|
||||
return dictionary_set->groups[index];
|
||||
}
|
||||
|
||||
size_t dictionary_set_count_group(dictionary_set_t t_dictionary) {
|
||||
dictionary_set_desc *dictionary_set = (dictionary_set_desc *)t_dictionary;
|
||||
return dictionary_set->count;
|
||||
}
|
||||
37
internal/cpp/opencc/dictionary_set.h
Normal file
37
internal/cpp/opencc/dictionary_set.h
Normal file
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __DICTIONARY_SET_H_
|
||||
#define __DICTIONARY_SET_H_
|
||||
|
||||
#include "utils.h"
|
||||
#include "dictionary_group.h"
|
||||
|
||||
typedef void * dictionary_set_t;
|
||||
|
||||
dictionary_set_t dictionary_set_open(void);
|
||||
|
||||
void dictionary_set_close(dictionary_set_t t_dictionary);
|
||||
|
||||
dictionary_group_t dictionary_set_new_group(dictionary_set_t t_dictionary);
|
||||
|
||||
dictionary_group_t dictionary_set_get_group(dictionary_set_t t_dictionary, size_t index);
|
||||
|
||||
size_t dictionary_set_count_group(dictionary_set_t t_dictionary);
|
||||
|
||||
#endif /* __DICTIONARY_SET_H_ */
|
||||
230
internal/cpp/opencc/encoding.c
Normal file
230
internal/cpp/opencc/encoding.c
Normal file
@ -0,0 +1,230 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "encoding.h"
|
||||
#include "opencc.h"
|
||||
|
||||
#define INITIAL_BUFF_SIZE 1024
|
||||
#define GET_BIT(byte, pos) (((byte) >> (pos)) & 1)
|
||||
#define BITMASK(length) ((1 << length) - 1)
|
||||
|
||||
ucs4_t *utf8_to_ucs4(const char *utf8, size_t length) {
|
||||
if (length == 0)
|
||||
length = (size_t)-1;
|
||||
size_t i;
|
||||
for (i = 0; i < length && utf8[i] != '\0'; i++)
|
||||
;
|
||||
length = i;
|
||||
|
||||
size_t freesize = INITIAL_BUFF_SIZE;
|
||||
ucs4_t *ucs4 = (ucs4_t *)malloc(sizeof(ucs4_t) * freesize);
|
||||
ucs4_t *pucs4 = ucs4;
|
||||
|
||||
for (i = 0; i < length; i++) {
|
||||
ucs4_t byte[4] = {0};
|
||||
if (GET_BIT(utf8[i], 7) == 0) {
|
||||
/* U-00000000 - U-0000007F */
|
||||
/* 0xxxxxxx */
|
||||
byte[0] = utf8[i] & BITMASK(7);
|
||||
} else if (GET_BIT(utf8[i], 5) == 0) {
|
||||
/* U-00000080 - U-000007FF */
|
||||
/* 110xxxxx 10xxxxxx */
|
||||
if (i + 1 >= length)
|
||||
goto err;
|
||||
|
||||
byte[0] = (utf8[i + 1] & BITMASK(6)) + ((utf8[i] & BITMASK(2)) << 6);
|
||||
byte[1] = (utf8[i] >> 2) & BITMASK(3);
|
||||
|
||||
i += 1;
|
||||
} else if (GET_BIT(utf8[i], 4) == 0) {
|
||||
/* U-00000800 - U-0000FFFF */
|
||||
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
||||
if (i + 2 >= length)
|
||||
goto err;
|
||||
|
||||
byte[0] = (utf8[i + 2] & BITMASK(6)) + ((utf8[i + 1] & BITMASK(2)) << 6);
|
||||
byte[1] = ((utf8[i + 1] >> 2) & BITMASK(4)) + ((utf8[i] & BITMASK(4)) << 4);
|
||||
|
||||
i += 2;
|
||||
} else if (GET_BIT(utf8[i], 3) == 0) {
|
||||
/* U-00010000 - U-001FFFFF */
|
||||
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
if (i + 3 >= length)
|
||||
goto err;
|
||||
|
||||
byte[0] = (utf8[i + 3] & BITMASK(6)) + ((utf8[i + 2] & BITMASK(2)) << 6);
|
||||
byte[1] = ((utf8[i + 2] >> 2) & BITMASK(4)) + ((utf8[i + 1] & BITMASK(4)) << 4);
|
||||
byte[2] = ((utf8[i + 1] >> 4) & BITMASK(2)) + ((utf8[i] & BITMASK(3)) << 2);
|
||||
|
||||
i += 3;
|
||||
} else if (GET_BIT(utf8[i], 2) == 0) {
|
||||
/* U-00200000 - U-03FFFFFF */
|
||||
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
if (i + 4 >= length)
|
||||
goto err;
|
||||
|
||||
byte[0] = (utf8[i + 4] & BITMASK(6)) + ((utf8[i + 3] & BITMASK(2)) << 6);
|
||||
byte[1] = ((utf8[i + 3] >> 2) & BITMASK(4)) + ((utf8[i + 2] & BITMASK(4)) << 4);
|
||||
byte[2] = ((utf8[i + 2] >> 4) & BITMASK(2)) + ((utf8[i + 1] & BITMASK(6)) << 2);
|
||||
byte[3] = utf8[i] & BITMASK(2);
|
||||
i += 4;
|
||||
} else if (GET_BIT(utf8[i], 2) == 0) {
|
||||
/* U-04000000 - U-7FFFFFFF */
|
||||
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
if (i + 5 >= length)
|
||||
goto err;
|
||||
|
||||
byte[0] = (utf8[i + 5] & BITMASK(6)) + ((utf8[i + 4] & BITMASK(2)) << 6);
|
||||
byte[1] = ((utf8[i + 4] >> 2) & BITMASK(4)) + ((utf8[i + 3] & BITMASK(4)) << 4);
|
||||
byte[2] = ((utf8[i + 3] >> 4) & BITMASK(2)) + ((utf8[i + 2] & BITMASK(6)) << 2);
|
||||
byte[3] = (utf8[i + 1] & BITMASK(6)) + ((utf8[i] & BITMASK(1)) << 6);
|
||||
i += 5;
|
||||
} else
|
||||
goto err;
|
||||
|
||||
if (freesize == 0) {
|
||||
freesize = pucs4 - ucs4;
|
||||
ucs4 = (ucs4_t *)realloc(ucs4, sizeof(ucs4_t) * (freesize + freesize));
|
||||
pucs4 = ucs4 + freesize;
|
||||
}
|
||||
|
||||
*pucs4 = (byte[3] << 24) + (byte[2] << 16) + (byte[1] << 8) + byte[0];
|
||||
|
||||
pucs4++;
|
||||
freesize--;
|
||||
}
|
||||
|
||||
length = (pucs4 - ucs4 + 1);
|
||||
ucs4 = (ucs4_t *)realloc(ucs4, sizeof(ucs4_t) * length);
|
||||
ucs4[length - 1] = 0;
|
||||
return ucs4;
|
||||
|
||||
err:
|
||||
free(ucs4);
|
||||
return (ucs4_t *)-1;
|
||||
}
|
||||
|
||||
char *ucs4_to_utf8(const ucs4_t *ucs4, size_t length) {
|
||||
if (length == 0)
|
||||
length = (size_t)-1;
|
||||
size_t i;
|
||||
for (i = 0; i < length && ucs4[i] != 0; i++)
|
||||
;
|
||||
length = i;
|
||||
|
||||
size_t freesize = INITIAL_BUFF_SIZE;
|
||||
char *utf8 = (char *)malloc(sizeof(char) * freesize);
|
||||
char *putf8 = utf8;
|
||||
|
||||
for (i = 0; i < length; i++) {
|
||||
if ((ssize_t)freesize - 6 <= 0) {
|
||||
freesize = putf8 - utf8;
|
||||
utf8 = (char *)realloc(utf8, sizeof(char) * (freesize + freesize));
|
||||
putf8 = utf8 + freesize;
|
||||
}
|
||||
|
||||
ucs4_t c = ucs4[i];
|
||||
ucs4_t byte[4] = {(c >> 0) & BITMASK(8), (c >> 8) & BITMASK(8), (c >> 16) & BITMASK(8), (c >> 24) & BITMASK(8)};
|
||||
|
||||
size_t delta = 0;
|
||||
|
||||
if (c <= 0x7F) {
|
||||
/* U-00000000 - U-0000007F */
|
||||
/* 0xxxxxxx */
|
||||
putf8[0] = byte[0] & BITMASK(7);
|
||||
delta = 1;
|
||||
} else if (c <= 0x7FF) {
|
||||
/* U-00000080 - U-000007FF */
|
||||
/* 110xxxxx 10xxxxxx */
|
||||
putf8[1] = 0x80 + (byte[0] & BITMASK(6));
|
||||
putf8[0] = 0xC0 + ((byte[0] >> 6) & BITMASK(2)) + ((byte[1] & BITMASK(3)) << 2);
|
||||
delta = 2;
|
||||
} else if (c <= 0xFFFF) {
|
||||
/* U-00000800 - U-0000FFFF */
|
||||
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
||||
putf8[2] = 0x80 + (byte[0] & BITMASK(6));
|
||||
putf8[1] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + ((byte[1] & BITMASK(4)) << 2);
|
||||
putf8[0] = 0xE0 + ((byte[1] >> 4) & BITMASK(4));
|
||||
delta = 3;
|
||||
} else if (c <= 0x1FFFFF) {
|
||||
/* U-00010000 - U-001FFFFF */
|
||||
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
putf8[3] = 0x80 + (byte[0] & BITMASK(6));
|
||||
putf8[2] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + ((byte[1] & BITMASK(4)) << 2);
|
||||
putf8[1] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + ((byte[2] & BITMASK(2)) << 4);
|
||||
putf8[0] = 0xF0 + ((byte[2] >> 2) & BITMASK(3));
|
||||
delta = 4;
|
||||
} else if (c <= 0x3FFFFFF) {
|
||||
/* U-00200000 - U-03FFFFFF */
|
||||
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
putf8[4] = 0x80 + (byte[0] & BITMASK(6));
|
||||
putf8[3] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + ((byte[1] & BITMASK(4)) << 2);
|
||||
putf8[2] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + ((byte[2] & BITMASK(2)) << 4);
|
||||
putf8[1] = 0x80 + ((byte[2] >> 2) & BITMASK(6));
|
||||
putf8[0] = 0xF8 + (byte[3] & BITMASK(2));
|
||||
delta = 5;
|
||||
|
||||
} else if (c <= 0x7FFFFFFF) {
|
||||
/* U-04000000 - U-7FFFFFFF */
|
||||
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
putf8[5] = 0x80 + (byte[0] & BITMASK(6));
|
||||
putf8[4] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + ((byte[1] & BITMASK(4)) << 2);
|
||||
putf8[3] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + ((byte[2] & BITMASK(2)) << 4);
|
||||
putf8[2] = 0x80 + ((byte[2] >> 2) & BITMASK(6));
|
||||
putf8[1] = 0x80 + (byte[3] & BITMASK(6));
|
||||
putf8[0] = 0xFC + ((byte[3] >> 6) & BITMASK(1));
|
||||
delta = 6;
|
||||
} else {
|
||||
free(utf8);
|
||||
return (char *)-1;
|
||||
}
|
||||
|
||||
putf8 += delta;
|
||||
freesize -= delta;
|
||||
}
|
||||
|
||||
length = (putf8 - utf8 + 1);
|
||||
utf8 = (char *)realloc(utf8, sizeof(char) * length);
|
||||
utf8[length - 1] = '\0';
|
||||
return utf8;
|
||||
}
|
||||
|
||||
size_t ucs4len(const ucs4_t *str) {
|
||||
const register ucs4_t *pstr = str;
|
||||
while (*pstr)
|
||||
++pstr;
|
||||
return pstr - str;
|
||||
}
|
||||
|
||||
int ucs4cmp(const ucs4_t *src, const ucs4_t *dst) {
|
||||
register int ret = 0;
|
||||
while (!(ret = *src - *dst) && *dst)
|
||||
++src, ++dst;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ucs4cpy(ucs4_t *dest, const ucs4_t *src) {
|
||||
while (*src)
|
||||
*dest++ = *src++;
|
||||
*dest = 0;
|
||||
}
|
||||
|
||||
void ucs4ncpy(ucs4_t *dest, const ucs4_t *src, size_t len) {
|
||||
while (*src && len-- > 0)
|
||||
*dest++ = *src++;
|
||||
}
|
||||
36
internal/cpp/opencc/encoding.h
Normal file
36
internal/cpp/opencc/encoding.h
Normal file
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_ENCODING_H_
|
||||
#define __OPENCC_ENCODING_H_
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
ucs4_t * utf8_to_ucs4(const char * utf8, size_t length);
|
||||
|
||||
char * ucs4_to_utf8(const ucs4_t * ucs4, size_t length);
|
||||
|
||||
size_t ucs4len(const ucs4_t * str);
|
||||
|
||||
int ucs4cmp(const ucs4_t * str1, const ucs4_t * str2);
|
||||
|
||||
void ucs4cpy(ucs4_t * dest, const ucs4_t * src);
|
||||
|
||||
void ucs4ncpy(ucs4_t * dest, const ucs4_t * src, size_t len);
|
||||
|
||||
#endif /* __OPENCC_ENCODING_H_ */
|
||||
219
internal/cpp/opencc/opencc.c
Normal file
219
internal/cpp/opencc/opencc.c
Normal file
@ -0,0 +1,219 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "opencc.h"
|
||||
#include "config_reader.h"
|
||||
#include "converter.h"
|
||||
#include "dictionary_set.h"
|
||||
#include "encoding.h"
|
||||
#include "utils.h"
|
||||
|
||||
typedef struct {
|
||||
dictionary_set_t dictionary_set;
|
||||
converter_t converter;
|
||||
} opencc_desc;
|
||||
|
||||
static opencc_error errnum = OPENCC_ERROR_VOID;
|
||||
static int lib_initialized = FALSE;
|
||||
|
||||
static void lib_initialize(void) { lib_initialized = TRUE; }
|
||||
|
||||
size_t opencc_convert(opencc_t t_opencc, ucs4_t **inbuf, size_t *inbuf_left, ucs4_t **outbuf, size_t *outbuf_left) {
|
||||
if (!lib_initialized)
|
||||
lib_initialize();
|
||||
|
||||
opencc_desc *opencc = (opencc_desc *)t_opencc;
|
||||
|
||||
size_t retval = converter_convert(opencc->converter, inbuf, inbuf_left, outbuf, outbuf_left);
|
||||
|
||||
if (retval == (size_t)-1)
|
||||
errnum = OPENCC_ERROR_CONVERTER;
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
char *opencc_convert_utf8(opencc_t t_opencc, const char *inbuf, size_t length) {
|
||||
if (!lib_initialized)
|
||||
lib_initialize();
|
||||
|
||||
if (length == (size_t)-1 || length > strlen(inbuf))
|
||||
length = strlen(inbuf);
|
||||
|
||||
/* 將輸入數據轉換爲ucs4_t字符串 */
|
||||
ucs4_t *winbuf = utf8_to_ucs4(inbuf, length);
|
||||
if (winbuf == (ucs4_t *)-1) {
|
||||
/* 輸入數據轉換失敗 */
|
||||
errnum = OPENCC_ERROR_ENCODIND;
|
||||
return (char *)-1;
|
||||
}
|
||||
|
||||
/* 設置輸出UTF8文本緩衝區空間 */
|
||||
size_t outbuf_len = length;
|
||||
size_t outsize = outbuf_len;
|
||||
char *original_outbuf = (char *)malloc(sizeof(char) * (outbuf_len + 1));
|
||||
char *outbuf = original_outbuf;
|
||||
original_outbuf[0] = '\0';
|
||||
|
||||
/* 設置轉換緩衝區空間 */
|
||||
size_t wbufsize = length + 64;
|
||||
ucs4_t *woutbuf = (ucs4_t *)malloc(sizeof(ucs4_t) * (wbufsize + 1));
|
||||
|
||||
ucs4_t *pinbuf = winbuf;
|
||||
ucs4_t *poutbuf = woutbuf;
|
||||
size_t inbuf_left, outbuf_left;
|
||||
|
||||
inbuf_left = ucs4len(winbuf);
|
||||
outbuf_left = wbufsize;
|
||||
|
||||
while (inbuf_left > 0) {
|
||||
size_t retval = opencc_convert(t_opencc, &pinbuf, &inbuf_left, &poutbuf, &outbuf_left);
|
||||
if (retval == (size_t)-1) {
|
||||
free(outbuf);
|
||||
free(winbuf);
|
||||
free(woutbuf);
|
||||
return (char *)-1;
|
||||
}
|
||||
|
||||
*poutbuf = L'\0';
|
||||
|
||||
char *ubuff = ucs4_to_utf8(woutbuf, (size_t)-1);
|
||||
|
||||
if (ubuff == (char *)-1) {
|
||||
free(outbuf);
|
||||
free(winbuf);
|
||||
free(woutbuf);
|
||||
errnum = OPENCC_ERROR_ENCODIND;
|
||||
return (char *)-1;
|
||||
}
|
||||
|
||||
size_t ubuff_len = strlen(ubuff);
|
||||
|
||||
while (ubuff_len > outsize) {
|
||||
size_t outbuf_offset = outbuf - original_outbuf;
|
||||
outsize += outbuf_len;
|
||||
outbuf_len += outbuf_len;
|
||||
original_outbuf = (char *)realloc(original_outbuf, sizeof(char) * outbuf_len);
|
||||
outbuf = original_outbuf + outbuf_offset;
|
||||
}
|
||||
|
||||
strncpy(outbuf, ubuff, ubuff_len);
|
||||
free(ubuff);
|
||||
|
||||
outbuf += ubuff_len;
|
||||
*outbuf = '\0';
|
||||
|
||||
outbuf_left = wbufsize;
|
||||
poutbuf = woutbuf;
|
||||
}
|
||||
|
||||
free(winbuf);
|
||||
free(woutbuf);
|
||||
|
||||
original_outbuf = (char *)realloc(original_outbuf, sizeof(char) * (strlen(original_outbuf) + 1));
|
||||
|
||||
return original_outbuf;
|
||||
}
|
||||
|
||||
opencc_t opencc_open(const char *config_file, const char *home_path) {
|
||||
if (!lib_initialized)
|
||||
lib_initialize();
|
||||
|
||||
opencc_desc *opencc;
|
||||
opencc = (opencc_desc *)malloc(sizeof(opencc_desc));
|
||||
|
||||
opencc->dictionary_set = NULL;
|
||||
opencc->converter = converter_open();
|
||||
converter_set_conversion_mode(opencc->converter, OPENCC_CONVERSION_FAST);
|
||||
|
||||
/* 加載默認辭典 */
|
||||
int retval;
|
||||
if (config_file == NULL)
|
||||
retval = 0;
|
||||
else {
|
||||
config_t config = config_open(config_file, home_path);
|
||||
|
||||
if (config == (config_t)-1) {
|
||||
errnum = OPENCC_ERROR_CONFIG;
|
||||
return (opencc_t)-1;
|
||||
}
|
||||
|
||||
opencc->dictionary_set = config_get_dictionary_set(config);
|
||||
converter_assign_dictionary(opencc->converter, opencc->dictionary_set);
|
||||
|
||||
config_close(config);
|
||||
}
|
||||
|
||||
return (opencc_t)opencc;
|
||||
}
|
||||
|
||||
int opencc_close(opencc_t t_opencc) {
|
||||
if (!lib_initialized)
|
||||
lib_initialize();
|
||||
|
||||
opencc_desc *opencc = (opencc_desc *)t_opencc;
|
||||
|
||||
converter_close(opencc->converter);
|
||||
if (opencc->dictionary_set != NULL)
|
||||
dictionary_set_close(opencc->dictionary_set);
|
||||
free(opencc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void opencc_set_conversion_mode(opencc_t t_opencc, opencc_conversion_mode conversion_mode) {
|
||||
if (!lib_initialized)
|
||||
lib_initialize();
|
||||
|
||||
opencc_desc *opencc = (opencc_desc *)t_opencc;
|
||||
|
||||
converter_set_conversion_mode(opencc->converter, conversion_mode);
|
||||
}
|
||||
|
||||
opencc_error opencc_errno(void) {
|
||||
if (!lib_initialized)
|
||||
lib_initialize();
|
||||
|
||||
return errnum;
|
||||
}
|
||||
|
||||
void opencc_perror(const char *spec) {
|
||||
if (!lib_initialized)
|
||||
lib_initialize();
|
||||
|
||||
perr(spec);
|
||||
perr("\n");
|
||||
switch (errnum) {
|
||||
case OPENCC_ERROR_VOID:
|
||||
break;
|
||||
case OPENCC_ERROR_DICTLOAD:
|
||||
dictionary_perror(_("Dictionary loading error"));
|
||||
break;
|
||||
case OPENCC_ERROR_CONFIG:
|
||||
config_perror(_("Configuration error"));
|
||||
break;
|
||||
case OPENCC_ERROR_CONVERTER:
|
||||
converter_perror(_("Converter error"));
|
||||
break;
|
||||
case OPENCC_ERROR_ENCODIND:
|
||||
perr(_("Encoding error"));
|
||||
break;
|
||||
default:
|
||||
perr(_("Unknown"));
|
||||
}
|
||||
perr("\n");
|
||||
}
|
||||
116
internal/cpp/opencc/opencc.h
Normal file
116
internal/cpp/opencc/opencc.h
Normal file
@ -0,0 +1,116 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_H_
|
||||
#define __OPENCC_H_
|
||||
|
||||
#include "opencc_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Headers from C standard library
|
||||
*/
|
||||
|
||||
/* Macros */
|
||||
#define OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD "zhs2zht.ini"
|
||||
#define OPENCC_DEFAULT_CONFIG_TRAD_TO_SIMP "zht2zhs.ini"
|
||||
|
||||
/**
|
||||
* opencc_open:
|
||||
* @config_file: Location of configuration file.
|
||||
* @returns: A description pointer of the newly allocated instance of opencc.
|
||||
*
|
||||
* Make an instance of opencc.
|
||||
*
|
||||
* Note: Leave config_file to NULL if you do not want to load any configuration file.
|
||||
*
|
||||
*/
|
||||
opencc_t opencc_open(const char *config_file, const char *home_path);
|
||||
|
||||
/**
|
||||
* opencc_close:
|
||||
* @od: The description pointer.
|
||||
* @returns: 0 on success or non-zero number on failure.
|
||||
*
|
||||
* Destroy an instance of opencc.
|
||||
*
|
||||
*/
|
||||
int opencc_close(opencc_t od);
|
||||
|
||||
/**
|
||||
* opencc_convert:
|
||||
* @od: The opencc description pointer.
|
||||
* @inbuf: The pointer to the wide character string of the input buffer.
|
||||
* @inbufleft: The maximum number of characters in *inbuf to convert.
|
||||
* @outbuf: The pointer to the wide character string of the output buffer.
|
||||
* @outbufleft: The size of output buffer.
|
||||
*
|
||||
* @returns: The number of characters of the input buffer that converted.
|
||||
*
|
||||
* Convert string from *inbuf to *outbuf.
|
||||
*
|
||||
* Note: Don't forget to assign **outbuf to L'\0' after called.
|
||||
*
|
||||
*/
|
||||
size_t opencc_convert(opencc_t od, ucs4_t **inbuf, size_t *inbufleft, ucs4_t **outbuf, size_t *outbufleft);
|
||||
|
||||
/**
|
||||
* opencc_convert_utf8:
|
||||
* @od: The opencc description pointer.
|
||||
* @inbuf: The UTF-8 encoded string.
|
||||
* @length: The maximum number of characters in inbuf to convert.
|
||||
*
|
||||
* @returns: The newly allocated UTF-8 string that converted from inbuf.
|
||||
*
|
||||
* Convert UTF-8 string from inbuf. This function returns a newly allocated
|
||||
* c-style string via malloc(), which stores the converted string.
|
||||
* DON'T FORGET TO CALL free() to recycle memory.
|
||||
*
|
||||
*/
|
||||
char *opencc_convert_utf8(opencc_t t_opencc, const char *inbuf, size_t length);
|
||||
|
||||
void opencc_set_conversion_mode(opencc_t t_opencc, opencc_conversion_mode conversion_mode);
|
||||
|
||||
/**
|
||||
* opencc_errno:
|
||||
*
|
||||
* @returns: The error number.
|
||||
*
|
||||
* Return an opencc_convert_errno_t which describes the last error that occured or
|
||||
* OPENCC_CONVERT_ERROR_VOID
|
||||
*
|
||||
*/
|
||||
opencc_error opencc_errno(void);
|
||||
|
||||
/**
|
||||
* opencc_perror:
|
||||
* @spec Prefix message.
|
||||
*
|
||||
* Print the error message to stderr.
|
||||
*
|
||||
*/
|
||||
void opencc_perror(const char *spec);
|
||||
|
||||
#ifdef __cplusplus
|
||||
};
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCC_H_ */
|
||||
59
internal/cpp/opencc/opencc_types.h
Normal file
59
internal/cpp/opencc/opencc_types.h
Normal file
@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_TYPES_H_
|
||||
#define __OPENCC_TYPES_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
typedef void *opencc_t;
|
||||
|
||||
typedef uint32_t ucs4_t;
|
||||
|
||||
enum _opencc_error {
|
||||
OPENCC_ERROR_VOID,
|
||||
OPENCC_ERROR_DICTLOAD,
|
||||
OPENCC_ERROR_CONFIG,
|
||||
OPENCC_ERROR_ENCODIND,
|
||||
OPENCC_ERROR_CONVERTER,
|
||||
};
|
||||
typedef enum _opencc_error opencc_error;
|
||||
|
||||
enum _opencc_dictionary_type {
|
||||
OPENCC_DICTIONARY_TYPE_TEXT,
|
||||
OPENCC_DICTIONARY_TYPE_DATRIE,
|
||||
};
|
||||
typedef enum _opencc_dictionary_type opencc_dictionary_type;
|
||||
|
||||
enum _opencc_conversion_mode {
|
||||
OPENCC_CONVERSION_FAST,
|
||||
OPENCC_CONVERSION_SEGMENT_ONLY,
|
||||
OPENCC_CONVERSION_LIST_CANDIDATES,
|
||||
};
|
||||
typedef enum _opencc_conversion_mode opencc_conversion_mode;
|
||||
|
||||
#ifdef __cplusplus
|
||||
};
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCC_TYPES_H_ */
|
||||
80
internal/cpp/opencc/openccxx.cpp
Normal file
80
internal/cpp/opencc/openccxx.cpp
Normal file
@ -0,0 +1,80 @@
|
||||
#include "openccxx.h"
|
||||
#include "opencc.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
OpenCC::OpenCC(const std::string &home_dir) : od((opencc_t)-1) {
|
||||
config_file = mstrcpy(OPENCC_DEFAULT_CONFIG_TRAD_TO_SIMP);
|
||||
open(config_file, home_dir.c_str());
|
||||
}
|
||||
|
||||
OpenCC::~OpenCC() {
|
||||
if (od != (opencc_t)-1)
|
||||
opencc_close(od);
|
||||
free(config_file);
|
||||
}
|
||||
|
||||
int OpenCC::open(const char *config_file, const char *home_dir) {
|
||||
if (od != (opencc_t)-1)
|
||||
opencc_close(od);
|
||||
od = opencc_open(config_file, home_dir);
|
||||
return (od == (opencc_t)-1) ? (-1) : (0);
|
||||
}
|
||||
|
||||
long OpenCC::convert(const std::string &in, std::string &out, long length) {
|
||||
if (od == (opencc_t)-1)
|
||||
return -1;
|
||||
|
||||
if (length == -1)
|
||||
length = in.length();
|
||||
|
||||
char *outbuf = opencc_convert_utf8(od, in.c_str(), length);
|
||||
|
||||
if (outbuf == (char *)-1)
|
||||
return -1;
|
||||
|
||||
out = outbuf;
|
||||
free(outbuf);
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Warning:
|
||||
* This method can be used only if wchar_t is encoded in UCS4 on your platform.
|
||||
*/
|
||||
long OpenCC::convert(const std::wstring &in, std::wstring &out, long length) {
|
||||
if (od == (opencc_t)-1)
|
||||
return -1;
|
||||
|
||||
size_t inbuf_left = in.length();
|
||||
if (length >= 0 && length < (long)inbuf_left)
|
||||
inbuf_left = length;
|
||||
|
||||
const ucs4_t *inbuf = (const ucs4_t *)in.c_str();
|
||||
long count = 0;
|
||||
|
||||
while (inbuf_left != 0) {
|
||||
size_t retval;
|
||||
size_t outbuf_left;
|
||||
ucs4_t *outbuf;
|
||||
|
||||
/* occupy space */
|
||||
outbuf_left = inbuf_left + 64;
|
||||
out.resize(count + outbuf_left);
|
||||
outbuf = (ucs4_t *)out.c_str() + count;
|
||||
|
||||
retval = opencc_convert(od, (ucs4_t **)&inbuf, &inbuf_left, &outbuf, &outbuf_left);
|
||||
if (retval == (size_t)-1)
|
||||
return -1;
|
||||
count += retval;
|
||||
}
|
||||
|
||||
/* set the zero termination and shrink the size */
|
||||
out.resize(count + 1);
|
||||
out[count] = L'\0';
|
||||
|
||||
return count;
|
||||
}
|
||||
20
internal/cpp/opencc/openccxx.h
Normal file
20
internal/cpp/opencc/openccxx.h
Normal file
@ -0,0 +1,20 @@
|
||||
#pragma once
|
||||
|
||||
#include "opencc_types.h"
|
||||
#include <string>
|
||||
|
||||
class OpenCC {
|
||||
public:
|
||||
OpenCC(const std::string &home_dir);
|
||||
virtual ~OpenCC();
|
||||
|
||||
int open(const char *config_file, const char *home_dir);
|
||||
|
||||
long convert(const std::string &in, std::string &out, long length = -1);
|
||||
|
||||
long convert(const std::wstring &in, std::wstring &out, long length = -1);
|
||||
|
||||
private:
|
||||
char *config_file;
|
||||
opencc_t od;
|
||||
};
|
||||
36
internal/cpp/opencc/utils.c
Normal file
36
internal/cpp/opencc/utils.c
Normal file
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
void perr(const char *str) { fputs(str, stderr); }
|
||||
|
||||
int qsort_int_cmp(const void *a, const void *b) { return *((int *)a) - *((int *)b); }
|
||||
|
||||
char *mstrcpy(const char *str) {
|
||||
char *strbuf = (char *)malloc(sizeof(char) * (strlen(str) + 1));
|
||||
strcpy(strbuf, str);
|
||||
return strbuf;
|
||||
}
|
||||
|
||||
char *mstrncpy(const char *str, size_t n) {
|
||||
char *strbuf = (char *)malloc(sizeof(char) * (n + 1));
|
||||
strncpy(strbuf, str, n);
|
||||
strbuf[n] = '\0';
|
||||
return strbuf;
|
||||
}
|
||||
71
internal/cpp/opencc/utils.h
Normal file
71
internal/cpp/opencc/utils.h
Normal file
@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Open Chinese Convert
|
||||
*
|
||||
* Copyright 2010 BYVoid <byvoid.kcp@gmail.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCC_UTILS_H_
|
||||
#define __OPENCC_UTILS_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "opencc_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define FALSE (0)
|
||||
#define TRUE (!(0))
|
||||
#define INFINITY_INT ((~0U) >> 1)
|
||||
|
||||
#ifndef BIG_ENDIAN
|
||||
#define BIG_ENDIAN (0)
|
||||
#endif
|
||||
|
||||
#ifndef LITTLE_ENDIAN
|
||||
#define LITTLE_ENDIAN (1)
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_GETTEXT
|
||||
#include <libintl.h>
|
||||
#include <locale.h>
|
||||
#define _(STRING) dgettext(PACKAGE_NAME, STRING)
|
||||
#else
|
||||
#define _(STRING) STRING
|
||||
#endif
|
||||
|
||||
#define debug_should_not_be_here() \
|
||||
do { \
|
||||
fprintf(stderr, "Should not be here %s: %d\n", __FILE__, __LINE__); \
|
||||
assert(0); \
|
||||
} while (0)
|
||||
|
||||
void perr(const char *str);
|
||||
|
||||
int qsort_int_cmp(const void *a, const void *b);
|
||||
|
||||
char *mstrcpy(const char *str);
|
||||
|
||||
char *mstrncpy(const char *str, size_t n);
|
||||
|
||||
#ifdef __cplusplus
|
||||
};
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCC_UTILS_H_ */
|
||||
Reference in New Issue
Block a user