// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "tokenizer.h" #include #include const CharType ALLOW_CHR = 0; /// < regular term const CharType DELIMITER_CHR = 1; /// < delimiter const CharType SPACE_CHR = 2; /// < space term const CharType UNITE_CHR = 3; /// < united term CharTypeTable::CharTypeTable(bool use_def_delim) { memset(char_type_table_, 0, BYTE_MAX); // if use_def_delim is set, all the characters are allows if (!use_def_delim) return; // set the lower 4 bit to record default char type for (uint8_t i = 0; i < BYTE_MAX; i++) { if (std::isalnum(i) || i > 127) continue; else if (std::isspace(i)) char_type_table_[i] = SPACE_CHR; else char_type_table_[i] = DELIMITER_CHR; } } void CharTypeTable::SetConfig(const TokenizeConfig &conf) { // set the higher 4 bit to record user defined option type std::string str; // why need to copy? str = conf.divides_; if (!str.empty()) { for (unsigned int j = 0; j < str.length(); j++) { char_type_table_[(uint8_t)str[j]] = DELIMITER_CHR; } } str = conf.unites_; if (!str.empty()) { for (unsigned int j = 0; j < str.length(); j++) { char_type_table_[(uint8_t)str[j]] = UNITE_CHR; } } str = conf.allows_; if (!str.empty()) { for (unsigned int j = 0; j < str.length(); j++) { char_type_table_[(uint8_t)str[j]] = ALLOW_CHR; } } } void Tokenizer::SetConfig(const TokenizeConfig &conf) { table_.SetConfig(conf); } void Tokenizer::Tokenize(const std::string &input) { input_ = (std::string *)&input; input_cursor_ = 0; } bool Tokenizer::NextToken() { while (input_cursor_ < input_->length() && table_.GetType(input_->at(input_cursor_)) == SPACE_CHR) { input_cursor_++; } if (input_cursor_ == input_->length()) return false; output_buffer_cursor_ = 0; if (output_buffer_cursor_ >= output_buffer_size_) { GrowOutputBuffer(); } token_start_cursor_ = input_cursor_; output_buffer_[output_buffer_cursor_++] = input_->at(input_cursor_); if (table_.GetType(input_->at(input_cursor_)) == DELIMITER_CHR) { ++input_cursor_; is_delimiter_ = true; return true; } else { ++input_cursor_; is_delimiter_ = false; while (input_cursor_ < input_->length()) { CharType cur_type = table_.GetType(input_->at(input_cursor_)); if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) { return true; } else if (cur_type == ALLOW_CHR) { if (output_buffer_cursor_ >= output_buffer_size_) { GrowOutputBuffer(); } output_buffer_[output_buffer_cursor_++] = input_->at(input_cursor_++); } else { ++input_cursor_; } } return true; } } bool Tokenizer::GrowOutputBuffer() { output_buffer_size_ *= 2; output_buffer_ = std::make_unique(output_buffer_size_); return true; } bool Tokenizer::Tokenize(const std::string &input_string, TermList &special_terms, TermList &prim_terms) { special_terms.clear(); prim_terms.clear(); size_t len = input_string.length(); if (len == 0) return false; Term t; TermList::iterator it; unsigned int word_off = 0, char_off = 0; char cur_char; CharType cur_type; for (char_off = 0; char_off < len;) // char_off++ ) // char_off is always incremented inside { cur_type = table_.GetType(input_string.at(char_off)); if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) { it = prim_terms.insert(prim_terms.end(), t); do { cur_char = input_string.at(char_off); cur_type = table_.GetType(cur_char); if (cur_type == ALLOW_CHR) { it->text_ += cur_char; } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) { break; } char_off++; } while (char_off < len); if (it->text_.length() == 0) { prim_terms.erase(it); continue; // char_off--; } it->word_offset_ = word_off++; // char_off--; } else if (cur_type == DELIMITER_CHR) { it = special_terms.insert(special_terms.end(), t); do { cur_char = input_string.at(char_off); cur_type = table_.GetType(cur_char); if (cur_type == DELIMITER_CHR) it->text_ += cur_char; else break; char_off++; } while (char_off < len); it->word_offset_ = word_off++; // char_off--; } else char_off++; } return true; } bool Tokenizer::Tokenize(const std::string &input_string, TermList &prim_terms) { prim_terms.clear(); size_t len = input_string.length(); if (len == 0) return false; Term t; TermList::iterator it; unsigned int word_off = 0, char_off = 0; char cur_char; CharType cur_type; for (char_off = 0; char_off < len;) // char_off++ ) { cur_type = table_.GetType(input_string.at(char_off)); if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) { it = prim_terms.insert(prim_terms.end(), t); // it->begin_ = char_off; do { cur_char = input_string.at(char_off); cur_type = table_.GetType(cur_char); if (cur_type == ALLOW_CHR) { it->text_ += cur_char; } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) { break; } char_off++; } while (char_off < len); if (it->text_.length() == 0) { prim_terms.erase(it); continue; // char_off--; } it->word_offset_ = word_off++; // char_off--; } else if (cur_type == DELIMITER_CHR) { if (((char_off + 1) < len) && table_.GetType(input_string.at(char_off + 1)) != DELIMITER_CHR) { word_off++; } char_off++; } else char_off++; } return true; } bool Tokenizer::TokenizeWhite(const std::string &input_string, TermList &raw_terms) { raw_terms.clear(); size_t len = input_string.length(); if (len == 0) return false; Term t; TermList::iterator it; unsigned int word_off = 0, char_off = 0; char cur_char; CharType cur_type; // CharType cur_type, preType; for (char_off = 0; char_off < len;) // char_off++ ) { cur_type = table_.GetType(input_string.at(char_off)); if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) { it = raw_terms.insert(raw_terms.end(), t); // it->begin_ = char_off; do { cur_char = input_string.at(char_off); cur_type = table_.GetType(cur_char); if (cur_type == ALLOW_CHR) { it->text_ += cur_char; } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) { break; } char_off++; } while (char_off < len); if (it->text_.length() == 0) { raw_terms.erase(it); continue; // char_off--; } it->word_offset_ = word_off++; // char_off--; } else if (cur_type == DELIMITER_CHR) { it = raw_terms.insert(raw_terms.end(), t); do { cur_char = input_string.at(char_off); cur_type = table_.GetType(cur_char); if (cur_type == DELIMITER_CHR) it->text_ += cur_char; else break; char_off++; } while (char_off < len); it->word_offset_ = word_off++; // char_off--; } else { // SPACE_CHR nothing to do char_off++; } } return true; }