mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-25 10:26:59 +08:00
### What problem does this PR solve? Update chunk/metadata cli ### Type of change - [ ] Refactoring
486 lines
13 KiB
Go
486 lines
13 KiB
Go
//
|
|
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
package cli
|
|
|
|
import (
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// Lexer performs lexical analysis of the input
|
|
type Lexer struct {
|
|
input string
|
|
pos int
|
|
readPos int
|
|
ch byte
|
|
}
|
|
|
|
// NewLexer creates a new lexer for the given input
|
|
func NewLexer(input string) *Lexer {
|
|
l := &Lexer{input: input}
|
|
l.readChar()
|
|
return l
|
|
}
|
|
|
|
func (l *Lexer) readChar() {
|
|
if l.readPos >= len(l.input) {
|
|
l.ch = 0
|
|
} else {
|
|
l.ch = l.input[l.readPos]
|
|
}
|
|
l.pos = l.readPos
|
|
l.readPos++
|
|
}
|
|
|
|
func (l *Lexer) peekChar() byte {
|
|
if l.readPos >= len(l.input) {
|
|
return 0
|
|
}
|
|
return l.input[l.readPos]
|
|
}
|
|
|
|
func (l *Lexer) peekToken() string {
|
|
// Skip whitespace starting from readPos
|
|
skipPos := l.readPos
|
|
for skipPos < len(l.input) && (l.input[skipPos] == ' ' || l.input[skipPos] == '\t' || l.input[skipPos] == '\n' || l.input[skipPos] == '\r') {
|
|
skipPos++
|
|
}
|
|
|
|
// Read identifier starting from skipPos
|
|
start := skipPos
|
|
for skipPos < len(l.input) && (isLetter(l.input[skipPos]) || isDigit(l.input[skipPos]) || l.input[skipPos] == '_' || l.input[skipPos] == '-' || l.input[skipPos] == '.') {
|
|
skipPos++
|
|
}
|
|
|
|
return l.input[start:skipPos]
|
|
}
|
|
|
|
func (l *Lexer) skipWhitespace() {
|
|
for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
|
|
l.readChar()
|
|
}
|
|
}
|
|
|
|
// NextToken returns the next token from the input
|
|
func (l *Lexer) NextToken() Token {
|
|
var tok Token
|
|
|
|
l.skipWhitespace()
|
|
|
|
switch l.ch {
|
|
case ';':
|
|
tok = newToken(TokenSemicolon, l.ch)
|
|
l.readChar()
|
|
case ',':
|
|
tok = newToken(TokenComma, l.ch)
|
|
l.readChar()
|
|
case '/':
|
|
tok = newToken(TokenSlash, l.ch)
|
|
l.readChar()
|
|
case '-':
|
|
tok = newToken(TokenDash, l.ch)
|
|
l.readChar()
|
|
case '\'':
|
|
tok.Type = TokenQuotedString
|
|
tok.Value = l.readQuotedString('\'')
|
|
case '"':
|
|
tok.Type = TokenQuotedString
|
|
tok.Value = l.readQuotedString('"')
|
|
case '\\':
|
|
// Meta command: backslash followed by command name
|
|
tok.Type = TokenIdentifier
|
|
tok.Value = l.readMetaCommand()
|
|
case 0:
|
|
tok.Type = TokenEOF
|
|
tok.Value = ""
|
|
default:
|
|
if isLetter(l.ch) || l.ch == '_' {
|
|
ident := l.readIdentifier()
|
|
return l.lookupIdent(ident)
|
|
} else if isDigit(l.ch) {
|
|
tok.Value, tok.Type = l.readNumber()
|
|
return tok
|
|
}
|
|
|
|
tok = newToken(TokenIllegal, l.ch)
|
|
l.readChar()
|
|
}
|
|
|
|
return tok
|
|
}
|
|
|
|
func (l *Lexer) readMetaCommand() string {
|
|
start := l.pos
|
|
l.readChar() // consume backslash
|
|
for isLetter(l.ch) || l.ch == '?' {
|
|
l.readChar()
|
|
}
|
|
return l.input[start:l.pos]
|
|
}
|
|
|
|
func newToken(tokenType int, ch byte) Token {
|
|
return Token{Type: tokenType, Value: string(ch)}
|
|
}
|
|
|
|
func (l *Lexer) readIdentifier() string {
|
|
start := l.pos
|
|
for isLetter(l.ch) || isDigit(l.ch) || l.ch == '_' || l.ch == '-' || l.ch == '.' {
|
|
l.readChar()
|
|
}
|
|
return l.input[start:l.pos]
|
|
}
|
|
|
|
func (l *Lexer) readNumber() (string, int) {
|
|
start := l.pos
|
|
tokenType := TokenInteger
|
|
|
|
// Read integer part
|
|
for isDigit(l.ch) {
|
|
l.readChar()
|
|
}
|
|
|
|
// If encountering a decimal point followed by a digit, read as float
|
|
if l.ch == '.' && isDigit(l.peekChar()) {
|
|
tokenType = TokenFloat
|
|
l.readChar() // Consume '.'
|
|
for isDigit(l.ch) {
|
|
l.readChar()
|
|
}
|
|
}
|
|
|
|
return l.input[start:l.pos], tokenType
|
|
}
|
|
|
|
func (l *Lexer) readQuotedString(quote byte) string {
|
|
l.readChar() // skip opening quote
|
|
start := l.pos
|
|
for l.ch != quote && l.ch != 0 {
|
|
l.readChar()
|
|
}
|
|
str := l.input[start:l.pos]
|
|
if l.ch == quote {
|
|
l.readChar() // skip closing quote
|
|
}
|
|
return str
|
|
}
|
|
|
|
func (l *Lexer) lookupIdent(ident string) Token {
|
|
upper := strings.ToUpper(ident)
|
|
switch upper {
|
|
case "LOGIN":
|
|
return Token{Type: TokenLogin, Value: ident}
|
|
case "LOGOUT":
|
|
return Token{Type: TokenLogout, Value: ident}
|
|
case "REGISTER":
|
|
return Token{Type: TokenRegister, Value: ident}
|
|
case "LIST":
|
|
return Token{Type: TokenList, Value: ident}
|
|
case "SERVICES":
|
|
return Token{Type: TokenServices, Value: ident}
|
|
case "SHOW":
|
|
return Token{Type: TokenShow, Value: ident}
|
|
case "CREATE":
|
|
return Token{Type: TokenCreate, Value: ident}
|
|
case "SERVICE":
|
|
return Token{Type: TokenService, Value: ident}
|
|
case "SHUTDOWN":
|
|
return Token{Type: TokenShutdown, Value: ident}
|
|
case "STARTUP":
|
|
return Token{Type: TokenStartup, Value: ident}
|
|
case "RESTART":
|
|
return Token{Type: TokenRestart, Value: ident}
|
|
case "USERS":
|
|
return Token{Type: TokenUsers, Value: ident}
|
|
case "DROP":
|
|
return Token{Type: TokenDrop, Value: ident}
|
|
case "USER":
|
|
return Token{Type: TokenUser, Value: ident}
|
|
case "ALTER":
|
|
return Token{Type: TokenAlter, Value: ident}
|
|
case "ACTIVE":
|
|
return Token{Type: TokenActive, Value: ident}
|
|
case "ADMIN":
|
|
return Token{Type: TokenAdmin, Value: ident}
|
|
case "ADD":
|
|
return Token{Type: TokenAdd, Value: ident}
|
|
case "DELETE":
|
|
return Token{Type: TokenDelete, Value: ident}
|
|
case "PASSWORD":
|
|
return Token{Type: TokenPassword, Value: ident}
|
|
case "DATASET":
|
|
// Check if followed by TABLE for compound token
|
|
if strings.ToUpper(l.peekToken()) == "TABLE" {
|
|
// Skip whitespace to TABLE
|
|
for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
|
|
l.readChar()
|
|
}
|
|
// Skip past TABLE
|
|
for isLetter(l.ch) || isDigit(l.ch) || l.ch == '_' || l.ch == '-' || l.ch == '.' {
|
|
l.readChar()
|
|
}
|
|
return Token{Type: TokenDatasetTable, Value: "DATASET TABLE"}
|
|
}
|
|
return Token{Type: TokenDataset, Value: ident}
|
|
case "DATASETS":
|
|
return Token{Type: TokenDatasets, Value: ident}
|
|
case "OF":
|
|
return Token{Type: TokenOf, Value: ident}
|
|
case "AGENTS":
|
|
return Token{Type: TokenAgents, Value: ident}
|
|
case "ROLE":
|
|
return Token{Type: TokenRole, Value: ident}
|
|
case "ROLES":
|
|
return Token{Type: TokenRoles, Value: ident}
|
|
case "DESCRIPTION":
|
|
return Token{Type: TokenDescription, Value: ident}
|
|
case "GRANT":
|
|
return Token{Type: TokenGrant, Value: ident}
|
|
case "REVOKE":
|
|
return Token{Type: TokenRevoke, Value: ident}
|
|
case "ALL":
|
|
return Token{Type: TokenAll, Value: ident}
|
|
case "PERMISSION":
|
|
return Token{Type: TokenPermission, Value: ident}
|
|
case "TO":
|
|
return Token{Type: TokenTo, Value: ident}
|
|
case "FROM":
|
|
return Token{Type: TokenFrom, Value: ident}
|
|
case "FOR":
|
|
return Token{Type: TokenFor, Value: ident}
|
|
case "RESOURCES":
|
|
return Token{Type: TokenResources, Value: ident}
|
|
case "ON":
|
|
return Token{Type: TokenOn, Value: ident}
|
|
case "SET":
|
|
return Token{Type: TokenSet, Value: ident}
|
|
case "UNSET":
|
|
return Token{Type: TokenUnset, Value: ident}
|
|
case "RESET":
|
|
return Token{Type: TokenReset, Value: ident}
|
|
case "VERSION":
|
|
return Token{Type: TokenVersion, Value: ident}
|
|
case "VAR":
|
|
return Token{Type: TokenVar, Value: ident}
|
|
case "VARS":
|
|
return Token{Type: TokenVars, Value: ident}
|
|
case "CONFIGS":
|
|
return Token{Type: TokenConfigs, Value: ident}
|
|
case "ENVS":
|
|
return Token{Type: TokenEnvs, Value: ident}
|
|
case "KEY":
|
|
return Token{Type: TokenKey, Value: ident}
|
|
case "KEYS":
|
|
return Token{Type: TokenKeys, Value: ident}
|
|
case "GENERATE":
|
|
return Token{Type: TokenGenerate, Value: ident}
|
|
case "MODEL":
|
|
return Token{Type: TokenModel, Value: ident}
|
|
case "MODELS":
|
|
return Token{Type: TokenModels, Value: ident}
|
|
case "PROVIDER":
|
|
return Token{Type: TokenProvider, Value: ident}
|
|
case "PROVIDERS":
|
|
return Token{Type: TokenProviders, Value: ident}
|
|
case "DEFAULT":
|
|
return Token{Type: TokenDefault, Value: ident}
|
|
case "CHATS":
|
|
return Token{Type: TokenChats, Value: ident}
|
|
case "CHAT":
|
|
return Token{Type: TokenChat, Value: ident}
|
|
case "MESSAGE":
|
|
return Token{Type: TokenMessage, Value: ident}
|
|
case "IMAGE":
|
|
return Token{Type: TokenImage, Value: ident}
|
|
case "VIDEO":
|
|
return Token{Type: TokenVideo, Value: ident}
|
|
case "AUDIO":
|
|
return Token{Type: TokenAudio, Value: ident}
|
|
case "THINK":
|
|
return Token{Type: TokenThink, Value: ident}
|
|
case "EFFORT":
|
|
return Token{Type: TokenEffort, Value: ident}
|
|
case "VERBOSITY":
|
|
return Token{Type: TokenVerbosity, Value: ident}
|
|
case "NONE":
|
|
return Token{Type: TokenNone, Value: ident}
|
|
case "MINIMAL":
|
|
return Token{Type: TokenMinimal, Value: ident}
|
|
case "LOW":
|
|
return Token{Type: TokenLow, Value: ident}
|
|
case "MEDIUM":
|
|
return Token{Type: TokenMedium, Value: ident}
|
|
case "HIGH":
|
|
return Token{Type: TokenHigh, Value: ident}
|
|
case "MAX":
|
|
return Token{Type: TokenMax, Value: ident}
|
|
case "STREAM":
|
|
return Token{Type: TokenStream, Value: ident}
|
|
case "LS":
|
|
return Token{Type: TokenLS, Value: ident}
|
|
case "CAT":
|
|
return Token{Type: TokenCat, Value: ident}
|
|
case "FILES":
|
|
return Token{Type: TokenFiles, Value: ident}
|
|
case "AS":
|
|
return Token{Type: TokenAs, Value: ident}
|
|
case "PARSE":
|
|
return Token{Type: TokenParse, Value: ident}
|
|
case "IMPORT":
|
|
return Token{Type: TokenImport, Value: ident}
|
|
case "INTO":
|
|
return Token{Type: TokenInto, Value: ident}
|
|
case "WITH":
|
|
return Token{Type: TokenWith, Value: ident}
|
|
case "PARSER":
|
|
return Token{Type: TokenParser, Value: ident}
|
|
case "PIPELINE":
|
|
return Token{Type: TokenPipeline, Value: ident}
|
|
case "GET":
|
|
return Token{Type: TokenGet, Value: ident}
|
|
case "SEARCH":
|
|
return Token{Type: TokenSearch, Value: ident}
|
|
case "CURRENT":
|
|
return Token{Type: TokenCurrent, Value: ident}
|
|
case "VISION":
|
|
return Token{Type: TokenVision, Value: ident}
|
|
case "EMBEDDING":
|
|
return Token{Type: TokenEmbedding, Value: ident}
|
|
case "RERANK":
|
|
return Token{Type: TokenRerank, Value: ident}
|
|
case "ASR":
|
|
return Token{Type: TokenASR, Value: ident}
|
|
case "TTS":
|
|
return Token{Type: TokenTTS, Value: ident}
|
|
case "EMBED":
|
|
return Token{Type: TokenEmbed, Value: ident}
|
|
case "TEXT":
|
|
return Token{Type: TokenText, Value: ident}
|
|
case "QUERY":
|
|
return Token{Type: TokenQuery, Value: ident}
|
|
case "TOP":
|
|
return Token{Type: TokenTop, Value: ident}
|
|
case "DIMENSION":
|
|
return Token{Type: TokenDimension, Value: ident}
|
|
case "OCR":
|
|
return Token{Type: TokenOCR, Value: ident}
|
|
case "DOC_PARSE":
|
|
return Token{Type: TokenDocParse, Value: ident}
|
|
case "ASYNC":
|
|
return Token{Type: TokenAsync, Value: ident}
|
|
case "SYNC":
|
|
return Token{Type: TokenSync, Value: ident}
|
|
case "BENCHMARK":
|
|
return Token{Type: TokenBenchmark, Value: ident}
|
|
case "PING":
|
|
return Token{Type: TokenPing, Value: ident}
|
|
case "TOKEN":
|
|
return Token{Type: TokenToken, Value: ident}
|
|
case "TOKENS":
|
|
return Token{Type: TokenTokens, Value: ident}
|
|
case "INDEX":
|
|
return Token{Type: TokenIndex, Value: ident}
|
|
case "VECTOR":
|
|
return Token{Type: TokenVector, Value: ident}
|
|
case "SIZE":
|
|
return Token{Type: TokenSize, Value: ident}
|
|
case "METADATA":
|
|
return Token{Type: TokenMetadata, Value: ident}
|
|
case "TABLE":
|
|
return Token{Type: TokenTable, Value: ident}
|
|
case "AVAILABLE":
|
|
return Token{Type: TokenAvailable, Value: ident}
|
|
case "SUPPORTED":
|
|
return Token{Type: TokenSupported, Value: ident}
|
|
case "NAME":
|
|
return Token{Type: TokenName, Value: ident}
|
|
case "BALANCE":
|
|
return Token{Type: TokenBalance, Value: ident}
|
|
case "INSTANCE":
|
|
return Token{Type: TokenInstance, Value: ident}
|
|
case "INSTANCES":
|
|
return Token{Type: TokenInstances, Value: ident}
|
|
case "DISABLE":
|
|
return Token{Type: TokenDisable, Value: ident}
|
|
case "ENABLE":
|
|
return Token{Type: TokenEnable, Value: ident}
|
|
case "INSERT":
|
|
return Token{Type: TokenInsert, Value: ident}
|
|
case "FILE":
|
|
return Token{Type: TokenFile, Value: ident}
|
|
case "USE":
|
|
return Token{Type: TokenUse, Value: ident}
|
|
case "CHECK":
|
|
return Token{Type: TokenCheck, Value: ident}
|
|
case "UPDATE":
|
|
return Token{Type: TokenUpdate, Value: ident}
|
|
case "REMOVE":
|
|
return Token{Type: TokenRemove, Value: ident}
|
|
case "CHUNK":
|
|
return Token{Type: TokenChunk, Value: ident}
|
|
case "CHUNKS":
|
|
return Token{Type: TokenChunks, Value: ident}
|
|
case "DOCUMENT":
|
|
return Token{Type: TokenDocument, Value: ident}
|
|
case "DOCUMENTS":
|
|
return Token{Type: TokenDocuments, Value: ident}
|
|
case "TAGS":
|
|
return Token{Type: TokenTag, Value: ident}
|
|
case "REGION":
|
|
return Token{Type: TokenRegion, Value: ident}
|
|
case "URL":
|
|
return Token{Type: TokenURL, Value: ident}
|
|
case "TASK":
|
|
return Token{Type: TokenTask, Value: ident}
|
|
case "TASKS":
|
|
return Token{Type: TokenTasks, Value: ident}
|
|
case "LOG":
|
|
return Token{Type: TokenLog, Value: ident}
|
|
case "LEVEL":
|
|
return Token{Type: TokenLevel, Value: ident}
|
|
case "DEBUG":
|
|
return Token{Type: TokenDebug, Value: ident}
|
|
case "INFO":
|
|
return Token{Type: TokenInfo, Value: ident}
|
|
case "WARN":
|
|
return Token{Type: TokenWarn, Value: ident}
|
|
case "ERROR":
|
|
return Token{Type: TokenError, Value: ident}
|
|
case "FATAL":
|
|
return Token{Type: TokenFatal, Value: ident}
|
|
case "PANIC":
|
|
return Token{Type: TokenPanic, Value: ident}
|
|
case "PARAM":
|
|
return Token{Type: TokenParam, Value: ident}
|
|
case "PLAY":
|
|
return Token{Type: TokenPlay, Value: ident}
|
|
case "FORMAT":
|
|
return Token{Type: TokenFormat, Value: ident}
|
|
case "SAVE":
|
|
return Token{Type: TokenSave, Value: ident}
|
|
default:
|
|
return Token{Type: TokenIdentifier, Value: ident}
|
|
}
|
|
}
|
|
|
|
func isLetter(ch byte) bool {
|
|
return unicode.IsLetter(rune(ch))
|
|
}
|
|
|
|
func isDigit(ch byte) bool {
|
|
return unicode.IsDigit(rune(ch))
|
|
}
|