mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-02 00:07:47 +08:00
RAGFlow go API server (#13240)
# RAGFlow Go Implementation Plan 🚀 This repository tracks the progress of porting RAGFlow to Go. We'll implement core features and provide performance comparisons between Python and Go versions. ## Implementation Checklist - [x] User Management APIs - [x] Dataset Management Operations - [x] Retrieval Test - [x] Chat Management Operations - [x] Infinity Go SDK --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
This commit is contained in:
957
internal/cpp/re2/regexp.cc
Normal file
957
internal/cpp/re2/regexp.cc
Normal file
@ -0,0 +1,957 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Regular expression representation.
|
||||
// Tested by parse_test.cc
|
||||
|
||||
#include "re2/regexp.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "re2/pod_array.h"
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/walker-inl.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/mutex.h"
|
||||
#include "util/utf.h"
|
||||
#include "util/util.h"
|
||||
|
||||
#ifdef min
|
||||
#undef min
|
||||
#endif
|
||||
#ifdef max
|
||||
#undef max
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Constructor. Allocates vectors as appropriate for operator.
|
||||
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
|
||||
: op_(static_cast<uint8_t>(op)), simple_(false), parse_flags_(static_cast<uint16_t>(parse_flags)), ref_(1), nsub_(0), down_(NULL) {
|
||||
subone_ = NULL;
|
||||
memset(arguments.the_union_, 0, sizeof arguments.the_union_);
|
||||
}
|
||||
|
||||
// Destructor. Assumes already cleaned up children.
|
||||
// Private: use Decref() instead of delete to destroy Regexps.
|
||||
// Can't call Decref on the sub-Regexps here because
|
||||
// that could cause arbitrarily deep recursion, so
|
||||
// required Decref() to have handled them for us.
|
||||
Regexp::~Regexp() {
|
||||
if (nsub_ > 0)
|
||||
LOG(DFATAL) << "Regexp not destroyed.";
|
||||
|
||||
switch (op_) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpCapture:
|
||||
delete arguments.capture.name_;
|
||||
break;
|
||||
case kRegexpLiteralString:
|
||||
delete[] arguments.literal_string.runes_;
|
||||
break;
|
||||
case kRegexpCharClass:
|
||||
if (arguments.char_class.cc_)
|
||||
arguments.char_class.cc_->Delete();
|
||||
delete arguments.char_class.ccb_;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If it's possible to destroy this regexp without recurring,
|
||||
// do so and return true. Else return false.
|
||||
bool Regexp::QuickDestroy() {
|
||||
if (nsub_ == 0) {
|
||||
delete this;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Similar to EmptyStorage in re2.cc.
|
||||
struct RefStorage {
|
||||
Mutex ref_mutex;
|
||||
std::map<Regexp *, int> ref_map;
|
||||
};
|
||||
alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];
|
||||
|
||||
static inline Mutex *ref_mutex() { return &reinterpret_cast<RefStorage *>(ref_storage)->ref_mutex; }
|
||||
|
||||
static inline std::map<Regexp *, int> *ref_map() { return &reinterpret_cast<RefStorage *>(ref_storage)->ref_map; }
|
||||
|
||||
int Regexp::Ref() {
|
||||
if (ref_ < kMaxRef)
|
||||
return ref_;
|
||||
|
||||
MutexLock l(ref_mutex());
|
||||
return (*ref_map())[this];
|
||||
}
|
||||
|
||||
// Increments reference count, returns object as convenience.
|
||||
Regexp *Regexp::Incref() {
|
||||
if (ref_ >= kMaxRef - 1) {
|
||||
static std::once_flag ref_once;
|
||||
std::call_once(ref_once, []() { (void)new (ref_storage) RefStorage; });
|
||||
|
||||
// Store ref count in overflow map.
|
||||
MutexLock l(ref_mutex());
|
||||
if (ref_ == kMaxRef) {
|
||||
// already overflowed
|
||||
(*ref_map())[this]++;
|
||||
} else {
|
||||
// overflowing now
|
||||
(*ref_map())[this] = kMaxRef;
|
||||
ref_ = kMaxRef;
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
ref_++;
|
||||
return this;
|
||||
}
|
||||
|
||||
// Decrements reference count and deletes this object if count reaches 0.
|
||||
void Regexp::Decref() {
|
||||
if (ref_ == kMaxRef) {
|
||||
// Ref count is stored in overflow map.
|
||||
MutexLock l(ref_mutex());
|
||||
int r = (*ref_map())[this] - 1;
|
||||
if (r < kMaxRef) {
|
||||
ref_ = static_cast<uint16_t>(r);
|
||||
ref_map()->erase(this);
|
||||
} else {
|
||||
(*ref_map())[this] = r;
|
||||
}
|
||||
return;
|
||||
}
|
||||
ref_--;
|
||||
if (ref_ == 0)
|
||||
Destroy();
|
||||
}
|
||||
|
||||
// Deletes this object; ref count has count reached 0.
|
||||
void Regexp::Destroy() {
|
||||
if (QuickDestroy())
|
||||
return;
|
||||
|
||||
// Handle recursive Destroy with explicit stack
|
||||
// to avoid arbitrarily deep recursion on process stack [sigh].
|
||||
down_ = NULL;
|
||||
Regexp *stack = this;
|
||||
while (stack != NULL) {
|
||||
Regexp *re = stack;
|
||||
stack = re->down_;
|
||||
if (re->ref_ != 0)
|
||||
LOG(DFATAL) << "Bad reference count " << re->ref_;
|
||||
if (re->nsub_ > 0) {
|
||||
Regexp **subs = re->sub();
|
||||
for (int i = 0; i < re->nsub_; i++) {
|
||||
Regexp *sub = subs[i];
|
||||
if (sub == NULL)
|
||||
continue;
|
||||
if (sub->ref_ == kMaxRef)
|
||||
sub->Decref();
|
||||
else
|
||||
--sub->ref_;
|
||||
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
|
||||
sub->down_ = stack;
|
||||
stack = sub;
|
||||
}
|
||||
}
|
||||
if (re->nsub_ > 1)
|
||||
delete[] subs;
|
||||
re->nsub_ = 0;
|
||||
}
|
||||
delete re;
|
||||
}
|
||||
}
|
||||
|
||||
void Regexp::AddRuneToString(Rune r) {
|
||||
DCHECK(op_ == kRegexpLiteralString);
|
||||
if (arguments.literal_string.nrunes_ == 0) {
|
||||
// start with 8
|
||||
arguments.literal_string.runes_ = new Rune[8];
|
||||
} else if (arguments.literal_string.nrunes_ >= 8 && (arguments.literal_string.nrunes_ & (arguments.literal_string.nrunes_ - 1)) == 0) {
|
||||
// double on powers of two
|
||||
Rune *old = arguments.literal_string.runes_;
|
||||
arguments.literal_string.runes_ = new Rune[arguments.literal_string.nrunes_ * 2];
|
||||
for (int i = 0; i < arguments.literal_string.nrunes_; i++)
|
||||
arguments.literal_string.runes_[i] = old[i];
|
||||
delete[] old;
|
||||
}
|
||||
|
||||
arguments.literal_string.runes_[arguments.literal_string.nrunes_++] = r;
|
||||
}
|
||||
|
||||
Regexp *Regexp::HaveMatch(int match_id, ParseFlags flags) {
|
||||
Regexp *re = new Regexp(kRegexpHaveMatch, flags);
|
||||
re->arguments.match_id_ = match_id;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp *Regexp::StarPlusOrQuest(RegexpOp op, Regexp *sub, ParseFlags flags) {
|
||||
// Squash **, ++ and ??.
|
||||
if (op == sub->op() && flags == sub->parse_flags())
|
||||
return sub;
|
||||
|
||||
// Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
|
||||
// op is Star/Plus/Quest, we just have to check that sub->op() is too.
|
||||
if ((sub->op() == kRegexpStar || sub->op() == kRegexpPlus || sub->op() == kRegexpQuest) && flags == sub->parse_flags()) {
|
||||
// If sub is Star, no need to rewrite it.
|
||||
if (sub->op() == kRegexpStar)
|
||||
return sub;
|
||||
|
||||
// Rewrite sub to Star.
|
||||
Regexp *re = new Regexp(kRegexpStar, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub->sub()[0]->Incref();
|
||||
sub->Decref(); // We didn't consume the reference after all.
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp *re = new Regexp(op, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp *Regexp::Plus(Regexp *sub, ParseFlags flags) { return StarPlusOrQuest(kRegexpPlus, sub, flags); }
|
||||
|
||||
Regexp *Regexp::Star(Regexp *sub, ParseFlags flags) { return StarPlusOrQuest(kRegexpStar, sub, flags); }
|
||||
|
||||
Regexp *Regexp::Quest(Regexp *sub, ParseFlags flags) { return StarPlusOrQuest(kRegexpQuest, sub, flags); }
|
||||
|
||||
Regexp *Regexp::ConcatOrAlternate(RegexpOp op, Regexp **sub, int nsub, ParseFlags flags, bool can_factor) {
|
||||
if (nsub == 1)
|
||||
return sub[0];
|
||||
|
||||
if (nsub == 0) {
|
||||
if (op == kRegexpAlternate)
|
||||
return new Regexp(kRegexpNoMatch, flags);
|
||||
else
|
||||
return new Regexp(kRegexpEmptyMatch, flags);
|
||||
}
|
||||
|
||||
PODArray<Regexp *> subcopy;
|
||||
if (op == kRegexpAlternate && can_factor) {
|
||||
// Going to edit sub; make a copy so we don't step on caller.
|
||||
subcopy = PODArray<Regexp *>(nsub);
|
||||
memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
|
||||
sub = subcopy.data();
|
||||
nsub = FactorAlternation(sub, nsub, flags);
|
||||
if (nsub == 1) {
|
||||
Regexp *re = sub[0];
|
||||
return re;
|
||||
}
|
||||
}
|
||||
|
||||
if (nsub > kMaxNsub) {
|
||||
// Too many subexpressions to fit in a single Regexp.
|
||||
// Make a two-level tree. Two levels gets us to 65535^2.
|
||||
int nbigsub = (nsub + kMaxNsub - 1) / kMaxNsub;
|
||||
Regexp *re = new Regexp(op, flags);
|
||||
re->AllocSub(nbigsub);
|
||||
Regexp **subs = re->sub();
|
||||
for (int i = 0; i < nbigsub - 1; i++)
|
||||
subs[i] = ConcatOrAlternate(op, sub + i * kMaxNsub, kMaxNsub, flags, false);
|
||||
subs[nbigsub - 1] = ConcatOrAlternate(op, sub + (nbigsub - 1) * kMaxNsub, nsub - (nbigsub - 1) * kMaxNsub, flags, false);
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp *re = new Regexp(op, flags);
|
||||
re->AllocSub(nsub);
|
||||
Regexp **subs = re->sub();
|
||||
for (int i = 0; i < nsub; i++)
|
||||
subs[i] = sub[i];
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp *Regexp::Concat(Regexp **sub, int nsub, ParseFlags flags) { return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); }
|
||||
|
||||
Regexp *Regexp::Alternate(Regexp **sub, int nsub, ParseFlags flags) { return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); }
|
||||
|
||||
Regexp *Regexp::AlternateNoFactor(Regexp **sub, int nsub, ParseFlags flags) { return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); }
|
||||
|
||||
Regexp *Regexp::Capture(Regexp *sub, ParseFlags flags, int cap) {
|
||||
Regexp *re = new Regexp(kRegexpCapture, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
re->arguments.capture.cap_ = cap;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp *Regexp::Repeat(Regexp *sub, ParseFlags flags, int min, int max) {
|
||||
Regexp *re = new Regexp(kRegexpRepeat, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
re->arguments.repeat.min_ = min;
|
||||
re->arguments.repeat.max_ = max;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp *Regexp::NewLiteral(Rune rune, ParseFlags flags) {
|
||||
Regexp *re = new Regexp(kRegexpLiteral, flags);
|
||||
re->arguments.rune_ = rune;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp *Regexp::LiteralString(Rune *runes, int nrunes, ParseFlags flags) {
|
||||
if (nrunes <= 0)
|
||||
return new Regexp(kRegexpEmptyMatch, flags);
|
||||
if (nrunes == 1)
|
||||
return NewLiteral(runes[0], flags);
|
||||
Regexp *re = new Regexp(kRegexpLiteralString, flags);
|
||||
for (int i = 0; i < nrunes; i++)
|
||||
re->AddRuneToString(runes[i]);
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp *Regexp::NewCharClass(CharClass *cc, ParseFlags flags) {
|
||||
Regexp *re = new Regexp(kRegexpCharClass, flags);
|
||||
re->arguments.char_class.cc_ = cc;
|
||||
return re;
|
||||
}
|
||||
|
||||
void Regexp::Swap(Regexp *that) {
|
||||
// Regexp is not trivially copyable, so we cannot freely copy it with
|
||||
// memmove(3), but swapping objects like so is safe for our purposes.
|
||||
char tmp[sizeof *this];
|
||||
void *vthis = reinterpret_cast<void *>(this);
|
||||
void *vthat = reinterpret_cast<void *>(that);
|
||||
memmove(tmp, vthis, sizeof *this);
|
||||
memmove(vthis, vthat, sizeof *this);
|
||||
memmove(vthat, tmp, sizeof *this);
|
||||
}
|
||||
|
||||
// Tests equality of all top-level structure but not subregexps.
|
||||
static bool TopEqual(Regexp *a, Regexp *b) {
|
||||
if (a->op() != b->op())
|
||||
return false;
|
||||
|
||||
switch (a->op()) {
|
||||
case kRegexpNoMatch:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpBeginLine:
|
||||
case kRegexpEndLine:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpBeginText:
|
||||
return true;
|
||||
|
||||
case kRegexpEndText:
|
||||
// The parse flags remember whether it's \z or (?-m:$),
|
||||
// which matters when testing against PCRE.
|
||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
|
||||
|
||||
case kRegexpLiteral:
|
||||
return a->rune() == b->rune() && ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
|
||||
|
||||
case kRegexpLiteralString:
|
||||
return a->nrunes() == b->nrunes() && ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
|
||||
memcmp(a->runes(), b->runes(), a->nrunes() * sizeof a->runes()[0]) == 0;
|
||||
|
||||
case kRegexpAlternate:
|
||||
case kRegexpConcat:
|
||||
return a->nsub() == b->nsub();
|
||||
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
|
||||
|
||||
case kRegexpRepeat:
|
||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && a->min() == b->min() && a->max() == b->max();
|
||||
|
||||
case kRegexpCapture:
|
||||
return a->cap() == b->cap() && a->name() == b->name();
|
||||
|
||||
case kRegexpHaveMatch:
|
||||
return a->match_id() == b->match_id();
|
||||
|
||||
case kRegexpCharClass: {
|
||||
CharClass *acc = a->cc();
|
||||
CharClass *bcc = b->cc();
|
||||
return acc->size() == bcc->size() && acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
|
||||
memcmp(acc->begin(), bcc->begin(), (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool Regexp::Equal(Regexp *a, Regexp *b) {
|
||||
if (a == NULL || b == NULL)
|
||||
return a == b;
|
||||
|
||||
if (!TopEqual(a, b))
|
||||
return false;
|
||||
|
||||
// Fast path:
|
||||
// return without allocating vector if there are no subregexps.
|
||||
switch (a->op()) {
|
||||
case kRegexpAlternate:
|
||||
case kRegexpConcat:
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpRepeat:
|
||||
case kRegexpCapture:
|
||||
break;
|
||||
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
// Committed to doing real work.
|
||||
// The stack (vector) has pairs of regexps waiting to
|
||||
// be compared. The regexps are only equal if
|
||||
// all the pairs end up being equal.
|
||||
std::vector<Regexp *> stk;
|
||||
|
||||
for (;;) {
|
||||
// Invariant: TopEqual(a, b) == true.
|
||||
Regexp *a2;
|
||||
Regexp *b2;
|
||||
switch (a->op()) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpAlternate:
|
||||
case kRegexpConcat:
|
||||
for (int i = 0; i < a->nsub(); i++) {
|
||||
a2 = a->sub()[i];
|
||||
b2 = b->sub()[i];
|
||||
if (!TopEqual(a2, b2))
|
||||
return false;
|
||||
stk.push_back(a2);
|
||||
stk.push_back(b2);
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpRepeat:
|
||||
case kRegexpCapture:
|
||||
a2 = a->sub()[0];
|
||||
b2 = b->sub()[0];
|
||||
if (!TopEqual(a2, b2))
|
||||
return false;
|
||||
// Really:
|
||||
// stk.push_back(a2);
|
||||
// stk.push_back(b2);
|
||||
// break;
|
||||
// but faster to assign directly and loop.
|
||||
a = a2;
|
||||
b = b2;
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t n = stk.size();
|
||||
if (n == 0)
|
||||
break;
|
||||
|
||||
DCHECK_GE(n, 2);
|
||||
a = stk[n - 2];
|
||||
b = stk[n - 1];
|
||||
stk.resize(n - 2);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Keep in sync with enum RegexpStatusCode in regexp.h
|
||||
static const char *kErrorStrings[] = {
|
||||
"no error",
|
||||
"unexpected error",
|
||||
"invalid escape sequence",
|
||||
"invalid character class",
|
||||
"invalid character class range",
|
||||
"missing ]",
|
||||
"missing )",
|
||||
"unexpected )",
|
||||
"trailing \\",
|
||||
"no argument for repetition operator",
|
||||
"invalid repetition size",
|
||||
"bad repetition operator",
|
||||
"invalid perl operator",
|
||||
"invalid UTF-8",
|
||||
"invalid named capture group",
|
||||
};
|
||||
|
||||
std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
|
||||
if (code < 0 || code >= arraysize(kErrorStrings))
|
||||
code = kRegexpInternalError;
|
||||
return kErrorStrings[code];
|
||||
}
|
||||
|
||||
std::string RegexpStatus::Text() const {
|
||||
if (error_arg_.empty())
|
||||
return CodeText(code_);
|
||||
std::string s;
|
||||
s.append(CodeText(code_));
|
||||
s.append(": ");
|
||||
s.append(error_arg_.data(), error_arg_.size());
|
||||
return s;
|
||||
}
|
||||
|
||||
void RegexpStatus::Copy(const RegexpStatus &status) {
|
||||
code_ = status.code_;
|
||||
error_arg_ = status.error_arg_;
|
||||
}
|
||||
|
||||
typedef int Ignored; // Walker<void> doesn't exist
|
||||
|
||||
// Walker subclass to count capturing parens in regexp.
|
||||
class NumCapturesWalker : public Regexp::Walker<Ignored> {
|
||||
public:
|
||||
NumCapturesWalker() : ncapture_(0) {}
|
||||
int ncapture() { return ncapture_; }
|
||||
|
||||
virtual Ignored PreVisit(Regexp *re, Ignored ignored, bool *stop) {
|
||||
if (re->op() == kRegexpCapture)
|
||||
ncapture_++;
|
||||
return ignored;
|
||||
}
|
||||
|
||||
virtual Ignored ShortVisit(Regexp *re, Ignored ignored) {
|
||||
// Should never be called: we use Walk(), not WalkExponential().
|
||||
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||||
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
|
||||
#endif
|
||||
return ignored;
|
||||
}
|
||||
|
||||
private:
|
||||
int ncapture_;
|
||||
|
||||
NumCapturesWalker(const NumCapturesWalker &) = delete;
|
||||
NumCapturesWalker &operator=(const NumCapturesWalker &) = delete;
|
||||
};
|
||||
|
||||
int Regexp::NumCaptures() {
|
||||
NumCapturesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.ncapture();
|
||||
}
|
||||
|
||||
// Walker class to build map of named capture groups and their indices.
|
||||
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
||||
public:
|
||||
NamedCapturesWalker() : map_(NULL) {}
|
||||
~NamedCapturesWalker() { delete map_; }
|
||||
|
||||
std::map<std::string, int> *TakeMap() {
|
||||
std::map<std::string, int> *m = map_;
|
||||
map_ = NULL;
|
||||
return m;
|
||||
}
|
||||
|
||||
virtual Ignored PreVisit(Regexp *re, Ignored ignored, bool *stop) {
|
||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
||||
// Allocate map once we find a name.
|
||||
if (map_ == NULL)
|
||||
map_ = new std::map<std::string, int>;
|
||||
|
||||
// Record first occurrence of each name.
|
||||
// (The rule is that if you have the same name
|
||||
// multiple times, only the leftmost one counts.)
|
||||
map_->insert({*re->name(), re->cap()});
|
||||
}
|
||||
return ignored;
|
||||
}
|
||||
|
||||
virtual Ignored ShortVisit(Regexp *re, Ignored ignored) {
|
||||
// Should never be called: we use Walk(), not WalkExponential().
|
||||
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||||
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
|
||||
#endif
|
||||
return ignored;
|
||||
}
|
||||
|
||||
private:
|
||||
std::map<std::string, int> *map_;
|
||||
|
||||
NamedCapturesWalker(const NamedCapturesWalker &) = delete;
|
||||
NamedCapturesWalker &operator=(const NamedCapturesWalker &) = delete;
|
||||
};
|
||||
|
||||
std::map<std::string, int> *Regexp::NamedCaptures() {
|
||||
NamedCapturesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.TakeMap();
|
||||
}
|
||||
|
||||
// Walker class to build map from capture group indices to their names.
|
||||
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
||||
public:
|
||||
CaptureNamesWalker() : map_(NULL) {}
|
||||
~CaptureNamesWalker() { delete map_; }
|
||||
|
||||
std::map<int, std::string> *TakeMap() {
|
||||
std::map<int, std::string> *m = map_;
|
||||
map_ = NULL;
|
||||
return m;
|
||||
}
|
||||
|
||||
virtual Ignored PreVisit(Regexp *re, Ignored ignored, bool *stop) {
|
||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
||||
// Allocate map once we find a name.
|
||||
if (map_ == NULL)
|
||||
map_ = new std::map<int, std::string>;
|
||||
|
||||
(*map_)[re->cap()] = *re->name();
|
||||
}
|
||||
return ignored;
|
||||
}
|
||||
|
||||
virtual Ignored ShortVisit(Regexp *re, Ignored ignored) {
|
||||
// Should never be called: we use Walk(), not WalkExponential().
|
||||
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||||
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
|
||||
#endif
|
||||
return ignored;
|
||||
}
|
||||
|
||||
private:
|
||||
std::map<int, std::string> *map_;
|
||||
|
||||
CaptureNamesWalker(const CaptureNamesWalker &) = delete;
|
||||
CaptureNamesWalker &operator=(const CaptureNamesWalker &) = delete;
|
||||
};
|
||||
|
||||
std::map<int, std::string> *Regexp::CaptureNames() {
|
||||
CaptureNamesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.TakeMap();
|
||||
}
|
||||
|
||||
void ConvertRunesToBytes(bool latin1, Rune *runes, int nrunes, std::string *bytes) {
|
||||
if (latin1) {
|
||||
bytes->resize(nrunes);
|
||||
for (int i = 0; i < nrunes; i++)
|
||||
(*bytes)[i] = static_cast<char>(runes[i]);
|
||||
} else {
|
||||
bytes->resize(nrunes * UTFmax); // worst case
|
||||
char *p = &(*bytes)[0];
|
||||
for (int i = 0; i < nrunes; i++)
|
||||
p += runetochar(p, &runes[i]);
|
||||
bytes->resize(p - &(*bytes)[0]);
|
||||
bytes->shrink_to_fit();
|
||||
}
|
||||
}
|
||||
|
||||
// Determines whether regexp matches must be anchored
|
||||
// with a fixed string prefix. If so, returns the prefix and
|
||||
// the regexp that remains after the prefix. The prefix might
|
||||
// be ASCII case-insensitive.
|
||||
bool Regexp::RequiredPrefix(std::string *prefix, bool *foldcase, Regexp **suffix) {
|
||||
prefix->clear();
|
||||
*foldcase = false;
|
||||
*suffix = NULL;
|
||||
|
||||
// No need for a walker: the regexp must be of the form
|
||||
// 1. some number of ^ anchors
|
||||
// 2. a literal char or string
|
||||
// 3. the rest
|
||||
if (op_ != kRegexpConcat)
|
||||
return false;
|
||||
int i = 0;
|
||||
while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
|
||||
i++;
|
||||
if (i == 0 || i >= nsub_)
|
||||
return false;
|
||||
Regexp *re = sub()[i];
|
||||
if (re->op_ != kRegexpLiteral && re->op_ != kRegexpLiteralString)
|
||||
return false;
|
||||
i++;
|
||||
if (i < nsub_) {
|
||||
for (int j = i; j < nsub_; j++)
|
||||
sub()[j]->Incref();
|
||||
*suffix = Concat(sub() + i, nsub_ - i, parse_flags());
|
||||
} else {
|
||||
*suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
|
||||
}
|
||||
|
||||
bool latin1 = (re->parse_flags() & Latin1) != 0;
|
||||
Rune *runes = re->op_ == kRegexpLiteral ? &re->arguments.rune_ : re->arguments.literal_string.runes_;
|
||||
int nrunes = re->op_ == kRegexpLiteral ? 1 : re->arguments.literal_string.nrunes_;
|
||||
ConvertRunesToBytes(latin1, runes, nrunes, prefix);
|
||||
*foldcase = (re->parse_flags() & FoldCase) != 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Determines whether regexp matches must be unanchored
|
||||
// with a fixed string prefix. If so, returns the prefix.
|
||||
// The prefix might be ASCII case-insensitive.
|
||||
bool Regexp::RequiredPrefixForAccel(std::string *prefix, bool *foldcase) {
|
||||
prefix->clear();
|
||||
*foldcase = false;
|
||||
|
||||
// No need for a walker: the regexp must either begin with or be
|
||||
// a literal char or string. We "see through" capturing groups,
|
||||
// but make no effort to glue multiple prefix fragments together.
|
||||
Regexp *re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
|
||||
while (re->op_ == kRegexpCapture) {
|
||||
re = re->sub()[0];
|
||||
if (re->op_ == kRegexpConcat && re->nsub_ > 0)
|
||||
re = re->sub()[0];
|
||||
}
|
||||
if (re->op_ != kRegexpLiteral && re->op_ != kRegexpLiteralString)
|
||||
return false;
|
||||
|
||||
bool latin1 = (re->parse_flags() & Latin1) != 0;
|
||||
Rune *runes = re->op_ == kRegexpLiteral ? &re->arguments.rune_ : re->arguments.literal_string.runes_;
|
||||
int nrunes = re->op_ == kRegexpLiteral ? 1 : re->arguments.literal_string.nrunes_;
|
||||
ConvertRunesToBytes(latin1, runes, nrunes, prefix);
|
||||
*foldcase = (re->parse_flags() & FoldCase) != 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Character class builder is a balanced binary tree (STL set)
|
||||
// containing non-overlapping, non-abutting RuneRanges.
|
||||
// The less-than operator used in the tree treats two
|
||||
// ranges as equal if they overlap at all, so that
|
||||
// lookups for a particular Rune are possible.
|
||||
|
||||
CharClassBuilder::CharClassBuilder() {
|
||||
nrunes_ = 0;
|
||||
upper_ = 0;
|
||||
lower_ = 0;
|
||||
}
|
||||
|
||||
// Add lo-hi to the class; return whether class got bigger.
|
||||
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
|
||||
if (hi < lo)
|
||||
return false;
|
||||
|
||||
if (lo <= 'z' && hi >= 'A') {
|
||||
// Overlaps some alpha, maybe not all.
|
||||
// Update bitmaps telling which ASCII letters are in the set.
|
||||
Rune lo1 = std::max<Rune>(lo, 'A');
|
||||
Rune hi1 = std::min<Rune>(hi, 'Z');
|
||||
if (lo1 <= hi1)
|
||||
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
|
||||
|
||||
lo1 = std::max<Rune>(lo, 'a');
|
||||
hi1 = std::min<Rune>(hi, 'z');
|
||||
if (lo1 <= hi1)
|
||||
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
|
||||
}
|
||||
|
||||
{ // Check whether lo, hi is already in the class.
|
||||
iterator it = ranges_.find(RuneRange(lo, lo));
|
||||
if (it != end() && it->lo <= lo && hi <= it->hi)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Look for a range abutting lo on the left.
|
||||
// If it exists, take it out and increase our range.
|
||||
if (lo > 0) {
|
||||
iterator it = ranges_.find(RuneRange(lo - 1, lo - 1));
|
||||
if (it != end()) {
|
||||
lo = it->lo;
|
||||
if (it->hi > hi)
|
||||
hi = it->hi;
|
||||
nrunes_ -= it->hi - it->lo + 1;
|
||||
ranges_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
// Look for a range abutting hi on the right.
|
||||
// If it exists, take it out and increase our range.
|
||||
if (hi < Runemax) {
|
||||
iterator it = ranges_.find(RuneRange(hi + 1, hi + 1));
|
||||
if (it != end()) {
|
||||
hi = it->hi;
|
||||
nrunes_ -= it->hi - it->lo + 1;
|
||||
ranges_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
// Look for ranges between lo and hi. Take them out.
|
||||
// This is only safe because the set has no overlapping ranges.
|
||||
// We've already removed any ranges abutting lo and hi, so
|
||||
// any that overlap [lo, hi] must be contained within it.
|
||||
for (;;) {
|
||||
iterator it = ranges_.find(RuneRange(lo, hi));
|
||||
if (it == end())
|
||||
break;
|
||||
nrunes_ -= it->hi - it->lo + 1;
|
||||
ranges_.erase(it);
|
||||
}
|
||||
|
||||
// Finally, add [lo, hi].
|
||||
nrunes_ += hi - lo + 1;
|
||||
ranges_.insert(RuneRange(lo, hi));
|
||||
return true;
|
||||
}
|
||||
|
||||
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
|
||||
for (iterator it = cc->begin(); it != cc->end(); ++it)
|
||||
AddRange(it->lo, it->hi);
|
||||
}
|
||||
|
||||
bool CharClassBuilder::Contains(Rune r) { return ranges_.find(RuneRange(r, r)) != end(); }
|
||||
|
||||
// Does the character class behave the same on A-Z as on a-z?
|
||||
bool CharClassBuilder::FoldsASCII() { return ((upper_ ^ lower_) & AlphaMask) == 0; }
|
||||
|
||||
CharClassBuilder *CharClassBuilder::Copy() {
|
||||
CharClassBuilder *cc = new CharClassBuilder;
|
||||
for (iterator it = begin(); it != end(); ++it)
|
||||
cc->ranges_.insert(RuneRange(it->lo, it->hi));
|
||||
cc->upper_ = upper_;
|
||||
cc->lower_ = lower_;
|
||||
cc->nrunes_ = nrunes_;
|
||||
return cc;
|
||||
}
|
||||
|
||||
void CharClassBuilder::RemoveAbove(Rune r) {
|
||||
if (r >= Runemax)
|
||||
return;
|
||||
|
||||
if (r < 'z') {
|
||||
if (r < 'a')
|
||||
lower_ = 0;
|
||||
else
|
||||
lower_ &= AlphaMask >> ('z' - r);
|
||||
}
|
||||
|
||||
if (r < 'Z') {
|
||||
if (r < 'A')
|
||||
upper_ = 0;
|
||||
else
|
||||
upper_ &= AlphaMask >> ('Z' - r);
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
|
||||
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
|
||||
if (it == end())
|
||||
break;
|
||||
RuneRange rr = *it;
|
||||
ranges_.erase(it);
|
||||
nrunes_ -= rr.hi - rr.lo + 1;
|
||||
if (rr.lo <= r) {
|
||||
rr.hi = r;
|
||||
ranges_.insert(rr);
|
||||
nrunes_ += rr.hi - rr.lo + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CharClassBuilder::Negate() {
|
||||
// Build up negation and then copy in.
|
||||
// Could edit ranges in place, but C++ won't let me.
|
||||
std::vector<RuneRange> v;
|
||||
v.reserve(ranges_.size() + 1);
|
||||
|
||||
// In negation, first range begins at 0, unless
|
||||
// the current class begins at 0.
|
||||
iterator it = begin();
|
||||
if (it == end()) {
|
||||
v.push_back(RuneRange(0, Runemax));
|
||||
} else {
|
||||
int nextlo = 0;
|
||||
if (it->lo == 0) {
|
||||
nextlo = it->hi + 1;
|
||||
++it;
|
||||
}
|
||||
for (; it != end(); ++it) {
|
||||
v.push_back(RuneRange(nextlo, it->lo - 1));
|
||||
nextlo = it->hi + 1;
|
||||
}
|
||||
if (nextlo <= Runemax)
|
||||
v.push_back(RuneRange(nextlo, Runemax));
|
||||
}
|
||||
|
||||
ranges_.clear();
|
||||
for (size_t i = 0; i < v.size(); i++)
|
||||
ranges_.insert(v[i]);
|
||||
|
||||
upper_ = AlphaMask & ~upper_;
|
||||
lower_ = AlphaMask & ~lower_;
|
||||
nrunes_ = Runemax + 1 - nrunes_;
|
||||
}
|
||||
|
||||
// Character class is a sorted list of ranges.
|
||||
// The ranges are allocated in the same block as the header,
|
||||
// necessitating a special allocator and Delete method.
|
||||
|
||||
CharClass *CharClass::New(size_t maxranges) {
|
||||
CharClass *cc;
|
||||
uint8_t *data = new uint8_t[sizeof *cc + maxranges * sizeof cc->ranges_[0]];
|
||||
cc = reinterpret_cast<CharClass *>(data);
|
||||
cc->ranges_ = reinterpret_cast<RuneRange *>(data + sizeof *cc);
|
||||
cc->nranges_ = 0;
|
||||
cc->folds_ascii_ = false;
|
||||
cc->nrunes_ = 0;
|
||||
return cc;
|
||||
}
|
||||
|
||||
void CharClass::Delete() {
|
||||
uint8_t *data = reinterpret_cast<uint8_t *>(this);
|
||||
delete[] data;
|
||||
}
|
||||
|
||||
CharClass *CharClass::Negate() {
|
||||
CharClass *cc = CharClass::New(static_cast<size_t>(nranges_ + 1));
|
||||
cc->folds_ascii_ = folds_ascii_;
|
||||
cc->nrunes_ = Runemax + 1 - nrunes_;
|
||||
int n = 0;
|
||||
int nextlo = 0;
|
||||
for (CharClass::iterator it = begin(); it != end(); ++it) {
|
||||
if (it->lo == nextlo) {
|
||||
nextlo = it->hi + 1;
|
||||
} else {
|
||||
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
|
||||
nextlo = it->hi + 1;
|
||||
}
|
||||
}
|
||||
if (nextlo <= Runemax)
|
||||
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
|
||||
cc->nranges_ = n;
|
||||
return cc;
|
||||
}
|
||||
|
||||
bool CharClass::Contains(Rune r) const {
|
||||
RuneRange *rr = ranges_;
|
||||
int n = nranges_;
|
||||
while (n > 0) {
|
||||
int m = n / 2;
|
||||
if (rr[m].hi < r) {
|
||||
rr += m + 1;
|
||||
n -= m + 1;
|
||||
} else if (r < rr[m].lo) {
|
||||
n = m;
|
||||
} else { // rr[m].lo <= r && r <= rr[m].hi
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
CharClass *CharClassBuilder::GetCharClass() {
|
||||
CharClass *cc = CharClass::New(ranges_.size());
|
||||
int n = 0;
|
||||
for (iterator it = begin(); it != end(); ++it)
|
||||
cc->ranges_[n++] = *it;
|
||||
cc->nranges_ = n;
|
||||
DCHECK_LE(n, static_cast<int>(ranges_.size()));
|
||||
cc->nrunes_ = nrunes_;
|
||||
cc->folds_ascii_ = FoldsASCII();
|
||||
return cc;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
Reference in New Issue
Block a user