mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-03-06 08:06:43 +08:00
# RAGFlow Go Implementation Plan 🚀 This repository tracks the progress of porting RAGFlow to Go. We'll implement core features and provide performance comparisons between Python and Go versions. ## Implementation Checklist - [x] User Management APIs - [x] Dataset Management Operations - [x] Retrieval Test - [x] Chat Management Operations - [x] Infinity Go SDK --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
630 lines
22 KiB
C++
630 lines
22 KiB
C++
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Rewrite POSIX and other features in re
|
|
// to use simple extended regular expression features.
|
|
// Also sort and simplify character classes.
|
|
|
|
#include <string>
|
|
|
|
#include "re2/pod_array.h"
|
|
#include "re2/regexp.h"
|
|
#include "re2/walker-inl.h"
|
|
#include "util/logging.h"
|
|
#include "util/utf.h"
|
|
#include "util/util.h"
|
|
|
|
namespace re2 {
|
|
|
|
// Parses the regexp src and then simplifies it and sets *dst to the
|
|
// string representation of the simplified form. Returns true on success.
|
|
// Returns false and sets *error (if error != NULL) on error.
|
|
bool Regexp::SimplifyRegexp(const StringPiece &src, ParseFlags flags, std::string *dst, RegexpStatus *status) {
|
|
Regexp *re = Parse(src, flags, status);
|
|
if (re == NULL)
|
|
return false;
|
|
Regexp *sre = re->Simplify();
|
|
re->Decref();
|
|
if (sre == NULL) {
|
|
if (status) {
|
|
status->set_code(kRegexpInternalError);
|
|
status->set_error_arg(src);
|
|
}
|
|
return false;
|
|
}
|
|
*dst = sre->ToString();
|
|
sre->Decref();
|
|
return true;
|
|
}
|
|
|
|
// Assuming the simple_ flags on the children are accurate,
|
|
// is this Regexp* simple?
|
|
bool Regexp::ComputeSimple() {
|
|
Regexp **subs;
|
|
switch (op_) {
|
|
case kRegexpNoMatch:
|
|
case kRegexpEmptyMatch:
|
|
case kRegexpLiteral:
|
|
case kRegexpLiteralString:
|
|
case kRegexpBeginLine:
|
|
case kRegexpEndLine:
|
|
case kRegexpBeginText:
|
|
case kRegexpWordBoundary:
|
|
case kRegexpNoWordBoundary:
|
|
case kRegexpEndText:
|
|
case kRegexpAnyChar:
|
|
case kRegexpAnyByte:
|
|
case kRegexpHaveMatch:
|
|
return true;
|
|
case kRegexpConcat:
|
|
case kRegexpAlternate:
|
|
// These are simple as long as the subpieces are simple.
|
|
subs = sub();
|
|
for (int i = 0; i < nsub_; i++)
|
|
if (!subs[i]->simple())
|
|
return false;
|
|
return true;
|
|
case kRegexpCharClass:
|
|
// Simple as long as the char class is not empty, not full.
|
|
if (arguments.char_class.ccb_ != NULL)
|
|
return !arguments.char_class.ccb_->empty() && !arguments.char_class.ccb_->full();
|
|
return !arguments.char_class.cc_->empty() && !arguments.char_class.cc_->full();
|
|
case kRegexpCapture:
|
|
subs = sub();
|
|
return subs[0]->simple();
|
|
case kRegexpStar:
|
|
case kRegexpPlus:
|
|
case kRegexpQuest:
|
|
subs = sub();
|
|
if (!subs[0]->simple())
|
|
return false;
|
|
switch (subs[0]->op_) {
|
|
case kRegexpStar:
|
|
case kRegexpPlus:
|
|
case kRegexpQuest:
|
|
case kRegexpEmptyMatch:
|
|
case kRegexpNoMatch:
|
|
return false;
|
|
default:
|
|
break;
|
|
}
|
|
return true;
|
|
case kRegexpRepeat:
|
|
return false;
|
|
}
|
|
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
|
|
return false;
|
|
}
|
|
|
|
// Walker subclass used by Simplify.
|
|
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
|
|
// occurrences of that literal into repeats of that literal. It also works for
|
|
// char classes, any char and any byte.
|
|
// PostVisit creates the coalesced result, which should then be simplified.
|
|
class CoalesceWalker : public Regexp::Walker<Regexp *> {
|
|
public:
|
|
CoalesceWalker() {}
|
|
virtual Regexp *PostVisit(Regexp *re, Regexp *parent_arg, Regexp *pre_arg, Regexp **child_args, int nchild_args);
|
|
virtual Regexp *Copy(Regexp *re);
|
|
virtual Regexp *ShortVisit(Regexp *re, Regexp *parent_arg);
|
|
|
|
private:
|
|
// These functions are declared inside CoalesceWalker so that
|
|
// they can edit the private fields of the Regexps they construct.
|
|
|
|
// Returns true if r1 and r2 can be coalesced. In particular, ensures that
|
|
// the parse flags are consistent. (They will not be checked again later.)
|
|
static bool CanCoalesce(Regexp *r1, Regexp *r2);
|
|
|
|
// Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
|
|
// will be empty match and the coalesced op. In other cases, where part of a
|
|
// literal string was removed to be coalesced, the array elements afterwards
|
|
// will be the coalesced op and the remainder of the literal string.
|
|
static void DoCoalesce(Regexp **r1ptr, Regexp **r2ptr);
|
|
|
|
CoalesceWalker(const CoalesceWalker &) = delete;
|
|
CoalesceWalker &operator=(const CoalesceWalker &) = delete;
|
|
};
|
|
|
|
// Walker subclass used by Simplify.
|
|
// The simplify walk is purely post-recursive: given the simplified children,
|
|
// PostVisit creates the simplified result.
|
|
// The child_args are simplified Regexp*s.
|
|
class SimplifyWalker : public Regexp::Walker<Regexp *> {
|
|
public:
|
|
SimplifyWalker() {}
|
|
virtual Regexp *PreVisit(Regexp *re, Regexp *parent_arg, bool *stop);
|
|
virtual Regexp *PostVisit(Regexp *re, Regexp *parent_arg, Regexp *pre_arg, Regexp **child_args, int nchild_args);
|
|
virtual Regexp *Copy(Regexp *re);
|
|
virtual Regexp *ShortVisit(Regexp *re, Regexp *parent_arg);
|
|
|
|
private:
|
|
// These functions are declared inside SimplifyWalker so that
|
|
// they can edit the private fields of the Regexps they construct.
|
|
|
|
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
|
// Caller must Decref return value when done with it.
|
|
static Regexp *Concat2(Regexp *re1, Regexp *re2, Regexp::ParseFlags flags);
|
|
|
|
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
|
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
|
// Caller must Decref return value when done with it.
|
|
static Regexp *SimplifyRepeat(Regexp *re, int min, int max, Regexp::ParseFlags parse_flags);
|
|
|
|
// Simplifies a character class by expanding any named classes
|
|
// into rune ranges. Does not edit re. Does not consume ref to re.
|
|
// Caller must Decref return value when done with it.
|
|
static Regexp *SimplifyCharClass(Regexp *re);
|
|
|
|
SimplifyWalker(const SimplifyWalker &) = delete;
|
|
SimplifyWalker &operator=(const SimplifyWalker &) = delete;
|
|
};
|
|
|
|
// Simplifies a regular expression, returning a new regexp.
|
|
// The new regexp uses traditional Unix egrep features only,
|
|
// plus the Perl (?:) non-capturing parentheses.
|
|
// Otherwise, no POSIX or Perl additions. The new regexp
|
|
// captures exactly the same subexpressions (with the same indices)
|
|
// as the original.
|
|
// Does not edit current object.
|
|
// Caller must Decref() return value when done with it.
|
|
|
|
Regexp *Regexp::Simplify() {
|
|
CoalesceWalker cw;
|
|
Regexp *cre = cw.Walk(this, NULL);
|
|
if (cre == NULL)
|
|
return NULL;
|
|
if (cw.stopped_early()) {
|
|
cre->Decref();
|
|
return NULL;
|
|
}
|
|
SimplifyWalker sw;
|
|
Regexp *sre = sw.Walk(cre, NULL);
|
|
cre->Decref();
|
|
if (sre == NULL)
|
|
return NULL;
|
|
if (sw.stopped_early()) {
|
|
sre->Decref();
|
|
return NULL;
|
|
}
|
|
return sre;
|
|
}
|
|
|
|
#define Simplify DontCallSimplify // Avoid accidental recursion
|
|
|
|
// Utility function for PostVisit implementations that compares re->sub() with
|
|
// child_args to determine whether any child_args changed. In the common case,
|
|
// where nothing changed, calls Decref() for all child_args and returns false,
|
|
// so PostVisit must return re->Incref(). Otherwise, returns true.
|
|
static bool ChildArgsChanged(Regexp *re, Regexp **child_args) {
|
|
for (int i = 0; i < re->nsub(); i++) {
|
|
Regexp *sub = re->sub()[i];
|
|
Regexp *newsub = child_args[i];
|
|
if (newsub != sub)
|
|
return true;
|
|
}
|
|
for (int i = 0; i < re->nsub(); i++) {
|
|
Regexp *newsub = child_args[i];
|
|
newsub->Decref();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
Regexp *CoalesceWalker::Copy(Regexp *re) { return re->Incref(); }
|
|
|
|
Regexp *CoalesceWalker::ShortVisit(Regexp *re, Regexp *parent_arg) {
|
|
// Should never be called: we use Walk(), not WalkExponential().
|
|
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
|
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
|
|
#endif
|
|
return re->Incref();
|
|
}
|
|
|
|
Regexp *CoalesceWalker::PostVisit(Regexp *re, Regexp *parent_arg, Regexp *pre_arg, Regexp **child_args, int nchild_args) {
|
|
if (re->nsub() == 0)
|
|
return re->Incref();
|
|
|
|
if (re->op() != kRegexpConcat) {
|
|
if (!ChildArgsChanged(re, child_args))
|
|
return re->Incref();
|
|
|
|
// Something changed. Build a new op.
|
|
Regexp *nre = new Regexp(re->op(), re->parse_flags());
|
|
nre->AllocSub(re->nsub());
|
|
Regexp **nre_subs = nre->sub();
|
|
for (int i = 0; i < re->nsub(); i++)
|
|
nre_subs[i] = child_args[i];
|
|
// Repeats and Captures have additional data that must be copied.
|
|
if (re->op() == kRegexpRepeat) {
|
|
nre->arguments.repeat.min_ = re->min();
|
|
nre->arguments.repeat.max_ = re->max();
|
|
} else if (re->op() == kRegexpCapture) {
|
|
nre->arguments.capture.cap_ = re->cap();
|
|
}
|
|
return nre;
|
|
}
|
|
|
|
bool can_coalesce = false;
|
|
for (int i = 0; i < re->nsub(); i++) {
|
|
if (i + 1 < re->nsub() && CanCoalesce(child_args[i], child_args[i + 1])) {
|
|
can_coalesce = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!can_coalesce) {
|
|
if (!ChildArgsChanged(re, child_args))
|
|
return re->Incref();
|
|
|
|
// Something changed. Build a new op.
|
|
Regexp *nre = new Regexp(re->op(), re->parse_flags());
|
|
nre->AllocSub(re->nsub());
|
|
Regexp **nre_subs = nre->sub();
|
|
for (int i = 0; i < re->nsub(); i++)
|
|
nre_subs[i] = child_args[i];
|
|
return nre;
|
|
}
|
|
|
|
for (int i = 0; i < re->nsub(); i++) {
|
|
if (i + 1 < re->nsub() && CanCoalesce(child_args[i], child_args[i + 1]))
|
|
DoCoalesce(&child_args[i], &child_args[i + 1]);
|
|
}
|
|
// Determine how many empty matches were left by DoCoalesce.
|
|
int n = 0;
|
|
for (int i = n; i < re->nsub(); i++) {
|
|
if (child_args[i]->op() == kRegexpEmptyMatch)
|
|
n++;
|
|
}
|
|
// Build a new op.
|
|
Regexp *nre = new Regexp(re->op(), re->parse_flags());
|
|
nre->AllocSub(re->nsub() - n);
|
|
Regexp **nre_subs = nre->sub();
|
|
for (int i = 0, j = 0; i < re->nsub(); i++) {
|
|
if (child_args[i]->op() == kRegexpEmptyMatch) {
|
|
child_args[i]->Decref();
|
|
continue;
|
|
}
|
|
nre_subs[j] = child_args[i];
|
|
j++;
|
|
}
|
|
return nre;
|
|
}
|
|
|
|
bool CoalesceWalker::CanCoalesce(Regexp *r1, Regexp *r2) {
|
|
// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
|
|
// any byte.
|
|
if ((r1->op() == kRegexpStar || r1->op() == kRegexpPlus || r1->op() == kRegexpQuest || r1->op() == kRegexpRepeat) &&
|
|
(r1->sub()[0]->op() == kRegexpLiteral || r1->sub()[0]->op() == kRegexpCharClass || r1->sub()[0]->op() == kRegexpAnyChar ||
|
|
r1->sub()[0]->op() == kRegexpAnyByte)) {
|
|
// r2 must be a star/plus/quest/repeat of the same literal, char class,
|
|
// any char or any byte.
|
|
if ((r2->op() == kRegexpStar || r2->op() == kRegexpPlus || r2->op() == kRegexpQuest || r2->op() == kRegexpRepeat) &&
|
|
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
|
|
// The parse flags must be consistent.
|
|
((r1->parse_flags() & Regexp::NonGreedy) == (r2->parse_flags() & Regexp::NonGreedy))) {
|
|
return true;
|
|
}
|
|
// ... OR an occurrence of that literal, char class, any char or any byte
|
|
if (Regexp::Equal(r1->sub()[0], r2)) {
|
|
return true;
|
|
}
|
|
// ... OR a literal string that begins with that literal.
|
|
if (r1->sub()[0]->op() == kRegexpLiteral && r2->op() == kRegexpLiteralString && r2->runes()[0] == r1->sub()[0]->rune() &&
|
|
// The parse flags must be consistent.
|
|
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) == (r2->parse_flags() & Regexp::FoldCase))) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void CoalesceWalker::DoCoalesce(Regexp **r1ptr, Regexp **r2ptr) {
|
|
Regexp *r1 = *r1ptr;
|
|
Regexp *r2 = *r2ptr;
|
|
|
|
Regexp *nre = Regexp::Repeat(r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
|
|
|
|
switch (r1->op()) {
|
|
case kRegexpStar:
|
|
nre->arguments.repeat.min_ = 0;
|
|
nre->arguments.repeat.max_ = -1;
|
|
break;
|
|
|
|
case kRegexpPlus:
|
|
nre->arguments.repeat.min_ = 1;
|
|
nre->arguments.repeat.max_ = -1;
|
|
break;
|
|
|
|
case kRegexpQuest:
|
|
nre->arguments.repeat.min_ = 0;
|
|
nre->arguments.repeat.max_ = 1;
|
|
break;
|
|
|
|
case kRegexpRepeat:
|
|
nre->arguments.repeat.min_ = r1->min();
|
|
nre->arguments.repeat.max_ = r1->max();
|
|
break;
|
|
|
|
default:
|
|
nre->Decref();
|
|
LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
|
|
return;
|
|
}
|
|
|
|
switch (r2->op()) {
|
|
case kRegexpStar:
|
|
nre->arguments.repeat.max_ = -1;
|
|
goto LeaveEmpty;
|
|
|
|
case kRegexpPlus:
|
|
nre->arguments.repeat.min_++;
|
|
nre->arguments.repeat.max_ = -1;
|
|
goto LeaveEmpty;
|
|
|
|
case kRegexpQuest:
|
|
if (nre->max() != -1)
|
|
nre->arguments.repeat.max_++;
|
|
goto LeaveEmpty;
|
|
|
|
case kRegexpRepeat:
|
|
nre->arguments.repeat.min_ += r2->min();
|
|
if (r2->max() == -1)
|
|
nre->arguments.repeat.max_ = -1;
|
|
else if (nre->max() != -1)
|
|
nre->arguments.repeat.max_ += r2->max();
|
|
goto LeaveEmpty;
|
|
|
|
case kRegexpLiteral:
|
|
case kRegexpCharClass:
|
|
case kRegexpAnyChar:
|
|
case kRegexpAnyByte:
|
|
nre->arguments.repeat.min_++;
|
|
if (nre->max() != -1)
|
|
nre->arguments.repeat.max_++;
|
|
goto LeaveEmpty;
|
|
|
|
LeaveEmpty:
|
|
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
|
|
*r2ptr = nre;
|
|
break;
|
|
|
|
case kRegexpLiteralString: {
|
|
Rune r = r1->sub()[0]->rune();
|
|
// Determine how much of the literal string is removed.
|
|
// We know that we have at least one rune. :)
|
|
int n = 1;
|
|
while (n < r2->nrunes() && r2->runes()[n] == r)
|
|
n++;
|
|
nre->arguments.repeat.min_ += n;
|
|
if (nre->max() != -1)
|
|
nre->arguments.repeat.max_ += n;
|
|
if (n == r2->nrunes())
|
|
goto LeaveEmpty;
|
|
*r1ptr = nre;
|
|
*r2ptr = Regexp::LiteralString(&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
|
|
break;
|
|
}
|
|
|
|
default:
|
|
nre->Decref();
|
|
LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
|
|
return;
|
|
}
|
|
|
|
r1->Decref();
|
|
r2->Decref();
|
|
}
|
|
|
|
Regexp *SimplifyWalker::Copy(Regexp *re) { return re->Incref(); }
|
|
|
|
Regexp *SimplifyWalker::ShortVisit(Regexp *re, Regexp *parent_arg) {
|
|
// Should never be called: we use Walk(), not WalkExponential().
|
|
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
|
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
|
|
#endif
|
|
return re->Incref();
|
|
}
|
|
|
|
Regexp *SimplifyWalker::PreVisit(Regexp *re, Regexp *parent_arg, bool *stop) {
|
|
if (re->simple()) {
|
|
*stop = true;
|
|
return re->Incref();
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
Regexp *SimplifyWalker::PostVisit(Regexp *re, Regexp *parent_arg, Regexp *pre_arg, Regexp **child_args, int nchild_args) {
|
|
switch (re->op()) {
|
|
case kRegexpNoMatch:
|
|
case kRegexpEmptyMatch:
|
|
case kRegexpLiteral:
|
|
case kRegexpLiteralString:
|
|
case kRegexpBeginLine:
|
|
case kRegexpEndLine:
|
|
case kRegexpBeginText:
|
|
case kRegexpWordBoundary:
|
|
case kRegexpNoWordBoundary:
|
|
case kRegexpEndText:
|
|
case kRegexpAnyChar:
|
|
case kRegexpAnyByte:
|
|
case kRegexpHaveMatch:
|
|
// All these are always simple.
|
|
re->simple_ = true;
|
|
return re->Incref();
|
|
|
|
case kRegexpConcat:
|
|
case kRegexpAlternate: {
|
|
// These are simple as long as the subpieces are simple.
|
|
if (!ChildArgsChanged(re, child_args)) {
|
|
re->simple_ = true;
|
|
return re->Incref();
|
|
}
|
|
Regexp *nre = new Regexp(re->op(), re->parse_flags());
|
|
nre->AllocSub(re->nsub());
|
|
Regexp **nre_subs = nre->sub();
|
|
for (int i = 0; i < re->nsub(); i++)
|
|
nre_subs[i] = child_args[i];
|
|
nre->simple_ = true;
|
|
return nre;
|
|
}
|
|
|
|
case kRegexpCapture: {
|
|
Regexp *newsub = child_args[0];
|
|
if (newsub == re->sub()[0]) {
|
|
newsub->Decref();
|
|
re->simple_ = true;
|
|
return re->Incref();
|
|
}
|
|
Regexp *nre = new Regexp(kRegexpCapture, re->parse_flags());
|
|
nre->AllocSub(1);
|
|
nre->sub()[0] = newsub;
|
|
nre->arguments.capture.cap_ = re->cap();
|
|
nre->simple_ = true;
|
|
return nre;
|
|
}
|
|
|
|
case kRegexpStar:
|
|
case kRegexpPlus:
|
|
case kRegexpQuest: {
|
|
Regexp *newsub = child_args[0];
|
|
// Special case: repeat the empty string as much as
|
|
// you want, but it's still the empty string.
|
|
if (newsub->op() == kRegexpEmptyMatch)
|
|
return newsub;
|
|
|
|
// These are simple as long as the subpiece is simple.
|
|
if (newsub == re->sub()[0]) {
|
|
newsub->Decref();
|
|
re->simple_ = true;
|
|
return re->Incref();
|
|
}
|
|
|
|
// These are also idempotent if flags are constant.
|
|
if (re->op() == newsub->op() && re->parse_flags() == newsub->parse_flags())
|
|
return newsub;
|
|
|
|
Regexp *nre = new Regexp(re->op(), re->parse_flags());
|
|
nre->AllocSub(1);
|
|
nre->sub()[0] = newsub;
|
|
nre->simple_ = true;
|
|
return nre;
|
|
}
|
|
|
|
case kRegexpRepeat: {
|
|
Regexp *newsub = child_args[0];
|
|
// Special case: repeat the empty string as much as
|
|
// you want, but it's still the empty string.
|
|
if (newsub->op() == kRegexpEmptyMatch)
|
|
return newsub;
|
|
|
|
Regexp *nre = SimplifyRepeat(newsub, re->arguments.repeat.min_, re->arguments.repeat.max_, re->parse_flags());
|
|
newsub->Decref();
|
|
nre->simple_ = true;
|
|
return nre;
|
|
}
|
|
|
|
case kRegexpCharClass: {
|
|
Regexp *nre = SimplifyCharClass(re);
|
|
nre->simple_ = true;
|
|
return nre;
|
|
}
|
|
}
|
|
|
|
LOG(ERROR) << "Simplify case not handled: " << re->op();
|
|
return re->Incref();
|
|
}
|
|
|
|
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
|
// Returns a new Regexp, handing the ref to the caller.
|
|
Regexp *SimplifyWalker::Concat2(Regexp *re1, Regexp *re2, Regexp::ParseFlags parse_flags) {
|
|
Regexp *re = new Regexp(kRegexpConcat, parse_flags);
|
|
re->AllocSub(2);
|
|
Regexp **subs = re->sub();
|
|
subs[0] = re1;
|
|
subs[1] = re2;
|
|
return re;
|
|
}
|
|
|
|
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
|
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
|
// Caller must Decref return value when done with it.
|
|
// The result will *not* necessarily have the right capturing parens
|
|
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
|
|
// but in the Regexp* representation, both (x) are marked as $1.
|
|
Regexp *SimplifyWalker::SimplifyRepeat(Regexp *re, int min, int max, Regexp::ParseFlags f) {
|
|
// x{n,} means at least n matches of x.
|
|
if (max == -1) {
|
|
// Special case: x{0,} is x*
|
|
if (min == 0)
|
|
return Regexp::Star(re->Incref(), f);
|
|
|
|
// Special case: x{1,} is x+
|
|
if (min == 1)
|
|
return Regexp::Plus(re->Incref(), f);
|
|
|
|
// General case: x{4,} is xxxx+
|
|
PODArray<Regexp *> nre_subs(min);
|
|
for (int i = 0; i < min - 1; i++)
|
|
nre_subs[i] = re->Incref();
|
|
nre_subs[min - 1] = Regexp::Plus(re->Incref(), f);
|
|
return Regexp::Concat(nre_subs.data(), min, f);
|
|
}
|
|
|
|
// Special case: (x){0} matches only empty string.
|
|
if (min == 0 && max == 0)
|
|
return new Regexp(kRegexpEmptyMatch, f);
|
|
|
|
// Special case: x{1} is just x.
|
|
if (min == 1 && max == 1)
|
|
return re->Incref();
|
|
|
|
// General case: x{n,m} means n copies of x and m copies of x?.
|
|
// The machine will do less work if we nest the final m copies,
|
|
// so that x{2,5} = xx(x(x(x)?)?)?
|
|
|
|
// Build leading prefix: xx. Capturing only on the last one.
|
|
Regexp *nre = NULL;
|
|
if (min > 0) {
|
|
PODArray<Regexp *> nre_subs(min);
|
|
for (int i = 0; i < min; i++)
|
|
nre_subs[i] = re->Incref();
|
|
nre = Regexp::Concat(nre_subs.data(), min, f);
|
|
}
|
|
|
|
// Build and attach suffix: (x(x(x)?)?)?
|
|
if (max > min) {
|
|
Regexp *suf = Regexp::Quest(re->Incref(), f);
|
|
for (int i = min + 1; i < max; i++)
|
|
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
|
|
if (nre == NULL)
|
|
nre = suf;
|
|
else
|
|
nre = Concat2(nre, suf, f);
|
|
}
|
|
|
|
if (nre == NULL) {
|
|
// Some degenerate case, like min > max, or min < max < 0.
|
|
// This shouldn't happen, because the parser rejects such regexps.
|
|
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
|
|
return new Regexp(kRegexpNoMatch, f);
|
|
}
|
|
|
|
return nre;
|
|
}
|
|
|
|
// Simplifies a character class.
|
|
// Caller must Decref return value when done with it.
|
|
Regexp *SimplifyWalker::SimplifyCharClass(Regexp *re) {
|
|
CharClass *cc = re->cc();
|
|
|
|
// Special cases
|
|
if (cc->empty())
|
|
return new Regexp(kRegexpNoMatch, re->parse_flags());
|
|
if (cc->full())
|
|
return new Regexp(kRegexpAnyChar, re->parse_flags());
|
|
|
|
return re->Incref();
|
|
}
|
|
|
|
} // namespace re2
|