RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀 This repository tracks the progress of porting RAGFlow to Go. We'll implement core features and provide performance comparisons between Python and Go versions. ## Implementation Checklist - [x] User Management APIs - [x] Dataset Management Operations - [x] Retrieval Test - [x] Chat Management Operations - [x] Infinity Go SDK --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
2026-05-02 00:07:47 +08:00 · 2026-03-04 19:17:16 +08:00
parent 2508c46c8f
commit 70e9743ef1
257 changed files with 80490 additions and 6 deletions
--- a/internal/cpp/re2/regexp.cc
+++ b/internal/cpp/re2/regexp.cc
@ -0,0 +1,957 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression representation.
+// Tested by parse_test.cc
+
+#include "re2/regexp.h"
+
+#include <algorithm>
+#include <map>
+#include <mutex>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "re2/pod_array.h"
+#include "re2/stringpiece.h"
+#include "re2/walker-inl.h"
+#include "util/logging.h"
+#include "util/mutex.h"
+#include "util/utf.h"
+#include "util/util.h"
+
+#ifdef min
+#undef min
+#endif
+#ifdef max
+#undef max
+#endif
+
+namespace re2 {
+
+// Constructor.  Allocates vectors as appropriate for operator.
+Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
+    : op_(static_cast<uint8_t>(op)), simple_(false), parse_flags_(static_cast<uint16_t>(parse_flags)), ref_(1), nsub_(0), down_(NULL) {
+    subone_ = NULL;
+    memset(arguments.the_union_, 0, sizeof arguments.the_union_);
+}
+
+// Destructor.  Assumes already cleaned up children.
+// Private: use Decref() instead of delete to destroy Regexps.
+// Can't call Decref on the sub-Regexps here because
+// that could cause arbitrarily deep recursion, so
+// required Decref() to have handled them for us.
+Regexp::~Regexp() {
+    if (nsub_ > 0)
+        LOG(DFATAL) << "Regexp not destroyed.";
+
+    switch (op_) {
+        default:
+            break;
+        case kRegexpCapture:
+            delete arguments.capture.name_;
+            break;
+        case kRegexpLiteralString:
+            delete[] arguments.literal_string.runes_;
+            break;
+        case kRegexpCharClass:
+            if (arguments.char_class.cc_)
+                arguments.char_class.cc_->Delete();
+            delete arguments.char_class.ccb_;
+            break;
+    }
+}
+
+// If it's possible to destroy this regexp without recurring,
+// do so and return true.  Else return false.
+bool Regexp::QuickDestroy() {
+    if (nsub_ == 0) {
+        delete this;
+        return true;
+    }
+    return false;
+}
+
+// Similar to EmptyStorage in re2.cc.
+struct RefStorage {
+    Mutex ref_mutex;
+    std::map<Regexp *, int> ref_map;
+};
+alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];
+
+static inline Mutex *ref_mutex() { return &reinterpret_cast<RefStorage *>(ref_storage)->ref_mutex; }
+
+static inline std::map<Regexp *, int> *ref_map() { return &reinterpret_cast<RefStorage *>(ref_storage)->ref_map; }
+
+int Regexp::Ref() {
+    if (ref_ < kMaxRef)
+        return ref_;
+
+    MutexLock l(ref_mutex());
+    return (*ref_map())[this];
+}
+
+// Increments reference count, returns object as convenience.
+Regexp *Regexp::Incref() {
+    if (ref_ >= kMaxRef - 1) {
+        static std::once_flag ref_once;
+        std::call_once(ref_once, []() { (void)new (ref_storage) RefStorage; });
+
+        // Store ref count in overflow map.
+        MutexLock l(ref_mutex());
+        if (ref_ == kMaxRef) {
+            // already overflowed
+            (*ref_map())[this]++;
+        } else {
+            // overflowing now
+            (*ref_map())[this] = kMaxRef;
+            ref_ = kMaxRef;
+        }
+        return this;
+    }
+
+    ref_++;
+    return this;
+}
+
+// Decrements reference count and deletes this object if count reaches 0.
+void Regexp::Decref() {
+    if (ref_ == kMaxRef) {
+        // Ref count is stored in overflow map.
+        MutexLock l(ref_mutex());
+        int r = (*ref_map())[this] - 1;
+        if (r < kMaxRef) {
+            ref_ = static_cast<uint16_t>(r);
+            ref_map()->erase(this);
+        } else {
+            (*ref_map())[this] = r;
+        }
+        return;
+    }
+    ref_--;
+    if (ref_ == 0)
+        Destroy();
+}
+
+// Deletes this object; ref count has count reached 0.
+void Regexp::Destroy() {
+    if (QuickDestroy())
+        return;
+
+    // Handle recursive Destroy with explicit stack
+    // to avoid arbitrarily deep recursion on process stack [sigh].
+    down_ = NULL;
+    Regexp *stack = this;
+    while (stack != NULL) {
+        Regexp *re = stack;
+        stack = re->down_;
+        if (re->ref_ != 0)
+            LOG(DFATAL) << "Bad reference count " << re->ref_;
+        if (re->nsub_ > 0) {
+            Regexp **subs = re->sub();
+            for (int i = 0; i < re->nsub_; i++) {
+                Regexp *sub = subs[i];
+                if (sub == NULL)
+                    continue;
+                if (sub->ref_ == kMaxRef)
+                    sub->Decref();
+                else
+                    --sub->ref_;
+                if (sub->ref_ == 0 && !sub->QuickDestroy()) {
+                    sub->down_ = stack;
+                    stack = sub;
+                }
+            }
+            if (re->nsub_ > 1)
+                delete[] subs;
+            re->nsub_ = 0;
+        }
+        delete re;
+    }
+}
+
+void Regexp::AddRuneToString(Rune r) {
+    DCHECK(op_ == kRegexpLiteralString);
+    if (arguments.literal_string.nrunes_ == 0) {
+        // start with 8
+        arguments.literal_string.runes_ = new Rune[8];
+    } else if (arguments.literal_string.nrunes_ >= 8 && (arguments.literal_string.nrunes_ & (arguments.literal_string.nrunes_ - 1)) == 0) {
+        // double on powers of two
+        Rune *old = arguments.literal_string.runes_;
+        arguments.literal_string.runes_ = new Rune[arguments.literal_string.nrunes_ * 2];
+        for (int i = 0; i < arguments.literal_string.nrunes_; i++)
+            arguments.literal_string.runes_[i] = old[i];
+        delete[] old;
+    }
+
+    arguments.literal_string.runes_[arguments.literal_string.nrunes_++] = r;
+}
+
+Regexp *Regexp::HaveMatch(int match_id, ParseFlags flags) {
+    Regexp *re = new Regexp(kRegexpHaveMatch, flags);
+    re->arguments.match_id_ = match_id;
+    return re;
+}
+
+Regexp *Regexp::StarPlusOrQuest(RegexpOp op, Regexp *sub, ParseFlags flags) {
+    // Squash **, ++ and ??.
+    if (op == sub->op() && flags == sub->parse_flags())
+        return sub;
+
+    // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
+    // op is Star/Plus/Quest, we just have to check that sub->op() is too.
+    if ((sub->op() == kRegexpStar || sub->op() == kRegexpPlus || sub->op() == kRegexpQuest) && flags == sub->parse_flags()) {
+        // If sub is Star, no need to rewrite it.
+        if (sub->op() == kRegexpStar)
+            return sub;
+
+        // Rewrite sub to Star.
+        Regexp *re = new Regexp(kRegexpStar, flags);
+        re->AllocSub(1);
+        re->sub()[0] = sub->sub()[0]->Incref();
+        sub->Decref(); // We didn't consume the reference after all.
+        return re;
+    }
+
+    Regexp *re = new Regexp(op, flags);
+    re->AllocSub(1);
+    re->sub()[0] = sub;
+    return re;
+}
+
+Regexp *Regexp::Plus(Regexp *sub, ParseFlags flags) { return StarPlusOrQuest(kRegexpPlus, sub, flags); }
+
+Regexp *Regexp::Star(Regexp *sub, ParseFlags flags) { return StarPlusOrQuest(kRegexpStar, sub, flags); }
+
+Regexp *Regexp::Quest(Regexp *sub, ParseFlags flags) { return StarPlusOrQuest(kRegexpQuest, sub, flags); }
+
+Regexp *Regexp::ConcatOrAlternate(RegexpOp op, Regexp **sub, int nsub, ParseFlags flags, bool can_factor) {
+    if (nsub == 1)
+        return sub[0];
+
+    if (nsub == 0) {
+        if (op == kRegexpAlternate)
+            return new Regexp(kRegexpNoMatch, flags);
+        else
+            return new Regexp(kRegexpEmptyMatch, flags);
+    }
+
+    PODArray<Regexp *> subcopy;
+    if (op == kRegexpAlternate && can_factor) {
+        // Going to edit sub; make a copy so we don't step on caller.
+        subcopy = PODArray<Regexp *>(nsub);
+        memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
+        sub = subcopy.data();
+        nsub = FactorAlternation(sub, nsub, flags);
+        if (nsub == 1) {
+            Regexp *re = sub[0];
+            return re;
+        }
+    }
+
+    if (nsub > kMaxNsub) {
+        // Too many subexpressions to fit in a single Regexp.
+        // Make a two-level tree.  Two levels gets us to 65535^2.
+        int nbigsub = (nsub + kMaxNsub - 1) / kMaxNsub;
+        Regexp *re = new Regexp(op, flags);
+        re->AllocSub(nbigsub);
+        Regexp **subs = re->sub();
+        for (int i = 0; i < nbigsub - 1; i++)
+            subs[i] = ConcatOrAlternate(op, sub + i * kMaxNsub, kMaxNsub, flags, false);
+        subs[nbigsub - 1] = ConcatOrAlternate(op, sub + (nbigsub - 1) * kMaxNsub, nsub - (nbigsub - 1) * kMaxNsub, flags, false);
+        return re;
+    }
+
+    Regexp *re = new Regexp(op, flags);
+    re->AllocSub(nsub);
+    Regexp **subs = re->sub();
+    for (int i = 0; i < nsub; i++)
+        subs[i] = sub[i];
+    return re;
+}
+
+Regexp *Regexp::Concat(Regexp **sub, int nsub, ParseFlags flags) { return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); }
+
+Regexp *Regexp::Alternate(Regexp **sub, int nsub, ParseFlags flags) { return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); }
+
+Regexp *Regexp::AlternateNoFactor(Regexp **sub, int nsub, ParseFlags flags) { return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); }
+
+Regexp *Regexp::Capture(Regexp *sub, ParseFlags flags, int cap) {
+    Regexp *re = new Regexp(kRegexpCapture, flags);
+    re->AllocSub(1);
+    re->sub()[0] = sub;
+    re->arguments.capture.cap_ = cap;
+    return re;
+}
+
+Regexp *Regexp::Repeat(Regexp *sub, ParseFlags flags, int min, int max) {
+    Regexp *re = new Regexp(kRegexpRepeat, flags);
+    re->AllocSub(1);
+    re->sub()[0] = sub;
+    re->arguments.repeat.min_ = min;
+    re->arguments.repeat.max_ = max;
+    return re;
+}
+
+Regexp *Regexp::NewLiteral(Rune rune, ParseFlags flags) {
+    Regexp *re = new Regexp(kRegexpLiteral, flags);
+    re->arguments.rune_ = rune;
+    return re;
+}
+
+Regexp *Regexp::LiteralString(Rune *runes, int nrunes, ParseFlags flags) {
+    if (nrunes <= 0)
+        return new Regexp(kRegexpEmptyMatch, flags);
+    if (nrunes == 1)
+        return NewLiteral(runes[0], flags);
+    Regexp *re = new Regexp(kRegexpLiteralString, flags);
+    for (int i = 0; i < nrunes; i++)
+        re->AddRuneToString(runes[i]);
+    return re;
+}
+
+Regexp *Regexp::NewCharClass(CharClass *cc, ParseFlags flags) {
+    Regexp *re = new Regexp(kRegexpCharClass, flags);
+    re->arguments.char_class.cc_ = cc;
+    return re;
+}
+
+void Regexp::Swap(Regexp *that) {
+    // Regexp is not trivially copyable, so we cannot freely copy it with
+    // memmove(3), but swapping objects like so is safe for our purposes.
+    char tmp[sizeof *this];
+    void *vthis = reinterpret_cast<void *>(this);
+    void *vthat = reinterpret_cast<void *>(that);
+    memmove(tmp, vthis, sizeof *this);
+    memmove(vthis, vthat, sizeof *this);
+    memmove(vthat, tmp, sizeof *this);
+}
+
+// Tests equality of all top-level structure but not subregexps.
+static bool TopEqual(Regexp *a, Regexp *b) {
+    if (a->op() != b->op())
+        return false;
+
+    switch (a->op()) {
+        case kRegexpNoMatch:
+        case kRegexpEmptyMatch:
+        case kRegexpAnyChar:
+        case kRegexpAnyByte:
+        case kRegexpBeginLine:
+        case kRegexpEndLine:
+        case kRegexpWordBoundary:
+        case kRegexpNoWordBoundary:
+        case kRegexpBeginText:
+            return true;
+
+        case kRegexpEndText:
+            // The parse flags remember whether it's \z or (?-m:$),
+            // which matters when testing against PCRE.
+            return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
+
+        case kRegexpLiteral:
+            return a->rune() == b->rune() && ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
+
+        case kRegexpLiteralString:
+            return a->nrunes() == b->nrunes() && ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
+                   memcmp(a->runes(), b->runes(), a->nrunes() * sizeof a->runes()[0]) == 0;
+
+        case kRegexpAlternate:
+        case kRegexpConcat:
+            return a->nsub() == b->nsub();
+
+        case kRegexpStar:
+        case kRegexpPlus:
+        case kRegexpQuest:
+            return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
+
+        case kRegexpRepeat:
+            return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && a->min() == b->min() && a->max() == b->max();
+
+        case kRegexpCapture:
+            return a->cap() == b->cap() && a->name() == b->name();
+
+        case kRegexpHaveMatch:
+            return a->match_id() == b->match_id();
+
+        case kRegexpCharClass: {
+            CharClass *acc = a->cc();
+            CharClass *bcc = b->cc();
+            return acc->size() == bcc->size() && acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
+                   memcmp(acc->begin(), bcc->begin(), (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
+        }
+    }
+
+    LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
+    return 0;
+}
+
+bool Regexp::Equal(Regexp *a, Regexp *b) {
+    if (a == NULL || b == NULL)
+        return a == b;
+
+    if (!TopEqual(a, b))
+        return false;
+
+    // Fast path:
+    // return without allocating vector if there are no subregexps.
+    switch (a->op()) {
+        case kRegexpAlternate:
+        case kRegexpConcat:
+        case kRegexpStar:
+        case kRegexpPlus:
+        case kRegexpQuest:
+        case kRegexpRepeat:
+        case kRegexpCapture:
+            break;
+
+        default:
+            return true;
+    }
+
+    // Committed to doing real work.
+    // The stack (vector) has pairs of regexps waiting to
+    // be compared.  The regexps are only equal if
+    // all the pairs end up being equal.
+    std::vector<Regexp *> stk;
+
+    for (;;) {
+        // Invariant: TopEqual(a, b) == true.
+        Regexp *a2;
+        Regexp *b2;
+        switch (a->op()) {
+            default:
+                break;
+            case kRegexpAlternate:
+            case kRegexpConcat:
+                for (int i = 0; i < a->nsub(); i++) {
+                    a2 = a->sub()[i];
+                    b2 = b->sub()[i];
+                    if (!TopEqual(a2, b2))
+                        return false;
+                    stk.push_back(a2);
+                    stk.push_back(b2);
+                }
+                break;
+
+            case kRegexpStar:
+            case kRegexpPlus:
+            case kRegexpQuest:
+            case kRegexpRepeat:
+            case kRegexpCapture:
+                a2 = a->sub()[0];
+                b2 = b->sub()[0];
+                if (!TopEqual(a2, b2))
+                    return false;
+                // Really:
+                //   stk.push_back(a2);
+                //   stk.push_back(b2);
+                //   break;
+                // but faster to assign directly and loop.
+                a = a2;
+                b = b2;
+                continue;
+        }
+
+        size_t n = stk.size();
+        if (n == 0)
+            break;
+
+        DCHECK_GE(n, 2);
+        a = stk[n - 2];
+        b = stk[n - 1];
+        stk.resize(n - 2);
+    }
+
+    return true;
+}
+
+// Keep in sync with enum RegexpStatusCode in regexp.h
+static const char *kErrorStrings[] = {
+    "no error",
+    "unexpected error",
+    "invalid escape sequence",
+    "invalid character class",
+    "invalid character class range",
+    "missing ]",
+    "missing )",
+    "unexpected )",
+    "trailing \\",
+    "no argument for repetition operator",
+    "invalid repetition size",
+    "bad repetition operator",
+    "invalid perl operator",
+    "invalid UTF-8",
+    "invalid named capture group",
+};
+
+std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
+    if (code < 0 || code >= arraysize(kErrorStrings))
+        code = kRegexpInternalError;
+    return kErrorStrings[code];
+}
+
+std::string RegexpStatus::Text() const {
+    if (error_arg_.empty())
+        return CodeText(code_);
+    std::string s;
+    s.append(CodeText(code_));
+    s.append(": ");
+    s.append(error_arg_.data(), error_arg_.size());
+    return s;
+}
+
+void RegexpStatus::Copy(const RegexpStatus &status) {
+    code_ = status.code_;
+    error_arg_ = status.error_arg_;
+}
+
+typedef int Ignored; // Walker<void> doesn't exist
+
+// Walker subclass to count capturing parens in regexp.
+class NumCapturesWalker : public Regexp::Walker<Ignored> {
+public:
+    NumCapturesWalker() : ncapture_(0) {}
+    int ncapture() { return ncapture_; }
+
+    virtual Ignored PreVisit(Regexp *re, Ignored ignored, bool *stop) {
+        if (re->op() == kRegexpCapture)
+            ncapture_++;
+        return ignored;
+    }
+
+    virtual Ignored ShortVisit(Regexp *re, Ignored ignored) {
+        // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+        LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
+#endif
+        return ignored;
+    }
+
+private:
+    int ncapture_;
+
+    NumCapturesWalker(const NumCapturesWalker &) = delete;
+    NumCapturesWalker &operator=(const NumCapturesWalker &) = delete;
+};
+
+int Regexp::NumCaptures() {
+    NumCapturesWalker w;
+    w.Walk(this, 0);
+    return w.ncapture();
+}
+
+// Walker class to build map of named capture groups and their indices.
+class NamedCapturesWalker : public Regexp::Walker<Ignored> {
+public:
+    NamedCapturesWalker() : map_(NULL) {}
+    ~NamedCapturesWalker() { delete map_; }
+
+    std::map<std::string, int> *TakeMap() {
+        std::map<std::string, int> *m = map_;
+        map_ = NULL;
+        return m;
+    }
+
+    virtual Ignored PreVisit(Regexp *re, Ignored ignored, bool *stop) {
+        if (re->op() == kRegexpCapture && re->name() != NULL) {
+            // Allocate map once we find a name.
+            if (map_ == NULL)
+                map_ = new std::map<std::string, int>;
+
+            // Record first occurrence of each name.
+            // (The rule is that if you have the same name
+            // multiple times, only the leftmost one counts.)
+            map_->insert({*re->name(), re->cap()});
+        }
+        return ignored;
+    }
+
+    virtual Ignored ShortVisit(Regexp *re, Ignored ignored) {
+        // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+        LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
+#endif
+        return ignored;
+    }
+
+private:
+    std::map<std::string, int> *map_;
+
+    NamedCapturesWalker(const NamedCapturesWalker &) = delete;
+    NamedCapturesWalker &operator=(const NamedCapturesWalker &) = delete;
+};
+
+std::map<std::string, int> *Regexp::NamedCaptures() {
+    NamedCapturesWalker w;
+    w.Walk(this, 0);
+    return w.TakeMap();
+}
+
+// Walker class to build map from capture group indices to their names.
+class CaptureNamesWalker : public Regexp::Walker<Ignored> {
+public:
+    CaptureNamesWalker() : map_(NULL) {}
+    ~CaptureNamesWalker() { delete map_; }
+
+    std::map<int, std::string> *TakeMap() {
+        std::map<int, std::string> *m = map_;
+        map_ = NULL;
+        return m;
+    }
+
+    virtual Ignored PreVisit(Regexp *re, Ignored ignored, bool *stop) {
+        if (re->op() == kRegexpCapture && re->name() != NULL) {
+            // Allocate map once we find a name.
+            if (map_ == NULL)
+                map_ = new std::map<int, std::string>;
+
+            (*map_)[re->cap()] = *re->name();
+        }
+        return ignored;
+    }
+
+    virtual Ignored ShortVisit(Regexp *re, Ignored ignored) {
+        // Should never be called: we use Walk(), not WalkExponential().
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+        LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
+#endif
+        return ignored;
+    }
+
+private:
+    std::map<int, std::string> *map_;
+
+    CaptureNamesWalker(const CaptureNamesWalker &) = delete;
+    CaptureNamesWalker &operator=(const CaptureNamesWalker &) = delete;
+};
+
+std::map<int, std::string> *Regexp::CaptureNames() {
+    CaptureNamesWalker w;
+    w.Walk(this, 0);
+    return w.TakeMap();
+}
+
+void ConvertRunesToBytes(bool latin1, Rune *runes, int nrunes, std::string *bytes) {
+    if (latin1) {
+        bytes->resize(nrunes);
+        for (int i = 0; i < nrunes; i++)
+            (*bytes)[i] = static_cast<char>(runes[i]);
+    } else {
+        bytes->resize(nrunes * UTFmax); // worst case
+        char *p = &(*bytes)[0];
+        for (int i = 0; i < nrunes; i++)
+            p += runetochar(p, &runes[i]);
+        bytes->resize(p - &(*bytes)[0]);
+        bytes->shrink_to_fit();
+    }
+}
+
+// Determines whether regexp matches must be anchored
+// with a fixed string prefix.  If so, returns the prefix and
+// the regexp that remains after the prefix.  The prefix might
+// be ASCII case-insensitive.
+bool Regexp::RequiredPrefix(std::string *prefix, bool *foldcase, Regexp **suffix) {
+    prefix->clear();
+    *foldcase = false;
+    *suffix = NULL;
+
+    // No need for a walker: the regexp must be of the form
+    // 1. some number of ^ anchors
+    // 2. a literal char or string
+    // 3. the rest
+    if (op_ != kRegexpConcat)
+        return false;
+    int i = 0;
+    while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
+        i++;
+    if (i == 0 || i >= nsub_)
+        return false;
+    Regexp *re = sub()[i];
+    if (re->op_ != kRegexpLiteral && re->op_ != kRegexpLiteralString)
+        return false;
+    i++;
+    if (i < nsub_) {
+        for (int j = i; j < nsub_; j++)
+            sub()[j]->Incref();
+        *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
+    } else {
+        *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
+    }
+
+    bool latin1 = (re->parse_flags() & Latin1) != 0;
+    Rune *runes = re->op_ == kRegexpLiteral ? &re->arguments.rune_ : re->arguments.literal_string.runes_;
+    int nrunes = re->op_ == kRegexpLiteral ? 1 : re->arguments.literal_string.nrunes_;
+    ConvertRunesToBytes(latin1, runes, nrunes, prefix);
+    *foldcase = (re->parse_flags() & FoldCase) != 0;
+    return true;
+}
+
+// Determines whether regexp matches must be unanchored
+// with a fixed string prefix.  If so, returns the prefix.
+// The prefix might be ASCII case-insensitive.
+bool Regexp::RequiredPrefixForAccel(std::string *prefix, bool *foldcase) {
+    prefix->clear();
+    *foldcase = false;
+
+    // No need for a walker: the regexp must either begin with or be
+    // a literal char or string. We "see through" capturing groups,
+    // but make no effort to glue multiple prefix fragments together.
+    Regexp *re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
+    while (re->op_ == kRegexpCapture) {
+        re = re->sub()[0];
+        if (re->op_ == kRegexpConcat && re->nsub_ > 0)
+            re = re->sub()[0];
+    }
+    if (re->op_ != kRegexpLiteral && re->op_ != kRegexpLiteralString)
+        return false;
+
+    bool latin1 = (re->parse_flags() & Latin1) != 0;
+    Rune *runes = re->op_ == kRegexpLiteral ? &re->arguments.rune_ : re->arguments.literal_string.runes_;
+    int nrunes = re->op_ == kRegexpLiteral ? 1 : re->arguments.literal_string.nrunes_;
+    ConvertRunesToBytes(latin1, runes, nrunes, prefix);
+    *foldcase = (re->parse_flags() & FoldCase) != 0;
+    return true;
+}
+
+// Character class builder is a balanced binary tree (STL set)
+// containing non-overlapping, non-abutting RuneRanges.
+// The less-than operator used in the tree treats two
+// ranges as equal if they overlap at all, so that
+// lookups for a particular Rune are possible.
+
+CharClassBuilder::CharClassBuilder() {
+    nrunes_ = 0;
+    upper_ = 0;
+    lower_ = 0;
+}
+
+// Add lo-hi to the class; return whether class got bigger.
+bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
+    if (hi < lo)
+        return false;
+
+    if (lo <= 'z' && hi >= 'A') {
+        // Overlaps some alpha, maybe not all.
+        // Update bitmaps telling which ASCII letters are in the set.
+        Rune lo1 = std::max<Rune>(lo, 'A');
+        Rune hi1 = std::min<Rune>(hi, 'Z');
+        if (lo1 <= hi1)
+            upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
+
+        lo1 = std::max<Rune>(lo, 'a');
+        hi1 = std::min<Rune>(hi, 'z');
+        if (lo1 <= hi1)
+            lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
+    }
+
+    { // Check whether lo, hi is already in the class.
+        iterator it = ranges_.find(RuneRange(lo, lo));
+        if (it != end() && it->lo <= lo && hi <= it->hi)
+            return false;
+    }
+
+    // Look for a range abutting lo on the left.
+    // If it exists, take it out and increase our range.
+    if (lo > 0) {
+        iterator it = ranges_.find(RuneRange(lo - 1, lo - 1));
+        if (it != end()) {
+            lo = it->lo;
+            if (it->hi > hi)
+                hi = it->hi;
+            nrunes_ -= it->hi - it->lo + 1;
+            ranges_.erase(it);
+        }
+    }
+
+    // Look for a range abutting hi on the right.
+    // If it exists, take it out and increase our range.
+    if (hi < Runemax) {
+        iterator it = ranges_.find(RuneRange(hi + 1, hi + 1));
+        if (it != end()) {
+            hi = it->hi;
+            nrunes_ -= it->hi - it->lo + 1;
+            ranges_.erase(it);
+        }
+    }
+
+    // Look for ranges between lo and hi.  Take them out.
+    // This is only safe because the set has no overlapping ranges.
+    // We've already removed any ranges abutting lo and hi, so
+    // any that overlap [lo, hi] must be contained within it.
+    for (;;) {
+        iterator it = ranges_.find(RuneRange(lo, hi));
+        if (it == end())
+            break;
+        nrunes_ -= it->hi - it->lo + 1;
+        ranges_.erase(it);
+    }
+
+    // Finally, add [lo, hi].
+    nrunes_ += hi - lo + 1;
+    ranges_.insert(RuneRange(lo, hi));
+    return true;
+}
+
+void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
+    for (iterator it = cc->begin(); it != cc->end(); ++it)
+        AddRange(it->lo, it->hi);
+}
+
+bool CharClassBuilder::Contains(Rune r) { return ranges_.find(RuneRange(r, r)) != end(); }
+
+// Does the character class behave the same on A-Z as on a-z?
+bool CharClassBuilder::FoldsASCII() { return ((upper_ ^ lower_) & AlphaMask) == 0; }
+
+CharClassBuilder *CharClassBuilder::Copy() {
+    CharClassBuilder *cc = new CharClassBuilder;
+    for (iterator it = begin(); it != end(); ++it)
+        cc->ranges_.insert(RuneRange(it->lo, it->hi));
+    cc->upper_ = upper_;
+    cc->lower_ = lower_;
+    cc->nrunes_ = nrunes_;
+    return cc;
+}
+
+void CharClassBuilder::RemoveAbove(Rune r) {
+    if (r >= Runemax)
+        return;
+
+    if (r < 'z') {
+        if (r < 'a')
+            lower_ = 0;
+        else
+            lower_ &= AlphaMask >> ('z' - r);
+    }
+
+    if (r < 'Z') {
+        if (r < 'A')
+            upper_ = 0;
+        else
+            upper_ &= AlphaMask >> ('Z' - r);
+    }
+
+    for (;;) {
+
+        iterator it = ranges_.find(RuneRange(r + 1, Runemax));
+        if (it == end())
+            break;
+        RuneRange rr = *it;
+        ranges_.erase(it);
+        nrunes_ -= rr.hi - rr.lo + 1;
+        if (rr.lo <= r) {
+            rr.hi = r;
+            ranges_.insert(rr);
+            nrunes_ += rr.hi - rr.lo + 1;
+        }
+    }
+}
+
+void CharClassBuilder::Negate() {
+    // Build up negation and then copy in.
+    // Could edit ranges in place, but C++ won't let me.
+    std::vector<RuneRange> v;
+    v.reserve(ranges_.size() + 1);
+
+    // In negation, first range begins at 0, unless
+    // the current class begins at 0.
+    iterator it = begin();
+    if (it == end()) {
+        v.push_back(RuneRange(0, Runemax));
+    } else {
+        int nextlo = 0;
+        if (it->lo == 0) {
+            nextlo = it->hi + 1;
+            ++it;
+        }
+        for (; it != end(); ++it) {
+            v.push_back(RuneRange(nextlo, it->lo - 1));
+            nextlo = it->hi + 1;
+        }
+        if (nextlo <= Runemax)
+            v.push_back(RuneRange(nextlo, Runemax));
+    }
+
+    ranges_.clear();
+    for (size_t i = 0; i < v.size(); i++)
+        ranges_.insert(v[i]);
+
+    upper_ = AlphaMask & ~upper_;
+    lower_ = AlphaMask & ~lower_;
+    nrunes_ = Runemax + 1 - nrunes_;
+}
+
+// Character class is a sorted list of ranges.
+// The ranges are allocated in the same block as the header,
+// necessitating a special allocator and Delete method.
+
+CharClass *CharClass::New(size_t maxranges) {
+    CharClass *cc;
+    uint8_t *data = new uint8_t[sizeof *cc + maxranges * sizeof cc->ranges_[0]];
+    cc = reinterpret_cast<CharClass *>(data);
+    cc->ranges_ = reinterpret_cast<RuneRange *>(data + sizeof *cc);
+    cc->nranges_ = 0;
+    cc->folds_ascii_ = false;
+    cc->nrunes_ = 0;
+    return cc;
+}
+
+void CharClass::Delete() {
+    uint8_t *data = reinterpret_cast<uint8_t *>(this);
+    delete[] data;
+}
+
+CharClass *CharClass::Negate() {
+    CharClass *cc = CharClass::New(static_cast<size_t>(nranges_ + 1));
+    cc->folds_ascii_ = folds_ascii_;
+    cc->nrunes_ = Runemax + 1 - nrunes_;
+    int n = 0;
+    int nextlo = 0;
+    for (CharClass::iterator it = begin(); it != end(); ++it) {
+        if (it->lo == nextlo) {
+            nextlo = it->hi + 1;
+        } else {
+            cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
+            nextlo = it->hi + 1;
+        }
+    }
+    if (nextlo <= Runemax)
+        cc->ranges_[n++] = RuneRange(nextlo, Runemax);
+    cc->nranges_ = n;
+    return cc;
+}
+
+bool CharClass::Contains(Rune r) const {
+    RuneRange *rr = ranges_;
+    int n = nranges_;
+    while (n > 0) {
+        int m = n / 2;
+        if (rr[m].hi < r) {
+            rr += m + 1;
+            n -= m + 1;
+        } else if (r < rr[m].lo) {
+            n = m;
+        } else { // rr[m].lo <= r && r <= rr[m].hi
+            return true;
+        }
+    }
+    return false;
+}
+
+CharClass *CharClassBuilder::GetCharClass() {
+    CharClass *cc = CharClass::New(ranges_.size());
+    int n = 0;
+    for (iterator it = begin(); it != end(); ++it)
+        cc->ranges_[n++] = *it;
+    cc->nranges_ = n;
+    DCHECK_LE(n, static_cast<int>(ranges_.size()));
+    cc->nrunes_ = nrunes_;
+    cc->folds_ascii_ = FoldsASCII();
+    return cc;
+}
+
+} // namespace re2