ragflow/internal/cpp/re2/regexp.h

// Copyright 2006 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#ifndef RE2_REGEXP_H_
#define RE2_REGEXP_H_

// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
// mimics the PCRE interface provided by PCRE's C++ wrappers.
// This header describes the low-level interface used to implement RE2
// and may change in backwards-incompatible ways from time to time.
// In contrast, RE2's interface will not.
// ---------------------------------------------------------------------

// Regular expression library: parsing, execution, and manipulation
// of regular expressions.
//
// Any operation that traverses the Regexp structures should be written
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
// regular expressions such as x++++++++++++++++++++... might cause recursive
// traversals to overflow the stack.
//
// It is the caller's responsibility to provide appropriate mutual exclusion
// around manipulation of the regexps.  RE2 does this.
//
// PARSING
//
// Regexp::Parse parses regular expressions encoded in UTF-8.
// The default syntax is POSIX extended regular expressions,
// with the following changes:
//
//   1.  Backreferences (optional in POSIX EREs) are not supported.
//         (Supporting them precludes the use of DFA-based
//          matching engines.)
//
//   2.  Collating elements and collation classes are not supported.
//         (No one has needed or wanted them.)
//
// The exact syntax accepted can be modified by passing flags to
// Regexp::Parse.  In particular, many of the basic Perl additions
// are available.  The flags are documented below (search for LikePerl).
//
// If parsed with the flag Regexp::Latin1, both the regular expression
// and the input to the matching routines are assumed to be encoded in
// Latin-1, not UTF-8.
//
// EXECUTION
//
// Once Regexp has parsed a regular expression, it provides methods
// to search text using that regular expression.  These methods are
// implemented via calling out to other regular expression libraries.
// (Let's call them the sublibraries.)
//
// To call a sublibrary, Regexp does not simply prepare a
// string version of the regular expression and hand it to the
// sublibrary.  Instead, Regexp prepares, from its own parsed form, the
// corresponding internal representation used by the sublibrary.
// This has the drawback of needing to know the internal representation
// used by the sublibrary, but it has two important benefits:
//
//   1. The syntax and meaning of regular expressions is guaranteed
//      to be that used by Regexp's parser, not the syntax expected
//      by the sublibrary.  Regexp might accept a restricted or
//      expanded syntax for regular expressions as compared with
//      the sublibrary.  As long as Regexp can translate from its
//      internal form into the sublibrary's, clients need not know
//      exactly which sublibrary they are using.
//
//   2. The sublibrary parsers are bypassed.  For whatever reason,
//      sublibrary regular expression parsers often have security
//      problems.  For example, plan9grep's regular expression parser
//      has a buffer overflow in its handling of large character
//      classes, and PCRE's parser has had buffer overflow problems
//      in the past.  Security-team requires sandboxing of sublibrary
//      regular expression parsers.  Avoiding the sublibrary parsers
//      avoids the sandbox.
//
// The execution methods we use now are provided by the compiled form,
// Prog, described in prog.h
//
// MANIPULATION
//
// Unlike other regular expression libraries, Regexp makes its parsed
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.

#include <map>
#include <set>
#include <stddef.h>
#include <stdint.h>
#include <string>

#include "re2/stringpiece.h"
#include "util/logging.h"
#include "util/utf.h"
#include "util/util.h"

namespace re2 {

// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
enum RegexpOp {
    // Matches no strings.
    kRegexpNoMatch = 1,

    // Matches empty string.
    kRegexpEmptyMatch,

    // Matches rune_.
    kRegexpLiteral,

    // Matches runes_.
    kRegexpLiteralString,

    // Matches concatenation of sub_[0..nsub-1].
    kRegexpConcat,
    // Matches union of sub_[0..nsub-1].
    kRegexpAlternate,

    // Matches sub_[0] zero or more times.
    kRegexpStar,
    // Matches sub_[0] one or more times.
    kRegexpPlus,
    // Matches sub_[0] zero or one times.
    kRegexpQuest,

    // Matches sub_[0] at least min_ times, at most max_ times.
    // max_ == -1 means no upper limit.
    kRegexpRepeat,

    // Parenthesized (capturing) subexpression.  Index is cap_.
    // Optionally, capturing name is name_.
    kRegexpCapture,

    // Matches any character.
    kRegexpAnyChar,

    // Matches any byte [sic].
    kRegexpAnyByte,

    // Matches empty string at beginning of line.
    kRegexpBeginLine,
    // Matches empty string at end of line.
    kRegexpEndLine,

    // Matches word boundary "\b".
    kRegexpWordBoundary,
    // Matches not-a-word boundary "\B".
    kRegexpNoWordBoundary,

    // Matches empty string at beginning of text.
    kRegexpBeginText,
    // Matches empty string at end of text.
    kRegexpEndText,

    // Matches character class given by cc_.
    kRegexpCharClass,

    // Forces match of entire expression right now,
    // with match ID match_id_ (used by RE2::Set).
    kRegexpHaveMatch,

    kMaxRegexpOp = kRegexpHaveMatch,
};

// Keep in sync with string list in regexp.cc
enum RegexpStatusCode {
    // No error
    kRegexpSuccess = 0,

    // Unexpected error
    kRegexpInternalError,

    // Parse errors
    kRegexpBadEscape,         // bad escape sequence
    kRegexpBadCharClass,      // bad character class
    kRegexpBadCharRange,      // bad character class range
    kRegexpMissingBracket,    // missing closing ]
    kRegexpMissingParen,      // missing closing )
    kRegexpUnexpectedParen,   // unexpected closing )
    kRegexpTrailingBackslash, // at end of regexp
    kRegexpRepeatArgument,    // repeat argument missing, e.g. "*"
    kRegexpRepeatSize,        // bad repetition argument
    kRegexpRepeatOp,          // bad repetition operator
    kRegexpBadPerlOp,         // bad perl operator
    kRegexpBadUTF8,           // invalid UTF-8 in regexp
    kRegexpBadNamedCapture,   // bad named capture
};

// Error status for certain operations.
class RegexpStatus {
public:
    RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
    ~RegexpStatus() { delete tmp_; }

    void set_code(RegexpStatusCode code) { code_ = code; }
    void set_error_arg(const StringPiece &error_arg) { error_arg_ = error_arg; }
    void set_tmp(std::string *tmp) {
        delete tmp_;
        tmp_ = tmp;
    }
    RegexpStatusCode code() const { return code_; }
    const StringPiece &error_arg() const { return error_arg_; }
    bool ok() const { return code() == kRegexpSuccess; }

    // Copies state from status.
    void Copy(const RegexpStatus &status);

    // Returns text equivalent of code, e.g.:
    //   "Bad character class"
    static std::string CodeText(RegexpStatusCode code);

    // Returns text describing error, e.g.:
    //   "Bad character class: [z-a]"
    std::string Text() const;

private:
    RegexpStatusCode code_; // Kind of error
    StringPiece error_arg_; // Piece of regexp containing syntax error.
    std::string *tmp_;      // Temporary storage, possibly where error_arg_ is.

    RegexpStatus(const RegexpStatus &) = delete;
    RegexpStatus &operator=(const RegexpStatus &) = delete;
};

// Compiled form; see prog.h
class Prog;

struct RuneRange {
    RuneRange() : lo(0), hi(0) {}
    RuneRange(int l, int h) : lo(l), hi(h) {}
    Rune lo;
    Rune hi;
};

// Less-than on RuneRanges treats a == b if they overlap at all.
// This lets us look in a set to find the range covering a particular Rune.
struct RuneRangeLess {
    bool operator()(const RuneRange &a, const RuneRange &b) const { return a.hi < b.lo; }
};

class CharClassBuilder;

class CharClass {
public:
    void Delete();

    typedef RuneRange *iterator;
    iterator begin() { return ranges_; }
    iterator end() { return ranges_ + nranges_; }

    int size() { return nrunes_; }
    bool empty() { return nrunes_ == 0; }
    bool full() { return nrunes_ == Runemax + 1; }
    bool FoldsASCII() { return folds_ascii_; }

    bool Contains(Rune r) const;
    CharClass *Negate();

private:
    CharClass();  // not implemented
    ~CharClass(); // not implemented
    static CharClass *New(size_t maxranges);

    friend class CharClassBuilder;

    bool folds_ascii_;
    int nrunes_;
    RuneRange *ranges_;
    int nranges_;

    CharClass(const CharClass &) = delete;
    CharClass &operator=(const CharClass &) = delete;
};

class Regexp {
public:
    // Flags for parsing.  Can be ORed together.
    enum ParseFlags {
        NoParseFlags = 0,
        FoldCase = 1 << 0, // Fold case during matching (case-insensitive).
        Literal = 1 << 1,  // Treat s as literal string instead of a regexp.
        ClassNL = 1 << 2,  // Allow char classes like [^a-z] and \D and \s
                           // and [[:space:]] to match newline.
        DotNL = 1 << 3,    // Allow . to match newline.
        MatchNL = ClassNL | DotNL,
        OneLine = 1 << 4,        // Treat ^ and $ as only matching at beginning and
                                 // end of text, not around embedded newlines.
                                 // (Perl's default)
        Latin1 = 1 << 5,         // Regexp and text are in Latin1, not UTF-8.
        NonGreedy = 1 << 6,      // Repetition operators are non-greedy by default.
        PerlClasses = 1 << 7,    // Allow Perl character classes like \d.
        PerlB = 1 << 8,          // Allow Perl's \b and \B.
        PerlX = 1 << 9,          // Perl extensions:
                                 //   non-capturing parens - (?: )
                                 //   non-greedy operators - *? +? ?? {}?
                                 //   flag edits - (?i) (?-i) (?i: )
                                 //     i - FoldCase
                                 //     m - !OneLine
                                 //     s - DotNL
                                 //     U - NonGreedy
                                 //   line ends: \A \z
                                 //   \Q and \E to disable/enable metacharacters
                                 //   (?P<name>expr) for named captures
                                 //   \C to match any single byte
        UnicodeGroups = 1 << 10, // Allow \p{Han} for Unicode Han group
                                 //   and \P{Han} for its negation.
        NeverNL = 1 << 11,       // Never match NL, even if the regexp mentions
                                 //   it explicitly.
        NeverCapture = 1 << 12,  // Parse all parens as non-capturing.

        // As close to Perl as we can get.
        LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | UnicodeGroups,

        // Internal use only.
        WasDollar = 1 << 13, // on kRegexpEndText: was $ in regexp text
        AllParseFlags = (1 << 14) - 1,
    };

    // Get.  No set, Regexps are logically immutable once created.
    RegexpOp op() { return static_cast<RegexpOp>(op_); }
    int nsub() { return nsub_; }
    bool simple() { return simple_ != 0; }
    ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
    int Ref(); // For testing.

    Regexp **sub() {
        if (nsub_ <= 1)
            return &subone_;
        else
            return submany_;
    }

    int min() {
        DCHECK_EQ(op_, kRegexpRepeat);
        return arguments.repeat.min_;
    }
    int max() {
        DCHECK_EQ(op_, kRegexpRepeat);
        return arguments.repeat.max_;
    }
    Rune rune() {
        DCHECK_EQ(op_, kRegexpLiteral);
        return arguments.rune_;
    }
    CharClass *cc() {
        DCHECK_EQ(op_, kRegexpCharClass);
        return arguments.char_class.cc_;
    }
    int cap() {
        DCHECK_EQ(op_, kRegexpCapture);
        return arguments.capture.cap_;
    }
    const std::string *name() {
        DCHECK_EQ(op_, kRegexpCapture);
        return arguments.capture.name_;
    }
    Rune *runes() {
        DCHECK_EQ(op_, kRegexpLiteralString);
        return arguments.literal_string.runes_;
    }
    int nrunes() {
        DCHECK_EQ(op_, kRegexpLiteralString);
        return arguments.literal_string.nrunes_;
    }
    int match_id() {
        DCHECK_EQ(op_, kRegexpHaveMatch);
        return arguments.match_id_;
    }

    // Increments reference count, returns object as convenience.
    Regexp *Incref();

    // Decrements reference count and deletes this object if count reaches 0.
    void Decref();

    // Parses string s to produce regular expression, returned.
    // Caller must release return value with re->Decref().
    // On failure, sets *status (if status != NULL) and returns NULL.
    static Regexp *Parse(const StringPiece &s, ParseFlags flags, RegexpStatus *status);

    // Returns a _new_ simplified version of the current regexp.
    // Does not edit the current regexp.
    // Caller must release return value with re->Decref().
    // Simplified means that counted repetition has been rewritten
    // into simpler terms and all Perl/POSIX features have been
    // removed.  The result will capture exactly the same
    // subexpressions the original did, unless formatted with ToString.
    Regexp *Simplify();
    friend class CoalesceWalker;
    friend class SimplifyWalker;

    // Parses the regexp src and then simplifies it and sets *dst to the
    // string representation of the simplified form.  Returns true on success.
    // Returns false and sets *status (if status != NULL) on parse error.
    static bool SimplifyRegexp(const StringPiece &src, ParseFlags flags, std::string *dst, RegexpStatus *status);

    // Returns the number of capturing groups in the regexp.
    int NumCaptures();
    friend class NumCapturesWalker;

    // Returns a map from names to capturing group indices,
    // or NULL if the regexp contains no named capture groups.
    // The caller is responsible for deleting the map.
    std::map<std::string, int> *NamedCaptures();

    // Returns a map from capturing group indices to capturing group
    // names or NULL if the regexp contains no named capture groups. The
    // caller is responsible for deleting the map.
    std::map<int, std::string> *CaptureNames();

    // Returns a string representation of the current regexp,
    // using as few parentheses as possible.
    std::string ToString();

    // Convenience functions.  They consume the passed reference,
    // so in many cases you should use, e.g., Plus(re->Incref(), flags).
    // They do not consume allocated arrays like subs or runes.
    static Regexp *Plus(Regexp *sub, ParseFlags flags);
    static Regexp *Star(Regexp *sub, ParseFlags flags);
    static Regexp *Quest(Regexp *sub, ParseFlags flags);
    static Regexp *Concat(Regexp **subs, int nsubs, ParseFlags flags);
    static Regexp *Alternate(Regexp **subs, int nsubs, ParseFlags flags);
    static Regexp *Capture(Regexp *sub, ParseFlags flags, int cap);
    static Regexp *Repeat(Regexp *sub, ParseFlags flags, int min, int max);
    static Regexp *NewLiteral(Rune rune, ParseFlags flags);
    static Regexp *NewCharClass(CharClass *cc, ParseFlags flags);
    static Regexp *LiteralString(Rune *runes, int nrunes, ParseFlags flags);
    static Regexp *HaveMatch(int match_id, ParseFlags flags);

    // Like Alternate but does not factor out common prefixes.
    static Regexp *AlternateNoFactor(Regexp **subs, int nsubs, ParseFlags flags);

    // Debugging function.  Returns string format for regexp
    // that makes structure clear.  Does NOT use regexp syntax.
    std::string Dump();

    // Helper traversal class, defined fully in walker-inl.h.
    template <typename T>
    class Walker;

    // Compile to Prog.  See prog.h
    // Reverse prog expects to be run over text backward.
    // Construction and execution of prog will
    // stay within approximately max_mem bytes of memory.
    // If max_mem <= 0, a reasonable default is used.
    Prog *CompileToProg(int64_t max_mem);
    Prog *CompileToReverseProg(int64_t max_mem);

    // Whether to expect this library to find exactly the same answer as PCRE
    // when running this regexp.  Most regexps do mimic PCRE exactly, but a few
    // obscure cases behave differently.  Technically this is more a property
    // of the Prog than the Regexp, but the computation is much easier to do
    // on the Regexp.  See mimics_pcre.cc for the exact conditions.
    bool MimicsPCRE();

    // Benchmarking function.
    void NullWalk();

    // Whether every match of this regexp must be anchored and
    // begin with a non-empty fixed string (perhaps after ASCII
    // case-folding).  If so, returns the prefix and the sub-regexp that
    // follows it.
    // Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
    // regardless of the return value.
    bool RequiredPrefix(std::string *prefix, bool *foldcase, Regexp **suffix);

    // Whether every match of this regexp must be unanchored and
    // begin with a non-empty fixed string (perhaps after ASCII
    // case-folding).  If so, returns the prefix.
    // Callers should expect *prefix and *foldcase to be "zeroed"
    // regardless of the return value.
    bool RequiredPrefixForAccel(std::string *prefix, bool *foldcase);

    // Controls the maximum repeat count permitted by the parser.
    // FOR FUZZING ONLY.
    static void FUZZING_ONLY_set_maximum_repeat_count(int i);

private:
    // Constructor allocates vectors as appropriate for operator.
    explicit Regexp(RegexpOp op, ParseFlags parse_flags);

    // Use Decref() instead of delete to release Regexps.
    // This is private to catch deletes at compile time.
    ~Regexp();
    void Destroy();
    bool QuickDestroy();

    // Helpers for Parse.  Listed here so they can edit Regexps.
    class ParseState;

    friend class ParseState;
    friend bool ParseCharClass(StringPiece *s, Regexp **out_re, RegexpStatus *status);

    // Helper for testing [sic].
    friend bool RegexpEqualTestingOnly(Regexp *, Regexp *);

    // Computes whether Regexp is already simple.
    bool ComputeSimple();

    // Constructor that generates a Star, Plus or Quest,
    // squashing the pair if sub is also a Star, Plus or Quest.
    static Regexp *StarPlusOrQuest(RegexpOp op, Regexp *sub, ParseFlags flags);

    // Constructor that generates a concatenation or alternation,
    // enforcing the limit on the number of subexpressions for
    // a particular Regexp.
    static Regexp *ConcatOrAlternate(RegexpOp op, Regexp **subs, int nsubs, ParseFlags flags, bool can_factor);

    // Returns the leading string that re starts with.
    // The returned Rune* points into a piece of re,
    // so it must not be used after the caller calls re->Decref().
    static Rune *LeadingString(Regexp *re, int *nrune, ParseFlags *flags);

    // Removes the first n leading runes from the beginning of re.
    // Edits re in place.
    static void RemoveLeadingString(Regexp *re, int n);

    // Returns the leading regexp in re's top-level concatenation.
    // The returned Regexp* points at re or a sub-expression of re,
    // so it must not be used after the caller calls re->Decref().
    static Regexp *LeadingRegexp(Regexp *re);

    // Removes LeadingRegexp(re) from re and returns the remainder.
    // Might edit re in place.
    static Regexp *RemoveLeadingRegexp(Regexp *re);

    // Simplifies an alternation of literal strings by factoring out
    // common prefixes.
    static int FactorAlternation(Regexp **sub, int nsub, ParseFlags flags);
    friend class FactorAlternationImpl;

    // Is a == b?  Only efficient on regexps that have not been through
    // Simplify yet - the expansion of a kRegexpRepeat will make this
    // take a long time.  Do not call on such regexps, hence private.
    static bool Equal(Regexp *a, Regexp *b);

    // Allocate space for n sub-regexps.
    void AllocSub(int n) {
        DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
        if (n > 1)
            submany_ = new Regexp *[n];
        nsub_ = static_cast<uint16_t>(n);
    }

    // Add Rune to LiteralString
    void AddRuneToString(Rune r);

    // Swaps this with that, in place.
    void Swap(Regexp *that);

    // Operator.  See description of operators above.
    // uint8_t instead of RegexpOp to control space usage.
    uint8_t op_;

    // Is this regexp structure already simple
    // (has it been returned by Simplify)?
    // uint8_t instead of bool to control space usage.
    uint8_t simple_;

    // Flags saved from parsing and used during execution.
    // (Only FoldCase is used.)
    // uint16_t instead of ParseFlags to control space usage.
    uint16_t parse_flags_;

    // Reference count.  Exists so that SimplifyRegexp can build
    // regexp structures that are dags rather than trees to avoid
    // exponential blowup in space requirements.
    // uint16_t to control space usage.
    // The standard regexp routines will never generate a
    // ref greater than the maximum repeat count (kMaxRepeat),
    // but even so, Incref and Decref consult an overflow map
    // when ref_ reaches kMaxRef.
    uint16_t ref_;
    static const uint16_t kMaxRef = 0xffff;

    // Subexpressions.
    // uint16_t to control space usage.
    // Concat and Alternate handle larger numbers of subexpressions
    // by building concatenation or alternation trees.
    // Other routines should call Concat or Alternate instead of
    // filling in sub() by hand.
    uint16_t nsub_;
    static const uint16_t kMaxNsub = 0xffff;
    union {
        Regexp **submany_; // if nsub_ > 1
        Regexp *subone_;   // if nsub_ == 1
    };

    // Extra space for parse and teardown stacks.
    Regexp *down_;

    // Arguments to operator.  See description of operators above.
    union {
        struct { // Repeat
            int max_;
            int min_;
        } repeat;
        struct { // Capture
            int cap_;
            std::string *name_;
        } capture;
        struct { // LiteralString
            int nrunes_;
            Rune *runes_;
        } literal_string;
        struct { // CharClass
            // These two could be in separate union members,
            // but it wouldn't save any space (there are other two-word structs)
            // and keeping them separate avoids confusion during parsing.
            CharClass *cc_;
            CharClassBuilder *ccb_;
        } char_class;
        Rune rune_;          // Literal
        int match_id_;       // HaveMatch
        void *the_union_[2]; // as big as any other element, for memset
    } arguments;

    Regexp(const Regexp &) = delete;
    Regexp &operator=(const Regexp &) = delete;
};

// Character class set: contains non-overlapping, non-abutting RuneRanges.
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;

class CharClassBuilder {
public:
    CharClassBuilder();

    typedef RuneRangeSet::iterator iterator;
    iterator begin() { return ranges_.begin(); }
    iterator end() { return ranges_.end(); }

    int size() { return nrunes_; }
    bool empty() { return nrunes_ == 0; }
    bool full() { return nrunes_ == Runemax + 1; }

    bool Contains(Rune r);
    bool FoldsASCII();
    bool AddRange(Rune lo, Rune hi); // returns whether class changed
    CharClassBuilder *Copy();
    void AddCharClass(CharClassBuilder *cc);
    void Negate();
    void RemoveAbove(Rune r);
    CharClass *GetCharClass();
    void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);

private:
    static const uint32_t AlphaMask = (1 << 26) - 1;
    uint32_t upper_; // bitmap of A-Z
    uint32_t lower_; // bitmap of a-z
    int nrunes_;
    RuneRangeSet ranges_;

    CharClassBuilder(const CharClassBuilder &) = delete;
    CharClassBuilder &operator=(const CharClassBuilder &) = delete;
};

// Bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b) {
    return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
}

inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b) {
    return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
}

inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b) {
    return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
}

inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
    // Attempting to produce a value out of enum's range has undefined behaviour.
    return static_cast<Regexp::ParseFlags>(~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
}

} // namespace re2

#endif // RE2_REGEXP_H_