// Copyright 2007 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Compile regular expression to Prog. // // Prog and Inst are defined in prog.h. // This file's external interface is just Regexp::CompileToProg. // The Compiler class defined in this file is private. #include #include #include #include #include "re2/pod_array.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" #include "re2/walker-inl.h" #include "util/logging.h" #include "util/utf.h" namespace re2 { // List of pointers to Inst* that need to be filled in (patched). // Because the Inst* haven't been filled in yet, // we can use the Inst* word to hold the list's "next" pointer. // It's kind of sleazy, but it works well in practice. // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. // // Because the out and out1 fields in Inst are no longer pointers, // we can't use pointers directly here either. Instead, head refers // to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1). // head == 0 represents the NULL list. This is okay because instruction #0 // is always the fail instruction, which never appears on a list. struct PatchList { // Returns patch list containing just p. static PatchList Mk(uint32_t p) { return {p, p}; } // Patches all the entries on l to have value p. // Caller must not ever use patch list again. static void Patch(Prog::Inst *inst0, PatchList l, uint32_t p) { while (l.head != 0) { Prog::Inst *ip = &inst0[l.head >> 1]; if (l.head & 1) { l.head = ip->out1(); ip->out1_ = p; } else { l.head = ip->out(); ip->set_out(p); } } } // Appends two patch lists and returns result. static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2) { if (l1.head == 0) return l2; if (l2.head == 0) return l1; Prog::Inst *ip = &inst0[l1.tail >> 1]; if (l1.tail & 1) ip->out1_ = l2.head; else ip->set_out(l2.head); return {l1.head, l2.tail}; } uint32_t head; uint32_t tail; // for constant-time append }; static const PatchList kNullPatchList = {0, 0}; // Compiled program fragment. struct Frag { uint32_t begin; PatchList end; bool nullable; Frag() : begin(0), end(kNullPatchList), nullable(false) {} Frag(uint32_t begin, PatchList end, bool nullable) : begin(begin), end(end), nullable(nullable) {} }; // Input encodings. enum Encoding { kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) kEncodingLatin1, // Latin-1 (0-FF) }; class Compiler : public Regexp::Walker { public: explicit Compiler(); ~Compiler(); // Compiles Regexp to a new Prog. // Caller is responsible for deleting Prog when finished with it. // If reversed is true, compiles for walking over the input // string backward (reverses all concatenations). static Prog *Compile(Regexp *re, bool reversed, int64_t max_mem); // Compiles alternation of all the re to a new Prog. // Each re has a match with an id equal to its index in the vector. static Prog *CompileSet(Regexp *re, RE2::Anchor anchor, int64_t max_mem); // Interface for Regexp::Walker, which helps traverse the Regexp. // The walk is purely post-recursive: given the machines for the // children, PostVisit combines them to create the machine for // the current node. The child_args are Frags. // The Compiler traverses the Regexp parse tree, visiting // each node in depth-first order. It invokes PreVisit before // visiting the node's children and PostVisit after visiting // the children. Frag PreVisit(Regexp *re, Frag parent_arg, bool *stop); Frag PostVisit(Regexp *re, Frag parent_arg, Frag pre_arg, Frag *child_args, int nchild_args); Frag ShortVisit(Regexp *re, Frag parent_arg); Frag Copy(Frag arg); // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? Frag Plus(Frag a, bool nongreedy); Frag Star(Frag a, bool nongreedy); Frag Quest(Frag a, bool nongreedy); // Given fragment a, returns (a) capturing as \n. Frag Capture(Frag a, int n); // Given fragments a and b, returns ab; a|b Frag Cat(Frag a, Frag b); Frag Alt(Frag a, Frag b); // Returns a fragment that can't match anything. Frag NoMatch(); // Returns a fragment that matches the empty string. Frag Match(int32_t id); // Returns a no-op fragment. Frag Nop(); // Returns a fragment matching the byte range lo-hi. Frag ByteRange(int lo, int hi, bool foldcase); // Returns a fragment matching an empty-width special op. Frag EmptyWidth(EmptyOp op); // Adds n instructions to the program. // Returns the index of the first one. // Returns -1 if no more instructions are available. int AllocInst(int n); // Rune range compiler. // Begins a new alternation. void BeginRange(); // Adds a fragment matching the rune range lo-hi. void AddRuneRange(Rune lo, Rune hi, bool foldcase); void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); void Add_80_10ffff(); // New suffix that matches the byte range lo-hi, then goes to next. int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next); int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next); // Returns true iff the suffix is cached. bool IsCachedRuneByteSuffix(int id); // Adds a suffix to alternation. void AddSuffix(int id); // Adds a suffix to the trie starting from the given root node. // Returns zero iff allocating an instruction fails. Otherwise, returns // the current root node, which might be different from what was given. int AddSuffixRecursive(int root, int id); // Finds the trie node for the given suffix. Returns a Frag in order to // distinguish between pointing at the root node directly (end.head == 0) // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively). Frag FindByteRange(int root, int id); // Compares two ByteRanges and returns true iff they are equal. bool ByteRangeEqual(int id1, int id2); // Returns the alternation of all the added suffixes. Frag EndRange(); // Single rune. Frag Literal(Rune r, bool foldcase); void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor); Prog *Finish(Regexp *re); // Returns .* where dot = any byte Frag DotStar(); private: Prog *prog_; // Program being built. bool failed_; // Did we give up compiling? Encoding encoding_; // Input encoding bool reversed_; // Should program run backward over text? PODArray inst_; int ninst_; // Number of instructions used. int max_ninst_; // Maximum number of instructions. int64_t max_mem_; // Total memory budget. std::unordered_map rune_cache_; Frag rune_range_; RE2::Anchor anchor_; // anchor mode for RE2::Set Compiler(const Compiler &) = delete; Compiler &operator=(const Compiler &) = delete; }; Compiler::Compiler() { prog_ = new Prog(); failed_ = false; encoding_ = kEncodingUTF8; reversed_ = false; ninst_ = 0; max_ninst_ = 1; // make AllocInst for fail instruction okay max_mem_ = 0; int fail = AllocInst(1); inst_[fail].InitFail(); max_ninst_ = 0; // Caller must change } Compiler::~Compiler() { delete prog_; } int Compiler::AllocInst(int n) { if (failed_ || ninst_ + n > max_ninst_) { failed_ = true; return -1; } if (ninst_ + n > inst_.size()) { int cap = inst_.size(); if (cap == 0) cap = 8; while (ninst_ + n > cap) cap *= 2; PODArray inst(cap); if (inst_.data() != NULL) memmove(inst.data(), inst_.data(), ninst_ * sizeof inst_[0]); memset(inst.data() + ninst_, 0, (cap - ninst_) * sizeof inst_[0]); inst_ = std::move(inst); } int id = ninst_; ninst_ += n; return id; } // These routines are somewhat hard to visualize in text -- // see http://swtch.com/~rsc/regexp/regexp1.html for // pictures explaining what is going on here. // Returns an unmatchable fragment. Frag Compiler::NoMatch() { return Frag(); } // Is a an unmatchable fragment? static bool IsNoMatch(Frag a) { return a.begin == 0; } // Given fragments a and b, returns fragment for ab. Frag Compiler::Cat(Frag a, Frag b) { if (IsNoMatch(a) || IsNoMatch(b)) return NoMatch(); // Elide no-op. Prog::Inst *begin = &inst_[a.begin]; if (begin->opcode() == kInstNop && a.end.head == (a.begin << 1) && begin->out() == 0) { // in case refs to a somewhere PatchList::Patch(inst_.data(), a.end, b.begin); return b; } // To run backward over string, reverse all concatenations. if (reversed_) { PatchList::Patch(inst_.data(), b.end, a.begin); return Frag(b.begin, a.end, b.nullable && a.nullable); } PatchList::Patch(inst_.data(), a.end, b.begin); return Frag(a.begin, b.end, a.nullable && b.nullable); } // Given fragments for a and b, returns fragment for a|b. Frag Compiler::Alt(Frag a, Frag b) { // Special case for convenience in loops. if (IsNoMatch(a)) return b; if (IsNoMatch(b)) return a; int id = AllocInst(1); if (id < 0) return NoMatch(); inst_[id].InitAlt(a.begin, b.begin); return Frag(id, PatchList::Append(inst_.data(), a.end, b.end), a.nullable || b.nullable); } // When capturing submatches in like-Perl mode, a kOpAlt Inst // treats out_ as the first choice, out1_ as the second. // // For *, +, and ?, if out_ causes another repetition, // then the operator is greedy. If out1_ is the repetition // (and out_ moves forward), then the operator is non-greedy. // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) Frag Compiler::Plus(Frag a, bool nongreedy) { int id = AllocInst(1); if (id < 0) return NoMatch(); PatchList pl; if (nongreedy) { inst_[id].InitAlt(0, a.begin); pl = PatchList::Mk(id << 1); } else { inst_[id].InitAlt(a.begin, 0); pl = PatchList::Mk((id << 1) | 1); } PatchList::Patch(inst_.data(), a.end, id); return Frag(a.begin, pl, a.nullable); } // Given a fragment for a, returns a fragment for a* or a*? (if nongreedy) Frag Compiler::Star(Frag a, bool nongreedy) { // When the subexpression is nullable, one Alt isn't enough to guarantee // correct priority ordering within the transitive closure. The simplest // solution is to handle it as (a+)? instead, which adds the second Alt. if (a.nullable) return Quest(Plus(a, nongreedy), nongreedy); int id = AllocInst(1); if (id < 0) return NoMatch(); PatchList pl; if (nongreedy) { inst_[id].InitAlt(0, a.begin); pl = PatchList::Mk(id << 1); } else { inst_[id].InitAlt(a.begin, 0); pl = PatchList::Mk((id << 1) | 1); } PatchList::Patch(inst_.data(), a.end, id); return Frag(id, pl, true); } // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) Frag Compiler::Quest(Frag a, bool nongreedy) { if (IsNoMatch(a)) return Nop(); int id = AllocInst(1); if (id < 0) return NoMatch(); PatchList pl; if (nongreedy) { inst_[id].InitAlt(0, a.begin); pl = PatchList::Mk(id << 1); } else { inst_[id].InitAlt(a.begin, 0); pl = PatchList::Mk((id << 1) | 1); } return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true); } // Returns a fragment for the byte range lo-hi. Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { int id = AllocInst(1); if (id < 0) return NoMatch(); inst_[id].InitByteRange(lo, hi, foldcase, 0); return Frag(id, PatchList::Mk(id << 1), false); } // Returns a no-op fragment. Sometimes unavoidable. Frag Compiler::Nop() { int id = AllocInst(1); if (id < 0) return NoMatch(); inst_[id].InitNop(0); return Frag(id, PatchList::Mk(id << 1), true); } // Returns a fragment that signals a match. Frag Compiler::Match(int32_t match_id) { int id = AllocInst(1); if (id < 0) return NoMatch(); inst_[id].InitMatch(match_id); return Frag(id, kNullPatchList, false); } // Returns a fragment matching a particular empty-width op (like ^ or $) Frag Compiler::EmptyWidth(EmptyOp empty) { int id = AllocInst(1); if (id < 0) return NoMatch(); inst_[id].InitEmptyWidth(empty, 0); return Frag(id, PatchList::Mk(id << 1), true); } // Given a fragment a, returns a fragment with capturing parens around a. Frag Compiler::Capture(Frag a, int n) { if (IsNoMatch(a)) return NoMatch(); int id = AllocInst(2); if (id < 0) return NoMatch(); inst_[id].InitCapture(2 * n, a.begin); inst_[id + 1].InitCapture(2 * n + 1, 0); PatchList::Patch(inst_.data(), a.end, id + 1); return Frag(id, PatchList::Mk((id + 1) << 1), a.nullable); } // A Rune is a name for a Unicode code point. // Returns maximum rune encoded by UTF-8 sequence of length len. static int MaxRune(int len) { int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) if (len == 1) b = 7; else b = 8 - (len + 1) + 6 * (len - 1); return (1 << b) - 1; // maximum Rune for b bits. } // The rune range compiler caches common suffix fragments, // which are very common in UTF-8 (e.g., [80-bf]). // The fragment suffixes are identified by their start // instructions. NULL denotes the eventual end match. // The Frag accumulates in rune_range_. Caching common // suffixes reduces the UTF-8 "." from 32 to 24 instructions, // and it reduces the corresponding one-pass NFA from 16 nodes to 8. void Compiler::BeginRange() { rune_cache_.clear(); rune_range_.begin = 0; rune_range_.end = kNullPatchList; } int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next) { Frag f = ByteRange(lo, hi, foldcase); if (next != 0) { PatchList::Patch(inst_.data(), f.end, next); } else { rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end); } return f.begin; } static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase, int next) { return (uint64_t)next << 17 | (uint64_t)lo << 9 | (uint64_t)hi << 1 | (uint64_t)foldcase; } int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next) { uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); std::unordered_map::const_iterator it = rune_cache_.find(key); if (it != rune_cache_.end()) return it->second; int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); rune_cache_[key] = id; return id; } bool Compiler::IsCachedRuneByteSuffix(int id) { uint8_t lo = inst_[id].byte_range.lo_; uint8_t hi = inst_[id].byte_range.hi_; bool foldcase = inst_[id].foldcase() != 0; int next = inst_[id].out(); uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); return rune_cache_.find(key) != rune_cache_.end(); } void Compiler::AddSuffix(int id) { if (failed_) return; if (rune_range_.begin == 0) { rune_range_.begin = id; return; } if (encoding_ == kEncodingUTF8) { // Build a trie in order to reduce fanout. rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id); return; } int alt = AllocInst(1); if (alt < 0) { rune_range_.begin = 0; return; } inst_[alt].InitAlt(rune_range_.begin, id); rune_range_.begin = alt; } int Compiler::AddSuffixRecursive(int root, int id) { DCHECK(inst_[root].opcode() == kInstAlt || inst_[root].opcode() == kInstByteRange); Frag f = FindByteRange(root, id); if (IsNoMatch(f)) { int alt = AllocInst(1); if (alt < 0) return 0; inst_[alt].InitAlt(root, id); return alt; } int br; if (f.end.head == 0) br = root; else if (f.end.head & 1) br = inst_[f.begin].out1(); else br = inst_[f.begin].out(); if (IsCachedRuneByteSuffix(br)) { // We can't fiddle with cached suffixes, so make a clone of the head. int byterange = AllocInst(1); if (byterange < 0) return 0; inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(), inst_[br].foldcase(), inst_[br].out()); // Ensure that the parent points to the clone, not to the original. // Note that this could leave the head unreachable except via the cache. br = byterange; if (f.end.head == 0) root = br; else if (f.end.head & 1) inst_[f.begin].out1_ = br; else inst_[f.begin].set_out(br); } int out = inst_[id].out(); if (!IsCachedRuneByteSuffix(id)) { // The head should be the instruction most recently allocated, so free it // instead of leaving it unreachable. DCHECK_EQ(id, ninst_ - 1); inst_[id].out_opcode_ = 0; inst_[id].out1_ = 0; ninst_--; } out = AddSuffixRecursive(inst_[br].out(), out); if (out == 0) return 0; inst_[br].set_out(out); return root; } bool Compiler::ByteRangeEqual(int id1, int id2) { return inst_[id1].lo() == inst_[id2].lo() && inst_[id1].hi() == inst_[id2].hi() && inst_[id1].foldcase() == inst_[id2].foldcase(); } Frag Compiler::FindByteRange(int root, int id) { if (inst_[root].opcode() == kInstByteRange) { if (ByteRangeEqual(root, id)) return Frag(root, kNullPatchList, false); else return NoMatch(); } while (inst_[root].opcode() == kInstAlt) { int out1 = inst_[root].out1(); if (ByteRangeEqual(out1, id)) return Frag(root, PatchList::Mk((root << 1) | 1), false); // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't // what we're looking for, then we can stop immediately. Unfortunately, we // can't short-circuit the search in reverse mode. if (!reversed_) return NoMatch(); int out = inst_[root].out(); if (inst_[out].opcode() == kInstAlt) root = out; else if (ByteRangeEqual(out, id)) return Frag(root, PatchList::Mk(root << 1), false); else return NoMatch(); } LOG(DFATAL) << "should never happen"; return NoMatch(); } Frag Compiler::EndRange() { return rune_range_; } // Converts rune range lo-hi into a fragment that recognizes // the bytes that would make up those runes in the current // encoding (Latin 1 or UTF-8). // This lets the machine work byte-by-byte even when // using multibyte encodings. void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { switch (encoding_) { default: case kEncodingUTF8: AddRuneRangeUTF8(lo, hi, foldcase); break; case kEncodingLatin1: AddRuneRangeLatin1(lo, hi, foldcase); break; } } void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { // Latin-1 is easy: runes *are* bytes. if (lo > hi || lo > 0xFF) return; if (hi > 0xFF) hi = 0xFF; AddSuffix(UncachedRuneByteSuffix(static_cast(lo), static_cast(hi), foldcase, 0)); } void Compiler::Add_80_10ffff() { // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by // permitting overlong encodings in E0 and F0 sequences and code points // over 10FFFF in F4 sequences, the size of the bytecode and the number // of equivalence classes are reduced significantly. int id; if (reversed_) { // Prefix factoring matters, but we don't have to handle it here // because the rune range trie logic takes care of that already. id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); AddSuffix(id); id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); AddSuffix(id); id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); AddSuffix(id); } else { // Suffix factoring matters - and we do have to handle it here. int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0); id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1); AddSuffix(id); int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1); id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2); AddSuffix(id); int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2); id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3); AddSuffix(id); } } void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { if (lo > hi) return; // Pick off 80-10FFFF as a common special case. if (lo == 0x80 && hi == 0x10ffff) { Add_80_10ffff(); return; } // Split range into same-length sized ranges. for (int i = 1; i < UTFmax; i++) { Rune max = MaxRune(i); if (lo <= max && max < hi) { AddRuneRangeUTF8(lo, max, foldcase); AddRuneRangeUTF8(max + 1, hi, foldcase); return; } } // ASCII range is always a special case. if (hi < Runeself) { AddSuffix(UncachedRuneByteSuffix(static_cast(lo), static_cast(hi), foldcase, 0)); return; } // Split range into sections that agree on leading bytes. for (int i = 1; i < UTFmax; i++) { uint32_t m = (1 << (6 * i)) - 1; // last i bytes of a UTF-8 sequence if ((lo & ~m) != (hi & ~m)) { if ((lo & m) != 0) { AddRuneRangeUTF8(lo, lo | m, foldcase); AddRuneRangeUTF8((lo | m) + 1, hi, foldcase); return; } if ((hi & m) != m) { AddRuneRangeUTF8(lo, (hi & ~m) - 1, foldcase); AddRuneRangeUTF8(hi & ~m, hi, foldcase); return; } } } // Finally. Generate byte matching equivalent for lo-hi. uint8_t ulo[UTFmax], uhi[UTFmax]; int n = runetochar(reinterpret_cast(ulo), &lo); int m = runetochar(reinterpret_cast(uhi), &hi); (void)m; // USED(m) DCHECK_EQ(n, m); // The logic below encodes this thinking: // // 1. When we have built the whole suffix, we know that it cannot // possibly be a suffix of anything longer: in forward mode, nothing // else can occur before the leading byte; in reverse mode, nothing // else can occur after the last continuation byte or else the leading // byte would have to change. Thus, there is no benefit to caching // the first byte of the suffix whereas there is a cost involved in // cloning it if it begins a common prefix, which is fairly likely. // // 2. Conversely, the last byte of the suffix cannot possibly be a // prefix of anything because next == 0, so we will never want to // clone it, but it is fairly likely to be a common suffix. Perhaps // more so in reverse mode than in forward mode because the former is // "converging" towards lower entropy, but caching is still worthwhile // for the latter in cases such as 80-BF. // // 3. Handling the bytes between the first and the last is less // straightforward and, again, the approach depends on whether we are // "converging" towards lower entropy: in forward mode, a single byte // is unlikely to be part of a common suffix whereas a byte range // is more likely so; in reverse mode, a byte range is unlikely to // be part of a common suffix whereas a single byte is more likely // so. The same benefit versus cost argument applies here. int id = 0; if (reversed_) { for (int i = 0; i < n; i++) { // In reverse UTF-8 mode: cache the leading byte; don't cache the last // continuation byte; cache anything else iff it's a single byte (XX-XX). if (i == 0 || (ulo[i] == uhi[i] && i != n - 1)) id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id); else id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); } } else { for (int i = n - 1; i >= 0; i--) { // In forward UTF-8 mode: don't cache the leading byte; cache the last // continuation byte; cache anything else iff it's a byte range (XX-YY). if (i == n - 1 || (ulo[i] < uhi[i] && i != 0)) id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id); else id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); } } AddSuffix(id); } // Should not be called. Frag Compiler::Copy(Frag arg) { // We're using WalkExponential; there should be no copying. failed_ = true; LOG(DFATAL) << "Compiler::Copy called!"; return NoMatch(); } // Visits a node quickly; called once WalkExponential has // decided to cut this walk short. Frag Compiler::ShortVisit(Regexp *re, Frag) { failed_ = true; return NoMatch(); } // Called before traversing a node's children during the walk. Frag Compiler::PreVisit(Regexp *re, Frag, bool *stop) { // Cut off walk if we've already failed. if (failed_) *stop = true; return Frag(); // not used by caller } Frag Compiler::Literal(Rune r, bool foldcase) { switch (encoding_) { default: return Frag(); case kEncodingLatin1: return ByteRange(r, r, foldcase); case kEncodingUTF8: { if (r < Runeself) // Make common case fast. return ByteRange(r, r, foldcase); uint8_t buf[UTFmax]; int n = runetochar(reinterpret_cast(buf), &r); Frag f = ByteRange((uint8_t)buf[0], buf[0], false); for (int i = 1; i < n; i++) f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false)); return f; } } } // Called after traversing the node's children during the walk. // Given their frags, build and return the frag for this re. Frag Compiler::PostVisit(Regexp *re, Frag, Frag, Frag *child_frags, int nchild_frags) { // If a child failed, don't bother going forward, especially // since the child_frags might contain Frags with NULLs in them. if (failed_) return NoMatch(); // Given the child fragments, return the fragment for this node. switch (re->op()) { case kRegexpRepeat: // Should not see; code at bottom of function will print error break; case kRegexpNoMatch: return NoMatch(); case kRegexpEmptyMatch: return Nop(); case kRegexpHaveMatch: { Frag f = Match(re->match_id()); if (anchor_ == RE2::ANCHOR_BOTH) { // Append \z or else the subexpression will effectively be unanchored. // Complemented by the UNANCHORED case in CompileSet(). f = Cat(EmptyWidth(kEmptyEndText), f); } return f; } case kRegexpConcat: { Frag f = child_frags[0]; for (int i = 1; i < nchild_frags; i++) f = Cat(f, child_frags[i]); return f; } case kRegexpAlternate: { Frag f = child_frags[0]; for (int i = 1; i < nchild_frags; i++) f = Alt(f, child_frags[i]); return f; } case kRegexpStar: return Star(child_frags[0], (re->parse_flags() & Regexp::NonGreedy) != 0); case kRegexpPlus: return Plus(child_frags[0], (re->parse_flags() & Regexp::NonGreedy) != 0); case kRegexpQuest: return Quest(child_frags[0], (re->parse_flags() & Regexp::NonGreedy) != 0); case kRegexpLiteral: return Literal(re->rune(), (re->parse_flags() & Regexp::FoldCase) != 0); case kRegexpLiteralString: { // Concatenation of literals. if (re->nrunes() == 0) return Nop(); Frag f; for (int i = 0; i < re->nrunes(); i++) { Frag f1 = Literal(re->runes()[i], (re->parse_flags() & Regexp::FoldCase) != 0); if (i == 0) f = f1; else f = Cat(f, f1); } return f; } case kRegexpAnyChar: BeginRange(); AddRuneRange(0, Runemax, false); return EndRange(); case kRegexpAnyByte: return ByteRange(0x00, 0xFF, false); case kRegexpCharClass: { CharClass *cc = re->cc(); if (cc->empty()) { // This can't happen. failed_ = true; LOG(DFATAL) << "No ranges in char class"; return NoMatch(); } // ASCII case-folding optimization: if the char class // behaves the same on A-Z as it does on a-z, // discard any ranges wholly contained in A-Z // and mark the other ranges as foldascii. // This reduces the size of a program for // (?i)abc from 3 insts per letter to 1 per letter. bool foldascii = cc->FoldsASCII(); // Character class is just a big OR of the different // character ranges in the class. BeginRange(); for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { // ASCII case-folding optimization (see above). if (foldascii && 'A' <= i->lo && i->hi <= 'Z') continue; // If this range contains all of A-Za-z or none of it, // the fold flag is unnecessary; don't bother. bool fold = foldascii; if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo || ('Z' < i->lo && i->hi < 'a')) fold = false; AddRuneRange(i->lo, i->hi, fold); } return EndRange(); } case kRegexpCapture: // If this is a non-capturing parenthesis -- (?:foo) -- // just use the inner expression. if (re->cap() < 0) return child_frags[0]; return Capture(child_frags[0], re->cap()); case kRegexpBeginLine: return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); case kRegexpEndLine: return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); case kRegexpBeginText: return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); case kRegexpEndText: return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); case kRegexpWordBoundary: return EmptyWidth(kEmptyWordBoundary); case kRegexpNoWordBoundary: return EmptyWidth(kEmptyNonWordBoundary); } failed_ = true; LOG(DFATAL) << "Missing case in Compiler: " << re->op(); return NoMatch(); } // Is this regexp required to start at the beginning of the text? // Only approximate; can return false for complicated regexps like (\Aa|\Ab), // but handles (\A(a|b)). Could use the Walker to write a more exact one. static bool IsAnchorStart(Regexp **pre, int depth) { Regexp *re = *pre; Regexp *sub; // The depth limit makes sure that we don't overflow // the stack on a deeply nested regexp. As the comment // above says, IsAnchorStart is conservative, so returning // a false negative is okay. The exact limit is somewhat arbitrary. if (re == NULL || depth >= 4) return false; switch (re->op()) { default: break; case kRegexpConcat: if (re->nsub() > 0) { sub = re->sub()[0]->Incref(); if (IsAnchorStart(&sub, depth + 1)) { PODArray subcopy(re->nsub()); subcopy[0] = sub; // already have reference for (int i = 1; i < re->nsub(); i++) subcopy[i] = re->sub()[i]->Incref(); *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); re->Decref(); return true; } sub->Decref(); } break; case kRegexpCapture: sub = re->sub()[0]->Incref(); if (IsAnchorStart(&sub, depth + 1)) { *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); re->Decref(); return true; } sub->Decref(); break; case kRegexpBeginText: *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); re->Decref(); return true; } return false; } // Is this regexp required to start at the end of the text? // Only approximate; can return false for complicated regexps like (a\z|b\z), // but handles ((a|b)\z). Could use the Walker to write a more exact one. static bool IsAnchorEnd(Regexp **pre, int depth) { Regexp *re = *pre; Regexp *sub; // The depth limit makes sure that we don't overflow // the stack on a deeply nested regexp. As the comment // above says, IsAnchorEnd is conservative, so returning // a false negative is okay. The exact limit is somewhat arbitrary. if (re == NULL || depth >= 4) return false; switch (re->op()) { default: break; case kRegexpConcat: if (re->nsub() > 0) { sub = re->sub()[re->nsub() - 1]->Incref(); if (IsAnchorEnd(&sub, depth + 1)) { PODArray subcopy(re->nsub()); subcopy[re->nsub() - 1] = sub; // already have reference for (int i = 0; i < re->nsub() - 1; i++) subcopy[i] = re->sub()[i]->Incref(); *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); re->Decref(); return true; } sub->Decref(); } break; case kRegexpCapture: sub = re->sub()[0]->Incref(); if (IsAnchorEnd(&sub, depth + 1)) { *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); re->Decref(); return true; } sub->Decref(); break; case kRegexpEndText: *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); re->Decref(); return true; } return false; } void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor) { if (flags & Regexp::Latin1) encoding_ = kEncodingLatin1; max_mem_ = max_mem; if (max_mem <= 0) { max_ninst_ = 100000; // more than enough } else if (static_cast(max_mem) <= sizeof(Prog)) { // No room for anything. max_ninst_ = 0; } else { int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); // Limit instruction count so that inst->id() fits nicely in an int. // SparseArray also assumes that the indices (inst->id()) are ints. // The call to WalkExponential uses 2*max_ninst_ below, // and other places in the code use 2 or 3 * prog->size(). // Limiting to 2^24 should avoid overflow in those places. // (The point of allowing more than 32 bits of memory is to // have plenty of room for the DFA states, not to use it up // on the program.) if (m >= 1 << 24) m = 1 << 24; // Inst imposes its own limit (currently bigger than 2^24 but be safe). if (m > Prog::Inst::kMaxInst) m = Prog::Inst::kMaxInst; max_ninst_ = static_cast(m); } anchor_ = anchor; } // Compiles re, returning program. // Caller is responsible for deleting prog_. // If reversed is true, compiles a program that expects // to run over the input string backward (reverses all concatenations). // The reversed flag is also recorded in the returned program. Prog *Compiler::Compile(Regexp *re, bool reversed, int64_t max_mem) { Compiler c; c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */); c.reversed_ = reversed; // Simplify to remove things like counted repetitions // and character classes like \d. Regexp *sre = re->Simplify(); if (sre == NULL) return NULL; // Record whether prog is anchored, removing the anchors. // (They get in the way of other optimizations.) bool is_anchor_start = IsAnchorStart(&sre, 0); bool is_anchor_end = IsAnchorEnd(&sre, 0); // Generate fragment for entire regexp. Frag all = c.WalkExponential(sre, Frag(), 2 * c.max_ninst_); sre->Decref(); if (c.failed_) return NULL; // Success! Finish by putting Match node at end, and record start. // Turn off c.reversed_ (if it is set) to force the remaining concatenations // to behave normally. c.reversed_ = false; all = c.Cat(all, c.Match(0)); c.prog_->set_reversed(reversed); if (c.prog_->reversed()) { c.prog_->set_anchor_start(is_anchor_end); c.prog_->set_anchor_end(is_anchor_start); } else { c.prog_->set_anchor_start(is_anchor_start); c.prog_->set_anchor_end(is_anchor_end); } c.prog_->set_start(all.begin); if (!c.prog_->anchor_start()) { // Also create unanchored version, which starts with a .*? loop. all = c.Cat(c.DotStar(), all); } c.prog_->set_start_unanchored(all.begin); // Hand ownership of prog_ to caller. return c.Finish(re); } Prog *Compiler::Finish(Regexp *re) { if (failed_) return NULL; if (prog_->start() == 0 && prog_->start_unanchored() == 0) { // No possible matches; keep Fail instruction only. ninst_ = 1; } // Hand off the array to Prog. prog_->inst_ = std::move(inst_); prog_->size_ = ninst_; prog_->Optimize(); prog_->Flatten(); prog_->ComputeByteMap(); if (!prog_->reversed()) { std::string prefix; bool prefix_foldcase; if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase)) prog_->ConfigurePrefixAccel(prefix, prefix_foldcase); } // Record remaining memory for DFA. if (max_mem_ <= 0) { prog_->set_dfa_mem(1 << 20); } else { int64_t m = max_mem_ - sizeof(Prog); m -= prog_->size_ * sizeof(Prog::Inst); // account for inst_ if (prog_->CanBitState()) m -= prog_->size_ * sizeof(uint16_t); // account for list_heads_ if (m < 0) m = 0; prog_->set_dfa_mem(m); } Prog *p = prog_; prog_ = NULL; return p; } // Converts Regexp to Prog. Prog *Regexp::CompileToProg(int64_t max_mem) { return Compiler::Compile(this, false, max_mem); } Prog *Regexp::CompileToReverseProg(int64_t max_mem) { return Compiler::Compile(this, true, max_mem); } Frag Compiler::DotStar() { return Star(ByteRange(0x00, 0xff, false), true); } // Compiles RE set to Prog. Prog *Compiler::CompileSet(Regexp *re, RE2::Anchor anchor, int64_t max_mem) { Compiler c; c.Setup(re->parse_flags(), max_mem, anchor); Regexp *sre = re->Simplify(); if (sre == NULL) return NULL; Frag all = c.WalkExponential(sre, Frag(), 2 * c.max_ninst_); sre->Decref(); if (c.failed_) return NULL; c.prog_->set_anchor_start(true); c.prog_->set_anchor_end(true); if (anchor == RE2::UNANCHORED) { // Prepend .* or else the expression will effectively be anchored. // Complemented by the ANCHOR_BOTH case in PostVisit(). all = c.Cat(c.DotStar(), all); } c.prog_->set_start(all.begin); c.prog_->set_start_unanchored(all.begin); Prog *prog = c.Finish(re); if (prog == NULL) return NULL; // Make sure DFA has enough memory to operate, // since we're not going to fall back to the NFA. bool dfa_failed = false; StringPiece sp = "hello, world"; prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, NULL, &dfa_failed, NULL); if (dfa_failed) { delete prog; return NULL; } return prog; } Prog *Prog::CompileSet(Regexp *re, RE2::Anchor anchor, int64_t max_mem) { return Compiler::CompileSet(re, anchor, max_mem); } } // namespace re2