mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-03-06 08:06:43 +08:00
# RAGFlow Go Implementation Plan 🚀 This repository tracks the progress of porting RAGFlow to Go. We'll implement core features and provide performance comparisons between Python and Go versions. ## Implementation Checklist - [x] User Management APIs - [x] Dataset Management Operations - [x] Retrieval Test - [x] Chat Management Operations - [x] Infinity Go SDK --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
1222 lines
41 KiB
C++
1222 lines
41 KiB
C++
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Compile regular expression to Prog.
|
|
//
|
|
// Prog and Inst are defined in prog.h.
|
|
// This file's external interface is just Regexp::CompileToProg.
|
|
// The Compiler class defined in this file is private.
|
|
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
|
|
#include "re2/pod_array.h"
|
|
#include "re2/prog.h"
|
|
#include "re2/re2.h"
|
|
#include "re2/regexp.h"
|
|
#include "re2/walker-inl.h"
|
|
#include "util/logging.h"
|
|
#include "util/utf.h"
|
|
|
|
namespace re2 {
|
|
|
|
// List of pointers to Inst* that need to be filled in (patched).
|
|
// Because the Inst* haven't been filled in yet,
|
|
// we can use the Inst* word to hold the list's "next" pointer.
|
|
// It's kind of sleazy, but it works well in practice.
|
|
// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
|
|
//
|
|
// Because the out and out1 fields in Inst are no longer pointers,
|
|
// we can't use pointers directly here either. Instead, head refers
|
|
// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
|
|
// head == 0 represents the NULL list. This is okay because instruction #0
|
|
// is always the fail instruction, which never appears on a list.
|
|
struct PatchList {
|
|
// Returns patch list containing just p.
|
|
static PatchList Mk(uint32_t p) { return {p, p}; }
|
|
|
|
// Patches all the entries on l to have value p.
|
|
// Caller must not ever use patch list again.
|
|
static void Patch(Prog::Inst *inst0, PatchList l, uint32_t p) {
|
|
while (l.head != 0) {
|
|
Prog::Inst *ip = &inst0[l.head >> 1];
|
|
if (l.head & 1) {
|
|
l.head = ip->out1();
|
|
ip->out1_ = p;
|
|
} else {
|
|
l.head = ip->out();
|
|
ip->set_out(p);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Appends two patch lists and returns result.
|
|
static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2) {
|
|
if (l1.head == 0)
|
|
return l2;
|
|
if (l2.head == 0)
|
|
return l1;
|
|
Prog::Inst *ip = &inst0[l1.tail >> 1];
|
|
if (l1.tail & 1)
|
|
ip->out1_ = l2.head;
|
|
else
|
|
ip->set_out(l2.head);
|
|
return {l1.head, l2.tail};
|
|
}
|
|
|
|
uint32_t head;
|
|
uint32_t tail; // for constant-time append
|
|
};
|
|
|
|
static const PatchList kNullPatchList = {0, 0};
|
|
|
|
// Compiled program fragment.
|
|
struct Frag {
|
|
uint32_t begin;
|
|
PatchList end;
|
|
bool nullable;
|
|
|
|
Frag() : begin(0), end(kNullPatchList), nullable(false) {}
|
|
Frag(uint32_t begin, PatchList end, bool nullable) : begin(begin), end(end), nullable(nullable) {}
|
|
};
|
|
|
|
// Input encodings.
|
|
enum Encoding {
|
|
kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
|
|
kEncodingLatin1, // Latin-1 (0-FF)
|
|
};
|
|
|
|
class Compiler : public Regexp::Walker<Frag> {
|
|
public:
|
|
explicit Compiler();
|
|
~Compiler();
|
|
|
|
// Compiles Regexp to a new Prog.
|
|
// Caller is responsible for deleting Prog when finished with it.
|
|
// If reversed is true, compiles for walking over the input
|
|
// string backward (reverses all concatenations).
|
|
static Prog *Compile(Regexp *re, bool reversed, int64_t max_mem);
|
|
|
|
// Compiles alternation of all the re to a new Prog.
|
|
// Each re has a match with an id equal to its index in the vector.
|
|
static Prog *CompileSet(Regexp *re, RE2::Anchor anchor, int64_t max_mem);
|
|
|
|
// Interface for Regexp::Walker, which helps traverse the Regexp.
|
|
// The walk is purely post-recursive: given the machines for the
|
|
// children, PostVisit combines them to create the machine for
|
|
// the current node. The child_args are Frags.
|
|
// The Compiler traverses the Regexp parse tree, visiting
|
|
// each node in depth-first order. It invokes PreVisit before
|
|
// visiting the node's children and PostVisit after visiting
|
|
// the children.
|
|
Frag PreVisit(Regexp *re, Frag parent_arg, bool *stop);
|
|
Frag PostVisit(Regexp *re, Frag parent_arg, Frag pre_arg, Frag *child_args, int nchild_args);
|
|
Frag ShortVisit(Regexp *re, Frag parent_arg);
|
|
Frag Copy(Frag arg);
|
|
|
|
// Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
|
|
Frag Plus(Frag a, bool nongreedy);
|
|
Frag Star(Frag a, bool nongreedy);
|
|
Frag Quest(Frag a, bool nongreedy);
|
|
|
|
// Given fragment a, returns (a) capturing as \n.
|
|
Frag Capture(Frag a, int n);
|
|
|
|
// Given fragments a and b, returns ab; a|b
|
|
Frag Cat(Frag a, Frag b);
|
|
Frag Alt(Frag a, Frag b);
|
|
|
|
// Returns a fragment that can't match anything.
|
|
Frag NoMatch();
|
|
|
|
// Returns a fragment that matches the empty string.
|
|
Frag Match(int32_t id);
|
|
|
|
// Returns a no-op fragment.
|
|
Frag Nop();
|
|
|
|
// Returns a fragment matching the byte range lo-hi.
|
|
Frag ByteRange(int lo, int hi, bool foldcase);
|
|
|
|
// Returns a fragment matching an empty-width special op.
|
|
Frag EmptyWidth(EmptyOp op);
|
|
|
|
// Adds n instructions to the program.
|
|
// Returns the index of the first one.
|
|
// Returns -1 if no more instructions are available.
|
|
int AllocInst(int n);
|
|
|
|
// Rune range compiler.
|
|
|
|
// Begins a new alternation.
|
|
void BeginRange();
|
|
|
|
// Adds a fragment matching the rune range lo-hi.
|
|
void AddRuneRange(Rune lo, Rune hi, bool foldcase);
|
|
void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
|
|
void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
|
|
void Add_80_10ffff();
|
|
|
|
// New suffix that matches the byte range lo-hi, then goes to next.
|
|
int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
|
|
int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
|
|
|
|
// Returns true iff the suffix is cached.
|
|
bool IsCachedRuneByteSuffix(int id);
|
|
|
|
// Adds a suffix to alternation.
|
|
void AddSuffix(int id);
|
|
|
|
// Adds a suffix to the trie starting from the given root node.
|
|
// Returns zero iff allocating an instruction fails. Otherwise, returns
|
|
// the current root node, which might be different from what was given.
|
|
int AddSuffixRecursive(int root, int id);
|
|
|
|
// Finds the trie node for the given suffix. Returns a Frag in order to
|
|
// distinguish between pointing at the root node directly (end.head == 0)
|
|
// and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
|
|
Frag FindByteRange(int root, int id);
|
|
|
|
// Compares two ByteRanges and returns true iff they are equal.
|
|
bool ByteRangeEqual(int id1, int id2);
|
|
|
|
// Returns the alternation of all the added suffixes.
|
|
Frag EndRange();
|
|
|
|
// Single rune.
|
|
Frag Literal(Rune r, bool foldcase);
|
|
|
|
void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
|
|
Prog *Finish(Regexp *re);
|
|
|
|
// Returns .* where dot = any byte
|
|
Frag DotStar();
|
|
|
|
private:
|
|
Prog *prog_; // Program being built.
|
|
bool failed_; // Did we give up compiling?
|
|
Encoding encoding_; // Input encoding
|
|
bool reversed_; // Should program run backward over text?
|
|
|
|
PODArray<Prog::Inst> inst_;
|
|
int ninst_; // Number of instructions used.
|
|
int max_ninst_; // Maximum number of instructions.
|
|
|
|
int64_t max_mem_; // Total memory budget.
|
|
|
|
std::unordered_map<uint64_t, int> rune_cache_;
|
|
Frag rune_range_;
|
|
|
|
RE2::Anchor anchor_; // anchor mode for RE2::Set
|
|
|
|
Compiler(const Compiler &) = delete;
|
|
Compiler &operator=(const Compiler &) = delete;
|
|
};
|
|
|
|
Compiler::Compiler() {
|
|
prog_ = new Prog();
|
|
failed_ = false;
|
|
encoding_ = kEncodingUTF8;
|
|
reversed_ = false;
|
|
ninst_ = 0;
|
|
max_ninst_ = 1; // make AllocInst for fail instruction okay
|
|
max_mem_ = 0;
|
|
int fail = AllocInst(1);
|
|
inst_[fail].InitFail();
|
|
max_ninst_ = 0; // Caller must change
|
|
}
|
|
|
|
Compiler::~Compiler() { delete prog_; }
|
|
|
|
int Compiler::AllocInst(int n) {
|
|
if (failed_ || ninst_ + n > max_ninst_) {
|
|
failed_ = true;
|
|
return -1;
|
|
}
|
|
|
|
if (ninst_ + n > inst_.size()) {
|
|
int cap = inst_.size();
|
|
if (cap == 0)
|
|
cap = 8;
|
|
while (ninst_ + n > cap)
|
|
cap *= 2;
|
|
PODArray<Prog::Inst> inst(cap);
|
|
if (inst_.data() != NULL)
|
|
memmove(inst.data(), inst_.data(), ninst_ * sizeof inst_[0]);
|
|
memset(inst.data() + ninst_, 0, (cap - ninst_) * sizeof inst_[0]);
|
|
inst_ = std::move(inst);
|
|
}
|
|
int id = ninst_;
|
|
ninst_ += n;
|
|
return id;
|
|
}
|
|
|
|
// These routines are somewhat hard to visualize in text --
|
|
// see http://swtch.com/~rsc/regexp/regexp1.html for
|
|
// pictures explaining what is going on here.
|
|
|
|
// Returns an unmatchable fragment.
|
|
Frag Compiler::NoMatch() { return Frag(); }
|
|
|
|
// Is a an unmatchable fragment?
|
|
static bool IsNoMatch(Frag a) { return a.begin == 0; }
|
|
|
|
// Given fragments a and b, returns fragment for ab.
|
|
Frag Compiler::Cat(Frag a, Frag b) {
|
|
if (IsNoMatch(a) || IsNoMatch(b))
|
|
return NoMatch();
|
|
|
|
// Elide no-op.
|
|
Prog::Inst *begin = &inst_[a.begin];
|
|
if (begin->opcode() == kInstNop && a.end.head == (a.begin << 1) && begin->out() == 0) {
|
|
// in case refs to a somewhere
|
|
PatchList::Patch(inst_.data(), a.end, b.begin);
|
|
return b;
|
|
}
|
|
|
|
// To run backward over string, reverse all concatenations.
|
|
if (reversed_) {
|
|
PatchList::Patch(inst_.data(), b.end, a.begin);
|
|
return Frag(b.begin, a.end, b.nullable && a.nullable);
|
|
}
|
|
|
|
PatchList::Patch(inst_.data(), a.end, b.begin);
|
|
return Frag(a.begin, b.end, a.nullable && b.nullable);
|
|
}
|
|
|
|
// Given fragments for a and b, returns fragment for a|b.
|
|
Frag Compiler::Alt(Frag a, Frag b) {
|
|
// Special case for convenience in loops.
|
|
if (IsNoMatch(a))
|
|
return b;
|
|
if (IsNoMatch(b))
|
|
return a;
|
|
|
|
int id = AllocInst(1);
|
|
if (id < 0)
|
|
return NoMatch();
|
|
|
|
inst_[id].InitAlt(a.begin, b.begin);
|
|
return Frag(id, PatchList::Append(inst_.data(), a.end, b.end), a.nullable || b.nullable);
|
|
}
|
|
|
|
// When capturing submatches in like-Perl mode, a kOpAlt Inst
|
|
// treats out_ as the first choice, out1_ as the second.
|
|
//
|
|
// For *, +, and ?, if out_ causes another repetition,
|
|
// then the operator is greedy. If out1_ is the repetition
|
|
// (and out_ moves forward), then the operator is non-greedy.
|
|
|
|
// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
|
|
Frag Compiler::Plus(Frag a, bool nongreedy) {
|
|
int id = AllocInst(1);
|
|
if (id < 0)
|
|
return NoMatch();
|
|
PatchList pl;
|
|
if (nongreedy) {
|
|
inst_[id].InitAlt(0, a.begin);
|
|
pl = PatchList::Mk(id << 1);
|
|
} else {
|
|
inst_[id].InitAlt(a.begin, 0);
|
|
pl = PatchList::Mk((id << 1) | 1);
|
|
}
|
|
PatchList::Patch(inst_.data(), a.end, id);
|
|
return Frag(a.begin, pl, a.nullable);
|
|
}
|
|
|
|
// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
|
|
Frag Compiler::Star(Frag a, bool nongreedy) {
|
|
// When the subexpression is nullable, one Alt isn't enough to guarantee
|
|
// correct priority ordering within the transitive closure. The simplest
|
|
// solution is to handle it as (a+)? instead, which adds the second Alt.
|
|
if (a.nullable)
|
|
return Quest(Plus(a, nongreedy), nongreedy);
|
|
|
|
int id = AllocInst(1);
|
|
if (id < 0)
|
|
return NoMatch();
|
|
PatchList pl;
|
|
if (nongreedy) {
|
|
inst_[id].InitAlt(0, a.begin);
|
|
pl = PatchList::Mk(id << 1);
|
|
} else {
|
|
inst_[id].InitAlt(a.begin, 0);
|
|
pl = PatchList::Mk((id << 1) | 1);
|
|
}
|
|
PatchList::Patch(inst_.data(), a.end, id);
|
|
return Frag(id, pl, true);
|
|
}
|
|
|
|
// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
|
|
Frag Compiler::Quest(Frag a, bool nongreedy) {
|
|
if (IsNoMatch(a))
|
|
return Nop();
|
|
int id = AllocInst(1);
|
|
if (id < 0)
|
|
return NoMatch();
|
|
PatchList pl;
|
|
if (nongreedy) {
|
|
inst_[id].InitAlt(0, a.begin);
|
|
pl = PatchList::Mk(id << 1);
|
|
} else {
|
|
inst_[id].InitAlt(a.begin, 0);
|
|
pl = PatchList::Mk((id << 1) | 1);
|
|
}
|
|
return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
|
|
}
|
|
|
|
// Returns a fragment for the byte range lo-hi.
|
|
Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
|
|
int id = AllocInst(1);
|
|
if (id < 0)
|
|
return NoMatch();
|
|
inst_[id].InitByteRange(lo, hi, foldcase, 0);
|
|
return Frag(id, PatchList::Mk(id << 1), false);
|
|
}
|
|
|
|
// Returns a no-op fragment. Sometimes unavoidable.
|
|
Frag Compiler::Nop() {
|
|
int id = AllocInst(1);
|
|
if (id < 0)
|
|
return NoMatch();
|
|
inst_[id].InitNop(0);
|
|
return Frag(id, PatchList::Mk(id << 1), true);
|
|
}
|
|
|
|
// Returns a fragment that signals a match.
|
|
Frag Compiler::Match(int32_t match_id) {
|
|
int id = AllocInst(1);
|
|
if (id < 0)
|
|
return NoMatch();
|
|
inst_[id].InitMatch(match_id);
|
|
return Frag(id, kNullPatchList, false);
|
|
}
|
|
|
|
// Returns a fragment matching a particular empty-width op (like ^ or $)
|
|
Frag Compiler::EmptyWidth(EmptyOp empty) {
|
|
int id = AllocInst(1);
|
|
if (id < 0)
|
|
return NoMatch();
|
|
inst_[id].InitEmptyWidth(empty, 0);
|
|
return Frag(id, PatchList::Mk(id << 1), true);
|
|
}
|
|
|
|
// Given a fragment a, returns a fragment with capturing parens around a.
|
|
Frag Compiler::Capture(Frag a, int n) {
|
|
if (IsNoMatch(a))
|
|
return NoMatch();
|
|
int id = AllocInst(2);
|
|
if (id < 0)
|
|
return NoMatch();
|
|
inst_[id].InitCapture(2 * n, a.begin);
|
|
inst_[id + 1].InitCapture(2 * n + 1, 0);
|
|
PatchList::Patch(inst_.data(), a.end, id + 1);
|
|
|
|
return Frag(id, PatchList::Mk((id + 1) << 1), a.nullable);
|
|
}
|
|
|
|
// A Rune is a name for a Unicode code point.
|
|
// Returns maximum rune encoded by UTF-8 sequence of length len.
|
|
static int MaxRune(int len) {
|
|
int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
|
|
if (len == 1)
|
|
b = 7;
|
|
else
|
|
b = 8 - (len + 1) + 6 * (len - 1);
|
|
return (1 << b) - 1; // maximum Rune for b bits.
|
|
}
|
|
|
|
// The rune range compiler caches common suffix fragments,
|
|
// which are very common in UTF-8 (e.g., [80-bf]).
|
|
// The fragment suffixes are identified by their start
|
|
// instructions. NULL denotes the eventual end match.
|
|
// The Frag accumulates in rune_range_. Caching common
|
|
// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
|
|
// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
|
|
|
|
void Compiler::BeginRange() {
|
|
rune_cache_.clear();
|
|
rune_range_.begin = 0;
|
|
rune_range_.end = kNullPatchList;
|
|
}
|
|
|
|
int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next) {
|
|
Frag f = ByteRange(lo, hi, foldcase);
|
|
if (next != 0) {
|
|
PatchList::Patch(inst_.data(), f.end, next);
|
|
} else {
|
|
rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
|
|
}
|
|
return f.begin;
|
|
}
|
|
|
|
static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase, int next) {
|
|
return (uint64_t)next << 17 | (uint64_t)lo << 9 | (uint64_t)hi << 1 | (uint64_t)foldcase;
|
|
}
|
|
|
|
int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next) {
|
|
uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
|
|
std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
|
|
if (it != rune_cache_.end())
|
|
return it->second;
|
|
int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
|
|
rune_cache_[key] = id;
|
|
return id;
|
|
}
|
|
|
|
bool Compiler::IsCachedRuneByteSuffix(int id) {
|
|
uint8_t lo = inst_[id].byte_range.lo_;
|
|
uint8_t hi = inst_[id].byte_range.hi_;
|
|
bool foldcase = inst_[id].foldcase() != 0;
|
|
int next = inst_[id].out();
|
|
|
|
uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
|
|
return rune_cache_.find(key) != rune_cache_.end();
|
|
}
|
|
|
|
void Compiler::AddSuffix(int id) {
|
|
if (failed_)
|
|
return;
|
|
|
|
if (rune_range_.begin == 0) {
|
|
rune_range_.begin = id;
|
|
return;
|
|
}
|
|
|
|
if (encoding_ == kEncodingUTF8) {
|
|
// Build a trie in order to reduce fanout.
|
|
rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
|
|
return;
|
|
}
|
|
|
|
int alt = AllocInst(1);
|
|
if (alt < 0) {
|
|
rune_range_.begin = 0;
|
|
return;
|
|
}
|
|
inst_[alt].InitAlt(rune_range_.begin, id);
|
|
rune_range_.begin = alt;
|
|
}
|
|
|
|
int Compiler::AddSuffixRecursive(int root, int id) {
|
|
DCHECK(inst_[root].opcode() == kInstAlt || inst_[root].opcode() == kInstByteRange);
|
|
|
|
Frag f = FindByteRange(root, id);
|
|
if (IsNoMatch(f)) {
|
|
int alt = AllocInst(1);
|
|
if (alt < 0)
|
|
return 0;
|
|
inst_[alt].InitAlt(root, id);
|
|
return alt;
|
|
}
|
|
|
|
int br;
|
|
if (f.end.head == 0)
|
|
br = root;
|
|
else if (f.end.head & 1)
|
|
br = inst_[f.begin].out1();
|
|
else
|
|
br = inst_[f.begin].out();
|
|
|
|
if (IsCachedRuneByteSuffix(br)) {
|
|
// We can't fiddle with cached suffixes, so make a clone of the head.
|
|
int byterange = AllocInst(1);
|
|
if (byterange < 0)
|
|
return 0;
|
|
inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(), inst_[br].foldcase(), inst_[br].out());
|
|
|
|
// Ensure that the parent points to the clone, not to the original.
|
|
// Note that this could leave the head unreachable except via the cache.
|
|
br = byterange;
|
|
if (f.end.head == 0)
|
|
root = br;
|
|
else if (f.end.head & 1)
|
|
inst_[f.begin].out1_ = br;
|
|
else
|
|
inst_[f.begin].set_out(br);
|
|
}
|
|
|
|
int out = inst_[id].out();
|
|
if (!IsCachedRuneByteSuffix(id)) {
|
|
// The head should be the instruction most recently allocated, so free it
|
|
// instead of leaving it unreachable.
|
|
DCHECK_EQ(id, ninst_ - 1);
|
|
inst_[id].out_opcode_ = 0;
|
|
inst_[id].out1_ = 0;
|
|
ninst_--;
|
|
}
|
|
|
|
out = AddSuffixRecursive(inst_[br].out(), out);
|
|
if (out == 0)
|
|
return 0;
|
|
|
|
inst_[br].set_out(out);
|
|
return root;
|
|
}
|
|
|
|
bool Compiler::ByteRangeEqual(int id1, int id2) {
|
|
return inst_[id1].lo() == inst_[id2].lo() && inst_[id1].hi() == inst_[id2].hi() && inst_[id1].foldcase() == inst_[id2].foldcase();
|
|
}
|
|
|
|
Frag Compiler::FindByteRange(int root, int id) {
|
|
if (inst_[root].opcode() == kInstByteRange) {
|
|
if (ByteRangeEqual(root, id))
|
|
return Frag(root, kNullPatchList, false);
|
|
else
|
|
return NoMatch();
|
|
}
|
|
|
|
while (inst_[root].opcode() == kInstAlt) {
|
|
int out1 = inst_[root].out1();
|
|
if (ByteRangeEqual(out1, id))
|
|
return Frag(root, PatchList::Mk((root << 1) | 1), false);
|
|
|
|
// CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
|
|
// what we're looking for, then we can stop immediately. Unfortunately, we
|
|
// can't short-circuit the search in reverse mode.
|
|
if (!reversed_)
|
|
return NoMatch();
|
|
|
|
int out = inst_[root].out();
|
|
if (inst_[out].opcode() == kInstAlt)
|
|
root = out;
|
|
else if (ByteRangeEqual(out, id))
|
|
return Frag(root, PatchList::Mk(root << 1), false);
|
|
else
|
|
return NoMatch();
|
|
}
|
|
|
|
LOG(DFATAL) << "should never happen";
|
|
return NoMatch();
|
|
}
|
|
|
|
Frag Compiler::EndRange() { return rune_range_; }
|
|
|
|
// Converts rune range lo-hi into a fragment that recognizes
|
|
// the bytes that would make up those runes in the current
|
|
// encoding (Latin 1 or UTF-8).
|
|
// This lets the machine work byte-by-byte even when
|
|
// using multibyte encodings.
|
|
|
|
void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
|
|
switch (encoding_) {
|
|
default:
|
|
case kEncodingUTF8:
|
|
AddRuneRangeUTF8(lo, hi, foldcase);
|
|
break;
|
|
case kEncodingLatin1:
|
|
AddRuneRangeLatin1(lo, hi, foldcase);
|
|
break;
|
|
}
|
|
}
|
|
|
|
void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
|
|
// Latin-1 is easy: runes *are* bytes.
|
|
if (lo > hi || lo > 0xFF)
|
|
return;
|
|
if (hi > 0xFF)
|
|
hi = 0xFF;
|
|
AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo), static_cast<uint8_t>(hi), foldcase, 0));
|
|
}
|
|
|
|
void Compiler::Add_80_10ffff() {
|
|
// The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
|
|
// (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
|
|
// permitting overlong encodings in E0 and F0 sequences and code points
|
|
// over 10FFFF in F4 sequences, the size of the bytecode and the number
|
|
// of equivalence classes are reduced significantly.
|
|
int id;
|
|
if (reversed_) {
|
|
// Prefix factoring matters, but we don't have to handle it here
|
|
// because the rune range trie logic takes care of that already.
|
|
id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
|
|
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
|
|
AddSuffix(id);
|
|
|
|
id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
|
|
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
|
|
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
|
|
AddSuffix(id);
|
|
|
|
id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
|
|
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
|
|
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
|
|
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
|
|
AddSuffix(id);
|
|
} else {
|
|
// Suffix factoring matters - and we do have to handle it here.
|
|
int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
|
|
id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
|
|
AddSuffix(id);
|
|
|
|
int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
|
|
id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
|
|
AddSuffix(id);
|
|
|
|
int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
|
|
id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
|
|
AddSuffix(id);
|
|
}
|
|
}
|
|
|
|
void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
|
|
if (lo > hi)
|
|
return;
|
|
|
|
// Pick off 80-10FFFF as a common special case.
|
|
if (lo == 0x80 && hi == 0x10ffff) {
|
|
Add_80_10ffff();
|
|
return;
|
|
}
|
|
|
|
// Split range into same-length sized ranges.
|
|
for (int i = 1; i < UTFmax; i++) {
|
|
Rune max = MaxRune(i);
|
|
if (lo <= max && max < hi) {
|
|
AddRuneRangeUTF8(lo, max, foldcase);
|
|
AddRuneRangeUTF8(max + 1, hi, foldcase);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// ASCII range is always a special case.
|
|
if (hi < Runeself) {
|
|
AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo), static_cast<uint8_t>(hi), foldcase, 0));
|
|
return;
|
|
}
|
|
|
|
// Split range into sections that agree on leading bytes.
|
|
for (int i = 1; i < UTFmax; i++) {
|
|
uint32_t m = (1 << (6 * i)) - 1; // last i bytes of a UTF-8 sequence
|
|
if ((lo & ~m) != (hi & ~m)) {
|
|
if ((lo & m) != 0) {
|
|
AddRuneRangeUTF8(lo, lo | m, foldcase);
|
|
AddRuneRangeUTF8((lo | m) + 1, hi, foldcase);
|
|
return;
|
|
}
|
|
if ((hi & m) != m) {
|
|
AddRuneRangeUTF8(lo, (hi & ~m) - 1, foldcase);
|
|
AddRuneRangeUTF8(hi & ~m, hi, foldcase);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Finally. Generate byte matching equivalent for lo-hi.
|
|
uint8_t ulo[UTFmax], uhi[UTFmax];
|
|
int n = runetochar(reinterpret_cast<char *>(ulo), &lo);
|
|
int m = runetochar(reinterpret_cast<char *>(uhi), &hi);
|
|
(void)m; // USED(m)
|
|
DCHECK_EQ(n, m);
|
|
|
|
// The logic below encodes this thinking:
|
|
//
|
|
// 1. When we have built the whole suffix, we know that it cannot
|
|
// possibly be a suffix of anything longer: in forward mode, nothing
|
|
// else can occur before the leading byte; in reverse mode, nothing
|
|
// else can occur after the last continuation byte or else the leading
|
|
// byte would have to change. Thus, there is no benefit to caching
|
|
// the first byte of the suffix whereas there is a cost involved in
|
|
// cloning it if it begins a common prefix, which is fairly likely.
|
|
//
|
|
// 2. Conversely, the last byte of the suffix cannot possibly be a
|
|
// prefix of anything because next == 0, so we will never want to
|
|
// clone it, but it is fairly likely to be a common suffix. Perhaps
|
|
// more so in reverse mode than in forward mode because the former is
|
|
// "converging" towards lower entropy, but caching is still worthwhile
|
|
// for the latter in cases such as 80-BF.
|
|
//
|
|
// 3. Handling the bytes between the first and the last is less
|
|
// straightforward and, again, the approach depends on whether we are
|
|
// "converging" towards lower entropy: in forward mode, a single byte
|
|
// is unlikely to be part of a common suffix whereas a byte range
|
|
// is more likely so; in reverse mode, a byte range is unlikely to
|
|
// be part of a common suffix whereas a single byte is more likely
|
|
// so. The same benefit versus cost argument applies here.
|
|
int id = 0;
|
|
if (reversed_) {
|
|
for (int i = 0; i < n; i++) {
|
|
// In reverse UTF-8 mode: cache the leading byte; don't cache the last
|
|
// continuation byte; cache anything else iff it's a single byte (XX-XX).
|
|
if (i == 0 || (ulo[i] == uhi[i] && i != n - 1))
|
|
id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
|
else
|
|
id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
|
}
|
|
} else {
|
|
for (int i = n - 1; i >= 0; i--) {
|
|
// In forward UTF-8 mode: don't cache the leading byte; cache the last
|
|
// continuation byte; cache anything else iff it's a byte range (XX-YY).
|
|
if (i == n - 1 || (ulo[i] < uhi[i] && i != 0))
|
|
id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
|
else
|
|
id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
|
}
|
|
}
|
|
AddSuffix(id);
|
|
}
|
|
|
|
// Should not be called.
|
|
Frag Compiler::Copy(Frag arg) {
|
|
// We're using WalkExponential; there should be no copying.
|
|
failed_ = true;
|
|
LOG(DFATAL) << "Compiler::Copy called!";
|
|
return NoMatch();
|
|
}
|
|
|
|
// Visits a node quickly; called once WalkExponential has
|
|
// decided to cut this walk short.
|
|
Frag Compiler::ShortVisit(Regexp *re, Frag) {
|
|
failed_ = true;
|
|
return NoMatch();
|
|
}
|
|
|
|
// Called before traversing a node's children during the walk.
|
|
Frag Compiler::PreVisit(Regexp *re, Frag, bool *stop) {
|
|
// Cut off walk if we've already failed.
|
|
if (failed_)
|
|
*stop = true;
|
|
|
|
return Frag(); // not used by caller
|
|
}
|
|
|
|
Frag Compiler::Literal(Rune r, bool foldcase) {
|
|
switch (encoding_) {
|
|
default:
|
|
return Frag();
|
|
|
|
case kEncodingLatin1:
|
|
return ByteRange(r, r, foldcase);
|
|
|
|
case kEncodingUTF8: {
|
|
if (r < Runeself) // Make common case fast.
|
|
return ByteRange(r, r, foldcase);
|
|
uint8_t buf[UTFmax];
|
|
int n = runetochar(reinterpret_cast<char *>(buf), &r);
|
|
Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
|
|
for (int i = 1; i < n; i++)
|
|
f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
|
|
return f;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Called after traversing the node's children during the walk.
|
|
// Given their frags, build and return the frag for this re.
|
|
Frag Compiler::PostVisit(Regexp *re, Frag, Frag, Frag *child_frags, int nchild_frags) {
|
|
// If a child failed, don't bother going forward, especially
|
|
// since the child_frags might contain Frags with NULLs in them.
|
|
if (failed_)
|
|
return NoMatch();
|
|
|
|
// Given the child fragments, return the fragment for this node.
|
|
switch (re->op()) {
|
|
case kRegexpRepeat:
|
|
// Should not see; code at bottom of function will print error
|
|
break;
|
|
|
|
case kRegexpNoMatch:
|
|
return NoMatch();
|
|
|
|
case kRegexpEmptyMatch:
|
|
return Nop();
|
|
|
|
case kRegexpHaveMatch: {
|
|
Frag f = Match(re->match_id());
|
|
if (anchor_ == RE2::ANCHOR_BOTH) {
|
|
// Append \z or else the subexpression will effectively be unanchored.
|
|
// Complemented by the UNANCHORED case in CompileSet().
|
|
f = Cat(EmptyWidth(kEmptyEndText), f);
|
|
}
|
|
return f;
|
|
}
|
|
|
|
case kRegexpConcat: {
|
|
Frag f = child_frags[0];
|
|
for (int i = 1; i < nchild_frags; i++)
|
|
f = Cat(f, child_frags[i]);
|
|
return f;
|
|
}
|
|
|
|
case kRegexpAlternate: {
|
|
Frag f = child_frags[0];
|
|
for (int i = 1; i < nchild_frags; i++)
|
|
f = Alt(f, child_frags[i]);
|
|
return f;
|
|
}
|
|
|
|
case kRegexpStar:
|
|
return Star(child_frags[0], (re->parse_flags() & Regexp::NonGreedy) != 0);
|
|
|
|
case kRegexpPlus:
|
|
return Plus(child_frags[0], (re->parse_flags() & Regexp::NonGreedy) != 0);
|
|
|
|
case kRegexpQuest:
|
|
return Quest(child_frags[0], (re->parse_flags() & Regexp::NonGreedy) != 0);
|
|
|
|
case kRegexpLiteral:
|
|
return Literal(re->rune(), (re->parse_flags() & Regexp::FoldCase) != 0);
|
|
|
|
case kRegexpLiteralString: {
|
|
// Concatenation of literals.
|
|
if (re->nrunes() == 0)
|
|
return Nop();
|
|
Frag f;
|
|
for (int i = 0; i < re->nrunes(); i++) {
|
|
Frag f1 = Literal(re->runes()[i], (re->parse_flags() & Regexp::FoldCase) != 0);
|
|
if (i == 0)
|
|
f = f1;
|
|
else
|
|
f = Cat(f, f1);
|
|
}
|
|
return f;
|
|
}
|
|
|
|
case kRegexpAnyChar:
|
|
BeginRange();
|
|
AddRuneRange(0, Runemax, false);
|
|
return EndRange();
|
|
|
|
case kRegexpAnyByte:
|
|
return ByteRange(0x00, 0xFF, false);
|
|
|
|
case kRegexpCharClass: {
|
|
CharClass *cc = re->cc();
|
|
if (cc->empty()) {
|
|
// This can't happen.
|
|
failed_ = true;
|
|
LOG(DFATAL) << "No ranges in char class";
|
|
return NoMatch();
|
|
}
|
|
|
|
// ASCII case-folding optimization: if the char class
|
|
// behaves the same on A-Z as it does on a-z,
|
|
// discard any ranges wholly contained in A-Z
|
|
// and mark the other ranges as foldascii.
|
|
// This reduces the size of a program for
|
|
// (?i)abc from 3 insts per letter to 1 per letter.
|
|
bool foldascii = cc->FoldsASCII();
|
|
|
|
// Character class is just a big OR of the different
|
|
// character ranges in the class.
|
|
BeginRange();
|
|
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
|
|
// ASCII case-folding optimization (see above).
|
|
if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
|
|
continue;
|
|
|
|
// If this range contains all of A-Za-z or none of it,
|
|
// the fold flag is unnecessary; don't bother.
|
|
bool fold = foldascii;
|
|
if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo || ('Z' < i->lo && i->hi < 'a'))
|
|
fold = false;
|
|
|
|
AddRuneRange(i->lo, i->hi, fold);
|
|
}
|
|
return EndRange();
|
|
}
|
|
|
|
case kRegexpCapture:
|
|
// If this is a non-capturing parenthesis -- (?:foo) --
|
|
// just use the inner expression.
|
|
if (re->cap() < 0)
|
|
return child_frags[0];
|
|
return Capture(child_frags[0], re->cap());
|
|
|
|
case kRegexpBeginLine:
|
|
return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
|
|
|
|
case kRegexpEndLine:
|
|
return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
|
|
|
|
case kRegexpBeginText:
|
|
return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
|
|
|
|
case kRegexpEndText:
|
|
return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
|
|
|
|
case kRegexpWordBoundary:
|
|
return EmptyWidth(kEmptyWordBoundary);
|
|
|
|
case kRegexpNoWordBoundary:
|
|
return EmptyWidth(kEmptyNonWordBoundary);
|
|
}
|
|
failed_ = true;
|
|
LOG(DFATAL) << "Missing case in Compiler: " << re->op();
|
|
return NoMatch();
|
|
}
|
|
|
|
// Is this regexp required to start at the beginning of the text?
|
|
// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
|
|
// but handles (\A(a|b)). Could use the Walker to write a more exact one.
|
|
static bool IsAnchorStart(Regexp **pre, int depth) {
|
|
Regexp *re = *pre;
|
|
Regexp *sub;
|
|
// The depth limit makes sure that we don't overflow
|
|
// the stack on a deeply nested regexp. As the comment
|
|
// above says, IsAnchorStart is conservative, so returning
|
|
// a false negative is okay. The exact limit is somewhat arbitrary.
|
|
if (re == NULL || depth >= 4)
|
|
return false;
|
|
switch (re->op()) {
|
|
default:
|
|
break;
|
|
case kRegexpConcat:
|
|
if (re->nsub() > 0) {
|
|
sub = re->sub()[0]->Incref();
|
|
if (IsAnchorStart(&sub, depth + 1)) {
|
|
PODArray<Regexp *> subcopy(re->nsub());
|
|
subcopy[0] = sub; // already have reference
|
|
for (int i = 1; i < re->nsub(); i++)
|
|
subcopy[i] = re->sub()[i]->Incref();
|
|
*pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
|
|
re->Decref();
|
|
return true;
|
|
}
|
|
sub->Decref();
|
|
}
|
|
break;
|
|
case kRegexpCapture:
|
|
sub = re->sub()[0]->Incref();
|
|
if (IsAnchorStart(&sub, depth + 1)) {
|
|
*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
|
|
re->Decref();
|
|
return true;
|
|
}
|
|
sub->Decref();
|
|
break;
|
|
case kRegexpBeginText:
|
|
*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
|
|
re->Decref();
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Is this regexp required to start at the end of the text?
|
|
// Only approximate; can return false for complicated regexps like (a\z|b\z),
|
|
// but handles ((a|b)\z). Could use the Walker to write a more exact one.
|
|
static bool IsAnchorEnd(Regexp **pre, int depth) {
|
|
Regexp *re = *pre;
|
|
Regexp *sub;
|
|
// The depth limit makes sure that we don't overflow
|
|
// the stack on a deeply nested regexp. As the comment
|
|
// above says, IsAnchorEnd is conservative, so returning
|
|
// a false negative is okay. The exact limit is somewhat arbitrary.
|
|
if (re == NULL || depth >= 4)
|
|
return false;
|
|
switch (re->op()) {
|
|
default:
|
|
break;
|
|
case kRegexpConcat:
|
|
if (re->nsub() > 0) {
|
|
sub = re->sub()[re->nsub() - 1]->Incref();
|
|
if (IsAnchorEnd(&sub, depth + 1)) {
|
|
PODArray<Regexp *> subcopy(re->nsub());
|
|
subcopy[re->nsub() - 1] = sub; // already have reference
|
|
for (int i = 0; i < re->nsub() - 1; i++)
|
|
subcopy[i] = re->sub()[i]->Incref();
|
|
*pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
|
|
re->Decref();
|
|
return true;
|
|
}
|
|
sub->Decref();
|
|
}
|
|
break;
|
|
case kRegexpCapture:
|
|
sub = re->sub()[0]->Incref();
|
|
if (IsAnchorEnd(&sub, depth + 1)) {
|
|
*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
|
|
re->Decref();
|
|
return true;
|
|
}
|
|
sub->Decref();
|
|
break;
|
|
case kRegexpEndText:
|
|
*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
|
|
re->Decref();
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor) {
|
|
if (flags & Regexp::Latin1)
|
|
encoding_ = kEncodingLatin1;
|
|
max_mem_ = max_mem;
|
|
if (max_mem <= 0) {
|
|
max_ninst_ = 100000; // more than enough
|
|
} else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
|
|
// No room for anything.
|
|
max_ninst_ = 0;
|
|
} else {
|
|
int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
|
|
// Limit instruction count so that inst->id() fits nicely in an int.
|
|
// SparseArray also assumes that the indices (inst->id()) are ints.
|
|
// The call to WalkExponential uses 2*max_ninst_ below,
|
|
// and other places in the code use 2 or 3 * prog->size().
|
|
// Limiting to 2^24 should avoid overflow in those places.
|
|
// (The point of allowing more than 32 bits of memory is to
|
|
// have plenty of room for the DFA states, not to use it up
|
|
// on the program.)
|
|
if (m >= 1 << 24)
|
|
m = 1 << 24;
|
|
// Inst imposes its own limit (currently bigger than 2^24 but be safe).
|
|
if (m > Prog::Inst::kMaxInst)
|
|
m = Prog::Inst::kMaxInst;
|
|
max_ninst_ = static_cast<int>(m);
|
|
}
|
|
anchor_ = anchor;
|
|
}
|
|
|
|
// Compiles re, returning program.
|
|
// Caller is responsible for deleting prog_.
|
|
// If reversed is true, compiles a program that expects
|
|
// to run over the input string backward (reverses all concatenations).
|
|
// The reversed flag is also recorded in the returned program.
|
|
Prog *Compiler::Compile(Regexp *re, bool reversed, int64_t max_mem) {
|
|
Compiler c;
|
|
c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
|
|
c.reversed_ = reversed;
|
|
|
|
// Simplify to remove things like counted repetitions
|
|
// and character classes like \d.
|
|
Regexp *sre = re->Simplify();
|
|
if (sre == NULL)
|
|
return NULL;
|
|
|
|
// Record whether prog is anchored, removing the anchors.
|
|
// (They get in the way of other optimizations.)
|
|
bool is_anchor_start = IsAnchorStart(&sre, 0);
|
|
bool is_anchor_end = IsAnchorEnd(&sre, 0);
|
|
|
|
// Generate fragment for entire regexp.
|
|
Frag all = c.WalkExponential(sre, Frag(), 2 * c.max_ninst_);
|
|
sre->Decref();
|
|
if (c.failed_)
|
|
return NULL;
|
|
|
|
// Success! Finish by putting Match node at end, and record start.
|
|
// Turn off c.reversed_ (if it is set) to force the remaining concatenations
|
|
// to behave normally.
|
|
c.reversed_ = false;
|
|
all = c.Cat(all, c.Match(0));
|
|
|
|
c.prog_->set_reversed(reversed);
|
|
if (c.prog_->reversed()) {
|
|
c.prog_->set_anchor_start(is_anchor_end);
|
|
c.prog_->set_anchor_end(is_anchor_start);
|
|
} else {
|
|
c.prog_->set_anchor_start(is_anchor_start);
|
|
c.prog_->set_anchor_end(is_anchor_end);
|
|
}
|
|
|
|
c.prog_->set_start(all.begin);
|
|
if (!c.prog_->anchor_start()) {
|
|
// Also create unanchored version, which starts with a .*? loop.
|
|
all = c.Cat(c.DotStar(), all);
|
|
}
|
|
c.prog_->set_start_unanchored(all.begin);
|
|
|
|
// Hand ownership of prog_ to caller.
|
|
return c.Finish(re);
|
|
}
|
|
|
|
Prog *Compiler::Finish(Regexp *re) {
|
|
if (failed_)
|
|
return NULL;
|
|
|
|
if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
|
|
// No possible matches; keep Fail instruction only.
|
|
ninst_ = 1;
|
|
}
|
|
|
|
// Hand off the array to Prog.
|
|
prog_->inst_ = std::move(inst_);
|
|
prog_->size_ = ninst_;
|
|
|
|
prog_->Optimize();
|
|
prog_->Flatten();
|
|
prog_->ComputeByteMap();
|
|
|
|
if (!prog_->reversed()) {
|
|
std::string prefix;
|
|
bool prefix_foldcase;
|
|
if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
|
|
prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
|
|
}
|
|
|
|
// Record remaining memory for DFA.
|
|
if (max_mem_ <= 0) {
|
|
prog_->set_dfa_mem(1 << 20);
|
|
} else {
|
|
int64_t m = max_mem_ - sizeof(Prog);
|
|
m -= prog_->size_ * sizeof(Prog::Inst); // account for inst_
|
|
if (prog_->CanBitState())
|
|
m -= prog_->size_ * sizeof(uint16_t); // account for list_heads_
|
|
if (m < 0)
|
|
m = 0;
|
|
prog_->set_dfa_mem(m);
|
|
}
|
|
|
|
Prog *p = prog_;
|
|
prog_ = NULL;
|
|
return p;
|
|
}
|
|
|
|
// Converts Regexp to Prog.
|
|
Prog *Regexp::CompileToProg(int64_t max_mem) { return Compiler::Compile(this, false, max_mem); }
|
|
|
|
Prog *Regexp::CompileToReverseProg(int64_t max_mem) { return Compiler::Compile(this, true, max_mem); }
|
|
|
|
Frag Compiler::DotStar() { return Star(ByteRange(0x00, 0xff, false), true); }
|
|
|
|
// Compiles RE set to Prog.
|
|
Prog *Compiler::CompileSet(Regexp *re, RE2::Anchor anchor, int64_t max_mem) {
|
|
Compiler c;
|
|
c.Setup(re->parse_flags(), max_mem, anchor);
|
|
|
|
Regexp *sre = re->Simplify();
|
|
if (sre == NULL)
|
|
return NULL;
|
|
|
|
Frag all = c.WalkExponential(sre, Frag(), 2 * c.max_ninst_);
|
|
sre->Decref();
|
|
if (c.failed_)
|
|
return NULL;
|
|
|
|
c.prog_->set_anchor_start(true);
|
|
c.prog_->set_anchor_end(true);
|
|
|
|
if (anchor == RE2::UNANCHORED) {
|
|
// Prepend .* or else the expression will effectively be anchored.
|
|
// Complemented by the ANCHOR_BOTH case in PostVisit().
|
|
all = c.Cat(c.DotStar(), all);
|
|
}
|
|
c.prog_->set_start(all.begin);
|
|
c.prog_->set_start_unanchored(all.begin);
|
|
|
|
Prog *prog = c.Finish(re);
|
|
if (prog == NULL)
|
|
return NULL;
|
|
|
|
// Make sure DFA has enough memory to operate,
|
|
// since we're not going to fall back to the NFA.
|
|
bool dfa_failed = false;
|
|
StringPiece sp = "hello, world";
|
|
prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, NULL, &dfa_failed, NULL);
|
|
if (dfa_failed) {
|
|
delete prog;
|
|
return NULL;
|
|
}
|
|
|
|
return prog;
|
|
}
|
|
|
|
Prog *Prog::CompileSet(Regexp *re, RE2::Anchor anchor, int64_t max_mem) { return Compiler::CompileSet(re, anchor, max_mem); }
|
|
|
|
} // namespace re2
|