Hello community, here is the log from the commit of package re2 for openSUSE:Factory checked in at 2017-03-20 17:09:54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/re2 (Old) and /work/SRC/openSUSE:Factory/.re2.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "re2" Mon Mar 20 17:09:54 2017 rev:5 rq:480937 version:MACRO Changes: -------- --- /work/SRC/openSUSE:Factory/re2/re2.changes 2017-01-17 14:36:30.556200139 +0100 +++ /work/SRC/openSUSE:Factory/.re2.new/re2.changes 2017-03-20 17:09:56.178705796 +0100 @@ -1,0 +2,6 @@ +Thu Mar 16 11:38:30 UTC 2017 - [email protected] + +- Update to version 2017-03-01 + * No upstream changelog available + +------------------------------------------------------------------- Old: ---- 2017-01-01.tar.gz New: ---- 2017-03-01.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ re2.spec ++++++ --- /var/tmp/diff_new_pack.IebK0D/_old 2017-03-20 17:09:56.782620523 +0100 +++ /var/tmp/diff_new_pack.IebK0D/_new 2017-03-20 17:09:56.786619958 +0100 @@ -16,7 +16,7 @@ # -%global longver 2017-01-01 +%global longver 2017-03-01 %global shortver %(echo %{longver}|sed 's|-||g') %define libname libre2-0 Name: re2 ++++++ 2017-01-01.tar.gz -> 2017-03-01.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/compile.cc new/re2-2017-03-01/re2/compile.cc --- old/re2-2017-01-01/re2/compile.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/compile.cc 2017-02-28 01:11:21.000000000 +0100 @@ -10,8 +10,7 @@ #include <stdint.h> #include <string.h> -#include <sys/types.h> -#include <map> +#include <unordered_map> #include <utility> #include "util/logging.h" @@ -246,7 +245,7 @@ int64_t max_mem_; // Total memory budget. - std::map<uint64_t, int> rune_cache_; + std::unordered_map<uint64_t, int> rune_cache_; Frag rune_range_; RE2::Anchor anchor_; // anchor mode for RE2::Set @@ -496,7 +495,7 @@ int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next) { uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); - std::map<uint64_t, int>::const_iterator it = rune_cache_.find(key); + std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key); if (it != rune_cache_.end()) return it->second; int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); @@ -1211,7 +1210,7 @@ if (max_mem_ <= 0) { prog_->set_dfa_mem(1<<20); } else { - int64_t m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); + int64_t m = max_mem_ - sizeof(Prog) - prog_->size_*sizeof(Prog::Inst); if (m < 0) m = 0; prog_->set_dfa_mem(m); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/dfa.cc new/re2-2017-03-01/re2/dfa.cc --- old/re2-2017-01-01/re2/dfa.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/dfa.cc 2017-02-28 01:11:21.000000000 +0100 @@ -25,7 +25,6 @@ #include <stdint.h> #include <stdio.h> #include <string.h> -#include <sys/types.h> #include <algorithm> #include <atomic> #include <map> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/fuzzing/re2_fuzzer.cc new/re2-2017-03-01/re2/fuzzing/re2_fuzzer.cc --- old/re2-2017-01-01/re2/fuzzing/re2_fuzzer.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/fuzzing/re2_fuzzer.cc 2017-02-28 01:11:21.000000000 +0100 @@ -20,6 +20,12 @@ if (!re.ok()) return; + // Don't waste time fuzzing high-size programs. + // (They can cause bug reports due to fuzzer timeouts.) + int size = re.ProgramSize(); + if (size > 9999) + return; + // Don't waste time fuzzing high-fanout programs. // (They can also cause bug reports due to fuzzer timeouts.) std::map<int, int> histogram; @@ -50,7 +56,7 @@ // Entry point for libFuzzer. extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - if (size == 0 || size > 1000000) + if (size == 0 || size > 1024) return 0; // The one-at-a-time hash by Bob Jenkins. @@ -86,15 +92,5 @@ StringPiece text(ptr, len); Test(pattern, options, text); - for (int i = 2; i <= 4; i++) { - if (len < i) - break; - - int frac = len / i; - pattern = StringPiece(ptr, frac); - text = StringPiece(ptr + frac, len - frac); - Test(pattern, options, text); - } - return 0; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/parse.cc new/re2-2017-03-01/re2/parse.cc --- old/re2-2017-01-01/re2/parse.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/parse.cc 2017-02-28 01:11:21.000000000 +0100 @@ -42,6 +42,13 @@ namespace re2 { +// Reduce the maximum repeat count by an order of magnitude when fuzzing. +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +static const int kMaxRepeat = 100; +#else +static const int kMaxRepeat = 1000; +#endif + // Regular expression parse state. // The list of parsed regexps so far is maintained as a vector of // Regexp pointers called the stack. Left parenthesis and vertical @@ -473,6 +480,23 @@ Regexp::ParseFlags fl = flags_; if (nongreedy) fl = fl ^ NonGreedy; + + // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but + // they're mostly for use during simplification, not during parsing. + if (op == stacktop_->op() && fl == stacktop_->parse_flags()) + return true; + + // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because + // op is a repeat, we just have to check that stacktop_->op() is too, + // then adjust stacktop_. + if ((stacktop_->op() == kRegexpStar || + stacktop_->op() == kRegexpPlus || + stacktop_->op() == kRegexpQuest) && + fl == stacktop_->parse_flags()) { + stacktop_->op_ = kRegexpStar; + return true; + } + Regexp* re = new Regexp(op, fl); re->AllocSub(1); re->down_ = stacktop_->down_; @@ -541,7 +565,7 @@ bool Regexp::ParseState::PushRepetition(int min, int max, const StringPiece& s, bool nongreedy) { - if ((max != -1 && max < min) || min > 1000 || max > 1000) { + if ((max != -1 && max < min) || min > kMaxRepeat || max > kMaxRepeat) { status_->set_code(kRegexpRepeatSize); status_->set_error_arg(s); return false; @@ -564,7 +588,7 @@ stacktop_ = re; if (min >= 2 || max >= 2) { RepetitionWalker w; - if (w.Walk(stacktop_, 1000) == 0) { + if (w.Walk(stacktop_, kMaxRepeat) == 0) { status_->set_code(kRegexpRepeatSize); status_->set_error_arg(s); return false; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/prog.cc new/re2-2017-03-01/re2/prog.cc --- old/re2-2017-01-01/re2/prog.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/prog.cc 2017-02-28 01:11:21.000000000 +0100 @@ -459,7 +459,7 @@ // Don't repeat the work for \b and \B. bool marked_word_boundaries = false; - for (int id = 0; id < static_cast<int>(size()); id++) { + for (int id = 0; id < size(); id++) { Inst* ip = inst(id); if (ip->opcode() == kInstByteRange) { int lo = ip->lo(); @@ -521,23 +521,68 @@ } } +// Prog::Flatten() implements a graph rewriting algorithm. +// +// The overall process is similar to epsilon removal, but retains some epsilon +// transitions: those from Capture and EmptyWidth instructions; and those from +// nullable subexpressions. (The latter avoids quadratic blowup in transitions +// in the worst case.) It might be best thought of as Alt instruction elision. +// +// In conceptual terms, it divides the Prog into "trees" of instructions, then +// traverses the "trees" in order to produce "lists" of instructions. A "tree" +// is one or more instructions that grow from one "root" instruction to one or +// more "leaf" instructions; if a "tree" has exactly one instruction, then the +// "root" is also the "leaf". In most cases, a "root" is the successor of some +// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction) +// and is considered a "successor root". A "leaf" can be a ByteRange, Capture, +// EmptyWidth or Match instruction. However, this is insufficient for handling +// nested nullable subexpressions correctly, so in some cases, a "root" is the +// dominator of the instructions reachable from some "successor root" (i.e. it +// has an unreachable predecessor) and is considered a "dominator root". Since +// only Alt instructions can be "dominator roots" (other instructions would be +// "leaves"), only Alt instructions require their predecessors to be computed. +// +// Dividing the Prog into "trees" comprises two passes: marking the "successor +// roots" and the predecessors; and marking the "dominator roots". Walking the +// Prog in its entirety causes the "successor roots" to be marked in a certain +// order during the first pass. Iteration over the "successor roots" occurs in +// reverse order of their marking during the second pass; by working backwards +// and flooding the graph no further than "leaves" and already marked "roots", +// it becomes possible to mark "dominator roots" without doing excessive work. +// +// Traversing the "trees" is just iterating over the "roots" in order of their +// marking and flooding the graph no further than "leaves" and "roots". When a +// "leaf" is reached, the instruction is copied with its successor remapped to +// its "root" number. When a "root" is reached, a Nop instruction is generated +// with its successor remapped similarly. As each "list" is produced, its last +// instruction is marked as such. After all of the "lists" have been produced, +// a pass over their instructions remaps their successors to bytecode offsets. void Prog::Flatten() { if (did_flatten_) return; did_flatten_ = true; - // Scratch structures. It's important that these are reused by EmitList() - // because we call it in a loop and it would thrash the heap otherwise. - SparseSet q(size()); + // Scratch structures. It's important that these are reused by functions + // that we call in loops because they would thrash the heap otherwise. + SparseSet reachable(size()); std::vector<int> stk; stk.reserve(size()); - // First pass: Marks "roots". + // First pass: Marks "successor roots" and predecessors. // Builds the mapping from inst-ids to root-ids. SparseArray<int> rootmap(size()); - MarkRoots(&rootmap, &q, &stk); + SparseArray<int> predmap(size()); + std::vector<std::vector<int>> predvec; + MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk); + + // Second pass: Marks "dominator roots". + for (SparseArray<int>::const_iterator i = rootmap.end() - 1; + i != rootmap.begin(); + --i) { + MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk); + } - // Second pass: Emits "lists". Remaps outs to root-ids. + // Third pass: Emits "lists". Remaps outs to root-ids. // Builds the mapping from root-ids to flat-ids. std::vector<int> flatmap(rootmap.size()); std::vector<Inst> flat; @@ -546,7 +591,7 @@ i != rootmap.end(); ++i) { flatmap[i->value()] = static_cast<int>(flat.size()); - EmitList(i->index(), &rootmap, &flat, &q, &stk); + EmitList(i->index(), &rootmap, &flat, &reachable, &stk); flat.back().set_last(); } @@ -554,7 +599,7 @@ for (int i = 0; i < kNumInst; i++) inst_count_[i] = 0; - // Third pass: Remaps outs to flat-ids. + // Fourth pass: Remaps outs to flat-ids. // Counts instructions by opcode. for (int id = 0; id < static_cast<int>(flat.size()); id++) { Inst* ip = &flat[id]; @@ -586,8 +631,10 @@ memmove(inst_, flat.data(), size_ * sizeof *inst_); } -void Prog::MarkRoots(SparseArray<int>* rootmap, SparseSet* q, - std::vector<int>* stk) { +void Prog::MarkSuccessors(SparseArray<int>* rootmap, + SparseArray<int>* predmap, + std::vector<std::vector<int>>* predvec, + SparseSet* reachable, std::vector<int>* stk) { // Mark the kInstFail instruction. rootmap->set_new(0, rootmap->size()); @@ -597,16 +644,16 @@ if (!rootmap->has_index(start())) rootmap->set_new(start(), rootmap->size()); - q->clear(); + reachable->clear(); stk->clear(); stk->push_back(start_unanchored()); while (!stk->empty()) { int id = stk->back(); stk->pop_back(); Loop: - if (q->contains(id)) + if (reachable->contains(id)) continue; - q->insert_new(id); + reachable->insert_new(id); Inst* ip = inst(id); switch (ip->opcode()) { @@ -616,6 +663,14 @@ case kInstAltMatch: case kInstAlt: + // Mark this instruction as a predecessor of each out. + for (int out : {ip->out(), ip->out1()}) { + if (!predmap->has_index(out)) { + predmap->set_new(out, static_cast<int>(predvec->size())); + predvec->emplace_back(); + } + (*predvec)[predmap->get_existing(out)].emplace_back(id); + } stk->push_back(ip->out1()); id = ip->out(); goto Loop; @@ -623,7 +678,7 @@ case kInstByteRange: case kInstCapture: case kInstEmptyWidth: - // Mark the out of this instruction. + // Mark the out of this instruction as a "root". if (!rootmap->has_index(ip->out())) rootmap->set_new(ip->out(), rootmap->size()); id = ip->out(); @@ -640,19 +695,83 @@ } } +void Prog::MarkDominator(int root, SparseArray<int>* rootmap, + SparseArray<int>* predmap, + std::vector<std::vector<int>>* predvec, + SparseSet* reachable, std::vector<int>* stk) { + reachable->clear(); + stk->clear(); + stk->push_back(root); + while (!stk->empty()) { + int id = stk->back(); + stk->pop_back(); + Loop: + if (reachable->contains(id)) + continue; + reachable->insert_new(id); + + if (id != root && rootmap->has_index(id)) { + // We reached another "tree" via epsilon transition. + continue; + } + + Inst* ip = inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); + break; + + case kInstAltMatch: + case kInstAlt: + stk->push_back(ip->out1()); + id = ip->out(); + goto Loop; + + case kInstByteRange: + case kInstCapture: + case kInstEmptyWidth: + break; + + case kInstNop: + id = ip->out(); + goto Loop; + + case kInstMatch: + case kInstFail: + break; + } + } + + for (SparseSet::const_iterator i = reachable->begin(); + i != reachable->end(); + ++i) { + int id = *i; + if (predmap->has_index(id)) { + for (int pred : (*predvec)[predmap->get_existing(id)]) { + if (!reachable->contains(pred)) { + // id has a predecessor that cannot be reached from root! + // Therefore, id must be a "root" too - mark it as such. + if (!rootmap->has_index(id)) + rootmap->set_new(id, rootmap->size()); + } + } + } + } +} + void Prog::EmitList(int root, SparseArray<int>* rootmap, - std::vector<Inst>* flat, SparseSet* q, - std::vector<int>* stk) { - q->clear(); + std::vector<Inst>* flat, + SparseSet* reachable, std::vector<int>* stk) { + reachable->clear(); stk->clear(); stk->push_back(root); while (!stk->empty()) { int id = stk->back(); stk->pop_back(); Loop: - if (q->contains(id)) + if (reachable->contains(id)) continue; - q->insert_new(id); + reachable->insert_new(id); if (id != root && rootmap->has_index(id)) { // We reached another "tree" via epsilon transition. Emit a kInstNop diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/prog.h new/re2-2017-03-01/re2/prog.h --- old/re2-2017-01-01/re2/prog.h 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/prog.h 2017-02-28 01:11:21.000000000 +0100 @@ -332,16 +332,28 @@ // operation in the sense that the old instructions are lost. void Flatten(); - // Marks the "roots" in the Prog: the outs of kInstByteRange, kInstCapture - // and kInstEmptyWidth instructions. - void MarkRoots(SparseArray<int>* rootmap, SparseSet* q, - std::vector<int>* stk); + // Walks the Prog; the "successor roots" or predecessors of the reachable + // instructions are marked in rootmap or predmap/predvec, respectively. + // reachable and stk are preallocated scratch structures. + void MarkSuccessors(SparseArray<int>* rootmap, + SparseArray<int>* predmap, + std::vector<std::vector<int>>* predvec, + SparseSet* reachable, std::vector<int>* stk); - // Emits one "list" via "tree" traversal from the given "root" instruction. - // The new instructions are appended to the given vector. + // Walks the Prog from the given "root" instruction; the "dominator root" + // of the reachable instructions (if such exists) is marked in rootmap. + // reachable and stk are preallocated scratch structures. + void MarkDominator(int root, SparseArray<int>* rootmap, + SparseArray<int>* predmap, + std::vector<std::vector<int>>* predvec, + SparseSet* reachable, std::vector<int>* stk); + + // Walks the Prog from the given "root" instruction; the reachable + // instructions are emitted in "list" form and appended to flat. + // reachable and stk are preallocated scratch structures. void EmitList(int root, SparseArray<int>* rootmap, - std::vector<Inst>* flat, SparseSet* q, - std::vector<int>* stk); + std::vector<Inst>* flat, + SparseSet* reachable, std::vector<int>* stk); private: friend class Compiler; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/re2.cc new/re2-2017-03-01/re2/re2.cc --- old/re2-2017-01-01/re2/re2.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/re2.cc 2017-02-28 01:11:21.000000000 +0100 @@ -399,7 +399,13 @@ const char* lastend = NULL; string out; int count = 0; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Iterate just once when fuzzing. Otherwise, we easily get bogged down + // and coverage is unlikely to improve despite significant expense. + while (p == str->data()) { +#else while (p <= ep) { +#endif if (!re.Match(*str, static_cast<size_t>(p - str->data()), str->size(), UNANCHORED, vec, nvec)) break; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/re2.h new/re2-2017-03-01/re2/re2.h --- old/re2-2017-01-01/re2/re2.h 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/re2.h 2017-02-28 01:11:21.000000000 +0100 @@ -181,7 +181,6 @@ #include <stddef.h> #include <stdint.h> -#include <sys/types.h> #include <algorithm> #include <map> #include <mutex> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/regexp.cc new/re2-2017-03-01/re2/regexp.cc --- old/re2-2017-01-01/re2/regexp.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/regexp.cc 2017-02-28 01:11:21.000000000 +0100 @@ -190,31 +190,45 @@ return re; } -Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { - if (sub->op() == kRegexpPlus && sub->parse_flags() == flags) +Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) { + // Squash **, ++ and ??. + if (op == sub->op() && flags == sub->parse_flags()) return sub; - Regexp* re = new Regexp(kRegexpPlus, flags); + + // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because + // op is Star/Plus/Quest, we just have to check that sub->op() is too. + if ((sub->op() == kRegexpStar || + sub->op() == kRegexpPlus || + sub->op() == kRegexpQuest) && + flags == sub->parse_flags()) { + // If sub is Star, no need to rewrite it. + if (sub->op() == kRegexpStar) + return sub; + + // Rewrite sub to Star. + Regexp* re = new Regexp(kRegexpStar, flags); + re->AllocSub(1); + re->sub()[0] = sub->sub()[0]->Incref(); + sub->Decref(); // We didn't consume the reference after all. + return re; + } + + Regexp* re = new Regexp(op, flags); re->AllocSub(1); re->sub()[0] = sub; return re; } +Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { + return StarPlusOrQuest(kRegexpPlus, sub, flags); +} + Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { - if (sub->op() == kRegexpStar && sub->parse_flags() == flags) - return sub; - Regexp* re = new Regexp(kRegexpStar, flags); - re->AllocSub(1); - re->sub()[0] = sub; - return re; + return StarPlusOrQuest(kRegexpStar, sub, flags); } Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { - if (sub->op() == kRegexpQuest && sub->parse_flags() == flags) - return sub; - Regexp* re = new Regexp(kRegexpQuest, flags); - re->AllocSub(1); - re->sub()[0] = sub; - return re; + return StarPlusOrQuest(kRegexpQuest, sub, flags); } Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/regexp.h new/re2-2017-03-01/re2/regexp.h --- old/re2-2017-01-01/re2/regexp.h 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/regexp.h 2017-02-28 01:11:21.000000000 +0100 @@ -460,6 +460,10 @@ // Computes whether Regexp is already simple. bool ComputeSimple(); + // Constructor that generates a Star, Plus or Quest, + // squashing the pair if sub is also a Star, Plus or Quest. + static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags); + // Constructor that generates a concatenation or alternation, // enforcing the limit on the number of subexpressions for // a particular Regexp. @@ -528,7 +532,7 @@ // exponential blowup in space requirements. // uint16_t to control space usage. // The standard regexp routines will never generate a - // ref greater than the maximum repeat count (1000), + // ref greater than the maximum repeat count (kMaxRepeat), // but even so, Incref and Decref consult an overflow map // when ref_ reaches kMaxRef. uint16_t ref_; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/stringpiece.cc new/re2-2017-03-01/re2/stringpiece.cc --- old/re2-2017-01-01/re2/stringpiece.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/stringpiece.cc 2017-02-28 01:11:21.000000000 +0100 @@ -8,7 +8,7 @@ #include "util/util.h" -using re2::StringPiece; +namespace re2 { const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h @@ -61,3 +61,5 @@ o.write(p.data(), p.size()); return o; } + +} // namespace re2 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/stringpiece.h new/re2-2017-03-01/re2/stringpiece.h --- old/re2-2017-01-01/re2/stringpiece.h 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/stringpiece.h 2017-02-28 01:11:21.000000000 +0100 @@ -115,24 +115,28 @@ StringPiece substr(size_type pos = 0, size_type n = npos) const; int compare(const StringPiece& x) const { - int r = memcmp(data_, x.data_, std::min(size_, x.size_)); - if (r == 0) { - if (size_ < x.size_) r = -1; - else if (size_ > x.size_) r = +1; + size_type min_size = std::min(size(), x.size()); + if (min_size > 0) { + int r = memcmp(data(), x.data(), min_size); + if (r < 0) return -1; + if (r > 0) return 1; } - return r; + if (size() < x.size()) return -1; + if (size() > x.size()) return 1; + return 0; } // Does "this" start with "x"? bool starts_with(const StringPiece& x) const { - return size_ >= x.size_ && - memcmp(data_, x.data_, x.size_) == 0; + return x.empty() || + (size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0); } // Does "this" end with "x"? bool ends_with(const StringPiece& x) const { - return size_ >= x.size_ && - memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0; + return x.empty() || + (size() >= x.size() && + memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0); } bool contains(const StringPiece& s) const { @@ -150,8 +154,10 @@ }; inline bool operator==(const StringPiece& x, const StringPiece& y) { - return x.size() == y.size() && - memcmp(x.data(), y.data(), x.size()) == 0; + StringPiece::size_type len = x.size(); + if (len != y.size()) return false; + return x.data() == y.data() || len == 0 || + memcmp(x.data(), y.data(), len) == 0; } inline bool operator!=(const StringPiece& x, const StringPiece& y) { @@ -159,8 +165,9 @@ } inline bool operator<(const StringPiece& x, const StringPiece& y) { - int r = memcmp(x.data(), y.data(), std::min(x.size(), y.size())); - return ((r < 0) || ((r == 0) && (x.size() < y.size()))); + StringPiece::size_type min_size = std::min(x.size(), y.size()); + int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size); + return (r < 0) || (r == 0 && x.size() < y.size()); } inline bool operator>(const StringPiece& x, const StringPiece& y) { @@ -175,9 +182,9 @@ return !(x < y); } -} // namespace re2 - // Allow StringPiece to be logged. -std::ostream& operator<<(std::ostream& o, const re2::StringPiece& p); +std::ostream& operator<<(std::ostream& o, const StringPiece& p); + +} // namespace re2 #endif // RE2_STRINGPIECE_H_ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/testing/compile_test.cc new/re2-2017-03-01/re2/testing/compile_test.cc --- old/re2-2017-01-01/re2/testing/compile_test.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/testing/compile_test.cc 2017-02-28 01:11:21.000000000 +0100 @@ -316,4 +316,44 @@ reverse); } +TEST(TestCompile, Bug35237384) { + // Bug in the compiler caused inefficient bytecode to be generated for + // nested nullable subexpressions. + + string forward; + + Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); + EXPECT_EQ("3+ byte [61-61] -> 3\n" + "4. nop -> 5\n" + "5+ byte [61-61] -> 5\n" + "6. nop -> 7\n" + "7+ byte [61-61] -> 7\n" + "8. match! 0\n", + forward); + + Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); + EXPECT_EQ("3+ nop -> 6\n" + "4+ nop -> 8\n" + "5. nop -> 21\n" + "6+ byte [61-61] -> 6\n" + "7. nop -> 3\n" + "8+ byte [62-62] -> 8\n" + "9. nop -> 3\n" + "10+ byte [61-61] -> 10\n" + "11. nop -> 21\n" + "12+ byte [62-62] -> 12\n" + "13. nop -> 21\n" + "14+ byte [61-61] -> 14\n" + "15. nop -> 18\n" + "16+ byte [62-62] -> 16\n" + "17. nop -> 18\n" + "18+ nop -> 14\n" + "19+ nop -> 16\n" + "20. match! 0\n" + "21+ nop -> 10\n" + "22+ nop -> 12\n" + "23. nop -> 18\n", + forward); +} + } // namespace re2 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/testing/parse_test.cc new/re2-2017-03-01/re2/testing/parse_test.cc --- old/re2-2017-01-01/re2/testing/parse_test.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/testing/parse_test.cc 2017-02-28 01:11:21.000000000 +0100 @@ -115,6 +115,17 @@ { "ab|cd", "alt{str{ab}str{cd}}" }, { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" }, + // Test squashing of **, ++, ?? et cetera. + { "(?:(?:a)*)*", "star{lit{a}}" }, + { "(?:(?:a)+)+", "plus{lit{a}}" }, + { "(?:(?:a)?)?", "que{lit{a}}" }, + { "(?:(?:a)*)+", "star{lit{a}}" }, + { "(?:(?:a)*)?", "star{lit{a}}" }, + { "(?:(?:a)+)*", "star{lit{a}}" }, + { "(?:(?:a)+)?", "star{lit{a}}" }, + { "(?:(?:a)?)*", "star{lit{a}}" }, + { "(?:(?:a)?)+", "star{lit{a}}" }, + // Test flattening. { "(?:a)", "lit{a}" }, { "(?:ab)(?:cd)", "str{abcd}" }, diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/testing/simplify_test.cc new/re2-2017-03-01/re2/testing/simplify_test.cc --- old/re2-2017-01-01/re2/testing/simplify_test.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/testing/simplify_test.cc 2017-02-28 01:11:21.000000000 +0100 @@ -231,6 +231,17 @@ // capture. The new capture must have the cap of the old capture. // Failure to copy it results in cap=0 -> ToString() logs a fatal error. { "(a*aab)", "(aa+b)" }, + + // Test squashing of **, ++, ?? et cetera. + { "(?:(?:a){0,}){0,}", "a*" }, + { "(?:(?:a){1,}){1,}", "a+" }, + { "(?:(?:a){0,1}){0,1}", "a?" }, + { "(?:(?:a){0,}){1,}", "a*" }, + { "(?:(?:a){0,}){0,1}", "a*" }, + { "(?:(?:a){1,}){0,}", "a*" }, + { "(?:(?:a){1,}){0,1}", "a*" }, + { "(?:(?:a){0,1}){0,}", "a*" }, + { "(?:(?:a){0,1}){1,}", "a*" }, }; TEST(TestSimplify, SimpleRegexps) { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/re2/testing/tester.cc new/re2-2017-03-01/re2/testing/tester.cc --- old/re2-2017-01-01/re2/testing/tester.cc 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/re2/testing/tester.cc 2017-02-28 01:11:21.000000000 +0100 @@ -7,7 +7,6 @@ #include <stddef.h> #include <stdint.h> #include <string.h> -#include <sys/types.h> #include <string> #include "util/util.h" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/util/sparse_array.h new/re2-2017-03-01/util/sparse_array.h --- old/re2-2017-01-01/util/sparse_array.h 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/util/sparse_array.h 2017-02-28 01:11:21.000000000 +0100 @@ -273,7 +273,7 @@ iterator SetInternal(bool allow_overwrite, int i, U&& v) { // NOLINT DebugCheckInvariants(); if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) { - assert(!"illegal index"); + assert(false && "illegal index"); // Semantically, end() would be better here, but we already know // the user did something stupid, so begin() insulates them from // dereferencing an invalid pointer. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/re2-2017-01-01/util/sparse_set.h new/re2-2017-03-01/util/sparse_set.h --- old/re2-2017-01-01/util/sparse_set.h 2016-12-13 12:22:01.000000000 +0100 +++ new/re2-2017-03-01/util/sparse_set.h 2017-02-28 01:11:21.000000000 +0100 @@ -132,7 +132,7 @@ iterator InsertInternal(bool allow_existing, int i) { DebugCheckInvariants(); if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) { - assert(!"illegal index"); + assert(false && "illegal index"); // Semantically, end() would be better here, but we already know // the user did something stupid, so begin() insulates them from // dereferencing an invalid pointer.
