This revision was automatically updated to reflect the committed changes.
sammccall marked an inline comment as done.
Closed by commit rCTE343774: [clangd] Dex: FALSE iterator, peephole 
optimizations, fix AND bug (authored by sammccall, committed by ).

Changed prior to commit:
  https://reviews.llvm.org/D52789?vs=167972&id=168281#toc

Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D52789

Files:
  clangd/index/dex/Iterator.cpp
  clangd/index/dex/Iterator.h
  unittests/clangd/DexTests.cpp

Index: clangd/index/dex/Iterator.h
===================================================================
--- clangd/index/dex/Iterator.h
+++ clangd/index/dex/Iterator.h
@@ -98,8 +98,16 @@
     return Iterator.dump(OS);
   }
 
+  /// Inspect iterator type, used internally for optimizing query trees.
+  enum class Kind { And, Or, True, False, Other };
+  Kind kind() const { return MyKind; }
+
+protected:
+  Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {}
+
 private:
   virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
+  Kind MyKind;
 };
 
 /// Advances the iterator until it is exhausted. Returns pairs of document IDs
@@ -150,6 +158,9 @@
   /// containing all items in range [0, Size) in an efficient manner.
   std::unique_ptr<Iterator> all() const;
 
+  /// Returns FALSE Iterator which iterates over no documents.
+  std::unique_ptr<Iterator> none() const;
+
   /// Returns BOOST iterator which multiplies the score of each item by given
   /// factor. Boosting can be used as a computationally inexpensive filtering.
   /// Users can return significantly more items using consumeAndBoost() and
Index: clangd/index/dex/Iterator.cpp
===================================================================
--- clangd/index/dex/Iterator.cpp
+++ clangd/index/dex/Iterator.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Iterator.h"
+#include "llvm/Support/Casting.h"
 #include <algorithm>
 #include <cassert>
 #include <numeric>
@@ -26,9 +27,11 @@
 class AndIterator : public Iterator {
 public:
   explicit AndIterator(std::vector<std::unique_ptr<Iterator>> AllChildren)
-      : Children(std::move(AllChildren)) {
+      : Iterator(Kind::And), Children(std::move(AllChildren)) {
     assert(!Children.empty() && "AND iterator should have at least one child.");
     // Establish invariants.
+    for (const auto &Child : Children)
+      ReachedEnd |= Child->reachedEnd();
     sync();
     // When children are sorted by the estimateSize(), sync() calls are more
     // effective. Each sync() starts with the first child and makes sure all
@@ -122,6 +125,7 @@
   /// update the field, rather than traversing the whole subtree in each
   /// reachedEnd() call.
   bool ReachedEnd = false;
+  friend Corpus; // For optimizations.
 };
 
 /// Implements Iterator over the union of other iterators.
@@ -133,7 +137,7 @@
 class OrIterator : public Iterator {
 public:
   explicit OrIterator(std::vector<std::unique_ptr<Iterator>> AllChildren)
-      : Children(std::move(AllChildren)) {
+      : Iterator(Kind::Or), Children(std::move(AllChildren)) {
     assert(!Children.empty() && "OR iterator should have at least one child.");
   }
 
@@ -208,14 +212,15 @@
 
   // FIXME(kbobyrev): Would storing Children in min-heap be faster?
   std::vector<std::unique_ptr<Iterator>> Children;
+  friend Corpus; // For optimizations.
 };
 
 /// TrueIterator handles PostingLists which contain all items of the index. It
 /// stores size of the virtual posting list, and all operations are performed
 /// in O(1).
 class TrueIterator : public Iterator {
 public:
-  explicit TrueIterator(DocID Size) : Size(Size) {}
+  explicit TrueIterator(DocID Size) : Iterator(Kind::True), Size(Size) {}
 
   bool reachedEnd() const override { return Index >= Size; }
 
@@ -251,12 +256,35 @@
   DocID Size;
 };
 
+/// FalseIterator yields no results.
+class FalseIterator : public Iterator {
+public:
+  FalseIterator() : Iterator(Kind::False) {}
+  bool reachedEnd() const override { return true; }
+  void advance() override { assert(false); }
+  void advanceTo(DocID ID) override { assert(false); }
+  DocID peek() const override {
+    assert(false);
+    return 0;
+  }
+  float consume() override {
+    assert(false);
+    return 1;
+  }
+  size_t estimateSize() const override { return 0; }
+
+private:
+  llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
+    return OS << "false";
+  }
+};
+
 /// Boost iterator is a wrapper around its child which multiplies scores of
 /// each retrieved item by a given factor.
 class BoostIterator : public Iterator {
 public:
   BoostIterator(std::unique_ptr<Iterator> Child, float Factor)
-      : Child(move(Child)), Factor(Factor) {}
+      : Child(std::move(Child)), Factor(Factor) {}
 
   bool reachedEnd() const override { return Child->reachedEnd(); }
 
@@ -286,7 +314,7 @@
 class LimitIterator : public Iterator {
 public:
   LimitIterator(std::unique_ptr<Iterator> Child, size_t Limit)
-      : Child(move(Child)), Limit(Limit), ItemsLeft(Limit) {}
+      : Child(std::move(Child)), Limit(Limit), ItemsLeft(Limit) {}
 
   bool reachedEnd() const override {
     return ItemsLeft == 0 || Child->reachedEnd();
@@ -331,30 +359,86 @@
 
 std::unique_ptr<Iterator>
 Corpus::intersect(std::vector<std::unique_ptr<Iterator>> Children) const {
-  // If there is exactly one child, pull it one level up: AND(Child) -> Child.
-  return Children.size() == 1 ? std::move(Children.front())
-                              : llvm::make_unique<AndIterator>(move(Children));
+  std::vector<std::unique_ptr<Iterator>> RealChildren;
+  for (auto &Child : Children) {
+    switch (Child->kind()) {
+    case Iterator::Kind::True:
+      break; // No effect, drop the iterator.
+    case Iterator::Kind::False:
+      return std::move(Child); // Intersection is empty.
+    case Iterator::Kind::And: {
+      // Inline nested AND into parent AND.
+      auto &NewChildren = static_cast<AndIterator *>(Child.get())->Children;
+      std::move(NewChildren.begin(), NewChildren.end(),
+                std::back_inserter(RealChildren));
+      break;
+    }
+    default:
+      RealChildren.push_back(std::move(Child));
+    }
+  }
+  switch (RealChildren.size()) {
+  case 0:
+    return all();
+  case 1:
+    return std::move(RealChildren.front());
+  default:
+    return llvm::make_unique<AndIterator>(std::move(RealChildren));
+  }
 }
 
 std::unique_ptr<Iterator>
 Corpus::unionOf(std::vector<std::unique_ptr<Iterator>> Children) const {
-  // If there is exactly one child, pull it one level up: OR(Child) -> Child.
-  return Children.size() == 1 ? std::move(Children.front())
-                              : llvm::make_unique<OrIterator>(move(Children));
+  std::vector<std::unique_ptr<Iterator>> RealChildren;
+  for (auto &Child : Children) {
+    switch (Child->kind()) {
+    case Iterator::Kind::False:
+      break; // No effect, drop the iterator.
+    case Iterator::Kind::Or: {
+      // Inline nested OR into parent OR.
+      auto &NewChildren = static_cast<OrIterator *>(Child.get())->Children;
+      std::move(NewChildren.begin(), NewChildren.end(),
+                std::back_inserter(RealChildren));
+      break;
+    }
+    case Iterator::Kind::True:
+      // Don't return all(), which would discard sibling boosts.
+    default:
+      RealChildren.push_back(std::move(Child));
+    }
+  }
+  switch (RealChildren.size()) {
+  case 0:
+    return none();
+  case 1:
+    return std::move(RealChildren.front());
+  default:
+    return llvm::make_unique<OrIterator>(std::move(RealChildren));
+  }
 }
 
 std::unique_ptr<Iterator> Corpus::all() const {
   return llvm::make_unique<TrueIterator>(Size);
 }
 
+std::unique_ptr<Iterator> Corpus::none() const {
+  return llvm::make_unique<FalseIterator>();
+}
+
 std::unique_ptr<Iterator> Corpus::boost(std::unique_ptr<Iterator> Child,
                                         float Factor) const {
-  return llvm::make_unique<BoostIterator>(move(Child), Factor);
+  if (Factor == 1)
+    return Child;
+  if (Child->kind() == Iterator::Kind::False)
+    return Child;
+  return llvm::make_unique<BoostIterator>(std::move(Child), Factor);
 }
 
 std::unique_ptr<Iterator> Corpus::limit(std::unique_ptr<Iterator> Child,
                                         size_t Limit) const {
-  return llvm::make_unique<LimitIterator>(move(Child), Limit);
+  if (Child->kind() == Iterator::Kind::False)
+    return Child;
+  return llvm::make_unique<LimitIterator>(std::move(Child), Limit);
 }
 
 } // namespace dex
Index: unittests/clangd/DexTests.cpp
===================================================================
--- unittests/clangd/DexTests.cpp
+++ unittests/clangd/DexTests.cpp
@@ -109,6 +109,18 @@
   EXPECT_TRUE(And->reachedEnd());
 }
 
+TEST(DexIterators, AndEmpty) {
+  Corpus C{10000};
+  const PostingList L1({1});
+  const PostingList L2({2});
+  // These iterators are empty, but the optimizer can't tell.
+  auto Empty1 = C.intersect(L1.iterator(), L2.iterator());
+  auto Empty2 = C.intersect(L1.iterator(), L2.iterator());
+  // And syncs iterators on construction, and used to fail on empty children.
+  auto And = C.intersect(std::move(Empty1), std::move(Empty2));
+  EXPECT_TRUE(And->reachedEnd());
+}
+
 TEST(DexIterators, OrTwoLists) {
   Corpus C{10000};
   const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
@@ -270,18 +282,8 @@
 }
 
 TEST(DexIterators, True) {
-  Corpus C{0};
-  auto TrueIterator = C.all();
-  EXPECT_TRUE(TrueIterator->reachedEnd());
-  EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre());
-
-  C = Corpus{7};
-  const PostingList L0({1, 2, 5, 7});
-  TrueIterator = C.all();
-  EXPECT_THAT(TrueIterator->peek(), 0);
-  auto AndIterator = C.intersect(L0.iterator(), move(TrueIterator));
-  EXPECT_FALSE(AndIterator->reachedEnd());
-  EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5));
+  EXPECT_TRUE(Corpus{0}.all()->reachedEnd());
+  EXPECT_THAT(consumeIDs(*Corpus{4}.all()), ElementsAre(0, 1, 2, 3));
 }
 
 TEST(DexIterators, Boost) {
@@ -313,6 +315,38 @@
   EXPECT_THAT(ElementBoost, 3);
 }
 
+TEST(DexIterators, Optimizations) {
+  Corpus C{5};
+  const PostingList L1({1});
+  const PostingList L2({2});
+  const PostingList L3({3});
+
+  // empty and/or yield true/false
+  EXPECT_EQ(llvm::to_string(*C.intersect()), "true");
+  EXPECT_EQ(llvm::to_string(*C.unionOf()), "false");
+
+  // true/false inside and/or short-circuit
+  EXPECT_EQ(llvm::to_string(*C.intersect(L1.iterator(), C.all())), "[1]");
+  EXPECT_EQ(llvm::to_string(*C.intersect(L1.iterator(), C.none())), "false");
+  // Not optimized to avoid breaking boosts.
+  EXPECT_EQ(llvm::to_string(*C.unionOf(L1.iterator(), C.all())),
+            "(| [1] true)");
+  EXPECT_EQ(llvm::to_string(*C.unionOf(L1.iterator(), C.none())), "[1]");
+
+  // and/or nested inside and/or are flattened
+  EXPECT_EQ(llvm::to_string(*C.intersect(
+                L1.iterator(), C.intersect(L1.iterator(), L1.iterator()))),
+            "(& [1] [1] [1])");
+  EXPECT_EQ(llvm::to_string(*C.unionOf(
+                L1.iterator(), C.unionOf(L2.iterator(), L3.iterator()))),
+            "(| [1] [2] [3])");
+
+  // optimizations combine over multiple levels
+  EXPECT_EQ(llvm::to_string(*C.intersect(
+                C.intersect(L1.iterator(), C.intersect()), C.unionOf(C.all()))),
+            "[1]");
+}
+
 //===----------------------------------------------------------------------===//
 // Search token tests.
 //===----------------------------------------------------------------------===//
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to