sammccall created this revision.
sammccall added a reviewer: kbobyrev.
Herald added subscribers: cfe-commits, kadircet, arphaman, jkorous, MaskRay, 
ioeric, ilya-biryukov.

This uses a bitmap representation instead of a list if the density of
the list is high enough (at least 1 in 32, which is the breakeven point
sizewise).

Experimenting with the LLVM index, this saves about 3% of total posting
list size, which isn't worth the complexity.

However it should also improve iterator performance somewhat:

- advance is within a constant factor (find next set bit, average step is 
bounded)
- advanceTo is constant time instead of log(n) with random accesses

If the posting lists that are dense are also commonly used in queries
(seems likely for common trigrams) then this may be worth doing for
latency reasons.
I'm uploading this so Kirill can experiment with benchmarks.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D51689

Files:
  clangd/index/dex/DexIndex.cpp
  clangd/index/dex/Iterator.cpp
  clangd/index/dex/Iterator.h

Index: clangd/index/dex/Iterator.h
===================================================================
--- clangd/index/dex/Iterator.h
+++ clangd/index/dex/Iterator.h
@@ -32,6 +32,7 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <memory>
@@ -44,20 +45,6 @@
 /// Symbol position in the list of all index symbols sorted by a pre-computed
 /// symbol quality.
 using DocID = uint32_t;
-/// Contains sorted sequence of DocIDs all of which belong to symbols matching
-/// certain criteria, i.e. containing a Search Token. PostingLists are values
-/// for the inverted index.
-// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are
-// likely to be very dense and hence require special attention so that the index
-// doesn't use too much memory. Possible solution would be to construct
-// compressed posting lists which consist of ranges of DocIDs instead of
-// distinct DocIDs. A special case would be the empty query: for that case
-// TrueIterator should be implemented - an iterator which doesn't actually store
-// any PostingList within itself, but "contains" all DocIDs in range
-// [0, IndexSize).
-using PostingList = std::vector<DocID>;
-/// Immutable reference to PostingList object.
-using PostingListRef = llvm::ArrayRef<DocID>;
 
 /// Iterator is the interface for Query Tree node. The simplest type of Iterator
 /// is DocumentIterator which is simply a wrapper around PostingList iterator
@@ -131,11 +118,6 @@
 /// to acquire preliminary scores of requested items.
 std::vector<std::pair<DocID, float>> consume(Iterator &It);
 
-/// Returns a document iterator over given PostingList.
-///
-/// DocumentIterator returns DEFAULT_BOOST_SCORE for each processed item.
-std::unique_ptr<Iterator> create(PostingListRef Documents);
-
 /// Returns AND Iterator which performs the intersection of the PostingLists of
 /// its children.
 ///
@@ -200,6 +182,33 @@
   Children.push_back(move(Head));
 }
 
+/// Contains sorted sequence of DocIDs all of which belong to symbols matching
+/// certain criteria, i.e. containing a Search Token. PostingLists are values
+/// for the inverted index.
+class PostingList {
+public:
+  PostingList() : Representation(Null) {}
+  PostingList(std::vector<DocID> Docs);
+  /// Returns a document iterator over given PostingList.
+  std::unique_ptr<Iterator> iterator() const;
+
+  PostingList(PostingList&&);
+  PostingList &operator=(PostingList&&);
+  ~PostingList();
+
+  size_t bytes() const;
+
+private:
+  enum Rep { Null, Dense, Sparse } Representation;
+  union {
+    struct {
+      llvm::BitVector Bitmap;
+      size_t Count;
+    } DenseRep;
+    std::vector<DocID> SparseRep;
+  };
+};
+
 } // namespace dex
 } // namespace clangd
 } // namespace clang
Index: clangd/index/dex/Iterator.cpp
===================================================================
--- clangd/index/dex/Iterator.cpp
+++ clangd/index/dex/Iterator.cpp
@@ -18,12 +18,11 @@
 
 namespace {
 
-/// Implements Iterator over a PostingList. DocumentIterator is the most basic
-/// iterator: it doesn't have any children (hence it is the leaf of iterator
-/// tree) and is simply a wrapper around PostingList::const_iterator.
-class DocumentIterator : public Iterator {
+/// Implements Iterator over a sparse PostingList.
+/// This is a leaf iterator which simply wraps a list of DocIDs.
+class SparseIterator : public Iterator {
 public:
-  DocumentIterator(PostingListRef Documents)
+  SparseIterator(llvm::ArrayRef<DocID> Documents)
       : Documents(Documents), Index(std::begin(Documents)) {}
 
   bool reachedEnd() const override { return Index == std::end(Documents); }
@@ -74,8 +73,52 @@
     return OS;
   }
 
-  PostingListRef Documents;
-  PostingListRef::const_iterator Index;
+  llvm::ArrayRef<DocID> Documents;
+  llvm::ArrayRef<DocID>::iterator Index;
+};
+
+/// Implements Iterator over a dense PostingList.
+/// This is a leaf iterator over a BitVector with one bit per possible DocID.
+class DenseIterator : public Iterator {
+public:
+  DenseIterator(const llvm::BitVector &Bits, size_t Count)
+      : Bits(Bits), Index(Bits.find_first()), Count(Count) {}
+
+  bool reachedEnd() const override { return Index == -1; }
+
+  /// Advances cursor to the next item.
+  void advance() override {
+    assert(!reachedEnd() && "DENSE iterator can't advance() at the end.");
+    Index = Bits.find_next(Index);
+  }
+
+  /// Applies binary search to advance cursor to the next item with DocID equal
+  /// or higher than the given one.
+  void advanceTo(DocID ID) override {
+    assert(!reachedEnd() && "DENSE iterator can't advance() at the end.");
+    Index = Bits.find_next(ID);
+  }
+
+  DocID peek() const override {
+    assert(!reachedEnd() && "DENSE iterator can't peek() at the end.");
+    return Index;
+  }
+
+  float consume() override {
+    assert(!reachedEnd() && "DENSE iterator can't consume() at the end.");
+    return DEFAULT_BOOST_SCORE;
+  }
+
+  size_t estimateSize() const override { return Count; }
+
+private:
+  llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
+    return OS << "(dense)\n";
+  }
+
+  const llvm::BitVector &Bits;
+  int Index; // Invariant: Index == -1 || Bits[Index]
+  size_t Count;
 };
 
 /// Implements Iterator over the intersection of other iterators.
@@ -396,10 +439,6 @@
   return Result;
 }
 
-std::unique_ptr<Iterator> create(PostingListRef Documents) {
-  return llvm::make_unique<DocumentIterator>(Documents);
-}
-
 std::unique_ptr<Iterator>
 createAnd(std::vector<std::unique_ptr<Iterator>> Children) {
   return llvm::make_unique<AndIterator>(move(Children));
@@ -424,6 +463,77 @@
   return llvm::make_unique<LimitIterator>(move(Child), Limit);
 }
 
+std::unique_ptr<Iterator> PostingList::iterator() const {
+  switch(Representation) {
+    case Dense:
+      return llvm::make_unique<DenseIterator>(DenseRep.Bitmap, DenseRep.Count);
+    case Sparse:
+      return llvm::make_unique<SparseIterator>(SparseRep);
+    case Null:
+      assert(false && "iterator() on null posting list");
+      return llvm::make_unique<SparseIterator>(llvm::ArrayRef<DocID>{});
+  }
+}
+
+PostingList::PostingList(std::vector<DocID> Docs) {
+  if (Docs.empty() || sizeof(DocID) * Docs.size() < Docs.back() / CHAR_BIT) {
+    Representation = Sparse;
+    new (&SparseRep) decltype(SparseRep)(std::move(Docs));
+  } else {
+    Representation = Dense;
+    new (&DenseRep) decltype(DenseRep);
+    DenseRep.Count = Docs.size();
+    DenseRep.Bitmap.resize(Docs.back() + 1);
+    for (DocID D : Docs)
+      DenseRep.Bitmap.set(D);
+  }
+};
+
+PostingList::~PostingList() {
+  switch (Representation) {
+    case Sparse:
+      delete &SparseRep;
+      break;
+    case Dense:
+      delete &DenseRep;
+      break;
+    case Null:
+      break;
+  }
+}
+
+PostingList::PostingList(PostingList &&Other) {
+  Representation = Other.Representation;
+  switch (Representation) {
+    case Sparse:
+      new (&SparseRep) decltype(SparseRep)(std::move(Other.SparseRep));
+      break;
+    case Dense:
+      new (&DenseRep) decltype(DenseRep)(std::move(Other.DenseRep));
+      break;
+    case Null:
+      break;
+  }
+  Other.Representation = Null;
+}
+
+PostingList &PostingList::operator=(PostingList &&Other) {
+  this->~PostingList();
+  new(this) PostingList(std::move(Other));
+  return *this;
+}
+
+size_t PostingList::bytes() const {
+  switch(Representation) {
+  case Sparse:
+    return SparseRep.size() * sizeof(DocID);
+  case Dense:
+    return DenseRep.Bitmap.getMemorySize();
+  case Null:
+    return 0;
+  }
+}
+
 } // namespace dex
 } // namespace clangd
 } // namespace clang
Index: clangd/index/dex/DexIndex.cpp
===================================================================
--- clangd/index/dex/DexIndex.cpp
+++ clangd/index/dex/DexIndex.cpp
@@ -50,11 +50,15 @@
             });
 
   // Populate TempInvertedIndex with posting lists for index symbols.
+  llvm::DenseMap<Token, std::vector<DocID>> TempInvertedIndex;
   for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
     const auto *Sym = Symbols[SymbolRank];
     for (const auto &Token : generateSearchTokens(*Sym))
-      InvertedIndex[Token].push_back(SymbolRank);
+      TempInvertedIndex[Token].push_back(SymbolRank);
   }
+  InvertedIndex.reserve(InvertedIndex.size());
+  for (auto &Pair : TempInvertedIndex)
+    InvertedIndex.try_emplace(Pair.first, std::move(Pair.second));
 
   vlog("Built DexIndex with estimated memory usage {0} bytes.",
        estimateMemoryUsage());
@@ -80,7 +84,7 @@
   for (const auto &Trigram : TrigramTokens) {
     const auto It = InvertedIndex.find(Trigram);
     if (It != InvertedIndex.end())
-      TrigramIterators.push_back(create(It->second));
+      TrigramIterators.push_back(It->second.iterator());
   }
   if (!TrigramIterators.empty())
     TopLevelChildren.push_back(createAnd(move(TrigramIterators)));
@@ -90,7 +94,7 @@
   for (const auto &Scope : Req.Scopes) {
     const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope));
     if (It != InvertedIndex.end())
-      ScopeIterators.push_back(create(It->second));
+      ScopeIterators.push_back(It->second.iterator());
   }
   // Add OR iterator for scopes if there are any Scope Iterators.
   if (!ScopeIterators.empty())
@@ -154,10 +158,8 @@
       LookupTable.size() * sizeof(std::pair<SymbolID, const Symbol *>);
   Bytes += SymbolQuality.size() * sizeof(std::pair<const Symbol *, float>);
   Bytes += InvertedIndex.size() * sizeof(Token);
-
-  for (const auto &P : InvertedIndex) {
-    Bytes += P.second.size() * sizeof(DocID);
-  }
+  for (const auto &P : InvertedIndex)
+    Bytes += P.second.bytes();
   return Bytes;
 }
 
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to