[PATCH] D52300: [clangd] Implement VByte PostingList compression

Kirill Bobyrev via Phabricator via cfe-commits Thu, 20 Sep 2018 06:31:03 -0700

kbobyrev updated this revision to Diff 166278.
kbobyrev added a comment.

- Update unit tests with iterator tree string representation to comply with the 
new format
- Don't mark constructor `explicit` (previously it only had one parameter)
- Fix `Limits` explanation comment (`ID > Limits[I]` -> `ID >= Limits[I]`)



https://reviews.llvm.org/D52300

Files:
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/index/dex/Dex.cpp
  clang-tools-extra/clangd/index/dex/PostingList.cpp
  clang-tools-extra/clangd/index/dex/PostingList.h
  clang-tools-extra/clangd/index/dex/fuzzer/CMakeLists.txt
  clang-tools-extra/clangd/index/dex/fuzzer/VByteFuzzer.cpp
  clang-tools-extra/unittests/clangd/DexTests.cpp

Index: clang-tools-extra/unittests/clangd/DexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexTests.cpp
@@ -262,13 +262,14 @@
   const PostingList L4({0, 1, 5});
   const PostingList L5({});
 
-  EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4]");
+  EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4 ...]");
 
   auto Nested =
       createAnd(createAnd(L1.iterator(), L2.iterator()),
                 createOr(L3.iterator(), L4.iterator(), L5.iterator()));
 
-  EXPECT_EQ(llvm::to_string(*Nested), "(& (| [5] [1] [END]) (& [1] [1]))");
+  EXPECT_EQ(llvm::to_string(*Nested),
+            "(& (| [... 5] [... 1 ...] [END]) (& [1 ...] [1 ...]))");
 }
 
 TEST(DexIterators, Limit) {
Index: clang-tools-extra/clangd/index/dex/fuzzer/VByteFuzzer.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/fuzzer/VByteFuzzer.cpp
@@ -0,0 +1,64 @@
+//===-- VByteFuzzer.cpp - Fuzz VByte Posting List encoding ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a function that runs clangd on a single input.
+/// This function is then linked into the Fuzzer library.
+///
+//===----------------------------------------------------------------------===//
+
+#include "../Iterator.h"
+#include "../PostingList.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <vector>
+
+using DocID = clang::clangd::dex::DocID;
+
+/// Transform raw byte sequence into list of DocIDs.
+std::vector<DocID> generateDocuments(uint8_t *Data, size_t Size) {
+  std::vector<DocID> Result;
+  DocID ID = 0;
+  for (size_t I = 0; I < Size; ++I) {
+    size_t Offset = I % 4;
+    if (Offset == 0 && I != 0) {
+      ID = 0;
+      Result.push_back(ID);
+    }
+    ID |= (Data[I] << Offset);
+  }
+  if (Size > 4 && Size % 4 != 0)
+    Result.push_back(ID);
+  return Result;
+}
+
+/// This fuzzer checks that compressed PostingList contains can be successfully
+/// decoded into the original sequence.
+extern "C" int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) {
+  if (Size == 0)
+    return 0;
+  const auto OriginalDocuments = generateDocuments(Data, Size);
+  if (OriginalDocuments.empty())
+    return 0;
+  // Ensure that given sequence of DocIDs is sorted.
+  for (size_t I = 1; I < OriginalDocuments.size(); ++I)
+    if (OriginalDocuments[I] <= OriginalDocuments[I - 1])
+      return 0;
+  const clang::clangd::dex::PostingList List(OriginalDocuments);
+  const auto DecodedDocuments = clang::clangd::dex::consume(*List.iterator());
+  // Compare decoded sequence against the original PostingList contents.
+  if (DecodedDocuments.size() != OriginalDocuments.size())
+    LLVM_BUILTIN_TRAP;
+  for (size_t I = 0; I < DecodedDocuments.size(); ++I)
+    if (DecodedDocuments[I].first != OriginalDocuments[I])
+      LLVM_BUILTIN_TRAP;
+  return 0;
+}
Index: clang-tools-extra/clangd/index/dex/fuzzer/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/fuzzer/CMakeLists.txt
@@ -0,0 +1,19 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+set(LLVM_LINK_COMPONENTS Support)
+
+if(LLVM_USE_SANITIZE_COVERAGE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer")
+endif()
+
+add_clang_executable(clangd-vbyte-fuzzer
+  EXCLUDE_FROM_ALL
+  VByteFuzzer.cpp
+  )
+
+target_link_libraries(clangd-vbyte-fuzzer
+  PRIVATE
+  clangBasic
+  clangDaemon
+  ${LLVM_LIB_FUZZING_ENGINE}
+  )
Index: clang-tools-extra/clangd/index/dex/PostingList.h
===================================================================
--- clang-tools-extra/clangd/index/dex/PostingList.h
+++ clang-tools-extra/clangd/index/dex/PostingList.h
@@ -6,13 +6,19 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This defines posting list interface: a storage for identifiers of symbols
-// which can be characterized by a specific feature (such as fuzzy-find trigram,
-// scope, type or any other Search Token). Posting lists can be traversed in
-// order using an iterator and are values for inverted index, which maps search
-// tokens to corresponding posting lists.
-//
+///
+/// \file
+/// This defines posting list interface: a storage for identifiers of symbols
+/// which can be characterized by a specific feature (such as fuzzy-find
+/// trigram, scope, type or any other Search Token). Posting lists can be
+/// traversed in order using an iterator and are values for inverted index,
+/// which maps search tokens to corresponding posting lists.
+///
+/// In order to decrease size of Index in-memory representation, Variable Byte
+/// Encoding (VByte) is used for PostingLists compression. An overview of VByte
+/// algorithm can be found in "Introduction to Information Retrieval" book:
+/// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
@@ -29,20 +35,43 @@
 
 class Iterator;
 
+/// Chunk is a fixed-width piece of PostingList which contains the first DocID
+/// in uncompressed format (Head) and delta-encoded Payload. It can be
+/// decompressed upon request.
+struct Chunk {
+  // Keep sizeof(Chunk) == 32.
+  static constexpr size_t PayloadSize = 32 - sizeof(DocID) - sizeof(uint8_t);
+
+  std::vector<DocID> decompress() const;
+
+  /// The first element of
+  DocID Head;
+  /// VByte-encoded deltas.
+  std::array<uint8_t, PayloadSize> Payload = std::array<uint8_t, PayloadSize>();
+  /// Number of elements encoded into Payload + 1.
+  uint8_t Size;
+};
+
 /// PostingList is the storage of DocIDs which can be inserted to the Query
-/// Tree as a leaf by constructing Iterator over the PostingList object.
-// FIXME(kbobyrev): Use VByte algorithm to compress underlying data.
+/// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs
+/// are stored in underlying chunks. While avoiding compression would reflect
+/// positively on the Index performance, current Dex implementation has a large
+/// performance gap compared to MemIndex which allows memory consumption
+/// reduction at the cost of some performance.
 class PostingList {
 public:
-  explicit PostingList(const std::vector<DocID> &&Documents)
-      : Documents(std::move(Documents)) {}
+  explicit PostingList(const std::vector<DocID> &Documents);
 
+  /// Constructs DocumentIterator over given posting list. DocumentIterator will
+  /// go through the chunks and decompress them on-the-fly when necessary.
   std::unique_ptr<Iterator> iterator() const;
 
-  size_t bytes() const { return Documents.size() * sizeof(DocID); }
+  /// Returns in-memory size.
+  size_t bytes() const { return Chunks.size() * sizeof(Chunk); }
 
 private:
-  const std::vector<DocID> Documents;
+  const std::vector<Chunk> Chunks;
+  size_t Size;
 };
 
 } // namespace dex
Index: clang-tools-extra/clangd/index/dex/PostingList.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/PostingList.cpp
+++ clang-tools-extra/clangd/index/dex/PostingList.cpp
@@ -9,74 +9,264 @@
 
 #include "PostingList.h"
 #include "Iterator.h"
+#include <queue>
 
 namespace clang {
 namespace clangd {
 namespace dex {
 
 namespace {
 
-/// Implements Iterator over std::vector<DocID>. This is the most basic
-/// iterator and is simply a wrapper around
-/// std::vector<DocID>::const_iterator.
-class PlainIterator : public Iterator {
+/// Implements iterator of PostingList chunks. This requires iterating over two
+/// levels: the first level iterator iterates over the chunks and decompresses
+/// them on-the-fly when the contents of chunk are to be seen.
+class ChunkIterator : public Iterator {
 public:
-  explicit PlainIterator(llvm::ArrayRef<DocID> Documents)
-      : Documents(Documents), Index(std::begin(Documents)) {}
+  ChunkIterator(const std::vector<Chunk> &Chunks, size_t Size)
+      : Chunks(Chunks), Size(Size), ChunkIndex(begin(Chunks)) {
+    if (!Chunks.empty()) {
+      DecompressedChunk = ChunkIndex->decompress();
+      InnerIndex = begin(DecompressedChunk);
+    }
+  }
 
-  bool reachedEnd() const override { return Index == std::end(Documents); }
+  bool reachedEnd() const override { return ChunkIndex == end(Chunks); }
 
   /// Advances cursor to the next item.
   void advance() override {
     assert(!reachedEnd() &&
            "Posting List iterator can't advance() at the end.");
-    ++Index;
+    if (++InnerIndex != end(DecompressedChunk))
+      return; // Return if current chunk is not exhausted.
+    ++ChunkIndex;
+    if (reachedEnd())
+      return; // Can't advance to the next chunk at the end.
+    // Decompress next chunk and reset inner iterator.
+    DecompressedChunk = ChunkIndex->decompress();
+    InnerIndex = begin(DecompressedChunk);
   }
 
   /// Applies binary search to advance cursor to the next item with DocID
   /// equal or higher than the given one.
   void advanceTo(DocID ID) override {
     assert(!reachedEnd() &&
            "Posting List iterator can't advance() at the end.");
-    // If current ID is beyond requested one, iterator is already in the right
-    // state.
-    if (peek() < ID)
-      Index = std::lower_bound(Index, std::end(Documents), ID);
+    if (ID <= peek())
+      return;
+    // If current chunk doesn't contain needed element, find the chunk which
+    // does.
+    if ((ChunkIndex != end(Chunks) - 1) && ((ChunkIndex + 1)->Head <= ID)) {
+      // Find the next chunk with Head >= ID.
+      ChunkIndex = std::lower_bound(
+          ChunkIndex, end(Chunks), ID,
+          [](const Chunk &C, const DocID ID) { return C.Head < ID; });
+      if (reachedEnd())
+        return;
+      // Look for ID in the previous chunk if the current Head > ID and
+      // therefore needed position is either in previous Chunk or in the
+      // beginning of the current chunk.
+      if (ChunkIndex != begin(Chunks) && ID < ChunkIndex->Head)
+        --ChunkIndex;
+      DecompressedChunk = ChunkIndex->decompress();
+      InnerIndex = begin(DecompressedChunk);
+    }
+    // Try to find ID within current chunk.
+    InnerIndex = std::lower_bound(InnerIndex, std::end(DecompressedChunk), ID);
+    // Return if the position was found in current chunk.
+    if (InnerIndex != std::end(DecompressedChunk))
+      return;
+    // Otherwise, the iterator should point to the first element of the next
+    // chunk (if there is any).
+    ++ChunkIndex;
+    if (!reachedEnd())
+      DecompressedChunk = ChunkIndex->decompress();
+    InnerIndex = begin(DecompressedChunk);
   }
 
   DocID peek() const override {
-    assert(!reachedEnd() &&
-           "Posting List iterator can't peek() at the end.");
-    return *Index;
+    assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
+    return *InnerIndex;
   }
 
   float consume() override {
     assert(!reachedEnd() &&
            "Posting List iterator can't consume() at the end.");
     return DEFAULT_BOOST_SCORE;
   }
 
-  size_t estimateSize() const override { return Documents.size(); }
+  size_t estimateSize() const override { return Size; }
 
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << '[';
-    if (Index != std::end(Documents))
-      OS << *Index;
-    else
-      OS << "END";
+    if (ChunkIndex != begin(Chunks) || InnerIndex != begin(DecompressedChunk))
+      OS << "... ";
+    OS << (reachedEnd() ? "END" : std::to_string(*InnerIndex));
+    if (!reachedEnd() && InnerIndex < std::end(DecompressedChunk) - 1)
+      OS << " ...";
     OS << ']';
     return OS;
   }
 
-  llvm::ArrayRef<DocID> Documents;
-  llvm::ArrayRef<DocID>::const_iterator Index;
+  const std::vector<Chunk> &Chunks;
+  // Cache information about PostingList size.
+  size_t Size;
+  // Iterator over chunks.
+  std::vector<Chunk>::const_iterator ChunkIndex;
+  std::vector<DocID> DecompressedChunk;
+  // Iterator over DecompressedChunk.
+  std::vector<DocID>::iterator InnerIndex;
 };
 
+/// Single-byte masks used for VByte compression bit manipulations.
+constexpr uint8_t SevenBytesMask = 0x7f;  // 0b01111111
+constexpr uint8_t FourBytesMask = 0xf;    // 0b00001111
+constexpr uint8_t ContinuationBit = 0x80; // 0b10000000
+
+/// Fills chunk with the maximum number of bits available.
+Chunk createChunk(DocID Head, std::queue<uint8_t> &Payload,
+                  size_t DocumentsCount, size_t MeaningfulBytes) {
+  assert(DocumentsCount > 0 && "Can't create chunk without Head.");
+  Chunk Result;
+  Result.Head = Head;
+  Result.Size = DocumentsCount;
+  for (size_t I = 0; I < MeaningfulBytes; ++I) {
+    Result.Payload[I] = Payload.front();
+    Payload.pop();
+  }
+  return Result;
+}
+
+/// Byte offsets of Payload contents within DocID.
+const size_t Offsets[] = {0, 7, 7 * 2, 7 * 3, 7 * 4};
+
+/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
+/// DocIDs. The compression stores deltas (differences) between subsequent
+/// DocIDs and encodes these deltas utilizing the least possible number of
+/// bytes.
+///
+/// Each encoding byte consists of two parts: the first bit (continuation bit)
+/// indicates whether this is the last byte of current encoding and seven bytes
+/// a piece of DocID (payload). DocID contains 32 bits and therefore it takes
+/// up to 5 bytes to encode it (4 full 7-bit payloads and one 4-bit payload),
+/// but in practice it is expected that gaps (deltas) between subsequent DocIDs
+/// are not large enough to require 5 bytes. In very dense posting lists (with
+/// average gaps less than 128) this representation would be 4 times more
+/// efficient than raw DocID array.
+///
+/// PostingList encoding example:
+///
+/// DocIDs    42            47        7000
+/// gaps                    5         6958
+/// Encoding  (raw number)  10000101  10110110 00101110
+std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
+  // Masks are used to perform bit manipulations over DocID. Each mask
+  // represents
+  static const std::vector<DocID> Masks = {
+      SevenBytesMask,                              // First 7 bytes: 0b1111111
+      SevenBytesMask << 7U,                        // Next 7 bytes
+      SevenBytesMask << 7U * 2,                    // ...
+      SevenBytesMask << 7U * 3,                    // ...
+      static_cast<DocID>(FourBytesMask << 7U * 4), // 4 last bytes
+  };
+
+  // These limits are used to calculate the width of DocID encoding: when
+  // ID >= Limits[I], it takes at least I + 1 bytes.
+  static const DocID Limits[] = {
+      1U << 7U,
+      1U << 7U * 2,
+      1U << 7U * 3,
+      1U << 7U * 4,
+  };
+
+  std::vector<Chunk> Result;
+  std::queue<uint8_t> Payload;
+  size_t HeadIndex = 0;
+  // Keep track of the last Payload size which doesn't exceed the limit.
+  size_t LastEncodingEnd = 0;
+  for (size_t I = 0; I < Documents.size(); ++I) {
+    // Don't encode Heads.
+    if (HeadIndex == I)
+      continue;
+    const DocID Delta = Documents[I] - Documents[I - 1];
+    // Encode Delta.
+    for (size_t I = 0; I < Masks.size(); ++I) {
+      uint8_t Encoding = (Delta & Masks[I]) >> Offsets[I];
+      bool HasNextByte = I < Masks.size() - 1 ? Delta >= Limits[I] : false;
+      // If !HasNextByte, mark the end of encoding stream.
+      Payload.push(!HasNextByte ? Encoding | ContinuationBit : Encoding);
+      if (!HasNextByte)
+        break;
+    }
+    if (Payload.size() <= Chunk::PayloadSize)
+      LastEncodingEnd = Payload.size();
+    // Read stream until Payload overflows.
+    if (Payload.size() < Chunk::PayloadSize)
+      continue;
+    // If Payload contains exactly Chunk::PayloadSize bytes, use all of them to
+    // fill the next Chunk. Otherwise, use the last valid size.
+    Result.push_back(createChunk(Documents[HeadIndex], Payload,
+                                 Payload.size() == Chunk::PayloadSize
+                                     ? I - HeadIndex + 1
+                                     : I - HeadIndex,
+                                 LastEncodingEnd));
+    // Next head is the next item if Payload contains exactly Chunk::PayloadSize
+    // bytes, otherwise it is the current item.
+    HeadIndex = Payload.empty() ? I + 1 : I;
+    // If overflow happened, Payload contains encoding of the next Head: discard
+    // it.
+    while (!Payload.empty())
+      Payload.pop();
+    LastEncodingEnd = 0;
+  }
+  // Add another chunk if there are some bytes left in Payload or if there's a
+  // trailing Head.
+  if (!Payload.empty() || HeadIndex + 1 == Documents.size())
+    Result.push_back(createChunk(Documents[HeadIndex], Payload,
+                                 Documents.size() - HeadIndex,
+                                 LastEncodingEnd));
+  return Result;
+}
+
 } // namespace
 
+std::vector<DocID> Chunk::decompress() const {
+  std::vector<DocID> Result{Head};
+  if (Size == 1)
+    return Result;
+  // Store sum of Head and all deltas to keep track of the last ID.
+  DocID Current = Head;
+  DocID Delta = 0;
+  uint8_t Length = 0;
+  // Decode bytes from Payload into Delta.
+  for (const auto &Byte : Payload) {
+    assert(Length <= 5 && "Can't decode sequences longer than 5 bytes");
+    // Write meaningful bits to the correct place in the document decoding.
+    Delta |= (Byte & (Length < 4 ? SevenBytesMask : FourBytesMask))
+             << Offsets[Length];
+    ++Length;
+    // Add document decoding to the result as soon as END marker is seen.
+    if ((Byte & ContinuationBit) != 0) {
+      Current += Delta;
+      Result.push_back(Current);
+      Length = 0;
+      Delta = 0;
+    }
+    // Stop when all meaningful bytes are decoded.
+    if (Result.size() == Size)
+      break;
+  }
+  assert(Result.size() == 1 ||
+         Length == 0 &&
+             "Unterminated byte sequence at the end of input stream.");
+  return Result;
+}
+
+PostingList::PostingList(const std::vector<DocID> &Documents)
+    : Chunks(encodeStream(Documents)), Size(Documents.size()) {}
+
 std::unique_ptr<Iterator> PostingList::iterator() const {
-  return llvm::make_unique<PlainIterator>(Documents);
+  return llvm::make_unique<ChunkIterator>(Chunks, Size);
 }
 
 } // namespace dex
Index: clang-tools-extra/clangd/index/dex/Dex.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Dex.cpp
+++ clang-tools-extra/clangd/index/dex/Dex.cpp
@@ -122,8 +122,8 @@
 
   // Convert lists of items to posting lists.
   for (const auto &TokenToPostingList : TempInvertedIndex)
-    InvertedIndex.insert({TokenToPostingList.first,
-                          PostingList(move(TokenToPostingList.second))});
+    InvertedIndex.insert(
+        {TokenToPostingList.first, PostingList(TokenToPostingList.second)});
 
   vlog("Built Dex with estimated memory usage {0} bytes.",
        estimateMemoryUsage());
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -76,6 +76,7 @@
 add_subdirectory(tool)
 add_subdirectory(indexer)
 add_subdirectory(index/dex/dexp)
+add_subdirectory(index/dex/fuzzer)
 
 if (LLVM_INCLUDE_BENCHMARKS)
   add_subdirectory(benchmarks)

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D52300: [clangd] Implement VByte PostingList compression

Reply via email to