kbobyrev updated this revision to Diff 160576.
kbobyrev added a comment.

Don't separate the logic for "long" and "short" queries: 
https://reviews.llvm.org/D50517 (https://reviews.llvm.org/rCTE339548) 
introduced incomplete trigrams which can be used on for "short" queries, too.


https://reviews.llvm.org/D50337

Files:
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/index/dex/DexIndex.cpp
  clang-tools-extra/clangd/index/dex/DexIndex.h
  clang-tools-extra/clangd/index/dex/Token.h
  clang-tools-extra/unittests/clangd/CMakeLists.txt
  clang-tools-extra/unittests/clangd/DexIndexTests.cpp
  clang-tools-extra/unittests/clangd/IndexTests.cpp
  clang-tools-extra/unittests/clangd/TestIndexOperations.cpp
  clang-tools-extra/unittests/clangd/TestIndexOperations.h

Index: clang-tools-extra/unittests/clangd/TestIndexOperations.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/unittests/clangd/TestIndexOperations.cpp
@@ -0,0 +1,89 @@
+//===-- IndexHelpers.cpp ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestIndexOperations.h"
+
+namespace clang {
+namespace clangd {
+
+Symbol symbol(llvm::StringRef QName) {
+  Symbol Sym;
+  Sym.ID = SymbolID(QName.str());
+  size_t Pos = QName.rfind("::");
+  if (Pos == llvm::StringRef::npos) {
+    Sym.Name = QName;
+    Sym.Scope = "";
+  } else {
+    Sym.Name = QName.substr(Pos + 2);
+    Sym.Scope = QName.substr(0, Pos + 2);
+  }
+  return Sym;
+}
+
+// Create a slab of symbols with the given qualified names as both IDs and
+// names. The life time of the slab is managed by the returned shared pointer.
+// If \p WeakSymbols is provided, it will be pointed to the managed object in
+// the returned shared pointer.
+std::shared_ptr<std::vector<const Symbol *>>
+generateSymbols(std::vector<std::string> QualifiedNames,
+                std::weak_ptr<SlabAndPointers> *WeakSymbols) {
+  SymbolSlab::Builder Slab;
+  for (llvm::StringRef QName : QualifiedNames)
+    Slab.insert(symbol(QName));
+
+  auto Storage = std::make_shared<SlabAndPointers>();
+  Storage->Slab = std::move(Slab).build();
+  for (const auto &Sym : Storage->Slab)
+    Storage->Pointers.push_back(&Sym);
+  if (WeakSymbols)
+    *WeakSymbols = Storage;
+  auto *Pointers = &Storage->Pointers;
+  return {std::move(Storage), Pointers};
+}
+
+// Create a slab of symbols with IDs and names [Begin, End], otherwise identical
+// to the `generateSymbols` above.
+std::shared_ptr<std::vector<const Symbol *>>
+generateNumSymbols(int Begin, int End,
+                   std::weak_ptr<SlabAndPointers> *WeakSymbols) {
+  std::vector<std::string> Names;
+  for (int i = Begin; i <= End; i++)
+    Names.push_back(std::to_string(i));
+  return generateSymbols(Names, WeakSymbols);
+}
+
+std::string getQualifiedName(const Symbol &Sym) {
+  return (Sym.Scope + Sym.Name).str();
+}
+
+std::vector<std::string> match(const SymbolIndex &I,
+                               const FuzzyFindRequest &Req, bool *Incomplete) {
+  std::vector<std::string> Matches;
+  bool IsIncomplete = I.fuzzyFind(Req, [&](const Symbol &Sym) {
+    Matches.push_back(clang::clangd::getQualifiedName(Sym));
+  });
+  if (Incomplete)
+    *Incomplete = IsIncomplete;
+  return Matches;
+}
+
+// Returns qualified names of symbols with any of IDs in the index.
+std::vector<std::string> lookup(const SymbolIndex &I,
+                                llvm::ArrayRef<SymbolID> IDs) {
+  LookupRequest Req;
+  Req.IDs.insert(IDs.begin(), IDs.end());
+  std::vector<std::string> Results;
+  I.lookup(Req, [&](const Symbol &Sym) {
+    Results.push_back(getQualifiedName(Sym));
+  });
+  return Results;
+}
+
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/unittests/clangd/TestIndexOperations.h
===================================================================
--- /dev/null
+++ clang-tools-extra/unittests/clangd/TestIndexOperations.h
@@ -0,0 +1,57 @@
+//===-- IndexHelpers.h ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_UNITTESTS_CLANGD_INDEXTESTCOMMON_H
+#define LLVM_CLANG_TOOLS_EXTRA_UNITTESTS_CLANGD_INDEXTESTCOMMON_H
+
+#include "index/Index.h"
+#include "index/Merge.h"
+#include "index/dex/DexIndex.h"
+#include "index/dex/Iterator.h"
+#include "index/dex/Token.h"
+#include "index/dex/Trigram.h"
+
+namespace clang {
+namespace clangd {
+
+Symbol symbol(llvm::StringRef QName);
+
+struct SlabAndPointers {
+  SymbolSlab Slab;
+  std::vector<const Symbol *> Pointers;
+};
+
+// Create a slab of symbols with the given qualified names as both IDs and
+// names. The life time of the slab is managed by the returned shared pointer.
+// If \p WeakSymbols is provided, it will be pointed to the managed object in
+// the returned shared pointer.
+std::shared_ptr<std::vector<const Symbol *>>
+generateSymbols(std::vector<std::string> QualifiedNames,
+                std::weak_ptr<SlabAndPointers> *WeakSymbols = nullptr);
+
+// Create a slab of symbols with IDs and names [Begin, End], otherwise identical
+// to the `generateSymbols` above.
+std::shared_ptr<std::vector<const Symbol *>>
+generateNumSymbols(int Begin, int End,
+                   std::weak_ptr<SlabAndPointers> *WeakSymbols = nullptr);
+
+std::string getQualifiedName(const Symbol &Sym);
+
+std::vector<std::string> match(const SymbolIndex &I,
+                               const FuzzyFindRequest &Req,
+                               bool *Incomplete = nullptr);
+
+// Returns qualified names of symbols with any of IDs in the index.
+std::vector<std::string> lookup(const SymbolIndex &I,
+                                llvm::ArrayRef<SymbolID> IDs);
+
+} // namespace clangd
+} // namespace clang
+
+#endif
Index: clang-tools-extra/unittests/clangd/IndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/IndexTests.cpp
+++ clang-tools-extra/unittests/clangd/IndexTests.cpp
@@ -7,33 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "TestIndexOperations.h"
 #include "index/Index.h"
 #include "index/MemIndex.h"
 #include "index/Merge.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
-using testing::UnorderedElementsAre;
 using testing::Pointee;
+using testing::UnorderedElementsAre;
 
 namespace clang {
 namespace clangd {
 namespace {
 
-Symbol symbol(llvm::StringRef QName) {
-  Symbol Sym;
-  Sym.ID = SymbolID(QName.str());
-  size_t Pos = QName.rfind("::");
-  if (Pos == llvm::StringRef::npos) {
-    Sym.Name = QName;
-    Sym.Scope = "";
-  } else {
-    Sym.Name = QName.substr(Pos + 2);
-    Sym.Scope = QName.substr(0, Pos + 2);
-  }
-  return Sym;
-}
-
 MATCHER_P(Named, N, "") { return arg.Name == N; }
 
 TEST(SymbolSlab, FindAndIterate) {
@@ -52,59 +39,6 @@
     EXPECT_THAT(*S.find(SymbolID(Sym)), Named(Sym));
 }
 
-struct SlabAndPointers {
-  SymbolSlab Slab;
-  std::vector<const Symbol *> Pointers;
-};
-
-// Create a slab of symbols with the given qualified names as both IDs and
-// names. The life time of the slab is managed by the returned shared pointer.
-// If \p WeakSymbols is provided, it will be pointed to the managed object in
-// the returned shared pointer.
-std::shared_ptr<std::vector<const Symbol *>>
-generateSymbols(std::vector<std::string> QualifiedNames,
-                std::weak_ptr<SlabAndPointers> *WeakSymbols = nullptr) {
-  SymbolSlab::Builder Slab;
-  for (llvm::StringRef QName : QualifiedNames)
-    Slab.insert(symbol(QName));
-
-  auto Storage = std::make_shared<SlabAndPointers>();
-  Storage->Slab = std::move(Slab).build();
-  for (const auto &Sym : Storage->Slab)
-    Storage->Pointers.push_back(&Sym);
-  if (WeakSymbols)
-    *WeakSymbols = Storage;
-  auto *Pointers = &Storage->Pointers;
-  return {std::move(Storage), Pointers};
-}
-
-// Create a slab of symbols with IDs and names [Begin, End], otherwise identical
-// to the `generateSymbols` above.
-std::shared_ptr<std::vector<const Symbol *>>
-generateNumSymbols(int Begin, int End,
-                   std::weak_ptr<SlabAndPointers> *WeakSymbols = nullptr) {
-  std::vector<std::string> Names;
-  for (int i = Begin; i <= End; i++)
-    Names.push_back(std::to_string(i));
-  return generateSymbols(Names, WeakSymbols);
-}
-
-std::string getQualifiedName(const Symbol &Sym) {
-  return (Sym.Scope + Sym.Name).str();
-}
-
-std::vector<std::string> match(const SymbolIndex &I,
-                               const FuzzyFindRequest &Req,
-                               bool *Incomplete = nullptr) {
-  std::vector<std::string> Matches;
-  bool IsIncomplete = I.fuzzyFind(Req, [&](const Symbol &Sym) {
-    Matches.push_back(getQualifiedName(Sym));
-  });
-  if (Incomplete)
-    *Incomplete = IsIncomplete;
-  return Matches;
-}
-
 TEST(MemIndexTest, MemIndexSymbolsRecycled) {
   MemIndex I;
   std::weak_ptr<SlabAndPointers> Symbols;
@@ -212,18 +146,6 @@
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("ns::ABC", "ns::abc"));
 }
 
-// Returns qualified names of symbols with any of IDs in the index.
-std::vector<std::string> lookup(const SymbolIndex &I,
-                                llvm::ArrayRef<SymbolID> IDs) {
-  LookupRequest Req;
-  Req.IDs.insert(IDs.begin(), IDs.end());
-  std::vector<std::string> Results;
-  I.lookup(Req, [&](const Symbol &Sym) {
-    Results.push_back(getQualifiedName(Sym));
-  });
-  return Results;
-}
-
 TEST(MemIndexTest, Lookup) {
   MemIndex I;
   I.build(generateSymbols({"ns::abc", "ns::xyz"}));
@@ -269,7 +191,7 @@
 TEST(MergeTest, Merge) {
   Symbol L, R;
   L.ID = R.ID = SymbolID("hello");
-  L.Name = R.Name = "Foo";                    // same in both
+  L.Name = R.Name = "Foo";                           // same in both
   L.CanonicalDeclaration.FileURI = "file:///left.h"; // differs
   R.CanonicalDeclaration.FileURI = "file:///right.h";
   L.References = 1;
Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -7,6 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "TestIndexOperations.h"
+#include "index/Index.h"
+#include "index/Merge.h"
+#include "index/dex/DexIndex.h"
 #include "index/dex/Iterator.h"
 #include "index/dex/Token.h"
 #include "index/dex/Trigram.h"
@@ -17,11 +21,13 @@
 #include <string>
 #include <vector>
 
+using ::testing::ElementsAre;
+using ::testing::UnorderedElementsAre;
+
 namespace clang {
 namespace clangd {
 namespace dex {
-
-using ::testing::ElementsAre;
+namespace {
 
 TEST(DexIndexIterators, DocumentIterator) {
   const PostingList L = {4, 7, 8, 20, 42, 100};
@@ -333,6 +339,223 @@
                            "hij", "ijk", "jkl", "klm"}));
 }
 
+TEST(DexIndex, Lookup) {
+  DexIndex I;
+  I.build(generateSymbols({"ns::abc", "ns::xyz"}));
+  EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
+  EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
+              UnorderedElementsAre("ns::abc", "ns::xyz"));
+  EXPECT_THAT(lookup(I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}),
+              UnorderedElementsAre("ns::xyz"));
+  EXPECT_THAT(lookup(I, SymbolID("ns::nonono")), UnorderedElementsAre());
+}
+
+// FIXME(kbobyrev): Use 3+ symbols long names so that trigram index is used.
+TEST(MergeDexIndex, Lookup) {
+  DexIndex I, J;
+  I.build(generateSymbols({"ns::A", "ns::B"}));
+  J.build(generateSymbols({"ns::B", "ns::C"}));
+  EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::A")),
+              UnorderedElementsAre("ns::A"));
+  EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::B")),
+              UnorderedElementsAre("ns::B"));
+  EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::C")),
+              UnorderedElementsAre("ns::C"));
+  EXPECT_THAT(
+      lookup(*mergeIndex(&I, &J), {SymbolID("ns::A"), SymbolID("ns::B")}),
+      UnorderedElementsAre("ns::A", "ns::B"));
+  EXPECT_THAT(
+      lookup(*mergeIndex(&I, &J), {SymbolID("ns::A"), SymbolID("ns::C")}),
+      UnorderedElementsAre("ns::A", "ns::C"));
+  EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::D")),
+              UnorderedElementsAre());
+  EXPECT_THAT(lookup(*mergeIndex(&I, &J), {}), UnorderedElementsAre());
+}
+
+// FIXME(kbobyrev): Add more tests on DexIndex? Right now, it's mostly a wrapper
+// around DexIndex, adopting tests from IndexTests.cpp sounds reasonable.
+// However, most of these tests use short names (<3 symbols) for symbol lookups,
+// which currently are dispatched to DexIndex and hence just duplicating these
+// tests while substituting DexIndex with DexIndex is not a viable solution.
+TEST(DexIndex, FuzzyFind) {
+  DexIndex Index;
+  Index.build(generateSymbols(
+      {"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC", "other::ABC"}));
+  FuzzyFindRequest Req;
+  Req.Query = "ABC";
+  Req.Scopes = {"ns::"};
+  EXPECT_THAT(match(Index, Req), UnorderedElementsAre("ns::ABC"));
+  Req.Scopes = {"ns::", "ns::nested::"};
+  EXPECT_THAT(match(Index, Req),
+              UnorderedElementsAre("ns::ABC", "ns::nested::ABC"));
+}
+
+TEST(DexIndexTest, FuzzyMatchQ) {
+  DexIndex I;
+  I.build(
+      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
+  FuzzyFindRequest Req;
+  Req.Query = "lol";
+  Req.MaxCandidateCount = 2;
+  EXPECT_THAT(match(I, Req),
+              UnorderedElementsAre("LaughingOutLoud", "LittleOldLady"));
+}
+
+TEST(DexIndexTest, DexIndexSymbolsRecycled) {
+  DexIndex I;
+  std::weak_ptr<SlabAndPointers> Symbols;
+  I.build(generateNumSymbols(0, 10, &Symbols));
+  FuzzyFindRequest Req;
+  Req.Query = "7";
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("7"));
+
+  EXPECT_FALSE(Symbols.expired());
+  // Release old symbols.
+  I.build(generateNumSymbols(0, 0));
+  EXPECT_TRUE(Symbols.expired());
+}
+
+// FIXME(kbobyrev): This test is different for DexIndex and MemIndex: while
+// MemIndex manages response deduplication, DexIndex simply returns all matched
+// symbols which means there might be equivalent symbols in the response.
+// Before drop-in replacement of MemIndex with DexIndex happens, FileIndex
+// should handle deduplication instead.
+TEST(DexIndexTest, DexIndexDeduplicate) {
+  auto Symbols = generateNumSymbols(0, 10);
+
+  // Inject duplicates.
+  auto Sym = symbol("7");
+  Symbols->push_back(&Sym);
+  Symbols->push_back(&Sym);
+  Symbols->push_back(&Sym);
+
+  FuzzyFindRequest Req;
+  Req.Query = "7";
+  DexIndex I;
+  I.build(std::move(Symbols));
+  auto Matches = match(I, Req);
+  EXPECT_EQ(Matches.size(), 4u);
+}
+
+TEST(DexIndexTest, DexIndexLimitedNumMatches) {
+  DexIndex I;
+  I.build(generateNumSymbols(0, 100));
+  FuzzyFindRequest Req;
+  Req.Query = "5";
+  Req.MaxCandidateCount = 3;
+  bool Incomplete;
+  auto Matches = match(I, Req, &Incomplete);
+  EXPECT_EQ(Matches.size(), Req.MaxCandidateCount);
+  EXPECT_TRUE(Incomplete);
+}
+
+TEST(DexIndexTest, FuzzyMatch) {
+  DexIndex I;
+  I.build(
+      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
+  FuzzyFindRequest Req;
+  Req.Query = "lol";
+  Req.MaxCandidateCount = 2;
+  EXPECT_THAT(match(I, Req),
+              UnorderedElementsAre("LaughingOutLoud", "LittleOldLady"));
+}
+
+TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
+  DexIndex I;
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
+  FuzzyFindRequest Req;
+  Req.Query = "y";
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
+}
+
+TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) {
+  DexIndex I;
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
+  FuzzyFindRequest Req;
+  Req.Query = "y";
+  Req.Scopes = {""};
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3"));
+}
+
+TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) {
+  DexIndex I;
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
+  FuzzyFindRequest Req;
+  Req.Query = "y";
+  Req.Scopes = {"a::"};
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2"));
+}
+
+TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) {
+  DexIndex I;
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
+  FuzzyFindRequest Req;
+  Req.Query = "y";
+  Req.Scopes = {"a::", "b::"};
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
+}
+
+TEST(DexIndexTest, NoMatchNestedScopes) {
+  DexIndex I;
+  I.build(generateSymbols({"a::y1", "a::b::y2"}));
+  FuzzyFindRequest Req;
+  Req.Query = "y";
+  Req.Scopes = {"a::"};
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1"));
+}
+
+TEST(DexIndexTest, IgnoreCases) {
+  DexIndex I;
+  I.build(generateSymbols({"ns::ABC", "ns::abc"}));
+  FuzzyFindRequest Req;
+  Req.Query = "AB";
+  Req.Scopes = {"ns::"};
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("ns::ABC", "ns::abc"));
+}
+
+TEST(DexIndexTest, Lookup) {
+  DexIndex I;
+  I.build(generateSymbols({"ns::abc", "ns::xyz"}));
+  EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
+  EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
+              UnorderedElementsAre("ns::abc", "ns::xyz"));
+  EXPECT_THAT(lookup(I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}),
+              UnorderedElementsAre("ns::xyz"));
+  EXPECT_THAT(lookup(I, SymbolID("ns::nonono")), UnorderedElementsAre());
+}
+
+TEST(DexMergeIndexTest, Lookup) {
+  DexIndex I, J;
+  I.build(generateSymbols({"ns::A", "ns::B"}));
+  J.build(generateSymbols({"ns::B", "ns::C"}));
+  EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::A")),
+              UnorderedElementsAre("ns::A"));
+  EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::B")),
+              UnorderedElementsAre("ns::B"));
+  EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::C")),
+              UnorderedElementsAre("ns::C"));
+  EXPECT_THAT(
+      lookup(*mergeIndex(&I, &J), {SymbolID("ns::A"), SymbolID("ns::B")}),
+      UnorderedElementsAre("ns::A", "ns::B"));
+  EXPECT_THAT(
+      lookup(*mergeIndex(&I, &J), {SymbolID("ns::A"), SymbolID("ns::C")}),
+      UnorderedElementsAre("ns::A", "ns::C"));
+  EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::D")),
+              UnorderedElementsAre());
+  EXPECT_THAT(lookup(*mergeIndex(&I, &J), {}), UnorderedElementsAre());
+}
+
+TEST(DexMergeIndexTest, FuzzyFind) {
+  DexIndex I, J;
+  I.build(generateSymbols({"ns::A", "ns::B"}));
+  J.build(generateSymbols({"ns::B", "ns::C"}));
+  FuzzyFindRequest Req;
+  Req.Scopes = {"ns::"};
+  EXPECT_THAT(match(*mergeIndex(&I, &J), Req),
+              UnorderedElementsAre("ns::A", "ns::B", "ns::C"));
+}
+
+} // namespace
 } // namespace dex
 } // namespace clangd
 } // namespace clang
Index: clang-tools-extra/unittests/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/unittests/clangd/CMakeLists.txt
+++ clang-tools-extra/unittests/clangd/CMakeLists.txt
@@ -30,6 +30,7 @@
   SyncAPI.cpp
   TUSchedulerTests.cpp
   TestFS.cpp
+  TestIndexOperations.cpp
   TestTU.cpp
   ThreadingTests.cpp
   TraceTests.cpp
Index: clang-tools-extra/clangd/index/dex/Token.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Token.h
+++ clang-tools-extra/clangd/index/dex/Token.h
@@ -22,9 +22,9 @@
 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
 
+#include "../Index.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/raw_ostream.h"
-
 #include <string>
 #include <vector>
 
Index: clang-tools-extra/clangd/index/dex/DexIndex.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/DexIndex.cpp
@@ -0,0 +1,250 @@
+//===--- DexIndex.cpp - Dex Symbol Index Implementation ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DexIndex.h"
+#include "../../FuzzyMatch.h"
+#include "../../Logger.h"
+#include <algorithm>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+namespace {
+
+// Applies Callback to each symbol from the Scores in the order of decreasing
+// symbol quality.
+void useCallback(llvm::function_ref<void(const Symbol &)> Callback,
+                 std::vector<std::pair<float, const Symbol *>> &Scores) {
+  // First, sort retrieved symbols by the cumulative score to apply callbacks
+  // in the order of descending score.
+  std::sort(begin(Scores), end(Scores),
+            [](const std::pair<float, const Symbol *> &LHS,
+               const std::pair<float, const Symbol *> &RHS) {
+              return LHS.first > RHS.first;
+            });
+
+  for (size_t CandidateRank = 0; CandidateRank < Scores.size(); ++CandidateRank)
+    Callback(*Scores[CandidateRank].second);
+}
+
+// Determines whether query contains enough letters to construct a trigram.
+bool isShortQuery(std::string Query) {
+  // Apply fuzzy matching text segmentation.
+  std::vector<CharRole> Roles(Query.size());
+  calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
+
+  unsigned ValidSymbols = 0;
+  for (const auto &Role : Roles) {
+    if (Role == Head || Role == Tail)
+      ++ValidSymbols;
+    if (ValidSymbols >= 3)
+      return true;
+  }
+
+  return false;
+}
+
+// Returns the tokens which are given symbol's characteristics. Currently, the
+// generated tokens only contain fuzzy matching trigrams and symbol's scope,
+// but in the future this will also return path proximity tokens and other
+// types of tokens such as symbol type (if applicable).
+// Returns the tokens which are given symbols's characteristics. For example,
+// trigrams and scopes.
+// FIXME(kbobyrev): Support more token types:
+// * Path proximity
+// * Types
+std::vector<Token> generateSearchTokens(const Symbol &Sym) {
+  std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
+  Result.push_back(Token(Token::Kind::Scope, Sym.Scope));
+  return Result;
+}
+
+} // namespace
+
+void DexIndex::build(std::shared_ptr<std::vector<const Symbol *>> Syms) {
+  llvm::DenseMap<SymbolID, const Symbol *> TempLookupTable;
+  llvm::DenseMap<const Symbol *, float> TempSymbolQuality;
+  for (const Symbol *Sym : *Syms) {
+    TempLookupTable[Sym->ID] = Sym;
+    TempSymbolQuality[Sym] = quality(*Sym);
+  }
+
+  // Symbols are sorted by symbol qualities so that items in the posting lists
+  // are stored in the descending order of symbol quality.
+  // FIXME(kbobyrev): Should we also store symbol quality? It is used later in
+  // fuzzy matching stage, too. If measuring symbol quality gets expensive, this
+  // might be more efficient.
+  std::sort(begin(*Syms), end(*Syms),
+            [&](const Symbol *LHS, const Symbol *RHS) {
+              return TempSymbolQuality[LHS] > TempSymbolQuality[RHS];
+            });
+  llvm::DenseMap<Token, PostingList> TempInvertedIndex;
+  // Populate TempInvertedIndex with posting lists for index symbols.
+  for (DocID SymbolRank = 0; SymbolRank < Syms->size(); ++SymbolRank) {
+    const auto *Sym = (*Syms)[SymbolRank];
+    for (const auto &Token : generateSearchTokens(*Sym))
+      TempInvertedIndex[Token].push_back(SymbolRank);
+  }
+
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    // Replace outdated index with the new one.
+    LookupTable = std::move(TempLookupTable);
+    Symbols = std::move(Syms);
+    InvertedIndex = std::move(TempInvertedIndex);
+    SymbolQuality = std::move(TempSymbolQuality);
+  }
+}
+
+bool DexIndex::fuzzyFind(
+    const FuzzyFindRequest &Req,
+    llvm::function_ref<void(const Symbol &)> Callback) const {
+  assert(!StringRef(Req.Query).contains("::") &&
+         "There must be no :: in query.");
+  FuzzyMatcher Filter(Req.Query);
+  bool More = false;
+  // The separation of long queries and short queries is temporary and will be
+  // removed once incomplete trigrams are generated from both identifiers and
+  // queries.
+  More = isShortQuery(Req.Query) ? fuzzyFindLongQuery(Callback, Req)
+                                 : fuzzyFindShortQuery(Callback, Req);
+  return More;
+}
+
+/// Handles fuzzy matching requests for which trigrams can not be generated.
+/// This approach currently does not use trigram index and query iterators and
+/// will be revisited in the future.
+// FIXME(kbobyrev): Implement short queries (<3 length) using posting lists and
+// incomplete trigrams. Unigram and bigram posting lists are likely to be very
+// dense, which is why posting list compression should be also implemented in
+// that approach.
+bool DexIndex::fuzzyFindShortQuery(
+    llvm::function_ref<void(const Symbol &)> Callback,
+    const FuzzyFindRequest &Req) const {
+  bool More = false;
+  std::vector<std::pair<float, const Symbol *>> Scores;
+  FuzzyMatcher Filter(Req.Query);
+
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    // FIXME(kbobyrev): This can be very expensive. We should first filter
+    // symbols by scopes and types using query iterators.
+    for (DocID ID = 0; ID < Symbols->size(); ++ID) {
+      const auto *Sym = (*Symbols)[ID];
+      const llvm::Optional<float> Score = Filter.match(Sym->Name);
+      // Match scopes from the query.
+      if (!Req.Scopes.empty() && !llvm::is_contained(Req.Scopes, Sym->Scope))
+        continue;
+      if (Req.RestrictForCodeCompletion && !Sym->IsIndexedForCodeCompletion)
+        continue;
+      if (Score) {
+        if (Scores.size() == Req.MaxCandidateCount) {
+          More = true;
+          break;
+        }
+        Scores.push_back(
+            std::make_pair(*Score * SymbolQuality.find(Sym)->second, Sym));
+      }
+    }
+    useCallback(Callback, Scores);
+  }
+  return More;
+}
+
+/// Constructs iterators over tokens extracted from the query and exhausts it
+/// while applying Callback to each symbol in the order of decreasing quality
+/// of the matched symbols.
+bool DexIndex::fuzzyFindLongQuery(
+    llvm::function_ref<void(const Symbol &)> Callback,
+    const FuzzyFindRequest &Req) const {
+  bool More = false;
+  std::vector<DocID> SymbolDocIDs;
+  // FIXME(kbobyrev): Scoring all matched symbols and then processing
+  // MaxCandidateCount items is rather expensive, this should be replaced by
+  // boosting and two-stage filtering at some point as proposed in Dex design
+  // document.
+  std::vector<std::pair<float, const Symbol *>> Scores;
+  FuzzyMatcher Filter(Req.Query);
+
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    // Generate query trigrams and construct AND iterator over all query
+    // trigrams.
+    const auto TrigramTokens = generateIdentifierTrigrams(Req.Query);
+    std::vector<std::unique_ptr<Iterator>> TrigramIterators;
+    for (const auto &Trigram : TrigramTokens) {
+      const auto It = InvertedIndex.find(Trigram);
+      if (It != InvertedIndex.end())
+        TrigramIterators.push_back(create(It->second));
+    }
+
+    std::vector<std::unique_ptr<Iterator>> TopLevelChildren;
+    TopLevelChildren.push_back(createAnd(move(TrigramIterators)));
+
+    // Add OR iterator for scopes if the request contains scopes.
+    if (!Req.Scopes.empty()) {
+      // Generate scope tokens for search query.
+      std::vector<std::unique_ptr<Iterator>> ScopeIterators;
+      for (const auto &Scope : Req.Scopes) {
+        const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope));
+        if (It != InvertedIndex.end())
+          ScopeIterators.push_back(create(It->second));
+      }
+      TopLevelChildren.push_back(createOr(move(ScopeIterators)));
+    }
+
+    auto QueryIterator = createAnd(move(TopLevelChildren));
+    // FIXME(kbobyrev): Limit the total number of consumed symbols using
+    // Req.MaxCandidateCount. That would require properly implementing Limit
+    // iterator.
+    SymbolDocIDs = consume(*QueryIterator, Req.MaxCandidateCount);
+    More = SymbolDocIDs.size() == Req.MaxCandidateCount &&
+           !QueryIterator->reachedEnd();
+
+    // Populate scores.
+    // FIXME(kbobyrev): A more generic way of populating scores would be
+    // passing a callback to iterator consumer, otherwise SymbolDocIDs are
+    // stored but not actually used elsewhere.
+    for (size_t SymbolIdx = 0; SymbolIdx < SymbolDocIDs.size(); ++SymbolIdx) {
+      const auto Sym = (*Symbols)[SymbolDocIDs[SymbolIdx]];
+      const auto Score = Filter.match(Sym->Name);
+      assert(Score.hasValue() && "Matched symbol has all Fuzzy Matching "
+                                 "trigrams, it should match the query");
+      Scores.push_back(
+          std::make_pair(*Score * SymbolQuality.find(Sym)->second, Sym));
+    }
+
+    useCallback(Callback, Scores);
+  }
+
+  return More;
+}
+
+void DexIndex::lookup(const LookupRequest &Req,
+                      llvm::function_ref<void(const Symbol &)> Callback) const {
+  for (const auto &ID : Req.IDs) {
+    auto I = LookupTable.find(ID);
+    if (I != LookupTable.end())
+      Callback(*I->second);
+  }
+}
+
+void DexIndex::findOccurrences(
+    const OccurrencesRequest &Req,
+    llvm::function_ref<void(const SymbolOccurrence &)> Callback) const {
+  log("findOccurrences is not implemented.");
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/index/dex/DexIndex.h
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/DexIndex.h
@@ -0,0 +1,81 @@
+//===--- DexIndex.h - Dex Symbol Index Implementation -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines Dex - a symbol index implementation based on query iterators
+// over symbol tokens, such as fuzzy matching trigrams, scopes, types, etc.
+// While consuming more memory and having longer build stage due to
+// preprocessing, Dex will have substantially lower latency. It will also allow
+// efficient symbol searching which is crucial for operations like code
+// completion, and can be very important for a number of different code
+// transformations which will be eventually supported by Clangd.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H
+
+#include "../Index.h"
+#include "../MemIndex.h"
+#include "Iterator.h"
+#include "Token.h"
+#include "Trigram.h"
+#include <mutex>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// In-memory Dex trigram-based index implementation.
+// FIXME(kbobyrev): Introduce serialization and deserialization of the symbol
+// index so that it can be loaded from the disk. Since static index is not
+// changed frequently, it's safe to assume that it has to be built only once
+// (when the clangd process starts). Therefore, it can be easier to store built
+// index on disk and then load it if available.
+class DexIndex : public SymbolIndex {
+public:
+  /// \brief (Re-)Build index for `Symbols`. All symbol pointers must remain
+  /// accessible as long as `Symbols` is kept alive.
+  void build(std::shared_ptr<std::vector<const Symbol *>> Symbols);
+
+  bool
+  fuzzyFind(const FuzzyFindRequest &Req,
+            llvm::function_ref<void(const Symbol &)> Callback) const override;
+
+  virtual void
+  lookup(const LookupRequest &Req,
+         llvm::function_ref<void(const Symbol &)> Callback) const override;
+
+  void findOccurrences(const OccurrencesRequest &Req,
+                       llvm::function_ref<void(const SymbolOccurrence &)>
+                           Callback) const override;
+
+private:
+  bool fuzzyFindLongQuery(llvm::function_ref<void(const Symbol &)> Callback,
+                          const FuzzyFindRequest &Req) const;
+  bool fuzzyFindShortQuery(llvm::function_ref<void(const Symbol &)> Callback,
+                           const FuzzyFindRequest &Req) const;
+
+  std::shared_ptr<std::vector<const Symbol *>> Symbols;
+  llvm::DenseMap<SymbolID, const Symbol *> LookupTable;
+  llvm::DenseMap<const Symbol *, float> SymbolQuality;
+  mutable std::mutex Mutex;
+  // Inverted index is a mapping from the search token to the posting list,
+  // which contains all items which can be characterized by such search token.
+  // For example, if the search token is scope "std::", the corresponding
+  // posting list would contain all indices of symbols defined in namespace std.
+  // Inverted index is used to retrieve posting lists which are processed during
+  // the fuzzyFind process.
+  llvm::DenseMap<Token, PostingList> InvertedIndex;
+};
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+#endif
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -43,6 +43,7 @@
   index/SymbolCollector.cpp
   index/SymbolYAML.cpp
 
+  index/dex/DexIndex.cpp
   index/dex/Iterator.cpp
   index/dex/Trigram.cpp
 
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to