[PATCH] D51481: [clangd] Implement proximity path boosting for Dex

Kirill Bobyrev via Phabricator via cfe-commits Wed, 05 Sep 2018 02:30:41 -0700

kbobyrev updated this revision to Diff 163987.
kbobyrev marked 14 inline comments as done.
kbobyrev added a comment.


- Rebase on top of master
- Address suggestions Eric wrote here
- Address suggestions from the offline discussion by reusing `Quality.cpp` 
infrastructure for path proximity boosting calculation


https://reviews.llvm.org/D51481

Files:
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/FileDistance.h
  clang-tools-extra/clangd/Quality.cpp
  clang-tools-extra/clangd/index/SymbolYAML.cpp
  clang-tools-extra/clangd/index/SymbolYAML.h
  clang-tools-extra/clangd/index/dex/DexIndex.cpp
  clang-tools-extra/clangd/index/dex/DexIndex.h
  clang-tools-extra/clangd/index/dex/Iterator.cpp
  clang-tools-extra/clangd/index/dex/Token.cpp
  clang-tools-extra/clangd/index/dex/Token.h
  clang-tools-extra/clangd/tool/ClangdMain.cpp
  clang-tools-extra/unittests/clangd/DexIndexTests.cpp
  clang-tools-extra/unittests/clangd/TestIndex.cpp
  clang-tools-extra/unittests/clangd/TestIndex.h

Index: clang-tools-extra/unittests/clangd/TestIndex.h
===================================================================
--- clang-tools-extra/unittests/clangd/TestIndex.h
+++ clang-tools-extra/unittests/clangd/TestIndex.h
@@ -39,6 +39,13 @@
                                const FuzzyFindRequest &Req,
                                bool *Incomplete = nullptr);
 
+// Performs fuzzy matching-based symbol lookup given a query and an index.
+// Incomplete is set true if more items than requested can be retrieved, false
+// otherwise. Returns filename URIs of matched symbols canonical declarations.
+std::vector<std::string> matchDeclURIs(const SymbolIndex &I,
+                                       const FuzzyFindRequest &Req,
+                                       bool *Incomplete = nullptr);
+
 // Returns qualified names of symbols with any of IDs in the index.
 std::vector<std::string> lookup(const SymbolIndex &I,
                                 llvm::ArrayRef<SymbolID> IDs);
Index: clang-tools-extra/unittests/clangd/TestIndex.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/TestIndex.cpp
+++ clang-tools-extra/unittests/clangd/TestIndex.cpp
@@ -55,6 +55,18 @@
   return Matches;
 }
 
+std::vector<std::string> matchDeclURIs(const SymbolIndex &I,
+                                       const FuzzyFindRequest &Req,
+                                       bool *Incomplete) {
+  std::vector<std::string> Matches;
+  bool IsIncomplete = I.fuzzyFind(Req, [&](const Symbol &Sym) {
+    Matches.push_back(Sym.CanonicalDeclaration.FileURI);
+  });
+  if (Incomplete)
+    *Incomplete = IsIncomplete;
+  return Matches;
+}
+
 // Returns qualified names of symbols with any of IDs in the index.
 std::vector<std::string> lookup(const SymbolIndex &I,
                                 llvm::ArrayRef<SymbolID> IDs) {
Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -7,6 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "FuzzyMatch.h"
+#include "TestFS.h"
 #include "TestIndex.h"
 #include "index/Index.h"
 #include "index/Merge.h"
@@ -30,6 +32,12 @@
 namespace dex {
 namespace {
 
+std::vector<std::string> URISchemes = {"unittest"};
+
+//===----------------------------------------------------------------------===//
+// Query iterator tests.
+//===----------------------------------------------------------------------===//
+
 std::vector<DocID> consumeIDs(Iterator &It) {
   auto IDAndScore = consume(It);
   std::vector<DocID> IDs(IDAndScore.size());
@@ -325,15 +333,38 @@
   EXPECT_THAT(ElementBoost, 3);
 }
 
+//===----------------------------------------------------------------------===//
+// Search token tests.
+//===----------------------------------------------------------------------===//
+
 testing::Matcher<std::vector<Token>>
-trigramsAre(std::initializer_list<std::string> Trigrams) {
+tokensAre(std::initializer_list<std::string> Strings, Token::Kind Kind) {
   std::vector<Token> Tokens;
-  for (const auto &Symbols : Trigrams) {
-    Tokens.push_back(Token(Token::Kind::Trigram, Symbols));
+  for (const auto &TokenData : Strings) {
+    Tokens.push_back(Token(Kind, TokenData));
   }
   return testing::UnorderedElementsAreArray(Tokens);
 }
 
+testing::Matcher<std::vector<Token>>
+trigramsAre(std::initializer_list<std::string> Trigrams) {
+  return tokensAre(Trigrams, Token::Kind::Trigram);
+}
+
+testing::Matcher<std::vector<Token>>
+pathsAre(std::initializer_list<std::string> Paths) {
+  return tokensAre(Paths, Token::Kind::ProximityURI);
+}
+
+testing::Matcher<std::vector<Token>>
+proximityPathsAre(std::initializer_list<std::string> ProximityPaths) {
+  std::vector<Token> Result;
+  for (const auto &Path : ProximityPaths) {
+    Result.emplace_back(Token::Kind::ProximityURI, Path);
+  }
+  return testing::UnorderedElementsAreArray(Result);
+}
+
 TEST(DexIndexTrigrams, IdentifierTrigrams) {
   EXPECT_THAT(generateIdentifierTrigrams("X86"),
               trigramsAre({"x86", "x$$", "x8$"}));
@@ -408,8 +439,32 @@
                            "hij", "ijk", "jkl", "klm"}));
 }
 
+TEST(DexSearchTokens, SymbolPath) {
+  EXPECT_THAT(generatePathProximityTokens(
+                  "unittest:///clang-tools-extra/clangd/index/Token.h"),
+              pathsAre({"unittest:///clang-tools-extra/clangd/index/Token.h",
+                        "unittest:///clang-tools-extra/clangd/index",
+                        "unittest:///clang-tools-extra/clangd",
+                        "unittest:///clang-tools-extra", "unittest:///"}));
+}
+
+TEST(DexSearchTokens, QueryProximityDistances) {
+  llvm::SmallString<30> Path(testRoot());
+  llvm::sys::path::append(Path, "/a/b/c/d/e/f/g.h");
+  EXPECT_THAT(generateQueryPathProximityTokens(Path, URISchemes),
+              proximityPathsAre({{"unittest:///a/b/c/d/e/f/g.h"},
+                                 {"unittest:///a/b/c/d/e/f"},
+                                 {"unittest:///a/b/c/d/e"},
+                                 {"unittest:///a/b/c/d"},
+                                 {"unittest:///a/b/c"}}));
+}
+
+//===----------------------------------------------------------------------===//
+// Index tests.
+//===----------------------------------------------------------------------===//
+
 TEST(DexIndex, Lookup) {
-  auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}));
+  auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes);
   EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
   EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::abc", "ns::xyz"));
@@ -421,7 +476,8 @@
 TEST(DexIndex, FuzzyFind) {
   auto Index = DexIndex::build(
       generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC",
-                       "other::ABC", "other::A"}));
+                       "other::ABC", "other::A"}),
+      URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "ABC";
   Req.Scopes = {"ns::"};
@@ -443,7 +499,8 @@
 
 TEST(DexIndexTest, FuzzyMatchQ) {
   auto I = DexIndex::build(
-      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
+      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}),
+      URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "lol";
   Req.MaxCandidateCount = 2;
@@ -461,12 +518,12 @@
                                  symbol("2") /* duplicate */};
   FuzzyFindRequest Req;
   Req.Query = "2";
-  DexIndex I(Symbols);
+  DexIndex I(Symbols, URISchemes);
   EXPECT_THAT(match(I, Req), ElementsAre("2", "2"));
 }
 
 TEST(DexIndexTest, DexIndexLimitedNumMatches) {
-  auto I = DexIndex::build(generateNumSymbols(0, 100));
+  auto I = DexIndex::build(generateNumSymbols(0, 100), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "5";
   Req.MaxCandidateCount = 3;
@@ -478,73 +535,132 @@
 
 TEST(DexIndexTest, FuzzyMatch) {
   auto I = DexIndex::build(
-      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
+      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}),
+      URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "lol";
   Req.MaxCandidateCount = 2;
   EXPECT_THAT(match(*I, Req),
               UnorderedElementsAre("LaughingOutLoud", "LittleOldLady"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
-  auto I = DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}));
+  auto I =
+      DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) {
-  auto I = DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}));
+  auto I =
+      DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {""};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("y3"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) {
   auto I = DexIndex::build(
-      generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
+      generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::"};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "a::y2"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) {
   auto I = DexIndex::build(
-      generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
+      generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::", "b::"};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
 }
 
 TEST(DexIndexTest, NoMatchNestedScopes) {
-  auto I = DexIndex::build(generateSymbols({"a::y1", "a::b::y2"}));
+  auto I = DexIndex::build(generateSymbols({"a::y1", "a::b::y2"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::"};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1"));
 }
 
 TEST(DexIndexTest, IgnoreCases) {
-  auto I = DexIndex::build(generateSymbols({"ns::ABC", "ns::abc"}));
+  auto I = DexIndex::build(generateSymbols({"ns::ABC", "ns::abc"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "AB";
   Req.Scopes = {"ns::"};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("ns::ABC", "ns::abc"));
 }
 
 TEST(DexIndexTest, Lookup) {
-  auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}));
+  auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes);
   EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
   EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::abc", "ns::xyz"));
   EXPECT_THAT(lookup(*I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::xyz"));
   EXPECT_THAT(lookup(*I, SymbolID("ns::nonono")), UnorderedElementsAre());
 }
 
+TEST(DexIndexTest, ProximityPathsBoosting) {
+  // All strings should be stored in local variables to ensure their lifetime is
+  // the same as Slab's.
+  std::string Name = "abc";
+
+  // Populate Symbol Slab: generate two symbols with the same characteristics
+  // and hence quality, but different CanonicalDeclaration file.
+  SymbolSlab::Builder Builder;
+
+  Symbol RootSymbol;
+  RootSymbol.Name = Name;
+  llvm::SmallString<30> RootPath(testRoot());
+  llvm::sys::path::append(RootPath, "file.h");
+  auto RootURI = URI::create(RootPath, "unittest");
+  assert(RootURI);
+  std::string RootFilename = RootURI->toString();
+  RootSymbol.CanonicalDeclaration.FileURI = RootFilename;
+  RootSymbol.ID = SymbolID("RootSymbol");
+  Builder.insert(RootSymbol);
+
+  Symbol CloseSymbol;
+  CloseSymbol.Name = Name;
+  llvm::SmallString<30> ClosePath(testRoot());
+  llvm::sys::path::append(ClosePath, "/a/b/c/d/e/f/file.h");
+  auto CloseFileURI = URI::create(ClosePath, "unittest");
+  assert(CloseFileURI);
+  std::string CloseFileName = CloseFileURI->toString();
+  CloseSymbol.CanonicalDeclaration.FileURI = CloseFileName;
+  CloseSymbol.ID = SymbolID("CloseSymbol");
+  Builder.insert(CloseSymbol);
+
+  auto I = DexIndex::build(std::move(Builder).build(), URISchemes);
+
+  FuzzyFindRequest Req;
+  Req.Query = "abc";
+  // Return only one element, because the order of callback calls is not
+  // specified: items with lower score can be matched first. Therefore, to check
+  // whether the necessary item has higher score, test should only retrieve one
+  // element (with highest cumulative score).
+  Req.MaxCandidateCount = 1;
+
+  // FuzzyFind request comes from the file which is far from the root: expect
+  // CloseSymbol to come out.
+  llvm::SmallString<30> RequestPath(testRoot());
+  llvm::sys::path::append(RequestPath, "/a/b/c/d/e/f/file.h");
+  Req.ProximityPaths = {RequestPath.str()};
+  EXPECT_THAT(matchDeclURIs(*I, Req), ElementsAre(CloseFileURI->toString()));
+
+  // FuzzyFind request comes from the file which is close to the root: expect
+  // RootSymbol to come out.
+  RequestPath = testRoot();
+  llvm::sys::path::append(RequestPath, "file.h");
+  Req.ProximityPaths = {RequestPath.str()};
+  EXPECT_THAT(matchDeclURIs(*I, Req), ElementsAre(RootURI->toString()));
+}
+
 } // namespace
 } // namespace dex
 } // namespace clangd
Index: clang-tools-extra/clangd/tool/ClangdMain.cpp
===================================================================
--- clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -284,8 +284,8 @@
     // Load the index asynchronously. Meanwhile SwapIndex returns no results.
     SwapIndex *Placeholder;
     StaticIdx.reset(Placeholder = new SwapIndex(llvm::make_unique<MemIndex>()));
-    runAsync<void>([Placeholder] {
-      if (auto Idx = loadIndex(YamlSymbolFile))
+    runAsync<void>([Placeholder, Opts] {
+      if (auto Idx = loadIndex(YamlSymbolFile, Opts.URISchemes, UseDex))
         Placeholder->reset(std::move(Idx));
     });
   }
Index: clang-tools-extra/clangd/index/dex/Token.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Token.h
+++ clang-tools-extra/clangd/index/dex/Token.h
@@ -41,22 +41,31 @@
   /// Kind specifies Token type which defines semantics for the internal
   /// representation. Each Kind has different representation stored in Data
   /// field.
+  // FIXME(kbobyrev): Storing Data hash would be more efficient than storing raw
+  // strings. For example, PathURI store URIs of each directory and its parents,
+  // which induces a lot of overhead because these paths tend to be long and
+  // each parent directory is a prefix.
   enum class Kind {
     /// Represents trigram used for fuzzy search of unqualified symbol names.
     ///
     /// Data contains 3 bytes with trigram contents.
     Trigram,
     /// Scope primitives, e.g. "symbol belongs to namespace foo::bar".
     ///
-    /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global
+    /// Data stroes full scope name, e.g. "foo::bar::baz::" or "" (for global
     /// scope).
     Scope,
+    /// Path Proximity URI to symbol declaration.
+    ///
+    /// Data stores path URI of symbol declaration file or its parent.
+    ///
+    /// Example: "file:///path/to/clang-tools-extra/clangd/index/SymbolIndex.h"
+    /// and some amount of its parents.
+    ProximityURI,
     /// Internal Token type for invalid/special tokens, e.g. empty tokens for
     /// llvm::DenseMap.
     Sentinel,
     /// FIXME(kbobyrev): Add other Token Kinds
-    /// * Path with full or relative path to the directory in which symbol is
-    ///   defined
     /// * Type with qualified type name or its USR
   };
 
@@ -81,6 +90,17 @@
   }
 };
 
+/// Returns Search Token for a number of parent directories of given Path.
+/// Should be used within the index build process.
+std::vector<Token> generatePathProximityTokens(llvm::StringRef Path);
+
+/// Returns Search Token and its distance from the absolute path for the first
+/// URI scheme allows valid conversions from AbsolutePath. The first token of
+/// result is always the AbsolutePath converted to a valid scheme.
+std::vector<Token>
+generateQueryPathProximityTokens(llvm::StringRef AbsolutePath,
+                                 llvm::ArrayRef<std::string> URISchemes);
+
 } // namespace dex
 } // namespace clangd
 } // namespace clang
Index: clang-tools-extra/clangd/index/dex/Token.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Token.cpp
@@ -0,0 +1,74 @@
+//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+#include "URI.h"
+#include "llvm/Support/Path.h"
+#include <cassert>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+std::vector<Token> generatePathProximityTokens(llvm::StringRef URIPath) {
+  std::vector<Token> Result;
+  // Don't generate any tokens for empty paths.
+  if (URIPath.empty())
+    return Result;
+  auto ParsedURI = URI::parse(URIPath);
+  assert(ParsedURI &&
+         "Non-empty argument of generateProximityURIs() should be a valid "
+         "URI.");
+  const auto Scheme = ParsedURI->scheme();
+  const auto Authority = ParsedURI->authority();
+  StringRef Path = ParsedURI->body();
+  // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
+  // size of resulting vector. Some projects might want to have higher limit if
+  // the file hierarchy is deeper. For the generic case, it would be useful to
+  // calculate Limit in the index build stage by calculating the maximum depth
+  // of the project source tree at runtime.
+  size_t Limit = 5;
+  // Insert original URI before the loop: this would save a redundant iteration
+  // with a URI parse.
+  Result.emplace_back(Token::Kind::ProximityURI, ParsedURI->toString());
+  while (!Path.empty() && --Limit > 0) {
+    // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
+    // could be optimized.
+    Path = llvm::sys::path::parent_path(Path, llvm::sys::path::Style::posix);
+    URI TokenURI(Scheme, Authority, Path);
+    Result.emplace_back(Token::Kind::ProximityURI, TokenURI.toString());
+  }
+  return Result;
+}
+
+std::vector<Token>
+generateQueryPathProximityTokens(llvm::StringRef AbsolutePath,
+                                 llvm::ArrayRef<std::string> URISchemes) {
+  std::vector<Token> Result;
+  for (const auto Scheme : URISchemes) {
+    auto URI = URI::create(AbsolutePath, Scheme);
+    // For some paths, conversion to different URI schemes is impossible. These
+    // should be just skipped.
+    if (!URI) {
+      // Ignore the error.
+      llvm::consumeError(URI.takeError());
+      continue;
+    }
+    for (const auto ProximityPath :
+         generatePathProximityTokens(URI->toString()))
+      Result.emplace_back(ProximityPath);
+    // As soon as one valid URI scheme for given absolute path is found, the
+    // token generation process should be stopped.
+    break;
+  }
+  return Result;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/index/dex/Iterator.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Iterator.cpp
+++ clang-tools-extra/clangd/index/dex/Iterator.cpp
@@ -392,7 +392,7 @@
 std::vector<std::pair<DocID, float>> consume(Iterator &It) {
   std::vector<std::pair<DocID, float>> Result;
   for (; !It.reachedEnd(); It.advance())
-    Result.emplace_back(It.peek(), It.consume());
+    Result.push_back({It.peek(), It.consume()});
   return Result;
 }
 
Index: clang-tools-extra/clangd/index/dex/DexIndex.h
===================================================================
--- clang-tools-extra/clangd/index/dex/DexIndex.h
+++ clang-tools-extra/clangd/index/dex/DexIndex.h
@@ -39,22 +39,26 @@
 class DexIndex : public SymbolIndex {
 public:
   // All symbols must outlive this index.
-  template <typename Range> DexIndex(Range &&Symbols) {
+  template <typename Range>
+  DexIndex(Range &&Symbols, llvm::ArrayRef<std::string> URISchemes)
+      : URISchemes(URISchemes) {
     for (auto &&Sym : Symbols)
       this->Symbols.push_back(&Sym);
     buildIndex();
   }
   // Symbols are owned by BackingData, Index takes ownership.
   template <typename Range, typename Payload>
-  DexIndex(Range &&Symbols, Payload &&BackingData)
-      : DexIndex(std::forward<Range>(Symbols)) {
+  DexIndex(Range &&Symbols, Payload &&BackingData,
+           llvm::ArrayRef<std::string> URISchemes)
+      : DexIndex(std::forward<Range>(Symbols), URISchemes) {
     KeepAlive = std::shared_ptr<void>(
         std::make_shared<Payload>(std::move(BackingData)), nullptr);
   }
 
   /// Builds an index from a slab. The index takes ownership of the slab.
-  static std::unique_ptr<SymbolIndex> build(SymbolSlab Slab) {
-    return llvm::make_unique<DexIndex>(Slab, std::move(Slab));
+  static std::unique_ptr<SymbolIndex>
+  build(SymbolSlab Slab, llvm::ArrayRef<std::string> URISchemes) {
+    return llvm::make_unique<DexIndex>(Slab, std::move(Slab), URISchemes);
   }
 
   bool
@@ -72,21 +76,25 @@
 private:
   void buildIndex();
 
+  /// Stores symbols sorted in the descending order of symbol quality..
   std::vector<const Symbol *> Symbols;
+  /// SymbolQuality[I] is the quality of Symbols[I].
+  std::vector<float> SymbolQuality;
   llvm::DenseMap<SymbolID, const Symbol *> LookupTable;
-  llvm::DenseMap<const Symbol *, float> SymbolQuality;
-  // Inverted index is a mapping from the search token to the posting list,
-  // which contains all items which can be characterized by such search token.
-  // For example, if the search token is scope "std::", the corresponding
-  // posting list would contain all indices of symbols defined in namespace std.
-  // Inverted index is used to retrieve posting lists which are processed during
-  // the fuzzyFind process.
+  /// Inverted index is a mapping from the search token to the posting list,
+  /// which contains all items which can be characterized by such search token.
+  /// For example, if the search token is scope "std::", the corresponding
+  /// posting list would contain all indices of symbols defined in namespace
+  /// std. Inverted index is used to retrieve posting lists which are processed
+  /// during the fuzzyFind process.
   llvm::DenseMap<Token, PostingList> InvertedIndex;
   std::shared_ptr<void> KeepAlive; // poor man's move-only std::any
+
+  const std::vector<std::string> URISchemes;
 };
 
 } // namespace dex
 } // namespace clangd
 } // namespace clang
 
-#endif
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H
Index: clang-tools-extra/clangd/index/dex/DexIndex.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/DexIndex.cpp
+++ clang-tools-extra/clangd/index/dex/DexIndex.cpp
@@ -8,8 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "DexIndex.h"
+#include "../../FileDistance.h"
 #include "../../FuzzyMatch.h"
 #include "../../Logger.h"
+#include "../../Quality.h"
 #include <algorithm>
 #include <queue>
 
@@ -26,28 +28,39 @@
 // Returns the tokens which are given symbols's characteristics. For example,
 // trigrams and scopes.
 // FIXME(kbobyrev): Support more token types:
-// * Path proximity
 // * Types
+// * Namespace proximity
 std::vector<Token> generateSearchTokens(const Symbol &Sym) {
   std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
   Result.emplace_back(Token::Kind::Scope, Sym.Scope);
+  for (const auto &ProximityToken :
+       generatePathProximityTokens(Sym.CanonicalDeclaration.FileURI))
+    Result.emplace_back(ProximityToken);
   return Result;
 }
 
 } // namespace
 
 void DexIndex::buildIndex() {
-  for (const Symbol *Sym : Symbols) {
+  std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
+
+  for (size_t I = 0; I < Symbols.size(); ++I) {
+    const Symbol *Sym = Symbols[I];
     LookupTable[Sym->ID] = Sym;
-    SymbolQuality[Sym] = quality(*Sym);
+    ScoredSymbols[I] = {static_cast<float>(quality(*Sym)), Sym};
   }
 
   // Symbols are sorted by symbol qualities so that items in the posting lists
   // are stored in the descending order of symbol quality.
-  std::sort(begin(Symbols), end(Symbols),
-            [&](const Symbol *LHS, const Symbol *RHS) {
-              return SymbolQuality[LHS] > SymbolQuality[RHS];
-            });
+  std::sort(begin(ScoredSymbols), end(ScoredSymbols));
+
+  // SymbolQuality was empty up until now.
+  SymbolQuality.resize(Symbols.size());
+  // Populate internal storage using Symbol + Score pairs.
+  for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
+    SymbolQuality[I] = ScoredSymbols[I].first;
+    Symbols[I] = ScoredSymbols[I].second;
+  }
 
   // Populate TempInvertedIndex with posting lists for index symbols.
   for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
@@ -96,6 +109,40 @@
   if (!ScopeIterators.empty())
     TopLevelChildren.push_back(createOr(move(ScopeIterators)));
 
+  // Add proximity paths boosting.
+  std::vector<std::unique_ptr<Iterator>> BoostingIterators;
+  // Try to build BOOST iterator for each Proximity Path provided by
+  // FuzzyFindRequest. Boosting factor should depend on the distance to the
+  // Proximity Path: the closer processed path is, the higher boosting factor.
+  SymbolRelevanceSignals PathProximitySignals;
+  for (const auto &ProximityPath : Req.ProximityPaths) {
+    const auto PathProximityTokens =
+        generateQueryPathProximityTokens(ProximityPath, URISchemes);
+    llvm::StringMap<SourceParams> URIToParams;
+    // Use default distance parameters for the first URI in ProximityURIs, which
+    // corresponds to the query path URI.
+    URIToParams[PathProximityTokens.front().Data] = SourceParams();
+    URIDistance DistanceCalculator(URIToParams);
+    PathProximitySignals.FileProximityMatch = &DistanceCalculator;
+    for (const auto &ProximityToken : PathProximityTokens) {
+      const auto It = InvertedIndex.find(ProximityToken);
+      if (It != InvertedIndex.end()) {
+        // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
+        PathProximitySignals.SymbolURI = ProximityToken.Data;
+        float BoostFactor = 1 + PathProximitySignals.evaluate();
+        BoostingIterators.push_back(
+            createBoost(create(It->second), BoostFactor));
+      }
+    }
+  }
+  // Boosting iterators do not actually filter symbols. In order to preserve
+  // the validity of resulting query, TRUE iterator should be added along
+  // BOOSTs.
+  if (!BoostingIterators.empty()) {
+    BoostingIterators.push_back(createTrue(Symbols.size()));
+    TopLevelChildren.push_back(createOr(move(BoostingIterators)));
+  }
+
   // Use TRUE iterator if both trigrams and scopes from the query are not
   // present in the symbol index.
   auto QueryIterator = TopLevelChildren.empty()
@@ -106,32 +153,37 @@
   // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as
   // using 100x of the requested number might not be good in practice, e.g.
   // when the requested number of items is small.
-  const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount;
+  const size_t ItemsToRetrieve = 100 * Req.MaxCandidateCount;
   auto Root = createLimit(move(QueryIterator), ItemsToRetrieve);
-  // FIXME(kbobyrev): Add boosting to the query and utilize retrieved
-  // boosting scores.
-  std::vector<std::pair<DocID, float>> SymbolDocIDs = consume(*Root);
-
-  // Retrieve top Req.MaxCandidateCount items.
-  std::priority_queue<std::pair<float, const Symbol *>> Top;
-  for (const auto &P : SymbolDocIDs) {
-    const DocID SymbolDocID = P.first;
+
+  std::vector<std::pair<DocID, float>> IDAndScores = consume(*Root);
+
+  auto Compare = [](const std::pair<DocID, float> &LHS,
+                    const std::pair<DocID, float> &RHS) {
+    return LHS.second > RHS.second;
+  };
+  TopN<std::pair<DocID, float>, decltype(Compare)> Top(Req.MaxCandidateCount,
+                                                       Compare);
+  for (const auto &IDAndScore : IDAndScores) {
+    const DocID SymbolDocID = IDAndScore.first;
     const auto *Sym = Symbols[SymbolDocID];
     const llvm::Optional<float> Score = Filter.match(Sym->Name);
     if (!Score)
       continue;
-    // Multiply score by a negative factor so that Top stores items with the
-    // highest actual score.
-    Top.emplace(-(*Score) * SymbolQuality.find(Sym)->second, Sym);
-    if (Top.size() > Req.MaxCandidateCount) {
+    // Combine Fuzzy Matching score, precomputed symbol quality and boosting
+    // score for a cumulative final symbol score.
+    const float FinalScore =
+        (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
+    // If Top.push(...) returns true, it means that it had to pop an item. In
+    // this case, it is possible to retrieve more symbols.
+    if (Top.push({SymbolDocID, FinalScore}))
       More = true;
-      Top.pop();
-    }
   }
 
-  // Apply callback to the top Req.MaxCandidateCount items.
-  for (; !Top.empty(); Top.pop())
-    Callback(*Top.top().second);
+  // Apply callback to the top Req.MaxCandidateCount items in the descending
+  // order of cumulative score.
+  for (const auto &Item : std::move(Top).items())
+    Callback(*Symbols[Item.first]);
   return More;
 }
 
Index: clang-tools-extra/clangd/index/SymbolYAML.h
===================================================================
--- clang-tools-extra/clangd/index/SymbolYAML.h
+++ clang-tools-extra/clangd/index/SymbolYAML.h
@@ -45,6 +45,7 @@
 // The size of global symbols should be relatively small, so that all symbols
 // can be managed in memory.
 std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef SymbolFilename,
+                                       llvm::ArrayRef<std::string> URISchemes,
                                        bool UseDex = true);
 
 } // namespace clangd
Index: clang-tools-extra/clangd/index/SymbolYAML.cpp
===================================================================
--- clang-tools-extra/clangd/index/SymbolYAML.cpp
+++ clang-tools-extra/clangd/index/SymbolYAML.cpp
@@ -185,6 +185,7 @@
 }
 
 std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef SymbolFilename,
+                                       llvm::ArrayRef<std::string> URISchemes,
                                        bool UseDex) {
   trace::Span OverallTracer("LoadIndex");
   auto Buffer = llvm::MemoryBuffer::getFile(SymbolFilename);
@@ -209,7 +210,7 @@
   if (!Slab)
     return nullptr;
   trace::Span Tracer("BuildIndex");
-  return UseDex ? dex::DexIndex::build(std::move(*Slab))
+  return UseDex ? dex::DexIndex::build(std::move(*Slab), URISchemes)
                 : MemIndex::build(std::move(*Slab), RefSlab());
 }
 
Index: clang-tools-extra/clangd/Quality.cpp
===================================================================
--- clang-tools-extra/clangd/Quality.cpp
+++ clang-tools-extra/clangd/Quality.cpp
@@ -6,6 +6,7 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+
 #include "Quality.h"
 #include "FileDistance.h"
 #include "URI.h"
@@ -302,7 +303,7 @@
     return {0.f, 0u};
   unsigned Distance = D->distance(SymbolURI);
   // Assume approximately default options are used for sensible scoring.
-  return {std::exp(Distance * -0.4f / FileDistanceOptions().UpCost), Distance};
+  return {std::exp(Distance * -.4f / FileDistanceOptions().UpCost), Distance};
 }
 
 float SymbolRelevanceSignals::evaluate() const {
Index: clang-tools-extra/clangd/FileDistance.h
===================================================================
--- clang-tools-extra/clangd/FileDistance.h
+++ clang-tools-extra/clangd/FileDistance.h
@@ -37,6 +37,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FILEDISTANCE_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FILEDISTANCE_H
+
 #include "URI.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
@@ -107,3 +110,5 @@
 
 } // namespace clangd
 } // namespace clang
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_FILEDISTANCE_H
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -48,6 +48,7 @@
 
   index/dex/DexIndex.cpp
   index/dex/Iterator.cpp
+  index/dex/Token.cpp
   index/dex/Trigram.cpp
 
   LINK_LIBS

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D51481: [clangd] Implement proximity path boosting for Dex

Reply via email to