kbobyrev updated this revision to Diff 163987.
kbobyrev marked 14 inline comments as done.
kbobyrev added a comment.
- Rebase on top of master
- Address suggestions Eric wrote here
- Address suggestions from the offline discussion by reusing `Quality.cpp`
infrastructure for path proximity boosting calculation
https://reviews.llvm.org/D51481
Files:
clang-tools-extra/clangd/CMakeLists.txt
clang-tools-extra/clangd/FileDistance.h
clang-tools-extra/clangd/Quality.cpp
clang-tools-extra/clangd/index/SymbolYAML.cpp
clang-tools-extra/clangd/index/SymbolYAML.h
clang-tools-extra/clangd/index/dex/DexIndex.cpp
clang-tools-extra/clangd/index/dex/DexIndex.h
clang-tools-extra/clangd/index/dex/Iterator.cpp
clang-tools-extra/clangd/index/dex/Token.cpp
clang-tools-extra/clangd/index/dex/Token.h
clang-tools-extra/clangd/tool/ClangdMain.cpp
clang-tools-extra/unittests/clangd/DexIndexTests.cpp
clang-tools-extra/unittests/clangd/TestIndex.cpp
clang-tools-extra/unittests/clangd/TestIndex.h
Index: clang-tools-extra/unittests/clangd/TestIndex.h
===================================================================
--- clang-tools-extra/unittests/clangd/TestIndex.h
+++ clang-tools-extra/unittests/clangd/TestIndex.h
@@ -39,6 +39,13 @@
const FuzzyFindRequest &Req,
bool *Incomplete = nullptr);
+// Performs fuzzy matching-based symbol lookup given a query and an index.
+// Incomplete is set true if more items than requested can be retrieved, false
+// otherwise. Returns filename URIs of matched symbols canonical declarations.
+std::vector<std::string> matchDeclURIs(const SymbolIndex &I,
+ const FuzzyFindRequest &Req,
+ bool *Incomplete = nullptr);
+
// Returns qualified names of symbols with any of IDs in the index.
std::vector<std::string> lookup(const SymbolIndex &I,
llvm::ArrayRef<SymbolID> IDs);
Index: clang-tools-extra/unittests/clangd/TestIndex.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/TestIndex.cpp
+++ clang-tools-extra/unittests/clangd/TestIndex.cpp
@@ -55,6 +55,18 @@
return Matches;
}
+std::vector<std::string> matchDeclURIs(const SymbolIndex &I,
+ const FuzzyFindRequest &Req,
+ bool *Incomplete) {
+ std::vector<std::string> Matches;
+ bool IsIncomplete = I.fuzzyFind(Req, [&](const Symbol &Sym) {
+ Matches.push_back(Sym.CanonicalDeclaration.FileURI);
+ });
+ if (Incomplete)
+ *Incomplete = IsIncomplete;
+ return Matches;
+}
+
// Returns qualified names of symbols with any of IDs in the index.
std::vector<std::string> lookup(const SymbolIndex &I,
llvm::ArrayRef<SymbolID> IDs) {
Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -7,6 +7,8 @@
//
//===----------------------------------------------------------------------===//
+#include "FuzzyMatch.h"
+#include "TestFS.h"
#include "TestIndex.h"
#include "index/Index.h"
#include "index/Merge.h"
@@ -30,6 +32,12 @@
namespace dex {
namespace {
+std::vector<std::string> URISchemes = {"unittest"};
+
+//===----------------------------------------------------------------------===//
+// Query iterator tests.
+//===----------------------------------------------------------------------===//
+
std::vector<DocID> consumeIDs(Iterator &It) {
auto IDAndScore = consume(It);
std::vector<DocID> IDs(IDAndScore.size());
@@ -325,15 +333,38 @@
EXPECT_THAT(ElementBoost, 3);
}
+//===----------------------------------------------------------------------===//
+// Search token tests.
+//===----------------------------------------------------------------------===//
+
testing::Matcher<std::vector<Token>>
-trigramsAre(std::initializer_list<std::string> Trigrams) {
+tokensAre(std::initializer_list<std::string> Strings, Token::Kind Kind) {
std::vector<Token> Tokens;
- for (const auto &Symbols : Trigrams) {
- Tokens.push_back(Token(Token::Kind::Trigram, Symbols));
+ for (const auto &TokenData : Strings) {
+ Tokens.push_back(Token(Kind, TokenData));
}
return testing::UnorderedElementsAreArray(Tokens);
}
+testing::Matcher<std::vector<Token>>
+trigramsAre(std::initializer_list<std::string> Trigrams) {
+ return tokensAre(Trigrams, Token::Kind::Trigram);
+}
+
+testing::Matcher<std::vector<Token>>
+pathsAre(std::initializer_list<std::string> Paths) {
+ return tokensAre(Paths, Token::Kind::ProximityURI);
+}
+
+testing::Matcher<std::vector<Token>>
+proximityPathsAre(std::initializer_list<std::string> ProximityPaths) {
+ std::vector<Token> Result;
+ for (const auto &Path : ProximityPaths) {
+ Result.emplace_back(Token::Kind::ProximityURI, Path);
+ }
+ return testing::UnorderedElementsAreArray(Result);
+}
+
TEST(DexIndexTrigrams, IdentifierTrigrams) {
EXPECT_THAT(generateIdentifierTrigrams("X86"),
trigramsAre({"x86", "x$$", "x8$"}));
@@ -408,8 +439,32 @@
"hij", "ijk", "jkl", "klm"}));
}
+TEST(DexSearchTokens, SymbolPath) {
+ EXPECT_THAT(generatePathProximityTokens(
+ "unittest:///clang-tools-extra/clangd/index/Token.h"),
+ pathsAre({"unittest:///clang-tools-extra/clangd/index/Token.h",
+ "unittest:///clang-tools-extra/clangd/index",
+ "unittest:///clang-tools-extra/clangd",
+ "unittest:///clang-tools-extra", "unittest:///"}));
+}
+
+TEST(DexSearchTokens, QueryProximityDistances) {
+ llvm::SmallString<30> Path(testRoot());
+ llvm::sys::path::append(Path, "/a/b/c/d/e/f/g.h");
+ EXPECT_THAT(generateQueryPathProximityTokens(Path, URISchemes),
+ proximityPathsAre({{"unittest:///a/b/c/d/e/f/g.h"},
+ {"unittest:///a/b/c/d/e/f"},
+ {"unittest:///a/b/c/d/e"},
+ {"unittest:///a/b/c/d"},
+ {"unittest:///a/b/c"}}));
+}
+
+//===----------------------------------------------------------------------===//
+// Index tests.
+//===----------------------------------------------------------------------===//
+
TEST(DexIndex, Lookup) {
- auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}));
+ auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes);
EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
UnorderedElementsAre("ns::abc", "ns::xyz"));
@@ -421,7 +476,8 @@
TEST(DexIndex, FuzzyFind) {
auto Index = DexIndex::build(
generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC",
- "other::ABC", "other::A"}));
+ "other::ABC", "other::A"}),
+ URISchemes);
FuzzyFindRequest Req;
Req.Query = "ABC";
Req.Scopes = {"ns::"};
@@ -443,7 +499,8 @@
TEST(DexIndexTest, FuzzyMatchQ) {
auto I = DexIndex::build(
- generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
+ generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}),
+ URISchemes);
FuzzyFindRequest Req;
Req.Query = "lol";
Req.MaxCandidateCount = 2;
@@ -461,12 +518,12 @@
symbol("2") /* duplicate */};
FuzzyFindRequest Req;
Req.Query = "2";
- DexIndex I(Symbols);
+ DexIndex I(Symbols, URISchemes);
EXPECT_THAT(match(I, Req), ElementsAre("2", "2"));
}
TEST(DexIndexTest, DexIndexLimitedNumMatches) {
- auto I = DexIndex::build(generateNumSymbols(0, 100));
+ auto I = DexIndex::build(generateNumSymbols(0, 100), URISchemes);
FuzzyFindRequest Req;
Req.Query = "5";
Req.MaxCandidateCount = 3;
@@ -478,73 +535,132 @@
TEST(DexIndexTest, FuzzyMatch) {
auto I = DexIndex::build(
- generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
+ generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}),
+ URISchemes);
FuzzyFindRequest Req;
Req.Query = "lol";
Req.MaxCandidateCount = 2;
EXPECT_THAT(match(*I, Req),
UnorderedElementsAre("LaughingOutLoud", "LittleOldLady"));
}
TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
- auto I = DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}));
+ auto I =
+ DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes);
FuzzyFindRequest Req;
Req.Query = "y";
EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
}
TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) {
- auto I = DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}));
+ auto I =
+ DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes);
FuzzyFindRequest Req;
Req.Query = "y";
Req.Scopes = {""};
EXPECT_THAT(match(*I, Req), UnorderedElementsAre("y3"));
}
TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) {
auto I = DexIndex::build(
- generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
+ generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}), URISchemes);
FuzzyFindRequest Req;
Req.Query = "y";
Req.Scopes = {"a::"};
EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "a::y2"));
}
TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) {
auto I = DexIndex::build(
- generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
+ generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}), URISchemes);
FuzzyFindRequest Req;
Req.Query = "y";
Req.Scopes = {"a::", "b::"};
EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
}
TEST(DexIndexTest, NoMatchNestedScopes) {
- auto I = DexIndex::build(generateSymbols({"a::y1", "a::b::y2"}));
+ auto I = DexIndex::build(generateSymbols({"a::y1", "a::b::y2"}), URISchemes);
FuzzyFindRequest Req;
Req.Query = "y";
Req.Scopes = {"a::"};
EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1"));
}
TEST(DexIndexTest, IgnoreCases) {
- auto I = DexIndex::build(generateSymbols({"ns::ABC", "ns::abc"}));
+ auto I = DexIndex::build(generateSymbols({"ns::ABC", "ns::abc"}), URISchemes);
FuzzyFindRequest Req;
Req.Query = "AB";
Req.Scopes = {"ns::"};
EXPECT_THAT(match(*I, Req), UnorderedElementsAre("ns::ABC", "ns::abc"));
}
TEST(DexIndexTest, Lookup) {
- auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}));
+ auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes);
EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
UnorderedElementsAre("ns::abc", "ns::xyz"));
EXPECT_THAT(lookup(*I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}),
UnorderedElementsAre("ns::xyz"));
EXPECT_THAT(lookup(*I, SymbolID("ns::nonono")), UnorderedElementsAre());
}
+TEST(DexIndexTest, ProximityPathsBoosting) {
+ // All strings should be stored in local variables to ensure their lifetime is
+ // the same as Slab's.
+ std::string Name = "abc";
+
+ // Populate Symbol Slab: generate two symbols with the same characteristics
+ // and hence quality, but different CanonicalDeclaration file.
+ SymbolSlab::Builder Builder;
+
+ Symbol RootSymbol;
+ RootSymbol.Name = Name;
+ llvm::SmallString<30> RootPath(testRoot());
+ llvm::sys::path::append(RootPath, "file.h");
+ auto RootURI = URI::create(RootPath, "unittest");
+ assert(RootURI);
+ std::string RootFilename = RootURI->toString();
+ RootSymbol.CanonicalDeclaration.FileURI = RootFilename;
+ RootSymbol.ID = SymbolID("RootSymbol");
+ Builder.insert(RootSymbol);
+
+ Symbol CloseSymbol;
+ CloseSymbol.Name = Name;
+ llvm::SmallString<30> ClosePath(testRoot());
+ llvm::sys::path::append(ClosePath, "/a/b/c/d/e/f/file.h");
+ auto CloseFileURI = URI::create(ClosePath, "unittest");
+ assert(CloseFileURI);
+ std::string CloseFileName = CloseFileURI->toString();
+ CloseSymbol.CanonicalDeclaration.FileURI = CloseFileName;
+ CloseSymbol.ID = SymbolID("CloseSymbol");
+ Builder.insert(CloseSymbol);
+
+ auto I = DexIndex::build(std::move(Builder).build(), URISchemes);
+
+ FuzzyFindRequest Req;
+ Req.Query = "abc";
+ // Return only one element, because the order of callback calls is not
+ // specified: items with lower score can be matched first. Therefore, to check
+ // whether the necessary item has higher score, test should only retrieve one
+ // element (with highest cumulative score).
+ Req.MaxCandidateCount = 1;
+
+ // FuzzyFind request comes from the file which is far from the root: expect
+ // CloseSymbol to come out.
+ llvm::SmallString<30> RequestPath(testRoot());
+ llvm::sys::path::append(RequestPath, "/a/b/c/d/e/f/file.h");
+ Req.ProximityPaths = {RequestPath.str()};
+ EXPECT_THAT(matchDeclURIs(*I, Req), ElementsAre(CloseFileURI->toString()));
+
+ // FuzzyFind request comes from the file which is close to the root: expect
+ // RootSymbol to come out.
+ RequestPath = testRoot();
+ llvm::sys::path::append(RequestPath, "file.h");
+ Req.ProximityPaths = {RequestPath.str()};
+ EXPECT_THAT(matchDeclURIs(*I, Req), ElementsAre(RootURI->toString()));
+}
+
} // namespace
} // namespace dex
} // namespace clangd
Index: clang-tools-extra/clangd/tool/ClangdMain.cpp
===================================================================
--- clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -284,8 +284,8 @@
// Load the index asynchronously. Meanwhile SwapIndex returns no results.
SwapIndex *Placeholder;
StaticIdx.reset(Placeholder = new SwapIndex(llvm::make_unique<MemIndex>()));
- runAsync<void>([Placeholder] {
- if (auto Idx = loadIndex(YamlSymbolFile))
+ runAsync<void>([Placeholder, Opts] {
+ if (auto Idx = loadIndex(YamlSymbolFile, Opts.URISchemes, UseDex))
Placeholder->reset(std::move(Idx));
});
}
Index: clang-tools-extra/clangd/index/dex/Token.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Token.h
+++ clang-tools-extra/clangd/index/dex/Token.h
@@ -41,22 +41,31 @@
/// Kind specifies Token type which defines semantics for the internal
/// representation. Each Kind has different representation stored in Data
/// field.
+ // FIXME(kbobyrev): Storing Data hash would be more efficient than storing raw
+ // strings. For example, PathURI store URIs of each directory and its parents,
+ // which induces a lot of overhead because these paths tend to be long and
+ // each parent directory is a prefix.
enum class Kind {
/// Represents trigram used for fuzzy search of unqualified symbol names.
///
/// Data contains 3 bytes with trigram contents.
Trigram,
/// Scope primitives, e.g. "symbol belongs to namespace foo::bar".
///
- /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global
+ /// Data stroes full scope name, e.g. "foo::bar::baz::" or "" (for global
/// scope).
Scope,
+ /// Path Proximity URI to symbol declaration.
+ ///
+ /// Data stores path URI of symbol declaration file or its parent.
+ ///
+ /// Example: "file:///path/to/clang-tools-extra/clangd/index/SymbolIndex.h"
+ /// and some amount of its parents.
+ ProximityURI,
/// Internal Token type for invalid/special tokens, e.g. empty tokens for
/// llvm::DenseMap.
Sentinel,
/// FIXME(kbobyrev): Add other Token Kinds
- /// * Path with full or relative path to the directory in which symbol is
- /// defined
/// * Type with qualified type name or its USR
};
@@ -81,6 +90,17 @@
}
};
+/// Returns Search Token for a number of parent directories of given Path.
+/// Should be used within the index build process.
+std::vector<Token> generatePathProximityTokens(llvm::StringRef Path);
+
+/// Returns Search Token and its distance from the absolute path for the first
+/// URI scheme allows valid conversions from AbsolutePath. The first token of
+/// result is always the AbsolutePath converted to a valid scheme.
+std::vector<Token>
+generateQueryPathProximityTokens(llvm::StringRef AbsolutePath,
+ llvm::ArrayRef<std::string> URISchemes);
+
} // namespace dex
} // namespace clangd
} // namespace clang
Index: clang-tools-extra/clangd/index/dex/Token.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Token.cpp
@@ -0,0 +1,74 @@
+//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+#include "URI.h"
+#include "llvm/Support/Path.h"
+#include <cassert>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+std::vector<Token> generatePathProximityTokens(llvm::StringRef URIPath) {
+ std::vector<Token> Result;
+ // Don't generate any tokens for empty paths.
+ if (URIPath.empty())
+ return Result;
+ auto ParsedURI = URI::parse(URIPath);
+ assert(ParsedURI &&
+ "Non-empty argument of generateProximityURIs() should be a valid "
+ "URI.");
+ const auto Scheme = ParsedURI->scheme();
+ const auto Authority = ParsedURI->authority();
+ StringRef Path = ParsedURI->body();
+ // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
+ // size of resulting vector. Some projects might want to have higher limit if
+ // the file hierarchy is deeper. For the generic case, it would be useful to
+ // calculate Limit in the index build stage by calculating the maximum depth
+ // of the project source tree at runtime.
+ size_t Limit = 5;
+ // Insert original URI before the loop: this would save a redundant iteration
+ // with a URI parse.
+ Result.emplace_back(Token::Kind::ProximityURI, ParsedURI->toString());
+ while (!Path.empty() && --Limit > 0) {
+ // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
+ // could be optimized.
+ Path = llvm::sys::path::parent_path(Path, llvm::sys::path::Style::posix);
+ URI TokenURI(Scheme, Authority, Path);
+ Result.emplace_back(Token::Kind::ProximityURI, TokenURI.toString());
+ }
+ return Result;
+}
+
+std::vector<Token>
+generateQueryPathProximityTokens(llvm::StringRef AbsolutePath,
+ llvm::ArrayRef<std::string> URISchemes) {
+ std::vector<Token> Result;
+ for (const auto Scheme : URISchemes) {
+ auto URI = URI::create(AbsolutePath, Scheme);
+ // For some paths, conversion to different URI schemes is impossible. These
+ // should be just skipped.
+ if (!URI) {
+ // Ignore the error.
+ llvm::consumeError(URI.takeError());
+ continue;
+ }
+ for (const auto ProximityPath :
+ generatePathProximityTokens(URI->toString()))
+ Result.emplace_back(ProximityPath);
+ // As soon as one valid URI scheme for given absolute path is found, the
+ // token generation process should be stopped.
+ break;
+ }
+ return Result;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/index/dex/Iterator.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Iterator.cpp
+++ clang-tools-extra/clangd/index/dex/Iterator.cpp
@@ -392,7 +392,7 @@
std::vector<std::pair<DocID, float>> consume(Iterator &It) {
std::vector<std::pair<DocID, float>> Result;
for (; !It.reachedEnd(); It.advance())
- Result.emplace_back(It.peek(), It.consume());
+ Result.push_back({It.peek(), It.consume()});
return Result;
}
Index: clang-tools-extra/clangd/index/dex/DexIndex.h
===================================================================
--- clang-tools-extra/clangd/index/dex/DexIndex.h
+++ clang-tools-extra/clangd/index/dex/DexIndex.h
@@ -39,22 +39,26 @@
class DexIndex : public SymbolIndex {
public:
// All symbols must outlive this index.
- template <typename Range> DexIndex(Range &&Symbols) {
+ template <typename Range>
+ DexIndex(Range &&Symbols, llvm::ArrayRef<std::string> URISchemes)
+ : URISchemes(URISchemes) {
for (auto &&Sym : Symbols)
this->Symbols.push_back(&Sym);
buildIndex();
}
// Symbols are owned by BackingData, Index takes ownership.
template <typename Range, typename Payload>
- DexIndex(Range &&Symbols, Payload &&BackingData)
- : DexIndex(std::forward<Range>(Symbols)) {
+ DexIndex(Range &&Symbols, Payload &&BackingData,
+ llvm::ArrayRef<std::string> URISchemes)
+ : DexIndex(std::forward<Range>(Symbols), URISchemes) {
KeepAlive = std::shared_ptr<void>(
std::make_shared<Payload>(std::move(BackingData)), nullptr);
}
/// Builds an index from a slab. The index takes ownership of the slab.
- static std::unique_ptr<SymbolIndex> build(SymbolSlab Slab) {
- return llvm::make_unique<DexIndex>(Slab, std::move(Slab));
+ static std::unique_ptr<SymbolIndex>
+ build(SymbolSlab Slab, llvm::ArrayRef<std::string> URISchemes) {
+ return llvm::make_unique<DexIndex>(Slab, std::move(Slab), URISchemes);
}
bool
@@ -72,21 +76,25 @@
private:
void buildIndex();
+ /// Stores symbols sorted in the descending order of symbol quality..
std::vector<const Symbol *> Symbols;
+ /// SymbolQuality[I] is the quality of Symbols[I].
+ std::vector<float> SymbolQuality;
llvm::DenseMap<SymbolID, const Symbol *> LookupTable;
- llvm::DenseMap<const Symbol *, float> SymbolQuality;
- // Inverted index is a mapping from the search token to the posting list,
- // which contains all items which can be characterized by such search token.
- // For example, if the search token is scope "std::", the corresponding
- // posting list would contain all indices of symbols defined in namespace std.
- // Inverted index is used to retrieve posting lists which are processed during
- // the fuzzyFind process.
+ /// Inverted index is a mapping from the search token to the posting list,
+ /// which contains all items which can be characterized by such search token.
+ /// For example, if the search token is scope "std::", the corresponding
+ /// posting list would contain all indices of symbols defined in namespace
+ /// std. Inverted index is used to retrieve posting lists which are processed
+ /// during the fuzzyFind process.
llvm::DenseMap<Token, PostingList> InvertedIndex;
std::shared_ptr<void> KeepAlive; // poor man's move-only std::any
+
+ const std::vector<std::string> URISchemes;
};
} // namespace dex
} // namespace clangd
} // namespace clang
-#endif
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H
Index: clang-tools-extra/clangd/index/dex/DexIndex.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/DexIndex.cpp
+++ clang-tools-extra/clangd/index/dex/DexIndex.cpp
@@ -8,8 +8,10 @@
//===----------------------------------------------------------------------===//
#include "DexIndex.h"
+#include "../../FileDistance.h"
#include "../../FuzzyMatch.h"
#include "../../Logger.h"
+#include "../../Quality.h"
#include <algorithm>
#include <queue>
@@ -26,28 +28,39 @@
// Returns the tokens which are given symbols's characteristics. For example,
// trigrams and scopes.
// FIXME(kbobyrev): Support more token types:
-// * Path proximity
// * Types
+// * Namespace proximity
std::vector<Token> generateSearchTokens(const Symbol &Sym) {
std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
Result.emplace_back(Token::Kind::Scope, Sym.Scope);
+ for (const auto &ProximityToken :
+ generatePathProximityTokens(Sym.CanonicalDeclaration.FileURI))
+ Result.emplace_back(ProximityToken);
return Result;
}
} // namespace
void DexIndex::buildIndex() {
- for (const Symbol *Sym : Symbols) {
+ std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
+
+ for (size_t I = 0; I < Symbols.size(); ++I) {
+ const Symbol *Sym = Symbols[I];
LookupTable[Sym->ID] = Sym;
- SymbolQuality[Sym] = quality(*Sym);
+ ScoredSymbols[I] = {static_cast<float>(quality(*Sym)), Sym};
}
// Symbols are sorted by symbol qualities so that items in the posting lists
// are stored in the descending order of symbol quality.
- std::sort(begin(Symbols), end(Symbols),
- [&](const Symbol *LHS, const Symbol *RHS) {
- return SymbolQuality[LHS] > SymbolQuality[RHS];
- });
+ std::sort(begin(ScoredSymbols), end(ScoredSymbols));
+
+ // SymbolQuality was empty up until now.
+ SymbolQuality.resize(Symbols.size());
+ // Populate internal storage using Symbol + Score pairs.
+ for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
+ SymbolQuality[I] = ScoredSymbols[I].first;
+ Symbols[I] = ScoredSymbols[I].second;
+ }
// Populate TempInvertedIndex with posting lists for index symbols.
for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
@@ -96,6 +109,40 @@
if (!ScopeIterators.empty())
TopLevelChildren.push_back(createOr(move(ScopeIterators)));
+ // Add proximity paths boosting.
+ std::vector<std::unique_ptr<Iterator>> BoostingIterators;
+ // Try to build BOOST iterator for each Proximity Path provided by
+ // FuzzyFindRequest. Boosting factor should depend on the distance to the
+ // Proximity Path: the closer processed path is, the higher boosting factor.
+ SymbolRelevanceSignals PathProximitySignals;
+ for (const auto &ProximityPath : Req.ProximityPaths) {
+ const auto PathProximityTokens =
+ generateQueryPathProximityTokens(ProximityPath, URISchemes);
+ llvm::StringMap<SourceParams> URIToParams;
+ // Use default distance parameters for the first URI in ProximityURIs, which
+ // corresponds to the query path URI.
+ URIToParams[PathProximityTokens.front().Data] = SourceParams();
+ URIDistance DistanceCalculator(URIToParams);
+ PathProximitySignals.FileProximityMatch = &DistanceCalculator;
+ for (const auto &ProximityToken : PathProximityTokens) {
+ const auto It = InvertedIndex.find(ProximityToken);
+ if (It != InvertedIndex.end()) {
+ // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
+ PathProximitySignals.SymbolURI = ProximityToken.Data;
+ float BoostFactor = 1 + PathProximitySignals.evaluate();
+ BoostingIterators.push_back(
+ createBoost(create(It->second), BoostFactor));
+ }
+ }
+ }
+ // Boosting iterators do not actually filter symbols. In order to preserve
+ // the validity of resulting query, TRUE iterator should be added along
+ // BOOSTs.
+ if (!BoostingIterators.empty()) {
+ BoostingIterators.push_back(createTrue(Symbols.size()));
+ TopLevelChildren.push_back(createOr(move(BoostingIterators)));
+ }
+
// Use TRUE iterator if both trigrams and scopes from the query are not
// present in the symbol index.
auto QueryIterator = TopLevelChildren.empty()
@@ -106,32 +153,37 @@
// FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as
// using 100x of the requested number might not be good in practice, e.g.
// when the requested number of items is small.
- const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount;
+ const size_t ItemsToRetrieve = 100 * Req.MaxCandidateCount;
auto Root = createLimit(move(QueryIterator), ItemsToRetrieve);
- // FIXME(kbobyrev): Add boosting to the query and utilize retrieved
- // boosting scores.
- std::vector<std::pair<DocID, float>> SymbolDocIDs = consume(*Root);
-
- // Retrieve top Req.MaxCandidateCount items.
- std::priority_queue<std::pair<float, const Symbol *>> Top;
- for (const auto &P : SymbolDocIDs) {
- const DocID SymbolDocID = P.first;
+
+ std::vector<std::pair<DocID, float>> IDAndScores = consume(*Root);
+
+ auto Compare = [](const std::pair<DocID, float> &LHS,
+ const std::pair<DocID, float> &RHS) {
+ return LHS.second > RHS.second;
+ };
+ TopN<std::pair<DocID, float>, decltype(Compare)> Top(Req.MaxCandidateCount,
+ Compare);
+ for (const auto &IDAndScore : IDAndScores) {
+ const DocID SymbolDocID = IDAndScore.first;
const auto *Sym = Symbols[SymbolDocID];
const llvm::Optional<float> Score = Filter.match(Sym->Name);
if (!Score)
continue;
- // Multiply score by a negative factor so that Top stores items with the
- // highest actual score.
- Top.emplace(-(*Score) * SymbolQuality.find(Sym)->second, Sym);
- if (Top.size() > Req.MaxCandidateCount) {
+ // Combine Fuzzy Matching score, precomputed symbol quality and boosting
+ // score for a cumulative final symbol score.
+ const float FinalScore =
+ (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
+ // If Top.push(...) returns true, it means that it had to pop an item. In
+ // this case, it is possible to retrieve more symbols.
+ if (Top.push({SymbolDocID, FinalScore}))
More = true;
- Top.pop();
- }
}
- // Apply callback to the top Req.MaxCandidateCount items.
- for (; !Top.empty(); Top.pop())
- Callback(*Top.top().second);
+ // Apply callback to the top Req.MaxCandidateCount items in the descending
+ // order of cumulative score.
+ for (const auto &Item : std::move(Top).items())
+ Callback(*Symbols[Item.first]);
return More;
}
Index: clang-tools-extra/clangd/index/SymbolYAML.h
===================================================================
--- clang-tools-extra/clangd/index/SymbolYAML.h
+++ clang-tools-extra/clangd/index/SymbolYAML.h
@@ -45,6 +45,7 @@
// The size of global symbols should be relatively small, so that all symbols
// can be managed in memory.
std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef SymbolFilename,
+ llvm::ArrayRef<std::string> URISchemes,
bool UseDex = true);
} // namespace clangd
Index: clang-tools-extra/clangd/index/SymbolYAML.cpp
===================================================================
--- clang-tools-extra/clangd/index/SymbolYAML.cpp
+++ clang-tools-extra/clangd/index/SymbolYAML.cpp
@@ -185,6 +185,7 @@
}
std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef SymbolFilename,
+ llvm::ArrayRef<std::string> URISchemes,
bool UseDex) {
trace::Span OverallTracer("LoadIndex");
auto Buffer = llvm::MemoryBuffer::getFile(SymbolFilename);
@@ -209,7 +210,7 @@
if (!Slab)
return nullptr;
trace::Span Tracer("BuildIndex");
- return UseDex ? dex::DexIndex::build(std::move(*Slab))
+ return UseDex ? dex::DexIndex::build(std::move(*Slab), URISchemes)
: MemIndex::build(std::move(*Slab), RefSlab());
}
Index: clang-tools-extra/clangd/Quality.cpp
===================================================================
--- clang-tools-extra/clangd/Quality.cpp
+++ clang-tools-extra/clangd/Quality.cpp
@@ -6,6 +6,7 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+
#include "Quality.h"
#include "FileDistance.h"
#include "URI.h"
@@ -302,7 +303,7 @@
return {0.f, 0u};
unsigned Distance = D->distance(SymbolURI);
// Assume approximately default options are used for sensible scoring.
- return {std::exp(Distance * -0.4f / FileDistanceOptions().UpCost), Distance};
+ return {std::exp(Distance * -.4f / FileDistanceOptions().UpCost), Distance};
}
float SymbolRelevanceSignals::evaluate() const {
Index: clang-tools-extra/clangd/FileDistance.h
===================================================================
--- clang-tools-extra/clangd/FileDistance.h
+++ clang-tools-extra/clangd/FileDistance.h
@@ -37,6 +37,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FILEDISTANCE_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FILEDISTANCE_H
+
#include "URI.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseMapInfo.h"
@@ -107,3 +110,5 @@
} // namespace clangd
} // namespace clang
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_FILEDISTANCE_H
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -48,6 +48,7 @@
index/dex/DexIndex.cpp
index/dex/Iterator.cpp
+ index/dex/Token.cpp
index/dex/Trigram.cpp
LINK_LIBS
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits