kbobyrev updated this revision to Diff 388461.
kbobyrev added a comment.

Add improvements for identifier trigram generation and make query & id
generators consistent with each other.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D113995/new/

https://reviews.llvm.org/D113995

Files:
  clang-tools-extra/clangd/index/dex/Trigram.cpp
  clang-tools-extra/clangd/unittests/DexTests.cpp

Index: clang-tools-extra/clangd/unittests/DexTests.cpp
===================================================================
--- clang-tools-extra/clangd/unittests/DexTests.cpp
+++ clang-tools-extra/clangd/unittests/DexTests.cpp
@@ -386,30 +386,35 @@
               trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"}));
 
   EXPECT_THAT(identifierTrigramTokens("abc_def"),
-              trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde",
-                           "cde", "def"}));
+              trigramsAre({"a", "d", "ab", "ad", "de", "abc", "abd", "ade",
+                           "bcd", "bde", "cde", "def"}));
 
   EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"),
-              trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"}));
+              trigramsAre({"a", "b", "ab", "abc", "bcd", "cde"}));
 
   EXPECT_THAT(identifierTrigramTokens("unique_ptr"),
-              trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip",
-                           "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt",
-                           "uep", "ept", "ptr"}));
+              trigramsAre({"u",   "p",   "un",  "up",  "pt",  "uni", "unp",
+                           "upt", "niq", "nip", "npt", "iqu", "iqp", "ipt",
+                           "que", "qup", "qpt", "uep", "ept", "ptr"}));
 
-  EXPECT_THAT(
-      identifierTrigramTokens("TUDecl"),
-      trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"}));
+  EXPECT_THAT(identifierTrigramTokens("TUDecl"),
+              trigramsAre({"t", "d", "tu", "td", "de", "tud", "tde", "ude",
+                           "dec", "ecl"}));
 
   EXPECT_THAT(identifierTrigramTokens("IsOK"),
-              trigramsAre({"i", "is", "io", "iso", "iok", "sok"}));
+              trigramsAre({"i", "o", "is", "ok", "io", "iso", "iok", "sok"}));
 
-  EXPECT_THAT(
-      identifierTrigramTokens("abc_defGhij__klm"),
-      trigramsAre({"a",   "ab",  "ad",  "abc", "abd", "ade", "adg", "bcd",
-                   "bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk",
-                   "efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl",
-                   "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm"}));
+  EXPECT_THAT(identifierTrigramTokens("_pb"),
+              trigramsAre({"_", "_p", "p", "pb"}));
+  EXPECT_THAT(identifierTrigramTokens("__pb"),
+              trigramsAre({"_", "_p", "p", "pb"}));
+
+  EXPECT_THAT(identifierTrigramTokens("abc_defGhij__klm"),
+              trigramsAre({"a",   "d",   "ab",  "ad",  "de",  "abc", "abd",
+                           "ade", "adg", "bcd", "bde", "bdg", "cde", "cdg",
+                           "def", "deg", "dgh", "dgk", "efg", "egh", "egk",
+                           "fgh", "fgk", "ghi", "ghk", "gkl", "hij", "hik",
+                           "hkl", "ijk", "ikl", "jkl", "klm"}));
 }
 
 TEST(DexTrigrams, QueryTrigrams) {
@@ -419,8 +424,16 @@
 
   EXPECT_THAT(generateQueryTrigrams(""), trigramsAre({}));
   EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_"}));
-  EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"}));
-  EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({}));
+  EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"_"}));
+  EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"_"}));
+
+  EXPECT_THAT(generateQueryTrigrams("m_"), trigramsAre({"m"}));
+
+  EXPECT_THAT(generateQueryTrigrams("p_b"), trigramsAre({"pb"}));
+  EXPECT_THAT(generateQueryTrigrams("pb_"), trigramsAre({"pb"}));
+  EXPECT_THAT(generateQueryTrigrams("_p"), trigramsAre({"_p"}));
+  EXPECT_THAT(generateQueryTrigrams("_pb_"), trigramsAre({"pb"}));
+  EXPECT_THAT(generateQueryTrigrams("__pb"), trigramsAre({"pb"}));
 
   EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
 
@@ -525,25 +538,45 @@
 }
 
 TEST(DexTest, ShortQuery) {
-  auto I = Dex::build(generateSymbols({"OneTwoThreeFour"}), RefSlab(),
+  auto I = Dex::build(generateSymbols({"_OneTwoFourSix"}), RefSlab(),
                       RelationSlab());
   FuzzyFindRequest Req;
   Req.AnyScope = true;
   bool Incomplete;
 
-  EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
+  EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
   EXPECT_FALSE(Incomplete) << "Empty string is not a short query";
 
-  Req.Query = "t";
+  Req.Query = "o";
+  EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+  EXPECT_TRUE(Incomplete) << "Using first head as unigram";
+
+  Req.Query = "_o";
+  EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+  EXPECT_TRUE(Incomplete) << "Using delimiter and first head as bigram";
+
+  Req.Query = "on";
+  EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+  EXPECT_TRUE(Incomplete) << "Using first head and tail as bigram";
+
+  Req.Query = "ot";
+  EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+  EXPECT_TRUE(Incomplete) << "Using first two heads as bigram";
+
+  Req.Query = "tw";
+  EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+  EXPECT_TRUE(Incomplete) << "Using first two second head and tail as bigram";
+
+  Req.Query = "tf";
   EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre());
   EXPECT_TRUE(Incomplete) << "Short queries have different semantics";
 
-  Req.Query = "tt";
+  Req.Query = "fo";
   EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre());
   EXPECT_TRUE(Incomplete) << "Short queries have different semantics";
 
-  Req.Query = "ttf";
-  EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
+  Req.Query = "tfs";
+  EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
   EXPECT_FALSE(Incomplete) << "3-char string is not a short query";
 }
 
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -12,8 +12,10 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include <cctype>
+#include <limits>
 #include <queue>
 #include <string>
 
@@ -25,7 +27,7 @@
 template <typename Func>
 static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
   // Apply fuzzy matching text segmentation.
-  std::vector<CharRole> Roles(Identifier.size());
+  llvm::SmallVector<CharRole> Roles(Identifier.size());
   calculateRoles(Identifier,
                  llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
 
@@ -39,7 +41,7 @@
   //
   // Next stores tuples of three indices in the presented order, if a variant is
   // not available then 0 is stored.
-  std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
+  llvm::SmallVector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
   unsigned NextTail = 0, NextHead = 0;
   for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
     Next[I] = {{NextTail, NextHead}};
@@ -66,16 +68,50 @@
       }
     }
   }
-  // Emit short-query trigrams: FooBar -> f, fo, fb.
-  if (!LowercaseIdentifier.empty())
-    Out(Trigram(LowercaseIdentifier[0]));
-  if (LowercaseIdentifier.size() >= 2)
-    Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1]));
-  for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
-    if (Roles[I] == Head) {
-      Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I]));
+  // Short queries semantics are different: emit incomplete trigrams for them.
+  //
+  // First, the first separator if it is preceding the alphanumeric symbols.
+  size_t FirstSeparator = std::numeric_limits<size_t>::max();
+  for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
+    if (Roles[I] == Head || Roles[I] == Tail)
+      break;
+    if (Roles[I] == Separator) {
+      FirstSeparator = I;
+      Out(Trigram(LowercaseIdentifier[I]));
       break;
     }
+  }
+  // Then, find the first (up to) two heads and emit the following incomplete
+  // trigrams (if they exist):
+  //
+  // - First head
+  // - Fist head + subsequent tail
+  // - First separator + first head
+  // - Second head
+  // - First head + second head
+  // - Second head + subsequent tail
+  llvm::SmallVector<size_t, 2> Heads;
+  for (size_t I = 0; I < LowercaseIdentifier.size() && Heads.size() < 2; ++I)
+    if (Roles[I] == Head)
+      Heads.push_back(I);
+  if (!Heads.empty()) {
+    Out(Trigram(LowercaseIdentifier[Heads[0]]));
+    if (FirstSeparator != std::numeric_limits<size_t>::max())
+      Out(Trigram(LowercaseIdentifier[FirstSeparator],
+                  LowercaseIdentifier[Heads[0]]));
+    size_t NextIndex = Heads[0] + 1;
+    if (NextIndex < LowercaseIdentifier.size() && Roles[NextIndex] == Tail)
+      Out(Trigram(LowercaseIdentifier[Heads[0]],
+                  LowercaseIdentifier[NextIndex]));
+  }
+  if (Heads.size() == 2) {
+    Out(Trigram(LowercaseIdentifier[Heads[1]]));
+    Out(Trigram(LowercaseIdentifier[Heads[0]], LowercaseIdentifier[Heads[1]]));
+    size_t NextIndex = Heads.back() + 1;
+    if (NextIndex < LowercaseIdentifier.size() && Roles[NextIndex] == Tail)
+      Out(Trigram(LowercaseIdentifier[Heads[1]],
+                  LowercaseIdentifier[NextIndex]));
+  }
 }
 
 void generateIdentifierTrigrams(llvm::StringRef Identifier,
@@ -101,17 +137,31 @@
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
   if (Query.empty())
     return {};
-  std::string LowercaseQuery = Query.lower();
-  if (Query.size() < 3) // short-query trigrams only
-    return {Token(Token::Kind::Trigram, LowercaseQuery)};
 
   // Apply fuzzy matching text segmentation.
-  std::vector<CharRole> Roles(Query.size());
+  llvm::SmallVector<CharRole> Roles(Query.size());
   calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
 
+  std::string LowercaseQuery = Query.lower();
+
+  unsigned ValidSymbols =
+      llvm::count_if(Roles, [](CharRole R) { return R == Head || R == Tail; });
+  // For queries with very few letters, emulate what generateIdentifierTrigrams
+  // outputs for the beginning of the Identifier.
+  if (ValidSymbols < 3) {
+    // If a bigram can't be formed, we might prepend a separator.
+    std::string Letters = Roles.front() == Separator && ValidSymbols < 2
+                              ? std::string(1, Query.front())
+                              : "";
+    for (unsigned I = 0; I < LowercaseQuery.size(); ++I)
+      if (Roles[I] == Head || Roles[I] == Tail)
+        Letters += LowercaseQuery[I];
+    return {Token(Token::Kind::Trigram, Letters)};
+  }
+
   llvm::DenseSet<Token> UniqueTrigrams;
   std::string Chars;
-  for (unsigned I = 0; I < Query.size(); ++I) {
+  for (unsigned I = 0; I < LowercaseQuery.size(); ++I) {
     if (Roles[I] != Head && Roles[I] != Tail)
       continue; // Skip delimiters.
     Chars.push_back(LowercaseQuery[I]);
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to