[PATCH] D44003: [clangd] Fix unintentionally loose fuzzy matching, and the tests masking it.

Sam McCall via Phabricator via cfe-commits Fri, 02 Mar 2018 04:49:19 -0800

sammccall created this revision.
Herald added subscribers: cfe-commits, ioeric, jkorous-apple, ilya-biryukov, 
klimek.
sammccall updated this revision to Diff 136719.
sammccall added a comment.


test tweaks


The intent was that [ar] doesn't match "FooBar"; the first character must match
a Head character (hard requirement, not just a low score).
This matches VSCode, and was "tested" but the tests were defective.

The tests expected matches("FooBar") to fail for lack of a match. But instead
it fails because the string should be annotated - matches("FooB[ar]").
This patch makes matches("FooBar") ignore annotations, as was intended.

Fixing the code to reject weak matches for the first char causes problems:

- [bre] no longer matches "HTMLBRElement". We allow matching against an 
uppercase char even if we don't think it's head. Only do this if there's at 
least one lowercase, to avoid triggering on MACROS
- [print] no longer matches "sprintf". This is hard to fix without false 
positives (e.g. [int] vs "sprintf"]) This patch leaves this case broken. A 
future patch will add a dictionary providing custom segmentation to common 
names from the standard library.

Fixed a couple of index tests that indirectly relied on broken fuzzy matching.
Added const in a couple of missing places for consistency with new code.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D44003

Files:
  clangd/FuzzyMatch.cpp
  clangd/FuzzyMatch.h
  unittests/clangd/FuzzyMatchTests.cpp
  unittests/clangd/IndexTests.cpp

Index: unittests/clangd/IndexTests.cpp
===================================================================
--- unittests/clangd/IndexTests.cpp
+++ unittests/clangd/IndexTests.cpp
@@ -116,14 +116,6 @@
   EXPECT_TRUE(Symbols.expired());
 }
 
-TEST(MemIndexTest, MemIndexMatchSubstring) {
-  MemIndex I;
-  I.build(generateNumSymbols(5, 25));
-  FuzzyFindRequest Req;
-  Req.Query = "5";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("5", "15", "25"));
-}
-
 TEST(MemIndexTest, MemIndexDeduplicate) {
   auto Symbols = generateNumSymbols(0, 10);
 
@@ -166,46 +158,46 @@
 
 TEST(MemIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "b::yz", "yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithGlobalScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {""};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithOneScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithMultipleScopes) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a", "b"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy", "b::yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
 }
 
 TEST(MemIndexTest, NoMatchNestedScopes) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::b::yy"}));
+  I.build(generateSymbols({"a::y1", "a::b::y2"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1"));
 }
 
 TEST(MemIndexTest, IgnoreCases) {
Index: unittests/clangd/FuzzyMatchTests.cpp
===================================================================
--- unittests/clangd/FuzzyMatchTests.cpp
+++ unittests/clangd/FuzzyMatchTests.cpp
@@ -20,16 +20,27 @@
 using testing::Not;
 
 struct ExpectedMatch {
-  ExpectedMatch(StringRef Annotated) : Word(Annotated), Annotated(Annotated) {
+  // Annotations are optional, and will not be asserted if absent.
+  ExpectedMatch(StringRef Match) : Word(Match), Annotated(Match) {
     for (char C : "[]")
       Word.erase(std::remove(Word.begin(), Word.end(), C), Word.end());
+    if (Word.size() == Annotated->size())
+      Annotated = None;
+  }
+  bool accepts(StringRef ActualAnnotated) const {
+    return !Annotated || ActualAnnotated == *Annotated;
   }
   std::string Word;
-  StringRef Annotated;
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
+    return OS << "'" << M.Word;
+    if (M.Annotated)
+      OS << "' as " << *M.Annotated;
+  }
+
+private:
+  Optional<StringRef> Annotated;
 };
-raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
-  return OS << "'" << M.Word << "' as " << M.Annotated;
-}
 
 struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
   ExpectedMatch Candidate;
@@ -47,7 +58,7 @@
     FuzzyMatcher Matcher(Pattern);
     auto Result = Matcher.match(Candidate.Word);
     auto AnnotatedMatch = Matcher.dumpLast(*OS << "\n");
-    return Result && AnnotatedMatch == Candidate.Annotated;
+    return Result && Candidate.accepts(AnnotatedMatch);
   }
 };
 
@@ -122,6 +133,7 @@
   EXPECT_THAT("sl", matches("[S]Visual[L]oggerLogsList"));
   EXPECT_THAT("sllll", matches("[S]Visua[lL]ogger[L]ogs[L]ist"));
   EXPECT_THAT("Three", matches("H[T]ML[HRE]l[e]ment"));
+  EXPECT_THAT("b", Not(matches("NDEBUG")));
   EXPECT_THAT("Three", matches("[Three]"));
   EXPECT_THAT("fo", Not(matches("barfoo")));
   EXPECT_THAT("fo", matches("bar_[fo]o"));
@@ -151,8 +163,9 @@
   EXPECT_THAT("g", matches("zz[G]roup"));
 
   EXPECT_THAT("aaaa", matches("_a_[aaaa]")); // Prefer consecutive.
-  EXPECT_THAT("printf", matches("s[printf]"));
-  EXPECT_THAT("str", matches("o[str]eam"));
+  // These would ideally match, but would need special segmentation rules.
+  EXPECT_THAT("printf", Not(matches("s[printf]")));
+  EXPECT_THAT("str", Not(matches("o[str]eam")));
 }
 
 struct RankMatcher : public testing::MatcherInterface<StringRef> {
@@ -188,9 +201,8 @@
         llvm::raw_string_ostream Info(Buf);
         auto AnnotatedMatch = Matcher.dumpLast(Info);
 
-        if (AnnotatedMatch != Str.Annotated) {
-          *OS << "\nMatched " << Str.Word << " as " << AnnotatedMatch
-              << " instead of " << Str.Annotated << "\n"
+        if (!Str.accepts(AnnotatedMatch)) {
+          *OS << "\nDoesn't match " << Str << ", but " << AnnotatedMatch << "\n"
               << Info.str();
           Ok = false;
         } else if (LastScore && *LastScore < *Score) {
Index: clangd/FuzzyMatch.h
===================================================================
--- clangd/FuzzyMatch.h
+++ clangd/FuzzyMatch.h
@@ -56,23 +56,25 @@
 
   bool init(llvm::StringRef Word);
   void buildGraph();
-  void calculateRoles(const char *Text, CharRole *Out, int N);
-  int skipPenalty(int W, Action Last);
-  int matchBonus(int P, int W, Action Last);
+  void calculateRoles(const char *Text, CharRole *Out, int &Types, int N);
+  bool allowMatch(int P, int W) const;
+  int skipPenalty(int W, Action Last) const;
+  int matchBonus(int P, int W, Action Last) const;
 
   // Pattern data is initialized by the constructor, then constant.
   char Pat[MaxPat];         // Pattern data
   int PatN;                 // Length
   char LowPat[MaxPat];      // Pattern in lowercase
   CharRole PatRole[MaxPat]; // Pattern segmentation info
-  bool CaseSensitive;       // Case-sensitive match if pattern has uppercase
+  int PatTypeSet;           // Bitmask of 1<<CharType
   float ScoreScale;         // Normalizes scores for the pattern length.
 
   // Word data is initialized on each call to match(), mostly by init().
   char Word[MaxWord];         // Word data
   int WordN;                  // Length
   char LowWord[MaxWord];      // Word in lowercase
   CharRole WordRole[MaxWord]; // Word segmentation info
+  int WordTypeSet;            // Bitmask of 1<<CharType
   bool WordContainsPattern;   // Simple substring check
 
   // Cumulative best-match score table.
Index: clangd/FuzzyMatch.cpp
===================================================================
--- clangd/FuzzyMatch.cpp
+++ clangd/FuzzyMatch.cpp
@@ -33,6 +33,8 @@
 //    Legal if the characters match.
 //  - Moving down (consuming a pattern character) is never legal.
 //    Never legal: all pattern characters must match something.
+// Characters are matched case-insensitively.
+// The first pattern character may only match the start of a word segment.
 //
 // The scoring is based on heuristics:
 //  - when matching a character, apply a bonus or penalty depending on the
@@ -74,21 +76,19 @@
 static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
 
 FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
-    : PatN(std::min<int>(MaxPat, Pattern.size())), CaseSensitive(false),
+    : PatN(std::min<int>(MaxPat, Pattern.size())),
       ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
   std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
-  for (int I = 0; I < PatN; ++I) {
+  for (int I = 0; I < PatN; ++I)
     LowPat[I] = lower(Pat[I]);
-    CaseSensitive |= LowPat[I] != Pat[I];
-  }
   Scores[0][0][Miss] = {0, Miss};
   Scores[0][0][Match] = {AwfulScore, Miss};
   for (int P = 0; P <= PatN; ++P)
     for (int W = 0; W < P; ++W)
       for (Action A : {Miss, Match})
         Scores[P][W][A] = {AwfulScore, Miss};
   if (PatN > 0)
-    calculateRoles(Pat, PatRole, PatN);
+    calculateRoles(Pat, PatRole, PatTypeSet, PatN);
 }
 
 Optional<float> FuzzyMatcher::match(StringRef Word) {
@@ -177,16 +177,21 @@
 template <typename T> static T packedLookup(const uint8_t *Data, int I) {
   return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
 }
-void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
+void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int &TypeSet,
+                                  int N) {
   assert(N > 0);
+  CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
+  TypeSet = 1 << Type;
   // Types holds a sliding window of (Prev, Curr, Next) types.
   // Initial value is (Empty, Empty, type of Text[0]).
-  int Types = packedLookup<CharType>(CharTypes, Text[0]);
+  int Types = Type;
   // Rotate slides in the type of the next character.
   auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
   for (int I = 0; I < N - 1; ++I) {
     // For each character, rotate in the next, and look up the role.
-    Rotate(packedLookup<CharType>(CharTypes, Text[I + 1]));
+    Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
+    TypeSet |= 1 << Type;
+    Rotate(Type);
     *Out++ = packedLookup<CharRole>(CharRoles, Types);
   }
   // For the last character, the "next character" is Empty.
@@ -214,7 +219,10 @@
       ++P;
   }
 
-  calculateRoles(Word, WordRole, WordN);
+  // FIXME: some words are hard to tokenize algorithmically.
+  // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
+  // We could add a tokenization dictionary for common stdlib names.
+  calculateRoles(Word, WordRole, WordTypeSet, WordN);
   return true;
 }
 
@@ -247,7 +255,7 @@
                         ? ScoreInfo{MatchMissScore, Match}
                         : ScoreInfo{MissMissScore, Miss};
 
-      if (LowPat[P] != LowWord[W]) { // No match possible.
+      if (!allowMatch(P, W)) {
         Score[Match] = {AwfulScore, Miss};
       } else {
         auto &PreMatch = Scores[P][W];
@@ -261,23 +269,38 @@
   }
 }
 
-int FuzzyMatcher::skipPenalty(int W, Action Last) {
+bool FuzzyMatcher::allowMatch(int P, int W) const {
+  if (LowPat[P] != LowWord[W])
+    return false;
+  // We require a "strong" match for the first pattern character only.
+  if (P > 0)
+    return true;
+  // Obvious "strong match" for first char: match against a word head.
+  // We're banning matches outright, so conservatively accept some other cases
+  // where our segmentation might be wrong:
+  //  - allow matching B in ABCDef (but not in NDEBUG)
+  //  - we'd like to accept print in sprintf, but too many false positives
+  return WordRole[W] != Tail ||
+         (Word[W] != LowWord[W] && WordTypeSet & 1 << Lower);
+}
+
+int FuzzyMatcher::skipPenalty(int W, Action Last) const {
   int S = 0;
   if (WordRole[W] == Head) // Skipping a segment.
     S += 1;
   if (Last == Match) // Non-consecutive match.
     S += 2;          // We'd rather skip a segment than split our match.
   return S;
 }
 
-int FuzzyMatcher::matchBonus(int P, int W, Action Last) {
+int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
   assert(LowPat[P] == LowWord[W]);
   int S = 1;
   // Bonus: pattern so far is a (case-insensitive) prefix of the word.
   if (P == W) // We can't skip pattern characters, so we must have matched all.
     ++S;
   // Bonus: case matches, or a Head in the pattern aligns with one in the word.
-  if ((Pat[P] == Word[W] && (CaseSensitive || P == W)) ||
+  if ((Pat[P] == Word[W] && ((PatTypeSet & 1 << Upper) || P == W)) ||
       (PatRole[P] == Head && WordRole[W] == Head))
     ++S;
   // Penalty: matching inside a segment (and previous char wasn't matched).
@@ -312,7 +335,7 @@
                               Scores[PatN][WordN][Miss].Score))) {
     OS << "Substring check passed, but all matches are forbidden\n";
   }
-  if (!CaseSensitive)
+  if (!(PatTypeSet & 1 << Upper))
     OS << "Lowercase query, so scoring ignores case\n";
 
   // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D44003: [clangd] Fix unintentionally loose fuzzy matching, and the tests masking it.

Reply via email to