[PATCH] D44003: [clangd] Fix unintentionally loose fuzzy matching, and the tests masking it.

2018-03-05 Thread Sam McCall via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rCTE326721: [clangd] Fix unintentionally loose fuzzy matching, 
and the tests masking it. (authored by sammccall, committed by ).

Changed prior to commit:
  https://reviews.llvm.org/D44003?vs=136719=137027#toc

Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D44003

Files:
  clangd/FuzzyMatch.cpp
  clangd/FuzzyMatch.h
  unittests/clangd/FuzzyMatchTests.cpp
  unittests/clangd/IndexTests.cpp

Index: clangd/FuzzyMatch.cpp
===
--- clangd/FuzzyMatch.cpp
+++ clangd/FuzzyMatch.cpp
@@ -33,6 +33,8 @@
 //Legal if the characters match.
 //  - Moving down (consuming a pattern character) is never legal.
 //Never legal: all pattern characters must match something.
+// Characters are matched case-insensitively.
+// The first pattern character may only match the start of a word segment.
 //
 // The scoring is based on heuristics:
 //  - when matching a character, apply a bonus or penalty depending on the
@@ -74,21 +76,19 @@
 static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
 
 FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
-: PatN(std::min(MaxPat, Pattern.size())), CaseSensitive(false),
+: PatN(std::min(MaxPat, Pattern.size())),
   ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
   std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
-  for (int I = 0; I < PatN; ++I) {
+  for (int I = 0; I < PatN; ++I)
 LowPat[I] = lower(Pat[I]);
-CaseSensitive |= LowPat[I] != Pat[I];
-  }
   Scores[0][0][Miss] = {0, Miss};
   Scores[0][0][Match] = {AwfulScore, Miss};
   for (int P = 0; P <= PatN; ++P)
 for (int W = 0; W < P; ++W)
   for (Action A : {Miss, Match})
 Scores[P][W][A] = {AwfulScore, Miss};
   if (PatN > 0)
-calculateRoles(Pat, PatRole, PatN);
+calculateRoles(Pat, PatRole, PatTypeSet, PatN);
 }
 
 Optional FuzzyMatcher::match(StringRef Word) {
@@ -177,16 +177,21 @@
 template  static T packedLookup(const uint8_t *Data, int I) {
   return static_cast((Data[I >> 2] >> ((I & 3) * 2)) & 3);
 }
-void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
+void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int ,
+  int N) {
   assert(N > 0);
+  CharType Type = packedLookup(CharTypes, Text[0]);
+  TypeSet = 1 << Type;
   // Types holds a sliding window of (Prev, Curr, Next) types.
   // Initial value is (Empty, Empty, type of Text[0]).
-  int Types = packedLookup(CharTypes, Text[0]);
+  int Types = Type;
   // Rotate slides in the type of the next character.
   auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
   for (int I = 0; I < N - 1; ++I) {
 // For each character, rotate in the next, and look up the role.
-Rotate(packedLookup(CharTypes, Text[I + 1]));
+Type = packedLookup(CharTypes, Text[I + 1]);
+TypeSet |= 1 << Type;
+Rotate(Type);
 *Out++ = packedLookup(CharRoles, Types);
   }
   // For the last character, the "next character" is Empty.
@@ -214,7 +219,10 @@
   ++P;
   }
 
-  calculateRoles(Word, WordRole, WordN);
+  // FIXME: some words are hard to tokenize algorithmically.
+  // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
+  // We could add a tokenization dictionary for common stdlib names.
+  calculateRoles(Word, WordRole, WordTypeSet, WordN);
   return true;
 }
 
@@ -247,7 +255,7 @@
 ? ScoreInfo{MatchMissScore, Match}
 : ScoreInfo{MissMissScore, Miss};
 
-  if (LowPat[P] != LowWord[W]) { // No match possible.
+  if (!allowMatch(P, W)) {
 Score[Match] = {AwfulScore, Miss};
   } else {
 auto  = Scores[P][W];
@@ -261,23 +269,38 @@
   }
 }
 
-int FuzzyMatcher::skipPenalty(int W, Action Last) {
+bool FuzzyMatcher::allowMatch(int P, int W) const {
+  if (LowPat[P] != LowWord[W])
+return false;
+  // We require a "strong" match for the first pattern character only.
+  if (P > 0)
+return true;
+  // Obvious "strong match" for first char: match against a word head.
+  // We're banning matches outright, so conservatively accept some other cases
+  // where our segmentation might be wrong:
+  //  - allow matching B in ABCDef (but not in NDEBUG)
+  //  - we'd like to accept print in sprintf, but too many false positives
+  return WordRole[W] != Tail ||
+ (Word[W] != LowWord[W] && WordTypeSet & 1 << Lower);
+}
+
+int FuzzyMatcher::skipPenalty(int W, Action Last) const {
   int S = 0;
   if (WordRole[W] == Head) // Skipping a segment.
 S += 1;
   if (Last == Match) // Non-consecutive match.
 S += 2;  // We'd rather skip a segment than split our match.
   return S;
 }
 
-int FuzzyMatcher::matchBonus(int P, int W, Action Last) {
+int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
   assert(LowPat[P] == LowWord[W]);
   

[PATCH] D44003: [clangd] Fix unintentionally loose fuzzy matching, and the tests masking it.

2018-03-05 Thread Ilya Biryukov via Phabricator via cfe-commits
ilya-biryukov accepted this revision.
ilya-biryukov added a comment.
This revision is now accepted and ready to land.

LGTM, but note the unreachable code comment




Comment at: unittests/clangd/FuzzyMatchTests.cpp:35
+
+  friend raw_ostream <<(raw_ostream , const ExpectedMatch ) {
+return OS << "'" << M.Word;

Maybe put operator before `Word` to keep fields grouped together?



Comment at: unittests/clangd/FuzzyMatchTests.cpp:37
+return OS << "'" << M.Word;
+if (M.Annotated)
+  OS << "' as " << *M.Annotated;

This code is unreachable. Probably intended to remove `return`  from the 
previous line?


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D44003



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D44003: [clangd] Fix unintentionally loose fuzzy matching, and the tests masking it.

2018-03-02 Thread Sam McCall via Phabricator via cfe-commits
sammccall updated this revision to Diff 136719.
sammccall added a comment.

test tweaks


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D44003

Files:
  clangd/FuzzyMatch.cpp
  clangd/FuzzyMatch.h
  unittests/clangd/FuzzyMatchTests.cpp
  unittests/clangd/IndexTests.cpp

Index: unittests/clangd/IndexTests.cpp
===
--- unittests/clangd/IndexTests.cpp
+++ unittests/clangd/IndexTests.cpp
@@ -116,14 +116,6 @@
   EXPECT_TRUE(Symbols.expired());
 }
 
-TEST(MemIndexTest, MemIndexMatchSubstring) {
-  MemIndex I;
-  I.build(generateNumSymbols(5, 25));
-  FuzzyFindRequest Req;
-  Req.Query = "5";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("5", "15", "25"));
-}
-
 TEST(MemIndexTest, MemIndexDeduplicate) {
   auto Symbols = generateNumSymbols(0, 10);
 
@@ -166,46 +158,46 @@
 
 TEST(MemIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "b::yz", "yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithGlobalScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {""};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithOneScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithMultipleScopes) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a", "b"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy", "b::yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
 }
 
 TEST(MemIndexTest, NoMatchNestedScopes) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::b::yy"}));
+  I.build(generateSymbols({"a::y1", "a::b::y2"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1"));
 }
 
 TEST(MemIndexTest, IgnoreCases) {
Index: unittests/clangd/FuzzyMatchTests.cpp
===
--- unittests/clangd/FuzzyMatchTests.cpp
+++ unittests/clangd/FuzzyMatchTests.cpp
@@ -20,16 +20,27 @@
 using testing::Not;
 
 struct ExpectedMatch {
-  ExpectedMatch(StringRef Annotated) : Word(Annotated), Annotated(Annotated) {
+  // Annotations are optional, and will not be asserted if absent.
+  ExpectedMatch(StringRef Match) : Word(Match), Annotated(Match) {
 for (char C : "[]")
   Word.erase(std::remove(Word.begin(), Word.end(), C), Word.end());
+if (Word.size() == Annotated->size())
+  Annotated = None;
+  }
+  bool accepts(StringRef ActualAnnotated) const {
+return !Annotated || ActualAnnotated == *Annotated;
   }
   std::string Word;
-  StringRef Annotated;
+
+  friend raw_ostream <<(raw_ostream , const ExpectedMatch ) {
+return OS << "'" << M.Word;
+if (M.Annotated)
+  OS << "' as " << *M.Annotated;
+  }
+
+private:
+  Optional Annotated;
 };
-raw_ostream <<(raw_ostream , const ExpectedMatch ) {
-  return OS << "'" << M.Word << "' as " << M.Annotated;
-}
 
 struct MatchesMatcher : public testing::MatcherInterface {
   ExpectedMatch Candidate;
@@ -47,7 +58,7 @@
 FuzzyMatcher Matcher(Pattern);
 auto Result = Matcher.match(Candidate.Word);
 auto AnnotatedMatch = Matcher.dumpLast(*OS << "\n");
-return Result && AnnotatedMatch == Candidate.Annotated;
+return Result && Candidate.accepts(AnnotatedMatch);
   }
 };
 
@@ -122,6 +133,7 @@
   EXPECT_THAT("sl", matches("[S]Visual[L]oggerLogsList"));
   EXPECT_THAT("s", matches("[S]Visua[lL]ogger[L]ogs[L]ist"));
   EXPECT_THAT("Three", matches("H[T]ML[HRE]l[e]ment"));
+  EXPECT_THAT("b", Not(matches("NDEBUG")));
   EXPECT_THAT("Three", matches("[Three]"));
   EXPECT_THAT("fo", Not(matches("barfoo")));
   EXPECT_THAT("fo", matches("bar_[fo]o"));
@@ -151,8 +163,9 @@
   EXPECT_THAT("g", matches("zz[G]roup"));
 
   EXPECT_THAT("", matches("_a_[]")); // Prefer consecutive.
-  EXPECT_THAT("printf", matches("s[printf]"));
-  

[PATCH] D44003: [clangd] Fix unintentionally loose fuzzy matching, and the tests masking it.

2018-03-02 Thread Sam McCall via Phabricator via cfe-commits
sammccall created this revision.
Herald added subscribers: cfe-commits, ioeric, jkorous-apple, ilya-biryukov, 
klimek.
sammccall updated this revision to Diff 136719.
sammccall added a comment.

test tweaks


The intent was that [ar] doesn't match "FooBar"; the first character must match
a Head character (hard requirement, not just a low score).
This matches VSCode, and was "tested" but the tests were defective.

The tests expected matches("FooBar") to fail for lack of a match. But instead
it fails because the string should be annotated - matches("FooB[ar]").
This patch makes matches("FooBar") ignore annotations, as was intended.

Fixing the code to reject weak matches for the first char causes problems:

- [bre] no longer matches "HTMLBRElement". We allow matching against an 
uppercase char even if we don't think it's head. Only do this if there's at 
least one lowercase, to avoid triggering on MACROS
- [print] no longer matches "sprintf". This is hard to fix without false 
positives (e.g. [int] vs "sprintf"]) This patch leaves this case broken. A 
future patch will add a dictionary providing custom segmentation to common 
names from the standard library.

Fixed a couple of index tests that indirectly relied on broken fuzzy matching.
Added const in a couple of missing places for consistency with new code.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D44003

Files:
  clangd/FuzzyMatch.cpp
  clangd/FuzzyMatch.h
  unittests/clangd/FuzzyMatchTests.cpp
  unittests/clangd/IndexTests.cpp

Index: unittests/clangd/IndexTests.cpp
===
--- unittests/clangd/IndexTests.cpp
+++ unittests/clangd/IndexTests.cpp
@@ -116,14 +116,6 @@
   EXPECT_TRUE(Symbols.expired());
 }
 
-TEST(MemIndexTest, MemIndexMatchSubstring) {
-  MemIndex I;
-  I.build(generateNumSymbols(5, 25));
-  FuzzyFindRequest Req;
-  Req.Query = "5";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("5", "15", "25"));
-}
-
 TEST(MemIndexTest, MemIndexDeduplicate) {
   auto Symbols = generateNumSymbols(0, 10);
 
@@ -166,46 +158,46 @@
 
 TEST(MemIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "b::yz", "yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithGlobalScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {""};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithOneScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithMultipleScopes) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a", "b"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy", "b::yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
 }
 
 TEST(MemIndexTest, NoMatchNestedScopes) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::b::yy"}));
+  I.build(generateSymbols({"a::y1", "a::b::y2"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1"));
 }
 
 TEST(MemIndexTest, IgnoreCases) {
Index: unittests/clangd/FuzzyMatchTests.cpp
===
--- unittests/clangd/FuzzyMatchTests.cpp
+++ unittests/clangd/FuzzyMatchTests.cpp
@@ -20,16 +20,27 @@
 using testing::Not;
 
 struct ExpectedMatch {
-  ExpectedMatch(StringRef Annotated) : Word(Annotated), Annotated(Annotated) {
+  // Annotations are optional, and will not be asserted if absent.
+  ExpectedMatch(StringRef Match) : Word(Match), Annotated(Match) {
 for (char C : "[]")
   Word.erase(std::remove(Word.begin(), Word.end(), C), Word.end());
+if (Word.size() == Annotated->size())
+  Annotated = None;
+  }
+  bool accepts(StringRef ActualAnnotated) const {
+return !Annotated || ActualAnnotated == *Annotated;
   }
   std::string Word;
-  StringRef Annotated;
+
+  friend raw_ostream