teemperor updated this revision to Diff 65291.
teemperor added a comment.

Ok, so I think I've addressed the points from last meeting in this patch:

It was planned to replace custom hashing with FoldingSetNodeID and a hashmap: 
In this patch I removed all custom hashing. It's now done via LLVM's StringMap 
and strings which brings the same functionality. Using FoldingSetNodeID is not 
possible because it hides it's internal data vector which StringMap doesn't 
support, so I've just used a normal vector instead.

It was also planned to refactor CloneSignatureCache: In this patch I've 
completely removed the cache because it is no longer necessary. Fixing it was 
not an option because we assumed that the statements RecursiveASTVisitor visits 
are the same that Stmt::children() delivers, which is for example for 
LambdaExprs not true (and RecursiveASTVisitor makes no promise about that in 
it's API). The only sensible fix for this was to use Artem's initial suggestion 
to not use RecursiveASTVisitor to traverse statements.

I've also adressed Artems and Vassils comments regarding documentation and code 
style in this patch (Thanks!).


https://reviews.llvm.org/D20795

Files:
  include/clang/Analysis/CloneDetection.h
  include/clang/StaticAnalyzer/Checkers/Checkers.td
  lib/Analysis/CMakeLists.txt
  lib/Analysis/CloneDetection.cpp
  lib/StaticAnalyzer/Checkers/CMakeLists.txt
  lib/StaticAnalyzer/Checkers/CloneChecker.cpp
  test/Analysis/copypaste/test-min-max.cpp
  test/Analysis/copypaste/test-sub-sequences.cpp

Index: test/Analysis/copypaste/test-sub-sequences.cpp
===================================================================
--- /dev/null
+++ test/Analysis/copypaste/test-sub-sequences.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// We test if sub-sequences can match with normal sequences containing only
+// a single statement.
+
+void log2(int a);
+void log();
+
+int max(int a, int b) {
+  log2(a);
+  log(); // expected-warning{{Detected code clone.}}
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int a, int b) {
+  log(); // expected-note{{Related code clone is here.}}
+  if (a > b)
+    return a;
+  return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) { // no-warning
+  return a + b;
+}
Index: test/Analysis/copypaste/test-min-max.cpp
===================================================================
--- /dev/null
+++ test/Analysis/copypaste/test-min-max.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+void log();
+
+int max(int a, int b) { // expected-warning{{Detected code clone.}}
+  log();
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int x, int y) { // expected-note{{Related code clone is here.}}
+  log();
+  if (x > y)
+    return x;
+  return y;
+}
+
+// False positives below. These clones should not be reported.
+
+// FIXME: Detect different binary operator kinds.
+int min1(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (a < b)
+    return a;
+  return b;
+}
+
+// FIXME: Detect different variable patterns.
+int min2(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (b > a)
+    return a;
+  return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) { // no-warning
+  return a + b;
+}
Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp
===================================================================
--- /dev/null
+++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp
@@ -0,0 +1,85 @@
+//===--- CloneChecker.cpp - Clone detection checker -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// CloneChecker is a checker that reports clones in the current translation
+/// unit.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ClangSACheckers.h"
+#include "clang/Analysis/CloneDetection.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/CheckerManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+class CloneChecker : public Checker<check::EndOfTranslationUnit> {
+
+public:
+  void checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+                                 AnalysisManager &Mgr, BugReporter &BR) const;
+};
+} // end anonymous namespace
+
+void CloneChecker::checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+                                             AnalysisManager &Mgr,
+                                             BugReporter &BR) const {
+
+  int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger(
+      "MinimumCloneComplexity", 10, this);
+
+  assert(MinComplexity >= 0);
+
+  SourceManager &SM = BR.getSourceManager();
+
+  CloneDetector CloneDetector;
+  CloneDetector.analyzeTranslationUnit(const_cast<TranslationUnitDecl *>(TU));
+
+  std::vector<CloneDetector::CloneGroup> CloneGroups;
+  CloneDetector.findClones(CloneGroups, MinComplexity);
+
+  DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
+
+  unsigned WarnID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Warning,
+                                               "Detected code clone.");
+
+  unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note,
+                                               "Related code clone is here.");
+
+  for (CloneDetector::CloneGroup &Group : CloneGroups) {
+    // For readability reasons we sort the clones by line numbers.
+    std::sort(Group.Sequences.begin(), Group.Sequences.end(),
+              [&SM](const StmtSequence &LHS, const StmtSequence &RHS) {
+                return SM.isBeforeInTranslationUnit(LHS.getStartLoc(),
+                                                    RHS.getStartLoc()) &&
+                       SM.isBeforeInTranslationUnit(LHS.getEndLoc(),
+                                                    RHS.getEndLoc());
+              });
+
+    // We group the clones by printing the first as a warning and all others
+    // as a note.
+    DiagEngine.Report(Group.Sequences.front().getStartLoc(), WarnID);
+    for (unsigned J = 1; J < Group.Sequences.size(); ++J) {
+      DiagEngine.Report(Group.Sequences[J].getStartLoc(), NoteID);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Register CloneChecker
+//===----------------------------------------------------------------------===//
+
+void ento::registerCloneChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<CloneChecker>();
+}
Index: lib/StaticAnalyzer/Checkers/CMakeLists.txt
===================================================================
--- lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -22,6 +22,7 @@
   CheckerDocumentation.cpp
   ChrootChecker.cpp
   ClangCheckers.cpp
+  CloneChecker.cpp
   DeadStoresChecker.cpp
   DebugCheckers.cpp
   DereferenceChecker.cpp
Index: lib/Analysis/CloneDetection.cpp
===================================================================
--- /dev/null
+++ lib/Analysis/CloneDetection.cpp
@@ -0,0 +1,280 @@
+//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+///  This file implements classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/CloneDetection.h"
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Stmt.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace clang;
+
+StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
+                           unsigned StartIndex, unsigned EndIndex)
+    : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
+  assert(Stmt && "Stmt must not be a nullptr");
+  assert(StartIndex < EndIndex && "Given array should not be empty");
+  assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
+}
+
+StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context)
+    : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}
+
+StmtSequence::StmtSequence()
+    : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}
+
+bool StmtSequence::contains(const StmtSequence &Other) const {
+  // If both sequences reside in different translation units, they can never
+  // contain each other.
+  if (Context != Other.Context)
+    return false;
+
+  const SourceManager &SM = Context->getSourceManager();
+
+  // Otherwise check if the start and end locations of the current sequence
+  // surround the other sequence.
+  bool StartIsInBounds =
+      SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
+      getStartLoc() == Other.getStartLoc();
+  if (!StartIsInBounds)
+    return false;
+
+  bool EndIsInBounds =
+      SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
+      Other.getEndLoc() == getEndLoc();
+  return EndIsInBounds;
+}
+
+StmtSequence::iterator StmtSequence::begin() const {
+  if (!holdsSequence()) {
+    return &S;
+  }
+  auto CS = cast<CompoundStmt>(S);
+  return CS->body_begin() + StartIndex;
+}
+
+StmtSequence::iterator StmtSequence::end() const {
+  if (!holdsSequence()) {
+    return &S + 1;
+  }
+  auto CS = cast<CompoundStmt>(S);
+  return CS->body_begin() + EndIndex;
+}
+
+SourceLocation StmtSequence::getStartLoc() const {
+  return front()->getLocStart();
+}
+
+SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
+
+namespace {
+/// Traverses all statements in an AST and collects their relevant data.
+/// The results are stored in a CloneDetector object. Each statement is only
+/// visited a fixed amount of times during this process.
+class DataCollectVisitor : public RecursiveASTVisitor<DataCollectVisitor> {
+
+  CloneDetector &CD;
+  ASTContext &Context;
+
+  /// \brief Collects the relevant data of all statements in the given statement
+  /// tree and stores it in the CloneDetector.
+  ///
+  /// \param S The root of the given statement tree.
+  /// \return The CloneSignature of the root statement.
+  CloneDetector::CloneSignature CollectData(const Stmt *S) {
+    // Create an empty signature that will be filled in this method.
+    CloneDetector::CloneSignature Signature;
+
+    // The only relevant data for now is the class of the statement.
+    // TODO: Collect statement class specific data.
+    Signature.Data.push_back(S->getStmtClass());
+
+    // Storage for the signatures of the direct child statements. This is only
+    // needed if the current statemnt is a CompoundStmt.
+    std::vector<CloneDetector::CloneSignature> ChildSignatures;
+    const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S);
+
+    // The signature of a statement includes the signatures of its children.
+    // Therefore we create the signatures for every child and add them to the
+    // current signature.
+    for (const Stmt *Child : S->children()) {
+      // Some statements like 'if' can have nullptr children that we will skip.
+      if (!Child)
+        continue;
+
+      // Recursive call to create the signature of the child statement. This
+      // will also create and store all clone groups in this child statement.
+      auto ChildSignature = CollectData(Child);
+
+      // Add the collected data to the signature of the current statement.
+      Signature.add(ChildSignature);
+
+      // If the current statement is a CompoundStatement, we need to store the
+      // signature for the generation of the sub-sequences.
+      if (CS)
+        ChildSignatures.push_back(ChildSignature);
+    }
+
+    // If the current statement is a CompoundStmt, we also need to create the
+    // clone groups from the sub-sequences inside the children.
+    if (CS)
+      HandleSubSequences(CS, ChildSignatures);
+
+    // Save the signature for the current statement in the CloneDetector object.
+    CD.add(StmtSequence(S, Context), Signature);
+
+    return Signature;
+  }
+
+  /// \brief Adds all possible sub-sequences in the child array of the given
+  ///        CompoundStmt to the CloneDetector.
+  /// \param CS The given CompoundStmt.
+  /// \param ChildSignatures A list of calculated signatures for each child in
+  ///                        the given CompoundStmt.
+  void HandleSubSequences(
+      const CompoundStmt *CS,
+      std::vector<CloneDetector::CloneSignature> ChildSignatures) {
+
+    // FIXME: This function has quadratic runtime right now. Check if skipping
+    // this function for too long CompoundStmts is an option.
+
+    // The length of the sub-sequence. We don't need to handle sequences with
+    // the length 1 as they are already handled in CollectData().
+    for (unsigned Length = 2; Length <= CS->size(); ++Length) {
+      // The start index in the body of the CompoundStmt. We increase the
+      // position until the end of the sub-sequence reaches the end of the
+      // CompoundStmt body.
+      for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
+        // Create an empty signature and add the signatures of all selected
+        // child statements to it.
+        CloneDetector::CloneSignature SubSignature;
+
+        for (unsigned i = Pos; i < Pos + Length; ++i) {
+          SubSignature.add(ChildSignatures[i]);
+        }
+
+        // Save the signature together with the information about what children
+        // sequence we selected.
+        CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
+      }
+    }
+  }
+
+public:
+  explicit DataCollectVisitor(CloneDetector &CD, ASTContext &Context)
+      : CD(CD), Context(Context) {}
+
+  bool VisitFunctionDecl(FunctionDecl *D) {
+    // If we found a function, we start the clone search on its body statement.
+    if (D->hasBody())
+      CollectData(D->getBody());
+    return true;
+  }
+};
+} // end anonymous namespace
+
+void CloneDetector::analyzeTranslationUnit(TranslationUnitDecl *TU) {
+  DataCollectVisitor visitor(*this, TU->getASTContext());
+  visitor.TraverseDecl(TU);
+}
+
+void CloneDetector::add(const StmtSequence &S,
+                        const CloneSignature &Signature) {
+  // StringMap only works with StringRefs, so we create one for our data vector.
+  auto &Data = Signature.Data;
+  StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
+                                Data.size() * sizeof(unsigned));
+
+  // Search with the help of the signature if we already have encountered a
+  // clone of the given StmtSequence.
+  auto I = CloneGroupIndexes.find(DataRef);
+  if (I == CloneGroupIndexes.end()) {
+    // We haven't found an existing clone group, so we create a new clone group
+    // for this StmtSequence and store the index of it in our search map.
+    CloneGroupIndexes[DataRef] = CloneGroups.size();
+    CloneGroups.emplace_back(S, Signature);
+    return;
+  }
+
+  // We have found an existing clone group and can expand it with the given
+  // StmtSequence.
+  CloneGroups[I->getValue()].Sequences.push_back(S);
+}
+
+namespace {
+/// \brief Returns true if and only if \p Stmt contains at least one other
+/// sequence in the \p Group.
+bool StmtContainsAnyInGroup(StmtSequence &Stmt,
+                            CloneDetector::CloneGroup &Group) {
+  for (StmtSequence &GroupStmt : Group.Sequences) {
+    if (Stmt.contains(GroupStmt))
+      return true;
+  }
+  return false;
+}
+
+/// \brief Returns true if and only if all sequences in \p OtherGroup are
+/// contained by a sequence in \p Group.
+bool GroupContains(CloneDetector::CloneGroup &Group,
+                   CloneDetector::CloneGroup &OtherGroup) {
+  // We have less sequences in the current group than we have in the other,
+  // so we will never fulfill the requirement for returning true. This is only
+  // possible because we know that a sequence in Group can contain at most
+  // one sequence in OtherGroup.
+  if (Group.Sequences.size() < OtherGroup.Sequences.size())
+    return false;
+
+  for (StmtSequence &Stmt : Group.Sequences) {
+    if (!StmtContainsAnyInGroup(Stmt, OtherGroup))
+      return false;
+  }
+  return true;
+}
+} // end anonymous namespace
+
+void CloneDetector::findClones(std::vector<CloneGroup> &Result,
+                               unsigned MinGroupComplexity) {
+  // Add every valid clone group that fulfills the complexity requirement.
+  for (const CloneGroup &Group : CloneGroups) {
+    if (Group.IsValid() && Group.Signature.Complexity >= MinGroupComplexity) {
+      Result.push_back(Group);
+    }
+  }
+
+  std::vector<unsigned> IndexesToRemove;
+
+  // Compare every group in the result with the rest. If one groups contains
+  // another group, we only need to return the bigger group.
+  // Note: This doesn't scale well, so if possible avoid calling any heavy
+  // function from this loop to minimize the performance impact.
+  for (unsigned i = 0; i < Result.size(); ++i) {
+    for (unsigned j = 0; j < Result.size(); ++j) {
+      // Don't compare a group with itself.
+      if (i == j)
+        continue;
+
+      if (GroupContains(Result[j], Result[i])) {
+        IndexesToRemove.push_back(i);
+        break;
+      }
+    }
+  }
+
+  // Erasing a list of indexes from the vector should be done with decreasing
+  // indexes. As IndexesToRemove is constructed with increasing values, we just
+  // reverse iterate over it to get the desired order.
+  for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) {
+    Result.erase(Result.begin() + *I);
+  }
+}
Index: lib/Analysis/CMakeLists.txt
===================================================================
--- lib/Analysis/CMakeLists.txt
+++ lib/Analysis/CMakeLists.txt
@@ -9,6 +9,7 @@
   CFGReachabilityAnalysis.cpp
   CFGStmtMap.cpp
   CallGraph.cpp
+  CloneDetection.cpp
   CocoaConventions.cpp
   Consumed.cpp
   CodeInjector.cpp
Index: include/clang/StaticAnalyzer/Checkers/Checkers.td
===================================================================
--- include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -77,6 +77,8 @@
 def LLVM : Package<"llvm">;
 def Debug : Package<"debug">;
 
+def CloneDetectionAlpha : Package<"clone">, InPackage<Alpha>, Hidden;
+
 //===----------------------------------------------------------------------===//
 // Core Checkers.
 //===----------------------------------------------------------------------===//
@@ -657,3 +659,17 @@
   DescFile<"DebugCheckers.cpp">;
 
 } // end "debug"
+
+
+//===----------------------------------------------------------------------===//
+// Clone Detection
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = CloneDetectionAlpha in {
+
+def CloneChecker : Checker<"CloneChecker">,
+  HelpText<"Reports similar pieces of code.">,
+  DescFile<"CloneChecker.cpp">;
+
+} // end "clone"
+
Index: include/clang/Analysis/CloneDetection.h
===================================================================
--- /dev/null
+++ include/clang/Analysis/CloneDetection.h
@@ -0,0 +1,234 @@
+//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// /file
+/// This file defines classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_CLONEDETECTION_H
+#define LLVM_CLANG_AST_CLONEDETECTION_H
+
+#include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/StringMap.h"
+
+#include <vector>
+
+namespace clang {
+
+class Stmt;
+class ASTContext;
+class CompoundStmt;
+class TranslationUnitDecl;
+
+/// \brief Identifies a list of statements.
+///
+/// Can either identify a single arbitrary Stmt object, a continuous sequence of
+/// child statements inside a CompoundStmt or no statements at all.
+class StmtSequence {
+  /// If this object identifies a sequence of statements inside a CompoundStmt,
+  /// S points to this CompoundStmt. If this object only identifies a single
+  /// Stmt, then S is a pointer to this Stmt.
+  const Stmt *S;
+
+  /// The related ASTContext for S.
+  ASTContext *Context;
+
+  /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
+  /// instance is representing the CompoundStmt children inside the array
+  /// [StartIndex, EndIndex).
+  unsigned StartIndex;
+  unsigned EndIndex;
+
+public:
+  /// \brief Constructs a StmtSequence holding multiple statements.
+  ///
+  /// The resulting StmtSequence identifies a continuous sequence of statements
+  /// in the body of the given CompoundStmt. Which statements of the body should
+  /// be identified needs to be specified by providing a start and end index
+  /// that describe a non-empty sub-array in the body of the given CompoundStmt.
+  ///
+  /// \param Stmt A CompoundStmt that contains all statements in its body.
+  /// \param Context The ASTContext for the given CompoundStmt.
+  /// \param StartIndex The inclusive start index in the children array of
+  ///                   \p Stmt
+  /// \param EndIndex The exclusive end index in the children array of \p Stmt.
+  StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
+               unsigned StartIndex, unsigned EndIndex);
+
+  /// \brief Constructs a StmtSequence holding a single statement.
+  ///
+  /// \param Stmt An arbitrary Stmt.
+  /// \param Context The ASTContext for the given Stmt.
+  StmtSequence(const Stmt *Stmt, ASTContext &Context);
+
+  /// \brief Constructs an empty StmtSequence.
+  StmtSequence();
+
+  typedef const Stmt *const *iterator;
+
+  /// Returns an iterator pointing to the first statement in this sequence.
+  iterator begin() const;
+
+  /// Returns an iterator pointing behind the last statement in this sequence.
+  iterator end() const;
+
+  /// Returns the first statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  const Stmt *front() const {
+    assert(!empty());
+    return begin()[0];
+  }
+
+  /// Returns the last statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  const Stmt *back() const {
+    assert(!empty());
+    return begin()[size() - 1];
+  }
+
+  /// Returns the number of statements this object holds.
+  unsigned size() const {
+    if (holdsSequence())
+      return EndIndex - StartIndex;
+    if (S == nullptr)
+      return 0;
+    return 1;
+  }
+
+  /// Returns true if and only if this StmtSequence contains no statements.
+  bool empty() const { return size() == 0; }
+
+  /// Returns the related ASTContext for the stored Stmts.
+  ASTContext &getASTContext() const {
+    assert(Context);
+    return *Context;
+  }
+
+  /// Returns true if this objects holds a list of statements.
+  bool holdsSequence() const { return EndIndex != 0; }
+
+  /// Returns the start sourcelocation of the first statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  SourceLocation getStartLoc() const;
+
+  /// Returns the end sourcelocation of the last statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  SourceLocation getEndLoc() const;
+
+  bool operator==(const StmtSequence &Other) const {
+    return std::tie(S, StartIndex, EndIndex) ==
+           std::tie(Other.S, Other.StartIndex, Other.EndIndex);
+  }
+
+  bool operator!=(const StmtSequence &Other) const {
+    return std::tie(S, StartIndex, EndIndex) !=
+           std::tie(Other.S, Other.StartIndex, Other.EndIndex);
+  }
+
+  /// Returns true if and only if this sequence covers a source range that
+  /// contains the source range of the given sequence \p Other.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object
+  /// and passed a non-empty StmtSequence object.
+  bool contains(const StmtSequence &Other) const;
+};
+
+/// \brief Searches for clones in source code.
+///
+/// First, this class needs a translation unit which is passed via
+/// \p analyzeTranslationUnit . It will then generate and store search data
+/// for all statements inside the given translation unit.
+/// Afterwards the generated data can be used to find code clones by calling
+/// \p findClones .
+///
+/// This class only searches for clones in exectuable source code
+/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
+/// are not supported.
+class CloneDetector {
+public:
+  /// Holds the data about a StmtSequence that is needed during the search for
+  /// code clones.
+  struct CloneSignature {
+    /// \brief Holds all relevant data of a StmtSequence.
+    ///
+    /// If this variable is equal for two different StmtSequences, then they can
+    /// be considered clones of each other.
+    std::vector<unsigned> Data;
+
+    /// \brief The complexity of the StmtSequence.
+    ///
+    /// This scalar value serves as a simple way of filtering clones that are
+    /// too small to be reported. A greater value indicates that the related
+    /// StmtSequence is probably more interesting to the user.
+    unsigned Complexity;
+
+    /// \brief Creates an empty CloneSignature without any data.
+    CloneSignature() : Complexity(1) {}
+
+    CloneSignature(const std::vector<unsigned> &Data, unsigned Complexity)
+        : Data(Data), Complexity(Complexity) {}
+
+    /// \brief Adds the data from the given CloneSignature to this one.
+    void add(const CloneSignature &Other) {
+      Data.insert(Data.end(), Other.Data.begin(), Other.Data.end());
+      Complexity += Other.Complexity;
+    }
+  };
+
+  /// Holds group of StmtSequences that are clones of each other and a single
+  /// CloneSignature that all stored StmtSequences have in common.
+  struct CloneGroup {
+    std::vector<StmtSequence> Sequences;
+    CloneSignature Signature;
+
+    CloneGroup(const StmtSequence &Seq, const CloneSignature &Signature)
+        : Signature(Signature) {
+      Sequences.push_back(Seq);
+    }
+
+    /// \brief Returns false if and only if this group should be skipped when
+    ///        searching for clones.
+    bool IsValid() const {
+      // A clone group with only one member makes no sense, so we skip them.
+      return Sequences.size() > 1;
+    }
+  };
+
+  /// \brief Generates and stores search data for all statements in the given
+  /// translation unit.
+  void analyzeTranslationUnit(TranslationUnitDecl *TU);
+
+  /// \brief Stores the CloneSignature to allow future querying.
+  void add(const StmtSequence &S, const CloneSignature &Signature);
+
+  /// \brief Searches the provided statements for clones.
+  ///
+  /// \param Result Output parameter that is filled with a list of found
+  ///               clone groups. Each group contains multiple StmtSequences
+  ///               that were identified to be clones of each other.
+  /// \param MinGroupComplexity Only return clones which have at least this
+  ///                           complexity value.
+  void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity);
+
+private:
+  /// Stores all found clone groups including invalid groups with only a single
+  /// statement.
+  std::vector<CloneGroup> CloneGroups;
+  /// Maps search data to its related index in the \p CloneGroups vector.
+  llvm::StringMap<std::size_t> CloneGroupIndexes;
+};
+
+} // end namespace clang
+
+#endif // LLVM_CLANG_AST_CLONEDETECTION_H
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to