[cfe-commits] [PATCH] Initial version of formatting library

Daniel Jasper Thu, 25 Oct 2012 06:14:24 -0700

Hi klimek,

This formatting library will be used by a stand-alone clang-format tool and can 
also be used when writing other refactorings.


Manuel's original design document:
https://docs.google.com/a/google.com/document/d/1gpckL2U_6QuU9YW2L1ABsc4Fcogn5UngKk7fE5dDOoA/edit

This is still far away from being finished or useful to format real code. But I 
think, it should be checked in to get as much feedback as possible and 
collaborate on certain parts.

http://llvm-reviews.chandlerc.com/D80

Files:
  include/clang/Format/Format.h
  lib/CMakeLists.txt
  lib/Format/CMakeLists.txt
  lib/Format/Format.cpp
  lib/Format/Makefile
  lib/Makefile
  unittests/CMakeLists.txt
  unittests/Format/CMakeLists.txt
  unittests/Format/FormatTest.cpp
  unittests/Format/Makefile

Index: include/clang/Format/Format.h
===================================================================
--- /dev/null
+++ include/clang/Format/Format.h
@@ -0,0 +1,46 @@
+//===--- Format.h - Format C++ code -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This is EXPERIMENTAL code under heavy development. It is not in a state yet,
+//  where it can be used to format real code.
+//
+//  Various functions to configurably format source code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_FORMAT_FORMAT_H_
+#define LLVM_CLANG_FORMAT_FORMAT_H
+
+#include "clang/Frontend/FrontendAction.h"
+#include "clang/Tooling/Refactoring.h"
+
+namespace clang {
+
+class Lexer;
+class SourceManager;
+
+namespace format {
+
+/// \brief A character range of source code.
+struct CodeRange {
+  CodeRange(unsigned Offset, unsigned Length)
+    : Offset(Offset), Length(Length) {}
+
+  unsigned Offset;
+  unsigned Length;
+};
+
+/// \brief Reformats the given Ranges in the token stream coming out of \c Lex.
+tooling::Replacements reformat(Lexer &Lex, SourceManager &Sources,
+                               std::vector<CodeRange> Ranges);
+
+} // end namespace format
+} // end namespace clang
+
+#endif // LLVM_CLANG_FORMAT_FORMAT_H
Index: lib/CMakeLists.txt
===================================================================
--- lib/CMakeLists.txt
+++ lib/CMakeLists.txt
@@ -16,3 +16,4 @@
 add_subdirectory(FrontendTool)
 add_subdirectory(Tooling)
 add_subdirectory(StaticAnalyzer)
+add_subdirectory(Format)
Index: lib/Format/CMakeLists.txt
===================================================================
--- /dev/null
+++ lib/Format/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(LLVM_LINK_COMPONENTS support)
+
+add_clang_library(clangFormat
+  Format.cpp
+  )
+
+add_dependencies(clangFormat
+  ClangAttrClasses
+  ClangAttrList
+  ClangDeclNodes
+  ClangDiagnosticCommon
+  ClangDiagnosticFrontend
+  ClangStmtNodes
+  )
+
+target_link_libraries(clangFormat
+  clangBasic
+  clangFrontend
+  clangAST
+  clangASTMatchers
+  clangRewriteCore
+  clangRewriteFrontend
+  )
Index: lib/Format/Format.cpp
===================================================================
--- /dev/null
+++ lib/Format/Format.cpp
@@ -0,0 +1,305 @@
+//===--- Format.cpp - Format C++ code -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This is EXPERIMENTAL code under heavy development. It is not in a state yet,
+//  where it can be used to format real code.
+//
+//  Implements Format.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Format/Format.h"
+
+#include "clang/Basic/SourceManager.h"
+#include "clang/Lex/Lexer.h"
+
+namespace clang {
+namespace format {
+
+// An unbreakable unit of tokens.
+// All characters between Dings will be up for reformatting.
+struct UnbreakableEntity {
+  UnbreakableEntity() : Length(0) {}
+  unsigned Length;
+  std::vector<Token> Tokens;
+  unsigned NewlinesBefore;
+
+  unsigned WhiteSpaceLength;
+  SourceLocation WhiteSpaceStart;
+};
+
+struct FormatToken {
+  Token Tok;
+
+  unsigned NewlinesBefore;
+  unsigned WhiteSpaceLength;
+  SourceLocation WhiteSpaceStart;
+};
+
+class Formatter {
+public:
+  Formatter(Lexer &Lex, SourceManager &Sources,
+            const std::vector<CodeRange> &Ranges)
+      : Lex(Lex), Sources(Sources), EndOfFile(false) {}
+
+  tooling::Replacements format() {
+    Lex.SetKeepWhitespaceMode(true);
+
+    FormatToken NextToken;
+    NextToken.WhiteSpaceLength = 0;
+
+    // Read token stream and turn tokens into FormatTokens.
+    while (!EndOfFile) {
+      NextToken.Tok = getNextToken();
+      StringRef Data(Sources.getCharacterData(NextToken.Tok.getLocation()),
+          NextToken.Tok.getLength());
+      if (NextToken.WhiteSpaceLength == 0) {
+        NextToken.WhiteSpaceStart = NextToken.Tok.getLocation();
+        NextToken.NewlinesBefore = 0;
+      }
+      if (NextToken.Tok.getKind() == tok::unknown) {
+        StringRef Data(Sources.getCharacterData(NextToken.Tok.getLocation()),
+                       NextToken.Tok.getLength());
+        if (std::find(Data.begin(), Data.end(), '\n') != Data.end())
+          ++NextToken.NewlinesBefore;
+        NextToken.WhiteSpaceLength += NextToken.Tok.getLength();
+        continue;
+      }
+      Tokens.push_back(NextToken);
+      NextToken.WhiteSpaceLength = 0;
+    }
+
+    splitAndFormatContinuations();
+
+    return Replaces;
+  }
+
+private:
+  /// \brief Split token stream into continuations, i.e. something that we'd
+  /// on a single line if we didn't have a column limit.
+  void splitAndFormatContinuations() {
+    unsigned Level = 0;
+    unsigned ParenLevel = 0;
+    unsigned ContinuationStart = 0;
+    std::vector<bool> IsCompound;
+    IsCompound.push_back(true);
+    for (unsigned i = 0; i < Tokens.size(); i++) {
+      if (Tokens[i].Tok.getKind() == tok::l_paren) {
+        ++ParenLevel;
+      } else if (Tokens[i].Tok.getKind() == tok::r_paren) {
+        --ParenLevel;
+      } else if (ParenLevel == 0) {
+        if (Tokens[i].Tok.getKind() == tok::l_brace ||
+            Tokens[i].Tok.getKind() == tok::r_brace ||
+            Tokens[i].Tok.getKind() == tok::semi) {
+          if (Tokens[i].Tok.getKind() == tok::r_brace) {
+            --Level;
+            IsCompound.pop_back();
+            addNewline(ContinuationStart, Level);
+            formatContinuation(ContinuationStart, i, Level);
+
+            while (!IsCompound.back()) {
+              --Level;
+              IsCompound.pop_back();
+            }
+          } else {
+            addNewline(ContinuationStart, Level);
+            formatContinuation(ContinuationStart, i, Level);
+          }
+
+          while (Tokens[i].Tok.getKind() == tok::semi && !IsCompound.back()) {
+            --Level;
+            IsCompound.pop_back();
+          }
+
+          if (Tokens[i].Tok.getKind() == tok::l_brace) {
+            ++Level;
+            IsCompound.push_back(true);
+          }
+
+          ContinuationStart = i + 1;
+        }
+
+        else if (i != ContinuationStart) {
+          if (isIfForOrWhile(Tokens[ContinuationStart].Tok)) {
+            addNewline(ContinuationStart, Level);
+            formatContinuation(ContinuationStart, i - 1, Level);
+            ++Level;
+            IsCompound.push_back(false);
+            ContinuationStart = i;
+          }
+        }
+      }
+    }
+  }
+
+  // The current state when indenting a continuation.
+  struct IndentState {
+    unsigned ParenLevel;
+    unsigned Column;
+    std::vector<unsigned> Indent;
+    std::vector<unsigned> UsedIndent;
+  };
+
+  // Append the token at 'Index' to the IndentState 'State'.
+  void addToken(unsigned Index, bool Newline, bool DryRun, IndentState &State) {
+    if (Tokens[Index].Tok.getKind() == tok::l_paren) {
+      State.UsedIndent.push_back(State.UsedIndent.back());
+      State.Indent.push_back(State.UsedIndent.back() + 4);
+      ++State.ParenLevel;
+    }
+    if (Newline) {
+      if (!DryRun)
+        setWhitespace(Tokens[Index], 1, State.Indent[State.ParenLevel]);
+      State.Column = State.Indent[State.ParenLevel] +
+          Tokens[Index].Tok.getLength();
+      State.UsedIndent[State.ParenLevel] = State.Indent[State.ParenLevel];
+    } else {
+      bool Space = spaceRequiredBetween(Tokens[Index - 1].Tok,
+                                        Tokens[Index].Tok);
+      if (!DryRun)
+        setWhitespace(Tokens[Index], 0, Space ? 1 : 0);
+      if (Tokens[Index - 1].Tok.getKind() == tok::l_paren)
+        State.Indent[State.ParenLevel] = State.Column;
+      State.Column += Tokens[Index].Tok.getLength() + (Space ? 1 : 0);
+    }
+
+    if (Tokens[Index].Tok.getKind() == tok::r_paren) {
+      --State.ParenLevel;
+      State.Indent.pop_back();
+    }
+  }
+
+  bool canBreakAfter(Token tok) {
+    return tok.getKind() == tok::comma || tok.getKind() == tok::semi ||
+        tok.getKind() == tok::l_paren;
+  }
+
+  // Calculate the number of lines needed to format the remaining part of the
+  // continuation starting in the state 'State'. If 'NewLine' is set, a new line
+  // will be added after the previous token.
+  // 'EndIndex' is the last token belonging to the continuation.
+  // 'StopAt' is used for optimization. If we can determine that we'll
+  // definitely need more than 'StopAt' additional lines, we already know of a
+  // better solution.
+  int numLines(IndentState State, bool NewLine, unsigned Index,
+               unsigned EndIndex, int StopAt) {
+    count++;
+
+    // We are at the end of the continuation, so we don't need any more lines.
+    if (Index > EndIndex)
+      return 0;
+
+    addToken(Index - 1, NewLine, true, State);
+    if (NewLine)
+      --StopAt;
+
+    // Exceeding 80 columns is bad.
+    if (State.Column > 80)
+      return 10000;
+
+    if (StopAt < 1)
+      return 10000;
+
+    int NoBreak = numLines(State, false, Index + 1, EndIndex, StopAt);
+    if (!canBreakAfter(Tokens[Index - 1].Tok))
+      return NoBreak + (NewLine ? 1 : 0);
+    int Break = numLines(State, true, Index + 1, EndIndex,
+                         std::min(StopAt, NoBreak));
+    return std::min(NoBreak, Break) + (NewLine ? 1 : 0);
+  }
+
+  void formatContinuation(unsigned StartIndex, unsigned EndIndex,
+                          unsigned Level) {
+    count = 0;
+    IndentState State;
+    State.ParenLevel = 0;
+    State.Column = Level * 2 + Tokens[StartIndex].Tok.getLength();
+    State.UsedIndent.push_back(Level * 2);
+    State.Indent.push_back(Level * 2 + 4);
+    for (unsigned i = StartIndex + 1; i <= EndIndex; ++i) {
+      bool InsertNewLine = Tokens[i].NewlinesBefore > 0;
+      if (!InsertNewLine) {
+        int NoBreak = numLines(State, false, i + 1, EndIndex, 100000);
+        int Break = numLines(State, true, i + 1, EndIndex, 100000);
+        InsertNewLine = Break < NoBreak;
+      }
+      addToken(i, InsertNewLine, false, State);
+    }
+    llvm::outs() << "Tried combinations: " << count << "\n";
+  }
+
+  void setWhitespace(const FormatToken& Tok, unsigned NewLines,
+                     unsigned Spaces) {
+    Replaces.insert(tooling::Replacement(Sources, Tok.WhiteSpaceStart,
+                                         Tok.WhiteSpaceLength,
+                                         std::string(NewLines, '\n') +
+                                         std::string(Spaces, ' ')));
+  }
+
+  bool isIfForOrWhile(Token Tok) {
+    if (Tok.getKind() != tok::raw_identifier)
+      return false;
+    StringRef Data(Sources.getCharacterData(Tok.getLocation()),
+        Tok.getLength());
+    return Data == "for" || Data == "while" || Data == "if";
+  }
+
+  bool spaceRequiredBetween(Token Left, Token Right) {
+    if (Left.is(tok::period) || Right.is(tok::period))
+      return false;
+    if (Left.is(tok::colon) || Right.is(tok::colon))
+      return false;
+    if (Left.is(tok::plusplus) && Right.is(tok::raw_identifier))
+      return false;
+    if (Left.is(tok::l_paren))
+      return false;
+    if (Right.is(tok::r_paren) || Right.is(tok::semi) || Right.is(tok::comma))
+      return false;
+    if (Right.is(tok::l_paren)) {
+      return isIfForOrWhile(Left);
+    }
+    return true;
+  }
+
+  Token getNextToken() {
+    Token tok;
+    EndOfFile = Lex.LexFromRawLexer(tok);
+    return tok;
+  }
+
+  /// \brief Add a new line before token \c Index.
+  void addNewline(unsigned Index, unsigned Level) {
+    if (Tokens[Index].WhiteSpaceStart.isValid()) {
+      unsigned Newlines = Tokens[Index].NewlinesBefore;
+      if (Newlines == 0 && Index != 0)
+        Newlines = 1;
+      setWhitespace(Tokens[Index], Newlines, Level * 2);
+    }
+  }
+
+  Lexer &Lex;
+  SourceManager &Sources;
+  bool EndOfFile;
+  tooling::Replacements Replaces;
+  std::vector<UnbreakableEntity> Entities;
+  std::vector<FormatToken> Tokens;
+
+  // Count number of tried states visited when formatting a continuation.
+  unsigned int count;
+};
+
+tooling::Replacements reformat(Lexer &Lex, SourceManager &Sources,
+                               std::vector<CodeRange> Ranges) {
+  Formatter formatter(Lex, Sources, Ranges);
+  return formatter.format();
+}
+
+}  // namespace format
+}  // namespace clang
Index: lib/Format/Makefile
===================================================================
--- /dev/null
+++ lib/Format/Makefile
@@ -0,0 +1,13 @@
+##===- clang/lib/Tooling/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+CLANG_LEVEL := ../..
+LIBRARYNAME := clangTooling
+
+include $(CLANG_LEVEL)/Makefile
Index: lib/Makefile
===================================================================
--- lib/Makefile
+++ lib/Makefile
@@ -10,7 +10,7 @@
 
 PARALLEL_DIRS = Headers Basic Lex Parse AST ASTMatchers Sema CodeGen Analysis \
                 StaticAnalyzer Edit Rewrite ARCMigrate Serialization Frontend \
-                FrontendTool Tooling Driver
+                FrontendTool Tooling Driver Format
 
 include $(CLANG_LEVEL)/Makefile
 
Index: unittests/CMakeLists.txt
===================================================================
--- unittests/CMakeLists.txt
+++ unittests/CMakeLists.txt
@@ -15,3 +15,4 @@
 add_subdirectory(Lex)
 add_subdirectory(Frontend)
 add_subdirectory(Tooling)
+add_subdirectory(Format)
Index: unittests/Format/CMakeLists.txt
===================================================================
--- /dev/null
+++ unittests/Format/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  asmparser
+  support
+  mc
+  )
+
+add_clang_unittest(FormatTests
+  FormatTest.cpp
+  )
+
+target_link_libraries(FormatTests
+  clangAST
+  clangFormat
+  clangTooling
+  clangRewriteCore
+  )
Index: unittests/Format/FormatTest.cpp
===================================================================
--- /dev/null
+++ unittests/Format/FormatTest.cpp
@@ -0,0 +1,110 @@
+//===- unittest/Format/FormatTest.cpp - Formatting unit tests -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../Tooling/RewriterTestContext.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Format/Format.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace format {
+
+class FormatTest : public ::testing::Test {
+protected:
+  std::string format(llvm::StringRef Code, unsigned offset, unsigned length) {
+    RewriterTestContext Context;
+    FileID ID = Context.createInMemoryFile("input.cc", Code);
+    std::vector<CodeRange> Ranges(1, CodeRange(offset, length));
+    Lexer Lex(ID, Context.Sources.getBuffer(ID), Context.Sources,
+              LangOptions());
+    tooling::Replacements Replace = reformat(Lex, Context.Sources, Ranges);
+    EXPECT_TRUE(applyAllReplacements(Replace, Context.Rewrite));
+    llvm::outs() << Context.getRewrittenText(ID) << "\n";
+    return Context.getRewrittenText(ID);
+  }
+};
+
+TEST_F(FormatTest, DoesNotChangeCorrectlyFormatedCode) {
+  EXPECT_EQ(";", format(";", 0, 1));
+}
+
+TEST_F(FormatTest, FormatsGlobalStatementsAt0) {
+  EXPECT_EQ("int i;", format("  int i;", 0, 1));
+  EXPECT_EQ("\nint i;", format(" \n\t \r  int i;", 0, 1));
+  EXPECT_EQ("int i;\nint j;", format("    int i; int j;", 0, 1));
+  EXPECT_EQ("int i;\nint j;", format("    int i;\n  int j;", 0, 1));
+}
+
+TEST_F(FormatTest, FormatsContinuationsAtFirstFormat) {
+  EXPECT_EQ("int\n    i;", format("int\ni;", 0, 1));
+}
+
+TEST_F(FormatTest, FormatsNestedBlockStatements) {
+  EXPECT_EQ("{\n  {\n    {\n    }\n  }\n}", format("{{{}}}", 0, 1));
+}
+
+TEST_F(FormatTest, FormatsForLoop) {
+  EXPECT_EQ("for (int i = 0; i < 10; ++i);",
+            format("for(int i=0;i<10;++i);", 0 , 1));
+  EXPECT_EQ("for (int i = 0;\n     i < 10;\n     ++i);",
+            format("for(int i=0;\ni<10;\n++i);", 0 , 1));
+}
+
+TEST_F(FormatTest, FormatsWhileLoop) {
+  EXPECT_EQ("while (true) {\n}", format("while(true){}", 0, 1));
+}
+
+TEST_F(FormatTest, FormatsNestedCall) {
+  EXPECT_EQ("Method(1,\n"
+            "       2(\n"
+            "           3));",
+            format("Method(1,\n2(\n3));", 0, 1));
+  EXPECT_EQ("Method(1(2,\n"
+            "         3()));", format("Method(1(2,\n3()));", 0, 1));
+}
+
+TEST_F(FormatTest, FormatsAwesomeMethodCall) {
+  EXPECT_EQ(
+      "SomeLongMethodName(SomeReallyLongMethod(CallOtherReallyLongMethod(\n"
+      "    parameter, parameter, parameter)), SecondLongCall(some_parameter));",
+      format(
+          "SomeLongMethodName(SomeReallyLongMethod(CallOtherReallyLongMethod(\n"
+          "parameter , parameter, parameter)), SecondLongCall("
+          "some_parameter) );", 0, 1));
+  EXPECT_EQ(
+      "SomeLongMethodName(SomeReallyLongMethod(CallOtherReallyLongMethod(\n"
+      "    parameter, parameter, parameter)), SecondLongCall(some_parameter));",
+      format(
+          "SomeLongMethodName(SomeReallyLongMethod(CallOtherReallyLongMethod("
+          "parameter,parameter,parameter)),SecondLongCall("
+          "some_parameter) );", 0, 1));
+}
+
+TEST_F(FormatTest, FormatsFunctionDefinition) {
+  EXPECT_EQ(
+      "void f(int a, int b, int c, int d, int e, int f, int g,"
+      " int h, int j, int f,\n       int c, int ddddddddddddd) {\n}",
+      format("void f(int a, int b, int c, int d, int e, int f, int g,"
+        "int h, int j, int f, int c, int ddddddddddddd) {}", 0, 1));
+}
+
+TEST_F(FormatTest, FormatIfWithoutCompountStatement) {
+  EXPECT_EQ(
+      "if (true)\n  f();\ng();",
+      format("if (true) f(); g();", 0, 1));
+  EXPECT_EQ(
+      "if (a)\n  if (b)\n    if (c)\n      g();\nh();",
+      format("if(a)if(b)if(c)g();h();", 0, 1));
+  EXPECT_EQ(
+      "if (a)\n  if (b) {\n    f();\n  }\ng();",
+      format("if(a)if(b) {f();}g();", 0, 1));
+}
+
+} // end namespace tooling
+} // end namespace clang
Index: unittests/Format/Makefile
===================================================================
--- /dev/null
+++ unittests/Format/Makefile
@@ -0,0 +1,19 @@
+##===- unittests/Format/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+CLANG_LEVEL = ../..
+TESTNAME = Format
+include $(CLANG_LEVEL)/../../Makefile.config
+LINK_COMPONENTS := $(TARGETS_TO_BUILD) asmparser support mc
+USEDLIBS = clangFormat.a clangTooling.a clangFrontend.a clangSerialization.a \
+           clangDriver.a clangParse.a clangRewriteCore.a
+           clangRewriteFrontend.a clangSema.a clangAnalysis.a clangEdit.a \
+           clangAST.a clangASTMatchers.a clangLex.a clangBasic.a
+
+include $(CLANG_LEVEL)/unittests/Makefile

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

[cfe-commits] [PATCH] Initial version of formatting library

Reply via email to