[PATCH] D76291: [Support] Fix formatted_raw_ostream for UTF-8

Oliver Stannard (Linaro) via Phabricator via cfe-commits Thu, 02 Jul 2020 03:11:09 -0700

ostannard updated this revision to Diff 275029.
ostannard marked 5 inline comments as done.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D76291/new/

https://reviews.llvm.org/D76291

Files:
  clang/test/Analysis/checker-plugins.c
  llvm/include/llvm/Support/FormattedStream.h
  llvm/lib/Support/FormattedStream.cpp
  llvm/test/MC/ARM/lsl-zero.s
  llvm/unittests/Support/formatted_raw_ostream_test.cpp

Index: llvm/unittests/Support/formatted_raw_ostream_test.cpp
===================================================================
--- llvm/unittests/Support/formatted_raw_ostream_test.cpp
+++ llvm/unittests/Support/formatted_raw_ostream_test.cpp
@@ -29,4 +29,143 @@
   }
 }
 
+TEST(formatted_raw_ostreamTest, Test_LineColumn) {
+  // Test tracking of line and column numbers in a stream.
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  formatted_raw_ostream C(B);
+
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  C << "a";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+
+  C << "bcdef";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(6U, C.getColumn());
+
+  // '\n' increments line number, sets column to zero.
+  C << "\n";
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  // '\r sets column to zero without changing line number
+  C << "foo\r";
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  // '\t' advances column to the next multiple of 8.
+  // FIXME: If the column number is already a multiple of 8 this will do
+  // nothing, is this behaviour correct?
+  C << "1\t";
+  EXPECT_EQ(8U, C.getColumn());
+  C << "\t";
+  EXPECT_EQ(8U, C.getColumn());
+  C << "1234567\t";
+  EXPECT_EQ(16U, C.getColumn());
+  EXPECT_EQ(1U, C.getLine());
+}
+
+TEST(formatted_raw_ostreamTest, Test_Flush) {
+  // Flushing the buffer causes the characters in the buffer to be scanned
+  // before the buffer is emptied, so line and column numbers will still be
+  // tracked properly.
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(32);
+  formatted_raw_ostream C(B);
+
+  C << "\nabc";
+  EXPECT_EQ(4U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(3U, C.getColumn());
+  EXPECT_EQ(0U, C.GetNumBytesInBuffer());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(32);
+  formatted_raw_ostream C(B);
+
+  // U+00A0 Non-breaking space: encoded as two bytes, but only one column wide.
+  C << u8"\u00a0";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+
+  // U+2468 CIRCLED DIGIT NINE: encoded as three bytes, but only one column
+  // wide.
+  C << u8"\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(2U, C.getColumn());
+  EXPECT_EQ(5U, C.GetNumBytesInBuffer());
+
+  // U+00010000 LINEAR B SYLLABLE B008 A: encoded as four bytes, but only one
+  // column wide.
+  C << u8"\U00010000";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(3U, C.getColumn());
+  EXPECT_EQ(9U, C.GetNumBytesInBuffer());
+
+  // U+55B5, CJK character, encodes as three bytes, takes up two columns.
+  C << u8"\u55b5";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(5U, C.getColumn());
+  EXPECT_EQ(12U, C.GetNumBytesInBuffer());
+
+  // U+200B, zero-width space, encoded as three bytes but has no effect on the
+  // column or line number.
+  C << u8"\u200b";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(5U, C.getColumn());
+  EXPECT_EQ(15U, C.GetNumBytesInBuffer());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8Buffered) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(4);
+  formatted_raw_ostream C(B);
+
+  // U+2468 encodes as three bytes, so will cause the buffer to be flushed after
+  // the first byte (4 byte buffer, 3 bytes already written). We need to save
+  // the first part of the UTF-8 encoding until after the buffer is cleared and
+  // the remaining two bytes are written, at which point we can check the
+  // display width. In this case the display width is 1, so we end at column 4,
+  // with 6 bytes written into total, 2 of which are in the buffer.
+  C << u8"123\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(4U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(6U, A.size());
+
+  // Same as above, but with a CJK character which displays as two columns.
+  C << u8"123\u55b5";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(9U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(12U, A.size());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8TinyBuffer) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(1);
+  formatted_raw_ostream C(B);
+
+  // The stream has a one-byte buffer, so it gets flushed multiple times while
+  // printing a single Unicode character.
+  C << u8"\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+  EXPECT_EQ(0U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(3U, A.size());
+}
 }
Index: llvm/test/MC/ARM/lsl-zero.s
===================================================================
--- llvm/test/MC/ARM/lsl-zero.s
+++ llvm/test/MC/ARM/lsl-zero.s
@@ -1,6 +1,6 @@
-// RUN: llvm-mc -triple=thumbv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV7 %s
-// RUN: llvm-mc -triple=thumbv8 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV8 %s
-// RUN: llvm-mc -triple=armv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s
+// RUN: llvm-mc -triple=thumbv7 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV7 %s
+// RUN: llvm-mc -triple=thumbv8 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV8 %s
+// RUN: llvm-mc -triple=armv7 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s
 
         // lsl #0 is actually mov, so here we check that it behaves the same as
         // mov with regards to the permitted registers and how it behaves in an
Index: llvm/lib/Support/FormattedStream.cpp
===================================================================
--- llvm/lib/Support/FormattedStream.cpp
+++ llvm/lib/Support/FormattedStream.cpp
@@ -11,7 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Unicode.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
@@ -19,16 +21,22 @@
 
 /// UpdatePosition - Examine the given char sequence and figure out which
 /// column we end up in after output, and how many line breaks are contained.
-///
-static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *Ptr, size_t Size) {
+/// This assumes that the input string is well-formed UTF-8, and takes into
+/// account Unicode characters which render as multiple columns wide.
+void formatted_raw_ostream::UpdatePosition(const char *Ptr, size_t Size) {
   unsigned &Column = Position.first;
   unsigned &Line = Position.second;
 
-  // Keep track of the current column and line by scanning the string for
-  // special characters
-  for (const char *End = Ptr + Size; Ptr != End; ++Ptr) {
-    ++Column;
-    switch (*Ptr) {
+  auto ProcessUTF8CodePoint = [&Line, &Column](StringRef CP) {
+    int Width = sys::unicode::columnWidthUTF8(CP);
+    if (Width != sys::unicode::ErrorNonPrintableCharacter)
+      Column += Width;
+
+    // The only special whitespace characters we care about are single-byte.
+    if (CP.size() > 1)
+      return;
+
+    switch (CP[0]) {
     case '\n':
       Line += 1;
       LLVM_FALLTHROUGH;
@@ -40,6 +48,46 @@
       Column += (8 - (Column & 0x7)) & 0x7;
       break;
     }
+  };
+
+  // If we have a partial UTF-8 sequence from the previous buffer, check that
+  // first.
+  if (PartialUTF8Char.size()) {
+    size_t BytesFromBuffer =
+        getNumBytesForUTF8(PartialUTF8Char[0]) - PartialUTF8Char.size();
+    if (Size < BytesFromBuffer) {
+      // If we still don't have enough bytes for a complete code point, just
+      // append what we have.
+      PartialUTF8Char.append(StringRef(Ptr, Size));
+      return;
+    } else {
+      // The first few bytes from the buffer will complete the code point.
+      // Concatenate them and process their effect on the line and column
+      // numbers.
+      PartialUTF8Char.append(StringRef(Ptr, BytesFromBuffer));
+      ProcessUTF8CodePoint(PartialUTF8Char);
+      PartialUTF8Char.clear();
+      Ptr += BytesFromBuffer;
+      Size -= BytesFromBuffer;
+    }
+  }
+
+  // Now scan the rest of the buffer.
+  unsigned NumBytes;
+  for (const char *End = Ptr + Size; Ptr < End; Ptr += NumBytes) {
+    NumBytes = getNumBytesForUTF8(*Ptr);
+
+    // The buffer might end part way through a UTF-8 code unit sequence for a
+    // Unicode scalar value if it got flushed. If this happens, we can't know
+    // the display width until we see the rest of the code point. Stash the
+    // bytes we do have, so that we can reconstruct the whole code point later,
+    // even if the buffer is being flushed.
+    if ((End - Ptr) < NumBytes) {
+      PartialUTF8Char = StringRef(Ptr, End - Ptr);
+      return;
+    }
+
+    ProcessUTF8CodePoint(StringRef(Ptr, NumBytes));
   }
 }
 
@@ -52,9 +100,9 @@
   if (Ptr <= Scanned && Scanned <= Ptr + Size)
     // Scan all characters added since our last scan to determine the new
     // column.
-    UpdatePosition(Position, Scanned, Size - (Scanned - Ptr));
+    UpdatePosition(Scanned, Size - (Scanned - Ptr));
   else
-    UpdatePosition(Position, Ptr, Size);
+    UpdatePosition(Ptr, Size);
 
   // Update the scanning pointer.
   Scanned = Ptr + Size;
Index: llvm/include/llvm/Support/FormattedStream.h
===================================================================
--- llvm/include/llvm/Support/FormattedStream.h
+++ llvm/include/llvm/Support/FormattedStream.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_FORMATTEDSTREAM_H
 #define LLVM_SUPPORT_FORMATTEDSTREAM_H
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/raw_ostream.h"
 #include <utility>
 
@@ -21,8 +22,11 @@
 
 /// formatted_raw_ostream - A raw_ostream that wraps another one and keeps track
 /// of line and column position, allowing padding out to specific column
-/// boundaries and querying the number of lines written to the stream.
-///
+/// boundaries and querying the number of lines written to the stream. This
+/// assumes that the contents of the stream is valid UTF-8 encoded text. This
+/// doesn't attempt to handle everything Unicode can do (combining characters,
+/// right-to-left markers, etc), but should cover the cases likely to appear in
+/// source code or diagnostic messages.
 class formatted_raw_ostream : public raw_ostream {
   /// TheStream - The real stream we output to. We set it to be
   /// unbuffered, since we're already doing our own buffering.
@@ -40,6 +44,14 @@
   ///
   const char *Scanned;
 
+  /// PartialUTF8Char - Either empty or a prefix of a UTF-8 code unit sequence
+  /// for a Unicode scalar value which should be prepended to the buffer for the
+  /// next call to ComputePosition. This is needed when the buffer is flushed
+  /// when it ends part-way through the UTF-8 encoding of a Unicode scalar
+  /// value, so that we can compute the display width of the character once we
+  /// have the rest of it.
+  SmallString<4> PartialUTF8Char;
+
   void write_impl(const char *Ptr, size_t Size) override;
 
   /// current_pos - Return the current position within the stream,
@@ -52,10 +64,16 @@
   }
 
   /// ComputePosition - Examine the given output buffer and figure out the new
-  /// position after output.
-  ///
+  /// position after output. This is safe to call multiple times on the same
+  /// buffer, as it records the most recently scanned character and resumes from
+  /// there when the buffer has not been flushed.
   void ComputePosition(const char *Ptr, size_t size);
 
+  /// UpdatePosition - scan the characters in [Ptr, Ptr+Size), and update the
+  /// line and column numbers. Unlike ComputePosition, this must be called
+  /// exactly once on each region of the buffer.
+  void UpdatePosition(const char *Ptr, size_t Size);
+
   void setStream(raw_ostream &Stream) {
     releaseStream();
 
@@ -105,11 +123,17 @@
   /// \param NewCol - The column to move to.
   formatted_raw_ostream &PadToColumn(unsigned NewCol);
 
-  /// getColumn - Return the column number
-  unsigned getColumn() { return Position.first; }
+  unsigned getColumn() {
+    // Calculate current position, taking buffer contents into account.
+    ComputePosition(getBufferStart(), GetNumBytesInBuffer());
+    return Position.first;
+  }
 
-  /// getLine - Return the line number
-  unsigned getLine() { return Position.second; }
+  unsigned getLine() {
+    // Calculate current position, taking buffer contents into account.
+    ComputePosition(getBufferStart(), GetNumBytesInBuffer());
+    return Position.second;
+  }
 
   raw_ostream &resetColor() override {
     TheStream->resetColor();
Index: clang/test/Analysis/checker-plugins.c
===================================================================
--- clang/test/Analysis/checker-plugins.c
+++ clang/test/Analysis/checker-plugins.c
@@ -116,4 +116,5 @@
 // RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-CHECKER-OPTION-HELP
 
 // CHECK-CHECKER-OPTION-HELP: example.MyChecker:ExampleOption  (bool) This is an
-// CHECK-CHECKER-OPTION-HELP-SAME: example checker opt. (default: false)
+// CHECK-CHECKER-OPTION-HELP-SAME: example checker opt. (default:
+// CHECK-CHECKER-OPTION-HELP-NEXT: false)

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D76291: [Support] Fix formatted_raw_ostream for UTF-8

Reply via email to