[PATCH] D76291: [Support] Fix formatted_raw_ostream for UTF-8

Oliver Stannard (Linaro) via Phabricator via cfe-commits Wed, 18 Mar 2020 04:20:43 -0700

ostannard updated this revision to Diff 251030.
ostannard added a comment.

- I managed to reproduce the lsl-zero.s test failure on windows, this turned 
out to be because the test merges stdout and stderr, which can result in them 
being interleaved in ways which breaks tests. The test doesn't check anything 
in stderr, so fixed by redirecting that to /dev/null.
- The unit test was failing when built with MSVC, fixed by explicitly using 
UTF-8 string literals.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D76291/new/

https://reviews.llvm.org/D76291

Files:
  clang/test/Analysis/checker-plugins.c
  llvm/include/llvm/Support/FormattedStream.h
  llvm/lib/Support/FormattedStream.cpp
  llvm/test/MC/ARM/lsl-zero.s
  llvm/unittests/Support/formatted_raw_ostream_test.cpp

Index: llvm/unittests/Support/formatted_raw_ostream_test.cpp
===================================================================
--- llvm/unittests/Support/formatted_raw_ostream_test.cpp
+++ llvm/unittests/Support/formatted_raw_ostream_test.cpp
@@ -29,4 +29,143 @@
   }
 }
 
+TEST(formatted_raw_ostreamTest, Test_LineColumn) {
+  // Test tracking of line and column numbers in a stream.
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  formatted_raw_ostream C(B);
+
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  C << "a";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+
+  C << "bcdef";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(6U, C.getColumn());
+
+  // '\n' increments line number, sets column to zero.
+  C << "\n";
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  // '\r sets column to zero without changing line number
+  C << "foo\r";
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  // '\t' advances column to the next multiple of 8.
+  // FIXME: If the column number is already a multiple of 8 this will do
+  // nothing, is this behaviour correct?
+  C << "1\t";
+  EXPECT_EQ(8U, C.getColumn());
+  C << "\t";
+  EXPECT_EQ(8U, C.getColumn());
+  C << "1234567\t";
+  EXPECT_EQ(16U, C.getColumn());
+  EXPECT_EQ(1U, C.getLine());
+}
+
+TEST(formatted_raw_ostreamTest, Test_Flush) {
+  // Flushing the buffer causes the characters in the buffer to be scanned
+  // before the buffer is emptied, so line and column numbers will still be
+  // tracked properly.
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(32);
+  formatted_raw_ostream C(B);
+
+  C << "\nabc";
+  EXPECT_EQ(4U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(3U, C.getColumn());
+  EXPECT_EQ(0U, C.GetNumBytesInBuffer());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(32);
+  formatted_raw_ostream C(B);
+
+  // U+00A0 Non-breaking space: encoded as two bytes, but only one column wide.
+  C << u8"\u00a0";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+
+  // U+2468 CIRCLED DIGIT NINE: encoded as three bytes, but only one column
+  // wide.
+  C << u8"\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(2U, C.getColumn());
+  EXPECT_EQ(5U, C.GetNumBytesInBuffer());
+
+  // U+00010000 LINEAR B SYLLABLE B008 A: encoded as four bytes, but only one
+  // column wide.
+  C << u8"\U00010000";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(3U, C.getColumn());
+  EXPECT_EQ(9U, C.GetNumBytesInBuffer());
+
+  // U+55B5, CJK character, encodes as three bytes, takes up two columns.
+  C << u8"\u55b5";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(5U, C.getColumn());
+  EXPECT_EQ(12U, C.GetNumBytesInBuffer());
+
+  // U+200B, zero-width space, encoded as three bytes but has no effect on the
+  // column or line number.
+  C << u8"\u200b";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(5U, C.getColumn());
+  EXPECT_EQ(15U, C.GetNumBytesInBuffer());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8Buffered) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(4);
+  formatted_raw_ostream C(B);
+
+  // This character encodes as three bytes, so will cause the buffer to be
+  // flushed after the first byte (4 byte buffer, 3 bytes already written). We
+  // need to save the first part of the UTF-8 encoding until after the buffer is
+  // cleared and the remaining two bytes are written, at which point we can
+  // check the display width. In this case the display width is 1, so we end at
+  // column 4, with 6 bytes written into total, 2 of which are in the buffer.
+  C << u8"123\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(4U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(6U, A.size());
+
+  // Same as above, but with a CJK character which displays as two columns.
+  C << u8"123\u55b5";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(9U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(12U, A.size());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8TinyBuffer) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(1);
+  formatted_raw_ostream C(B);
+
+  // The stream has a one-byte buffer, so it gets flushed multiple times while
+  // printing a single Unicode character.
+  C << u8"\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+  EXPECT_EQ(0U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(3U, A.size());
+}
 }
Index: llvm/test/MC/ARM/lsl-zero.s
===================================================================
--- llvm/test/MC/ARM/lsl-zero.s
+++ llvm/test/MC/ARM/lsl-zero.s
@@ -1,6 +1,6 @@
-// RUN: llvm-mc -triple=thumbv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV7 %s
-// RUN: llvm-mc -triple=thumbv8 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV8 %s
-// RUN: llvm-mc -triple=armv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s
+// RUN: llvm-mc -triple=thumbv7 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV7 %s
+// RUN: llvm-mc -triple=thumbv8 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV8 %s
+// RUN: llvm-mc -triple=armv7 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s
 
         // lsl #0 is actually mov, so here we check that it behaves the same as
         // mov with regards to the permitted registers and how it behaves in an
Index: llvm/lib/Support/FormattedStream.cpp
===================================================================
--- llvm/lib/Support/FormattedStream.cpp
+++ llvm/lib/Support/FormattedStream.cpp
@@ -11,7 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Locale.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
@@ -19,16 +21,24 @@
 
 /// UpdatePosition - Examine the given char sequence and figure out which
 /// column we end up in after output, and how many line breaks are contained.
-///
-static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *Ptr, size_t Size) {
+/// This assumes that the input string is well-formed UTF-8, and takes into
+/// account Unicode characters which render as multiple columns wide.
+void formatted_raw_ostream::UpdatePosition(const char *Ptr, size_t Size) {
   unsigned &Column = Position.first;
   unsigned &Line = Position.second;
 
-  // Keep track of the current column and line by scanning the string for
-  // special characters
-  for (const char *End = Ptr + Size; Ptr != End; ++Ptr) {
-    ++Column;
-    switch (*Ptr) {
+  auto ProcessCodePoint = [&Line, &Column](StringRef CP) {
+    int Width = sys::locale::columnWidth(CP);
+    // columnWidth returns -1 for non-printing characters.
+    if (Width != -1)
+      Column += Width;
+
+    // If this is the final byte of a multi-byte sequence, it can't be any of
+    // the special whitespace characters below.
+    if (CP.size() > 1)
+      return;
+
+    switch (CP[0]) {
     case '\n':
       Line += 1;
       LLVM_FALLTHROUGH;
@@ -40,6 +50,46 @@
       Column += (8 - (Column & 0x7)) & 0x7;
       break;
     }
+  };
+
+  // If we have a partial UTF-8 sequence from the previous buffer, check that
+  // first.
+  if (PartialUTF8Char.size()) {
+    size_t NumBytes = getNumBytesForUTF8(PartialUTF8Char[0]);
+    if (NumBytes > (PartialUTF8Char.size() + Size)) {
+      // If we still don't have enough bytes for a complete code point, just
+      // append what we have.
+      PartialUTF8Char.append(StringRef(Ptr, Size));
+      return;
+    } else {
+      // The first few bytes from the buffer will complete the code point.
+      // Concatenate them and process their effect on the line and column
+      // numbers.
+      size_t BytesFromBuffer = NumBytes - PartialUTF8Char.size();
+      PartialUTF8Char.append(StringRef(Ptr, BytesFromBuffer));
+      ProcessCodePoint(PartialUTF8Char);
+      PartialUTF8Char.clear();
+      Ptr += BytesFromBuffer;
+      Size -= BytesFromBuffer;
+    }
+  }
+
+  // Now scan the rest of the buffer.
+  unsigned NumBytes;
+  for (const char *End = Ptr + Size; Ptr < End; Ptr += NumBytes) {
+    NumBytes = getNumBytesForUTF8(*Ptr);
+
+    // The buffer might end part way through a UTF-8 code unit sequence for a
+    // Unicode scalar value if it got flushed. If this happens, we can't know
+    // the display width until we see the rest of the code point. Stash the
+    // bytes we do have, so that we can reconstruct the whole code point later,
+    // even if the buffer is being flushed.
+    if ((Ptr + NumBytes) > End) {
+      PartialUTF8Char = StringRef(Ptr, End - Ptr);
+      return;
+    }
+
+    ProcessCodePoint(StringRef(Ptr, NumBytes));
   }
 }
 
@@ -52,9 +102,9 @@
   if (Ptr <= Scanned && Scanned <= Ptr + Size)
     // Scan all characters added since our last scan to determine the new
     // column.
-    UpdatePosition(Position, Scanned, Size - (Scanned - Ptr));
+    UpdatePosition(Scanned, Size - (Scanned - Ptr));
   else
-    UpdatePosition(Position, Ptr, Size);
+    UpdatePosition(Ptr, Size);
 
   // Update the scanning pointer.
   Scanned = Ptr + Size;
Index: llvm/include/llvm/Support/FormattedStream.h
===================================================================
--- llvm/include/llvm/Support/FormattedStream.h
+++ llvm/include/llvm/Support/FormattedStream.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_FORMATTEDSTREAM_H
 #define LLVM_SUPPORT_FORMATTEDSTREAM_H
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/raw_ostream.h"
 #include <utility>
 
@@ -40,6 +41,14 @@
   ///
   const char *Scanned;
 
+  /// PartialUTF8Char - Either empty or a prefix of a UTF-8 code unit sequence
+  /// for a Unicode scalar value which should be prepended to the buffer for the
+  /// next call to ComputePosition. This is needed when the buffer is flushed
+  /// when it ends part-way through the UTF-8 encoding of a Unicode scalar
+  /// value, so that we can compute the display width of the character once we
+  /// have the rest of it.
+  SmallString<4> PartialUTF8Char;
+
   void write_impl(const char *Ptr, size_t Size) override;
 
   /// current_pos - Return the current position within the stream,
@@ -52,10 +61,16 @@
   }
 
   /// ComputePosition - Examine the given output buffer and figure out the new
-  /// position after output.
-  ///
+  /// position after output. This is safe to call multiple times on the same
+  /// buffer, as it records the most recently scanned character and resumes from
+  /// there when the buffer has not been flushed.
   void ComputePosition(const char *Ptr, size_t size);
 
+  /// UpdatePosition - scan the characters in [Ptr, Ptr+Size), and update the
+  /// line and column numbers. Unlike ComputePosition, this must be called
+  /// exactly once on each region of the buffer.
+  void UpdatePosition(const char *Ptr, size_t Size);
+
   void setStream(raw_ostream &Stream) {
     releaseStream();
 
@@ -105,11 +120,17 @@
   /// \param NewCol - The column to move to.
   formatted_raw_ostream &PadToColumn(unsigned NewCol);
 
-  /// getColumn - Return the column number
-  unsigned getColumn() { return Position.first; }
+  unsigned getColumn() {
+    // Calculate current position, taking buffer contents into account.
+    ComputePosition(getBufferStart(), GetNumBytesInBuffer());
+    return Position.first;
+  }
 
-  /// getLine - Return the line number
-  unsigned getLine() { return Position.second; }
+  unsigned getLine() {
+    // Calculate current position, taking buffer contents into account.
+    ComputePosition(getBufferStart(), GetNumBytesInBuffer());
+    return Position.second;
+  }
 
   raw_ostream &resetColor() override {
     TheStream->resetColor();
Index: clang/test/Analysis/checker-plugins.c
===================================================================
--- clang/test/Analysis/checker-plugins.c
+++ clang/test/Analysis/checker-plugins.c
@@ -116,4 +116,5 @@
 // RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-CHECKER-OPTION-HELP
 
 // CHECK-CHECKER-OPTION-HELP: example.MyChecker:ExampleOption  (bool) This is an
-// CHECK-CHECKER-OPTION-HELP-SAME: example checker opt. (default: false)
+// CHECK-CHECKER-OPTION-HELP-SAME: example checker opt. (default:
+// CHECK-CHECKER-OPTION-HELP-NEXT: false)

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D76291: [Support] Fix formatted_raw_ostream for UTF-8

Reply via email to