[PATCH] D76291: [Support] Fix formatted_raw_ostream for UTF-8

Oliver Stannard (Linaro) via Phabricator via cfe-commits Tue, 17 Mar 2020 09:07:19 -0700

ostannard created this revision.
ostannard added reviewers: MaskRay, thakis, hoyFB, benlangmuir.
Herald added subscribers: cfe-commits, dexonsmith, hiraditya.
Herald added a project: clang.


- The getLine and getColumn functions need to update the position, or they will 
return stale data for buffered streams. This fixes a bug in the clang 
-analyzer-checker-option-help option, which was not wrapping the help text 
correctly when stdout is not a TTY.
- If the stream contains multi-byte UTF-8 sequences, then the whole sequence 
needs to be considered to be a single character. This has the edge case that 
the buffer might fill up and be flushed part way through a character.
- If the stream contains East Asian wide characters, these will be rendered 
twice as wide as other characters, so we need to increase the column count to 
match.

This doesn't attempt to handle everything unicode can do (combining characters, 
right-to-left markers, ...), but hopefully covers most things likely to be 
common in messages and source code we might want to print.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D76291

Files:
  clang/test/Analysis/checker-plugins.c
  llvm/include/llvm/Support/FormattedStream.h
  llvm/lib/Support/FormattedStream.cpp
  llvm/unittests/Support/formatted_raw_ostream_test.cpp

Index: llvm/unittests/Support/formatted_raw_ostream_test.cpp
===================================================================
--- llvm/unittests/Support/formatted_raw_ostream_test.cpp
+++ llvm/unittests/Support/formatted_raw_ostream_test.cpp
@@ -29,4 +29,144 @@
   }
 }
 
+TEST(formatted_raw_ostreamTest, Test_LineColumn) {
+  // Test tracking of line and column numbers in a stream.
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  formatted_raw_ostream C(B);
+
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  C << "a";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+
+  C << "bcdef";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(6U, C.getColumn());
+
+  // '\n' increments line number, sets column to zero.
+  C << "\n";
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  // '\r sets coulmn to zero without changing line number
+  C << "foo\r";
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(0U, C.getColumn());
+
+  // '\t' advances column to the next multiple of 8.
+  // FIXME: If the column number is already a multiple of 8 this will do
+  // nothing, is this behaviour correct?
+  C << "1\t";
+  EXPECT_EQ(8U, C.getColumn());
+  C << "\t";
+  EXPECT_EQ(8U, C.getColumn());
+  C << "1234567\t";
+  EXPECT_EQ(16U, C.getColumn());
+  EXPECT_EQ(1U, C.getLine());
+}
+
+TEST(formatted_raw_ostreamTest, Test_Flush) {
+  // Flushing the buffer causes the charcters in the buffer to be scanned
+  // before the buffer is emptied, so line and column numbers will still be
+  // tracked properly.
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(32);
+  formatted_raw_ostream C(B);
+
+  C << "\nabc";
+  EXPECT_EQ(4U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(1U, C.getLine());
+  EXPECT_EQ(3U, C.getColumn());
+  EXPECT_EQ(0U, C.GetNumBytesInBuffer());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(32);
+  formatted_raw_ostream C(B);
+
+  // U+00A0 Non-breaking space: encoded as two bytes, but only one column wide.
+  C << "\u00a0";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+
+  // U+2468 CIRCLED DIGIT NINE: encoded as three bytes, but only one column
+  // wide.
+  C << "\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(2U, C.getColumn());
+  EXPECT_EQ(5U, C.GetNumBytesInBuffer());
+
+  // U+00010000 LINEAR B SYLLABLE B008 A: encoded as four bytes, but only one
+  // column wide.
+  C << "\U00010000";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(3U, C.getColumn());
+  EXPECT_EQ(9U, C.GetNumBytesInBuffer());
+
+  // U+55B5, chinese character, encodes as three bytes, takes up two columns.
+  C << "\u55b5";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(5U, C.getColumn());
+  EXPECT_EQ(12U, C.GetNumBytesInBuffer());
+
+  // U+200B, zero-width space, encoded as three bytes but has no effect on the
+  // column or line number.
+  C << "\u200b";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(5U, C.getColumn());
+  EXPECT_EQ(15U, C.GetNumBytesInBuffer());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8Buffered) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(4);
+  formatted_raw_ostream C(B);
+
+  // This character encodes as three bytes, so will cause the buffer to be
+  // flushed after the first byte (4 byte buffer, 3 bytes already written). We
+  // need to save the first part of the UTF-8 encoding until after the buffer is
+  // cleared and the remaining two bytes are written, at which point we can
+  // check the display width. In this case the display width is 1, so we end at
+  // column 4, with 6 bytes written into total, 2 of which are in the buffer.
+  C << "123\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(4U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(6U, A.size());
+
+  // Same as above, but with a chinese character which displays as two columns.
+  C << "123\u55b5";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(9U, C.getColumn());
+  EXPECT_EQ(2U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(12U, A.size());
+}
+
+TEST(formatted_raw_ostreamTest, Test_UTF8TinyBuffer) {
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  B.SetBufferSize(1);
+  formatted_raw_ostream C(B);
+
+  // The stream has a one-byte buffer, so it gets flushed multiple times while
+  // printing a single unicode character.
+  C << "\u2468";
+  EXPECT_EQ(0U, C.getLine());
+  EXPECT_EQ(1U, C.getColumn());
+  EXPECT_EQ(0U, C.GetNumBytesInBuffer());
+  C.flush();
+  EXPECT_EQ(3U, A.size());
+}
+
 }
Index: llvm/lib/Support/FormattedStream.cpp
===================================================================
--- llvm/lib/Support/FormattedStream.cpp
+++ llvm/lib/Support/FormattedStream.cpp
@@ -11,7 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Locale.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
@@ -19,16 +21,24 @@
 
 /// UpdatePosition - Examine the given char sequence and figure out which
 /// column we end up in after output, and how many line breaks are contained.
-///
-static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *Ptr, size_t Size) {
+/// This assumes that the input string is well-formed UTF-8, and takes into
+/// account unicode characters which render as multiple columns wide.
+void formatted_raw_ostream::UpdatePosition(const char *Ptr, size_t Size) {
   unsigned &Column = Position.first;
   unsigned &Line = Position.second;
 
-  // Keep track of the current column and line by scanning the string for
-  // special characters
-  for (const char *End = Ptr + Size; Ptr != End; ++Ptr) {
-    ++Column;
-    switch (*Ptr) {
+  auto ProcessCodePoint = [&Line, &Column](StringRef CP){
+    int Width = sys::locale::columnWidth(CP);
+    // columnWidth returns -1 for non-printing characters.
+    if (Width != -1)
+      Column += Width;
+
+    // If this is the final byte of a multi-byte sequence, it can't be any of
+    // the special whitespace characters below.
+    if (CP.size() > 1)
+      return;
+
+    switch (CP[0]) {
     case '\n':
       Line += 1;
       LLVM_FALLTHROUGH;
@@ -40,6 +50,45 @@
       Column += (8 - (Column & 0x7)) & 0x7;
       break;
     }
+  };
+
+  // If we have a partial UTF-8 sequence from the previous buffer, check that first.
+  if (PartialUTF8Char.size()) {
+    size_t NumBytes = getNumBytesForUTF8(PartialUTF8Char[0]);
+    if (NumBytes > (PartialUTF8Char.size() + Size)) {
+      // If we still don't have enough bytes for a complete code point, just
+      // append what we have.
+      PartialUTF8Char.append(StringRef(Ptr, Size));
+      return;
+    } else {
+      // The first few bytes from the buffer will complete the code-point.
+      // Concatenate them and process their effect on the line ane column
+      // numbers.
+      size_t BytesFromBuffer = NumBytes - PartialUTF8Char.size();
+      PartialUTF8Char.append(StringRef(Ptr, BytesFromBuffer));
+      ProcessCodePoint(PartialUTF8Char);
+      PartialUTF8Char.clear();
+      Ptr += BytesFromBuffer;
+      Size -= BytesFromBuffer;
+    }
+  }
+
+  // Now scan the rest of the buffer.
+  unsigned NumBytes;
+  for (const char *End = Ptr + Size; Ptr < End; Ptr += NumBytes) {
+    NumBytes = getNumBytesForUTF8(*Ptr);
+
+    // The buffer might end part way through a UTF-8 code point if it got
+    // flushed. If this happens we can't know the display width until we see the
+    // rest of the code point. Stash the bytes we do have, so that we can
+    // reconstruct the whole code point later, even if the buffer is bsing
+    // flushed.
+    if ((Ptr + NumBytes) > End) {
+      PartialUTF8Char = StringRef(Ptr, End - Ptr);
+      return;
+    }
+
+    ProcessCodePoint(StringRef(Ptr, NumBytes));
   }
 }
 
@@ -52,9 +101,9 @@
   if (Ptr <= Scanned && Scanned <= Ptr + Size)
     // Scan all characters added since our last scan to determine the new
     // column.
-    UpdatePosition(Position, Scanned, Size - (Scanned - Ptr));
+    UpdatePosition(Scanned, Size - (Scanned - Ptr));
   else
-    UpdatePosition(Position, Ptr, Size);
+    UpdatePosition(Ptr, Size);
 
   // Update the scanning pointer.
   Scanned = Ptr + Size;
Index: llvm/include/llvm/Support/FormattedStream.h
===================================================================
--- llvm/include/llvm/Support/FormattedStream.h
+++ llvm/include/llvm/Support/FormattedStream.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_FORMATTEDSTREAM_H
 #define LLVM_SUPPORT_FORMATTEDSTREAM_H
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/raw_ostream.h"
 #include <utility>
 
@@ -40,6 +41,13 @@
   ///
   const char *Scanned;
 
+  /// PartialUTF8Char - Either empty or a prefix of a UTF-8 character which
+  /// should be prepended to the buffer for the next call to ComputePosition.
+  /// This is needed when the buffer is flushed when it ends part-way through a
+  /// UTF-8 character, so that we can compute the display width of the character
+  /// once we have the rest of it.
+  SmallString<4> PartialUTF8Char;
+
   void write_impl(const char *Ptr, size_t Size) override;
 
   /// current_pos - Return the current position within the stream,
@@ -52,10 +60,16 @@
   }
 
   /// ComputePosition - Examine the given output buffer and figure out the new
-  /// position after output.
-  ///
+  /// position after output. This is safe to call multiple times on the same
+  /// buffer, as it records the most recently scanned character and resumes from
+  /// there when the buffer has not been flushed.
   void ComputePosition(const char *Ptr, size_t size);
 
+  /// UpdatePosition - scan the characters in [Ptr, Ptr+Size), and update the
+  /// line and column numbers. Unlike ComputePosition, this must be called
+  /// exactly once on each region of the buffer.
+  void UpdatePosition(const char *Ptr, size_t Size);
+
   void setStream(raw_ostream &Stream) {
     releaseStream();
 
@@ -105,11 +119,17 @@
   /// \param NewCol - The column to move to.
   formatted_raw_ostream &PadToColumn(unsigned NewCol);
 
-  /// getColumn - Return the column number
-  unsigned getColumn() { return Position.first; }
+  unsigned getColumn() {
+    // Calculate current position, taking buffer contents into account.
+    ComputePosition(getBufferStart(), GetNumBytesInBuffer());
+    return Position.first;
+  }
 
-  /// getLine - Return the line number
-  unsigned getLine() { return Position.second; }
+  unsigned getLine() {
+    // Calculate current position, taking buffer contents into account.
+    ComputePosition(getBufferStart(), GetNumBytesInBuffer());
+    return Position.second;
+  }
 
   raw_ostream &resetColor() override {
     TheStream->resetColor();
Index: clang/test/Analysis/checker-plugins.c
===================================================================
--- clang/test/Analysis/checker-plugins.c
+++ clang/test/Analysis/checker-plugins.c
@@ -116,4 +116,5 @@
 // RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-CHECKER-OPTION-HELP
 
 // CHECK-CHECKER-OPTION-HELP: example.MyChecker:ExampleOption  (bool) This is an
-// CHECK-CHECKER-OPTION-HELP-SAME: example checker opt. (default: false)
+// CHECK-CHECKER-OPTION-HELP-SAME: example checker opt. (default:
+// CHECK-CHECKER-OPTION-HELP-NEXT: false)

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D76291: [Support] Fix formatted_raw_ostream for UTF-8

Reply via email to