[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-06-10 Thread Timm Bäder via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG6669dc09a441: [clang][NFC] Refactor 
printableTextForNextCharacter (authored by tbaeder).

Changed prior to commit:
  https://reviews.llvm.org/D150843?vs=529170=530160#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

Files:
  clang/lib/Frontend/TextDiagnostic.cpp

Index: clang/lib/Frontend/TextDiagnostic.cpp
===
--- clang/lib/Frontend/TextDiagnostic.cpp
+++ clang/lib/Frontend/TextDiagnostic.cpp
@@ -91,73 +91,78 @@
 ///printableTextForNextCharacter.
 ///
 /// \param SourceLine The line of source
-/// \param i Pointer to byte index,
+/// \param I Pointer to byte index,
 /// \param TabStop used to expand tabs
 /// \return pair(printable text, 'true' iff original text was printable)
 ///
 static std::pair, bool>
-printableTextForNextCharacter(StringRef SourceLine, size_t *i,
+printableTextForNextCharacter(StringRef SourceLine, size_t *I,
   unsigned TabStop) {
-  assert(i && "i must not be null");
-  assert(*i expandedTab;
-expandedTab.assign(NumSpaces, ' ');
-return std::make_pair(expandedTab, true);
+SmallString<16> ExpandedTab;
+ExpandedTab.assign(NumSpaces, ' ');
+return std::make_pair(ExpandedTab, true);
   }
 
-  unsigned char const *begin, *end;
-  begin = reinterpret_cast(&*(SourceLine.begin() + *i));
-  end = begin + (SourceLine.size() - *i);
-
-  if (llvm::isLegalUTF8Sequence(begin, end)) {
-llvm::UTF32 c;
-llvm::UTF32 *cptr = 
-unsigned char const *original_begin = begin;
-unsigned char const *cp_end =
-begin + llvm::getNumBytesForUTF8(SourceLine[*i]);
-
-llvm::ConversionResult res = llvm::ConvertUTF8toUTF32(
-, cp_end, , cptr + 1, llvm::strictConversion);
-(void)res;
-assert(llvm::conversionOK == res);
-assert(0 < begin-original_begin
-   && "we must be further along in the string now");
-*i += begin-original_begin;
-
-if (!llvm::sys::locale::isPrint(c)) {
-  // If next character is valid UTF-8, but not printable
-  SmallString<16> expandedCP("");
-  while (c) {
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(c%16));
-c/=16;
-  }
-  while (expandedCP.size() < 8)
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(0));
-  return std::make_pair(expandedCP, false);
-}
-
-// If next character is valid UTF-8, and printable
-return std::make_pair(SmallString<16>(original_begin, cp_end), true);
+  const unsigned char *Begin = SourceLine.bytes_begin() + *I;
 
+  // Fast path for the common ASCII case.
+  if (*Begin < 0x80 && llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, Begin + 1), true);
+  }
+  unsigned CharSize = llvm::getNumBytesForUTF8(*Begin);
+  const unsigned char *End = Begin + CharSize;
+
+  // Convert it to UTF32 and check if it's printable.
+  if (End <= SourceLine.bytes_end() && llvm::isLegalUTF8Sequence(Begin, End)) {
+llvm::UTF32 C;
+llvm::UTF32 *CPtr = 
+
+// Begin and end before conversion.
+unsigned char const *OriginalBegin = Begin;
+llvm::ConversionResult Res = llvm::ConvertUTF8toUTF32(
+, End, , CPtr + 1, llvm::strictConversion);
+(void)Res;
+assert(Res == llvm::conversionOK);
+assert(OriginalBegin < Begin);
+assert((Begin - OriginalBegin) == CharSize);
+
+(*I) += (Begin - OriginalBegin);
+
+// Valid, multi-byte, printable UTF8 character.
+if (llvm::sys::locale::isPrint(C))
+  return std::make_pair(SmallString<16>(OriginalBegin, End), true);
+
+// Valid but not printable.
+SmallString<16> Str("");
+while (C) {
+  Str.insert(Str.begin() + 3, llvm::hexdigit(C % 16));
+  C /= 16;
+}
+while (Str.size() < 8)
+  Str.insert(Str.begin() + 3, llvm::hexdigit(0));
+return std::make_pair(Str, false);
   }
 
-  // If next byte is not valid UTF-8 (and therefore not printable)
-  SmallString<16> expandedByte("");
-  unsigned char byte = SourceLine[*i];
-  expandedByte[1] = llvm::hexdigit(byte / 16);
-  expandedByte[2] = llvm::hexdigit(byte % 16);
-  ++(*i);
-  return std::make_pair(expandedByte, false);
+  // Otherwise, not printable since it's not valid UTF8.
+  SmallString<16> ExpandedByte("");
+  unsigned char Byte = SourceLine[*I];
+  ExpandedByte[1] = llvm::hexdigit(Byte / 16);
+  ExpandedByte[2] = llvm::hexdigit(Byte % 16);
+  ++(*I);
+  return std::make_pair(ExpandedByte, false);
 }
 
 static void expandTabs(std::string , unsigned TabStop) {
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-06-09 Thread Tom Honermann via Phabricator via cfe-commits
tahonermann accepted this revision.
tahonermann added a comment.
This revision is now accepted and ready to land.

> They are NFC, it's just not 100% only a refactoring, since it adds the new 
> ASCII-only case.

Ok, thanks. Changes look good. I noticed one comment that appears to be 
incorrect; please just remove it when committing the changes.




Comment at: clang/lib/Frontend/TextDiagnostic.cpp:128
+
+  // We now know that the next character is a multi-byte character.
+  // Convert it to UTF32 and check if it's printable.

This comment is not correct; `Begin` might still point to a non-printable 
character with a code point value less than 0x80. I suggest just removing the 
comment.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-06-07 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder updated this revision to Diff 529170.
tbaeder marked 2 inline comments as done.

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

Files:
  clang/lib/Frontend/TextDiagnostic.cpp

Index: clang/lib/Frontend/TextDiagnostic.cpp
===
--- clang/lib/Frontend/TextDiagnostic.cpp
+++ clang/lib/Frontend/TextDiagnostic.cpp
@@ -91,73 +91,79 @@
 ///printableTextForNextCharacter.
 ///
 /// \param SourceLine The line of source
-/// \param i Pointer to byte index,
+/// \param I Pointer to byte index,
 /// \param TabStop used to expand tabs
 /// \return pair(printable text, 'true' iff original text was printable)
 ///
 static std::pair, bool>
-printableTextForNextCharacter(StringRef SourceLine, size_t *i,
+printableTextForNextCharacter(StringRef SourceLine, size_t *I,
   unsigned TabStop) {
-  assert(i && "i must not be null");
-  assert(*i expandedTab;
-expandedTab.assign(NumSpaces, ' ');
-return std::make_pair(expandedTab, true);
+SmallString<16> ExpandedTab;
+ExpandedTab.assign(NumSpaces, ' ');
+return std::make_pair(ExpandedTab, true);
   }
 
-  unsigned char const *begin, *end;
-  begin = reinterpret_cast(&*(SourceLine.begin() + *i));
-  end = begin + (SourceLine.size() - *i);
-
-  if (llvm::isLegalUTF8Sequence(begin, end)) {
-llvm::UTF32 c;
-llvm::UTF32 *cptr = 
-unsigned char const *original_begin = begin;
-unsigned char const *cp_end =
-begin + llvm::getNumBytesForUTF8(SourceLine[*i]);
-
-llvm::ConversionResult res = llvm::ConvertUTF8toUTF32(
-, cp_end, , cptr + 1, llvm::strictConversion);
-(void)res;
-assert(llvm::conversionOK == res);
-assert(0 < begin-original_begin
-   && "we must be further along in the string now");
-*i += begin-original_begin;
-
-if (!llvm::sys::locale::isPrint(c)) {
-  // If next character is valid UTF-8, but not printable
-  SmallString<16> expandedCP("");
-  while (c) {
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(c%16));
-c/=16;
-  }
-  while (expandedCP.size() < 8)
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(0));
-  return std::make_pair(expandedCP, false);
-}
-
-// If next character is valid UTF-8, and printable
-return std::make_pair(SmallString<16>(original_begin, cp_end), true);
+  const unsigned char *Begin = SourceLine.bytes_begin() + *I;
 
+  // Fast path for the common ASCII case.
+  if (*Begin < 0x80 && llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, Begin + 1), true);
+  }
+  unsigned CharSize = llvm::getNumBytesForUTF8(*Begin);
+  const unsigned char *End = Begin + CharSize;
+
+  // We now know that the next character is a multi-byte character.
+  // Convert it to UTF32 and check if it's printable.
+  if (End <= SourceLine.bytes_end() && llvm::isLegalUTF8Sequence(Begin, End)) {
+llvm::UTF32 C;
+llvm::UTF32 *CPtr = 
+
+// Begin and end before conversion.
+unsigned char const *OriginalBegin = Begin;
+llvm::ConversionResult Res = llvm::ConvertUTF8toUTF32(
+, End, , CPtr + 1, llvm::strictConversion);
+(void)Res;
+assert(Res == llvm::conversionOK);
+assert(OriginalBegin < Begin);
+assert((Begin - OriginalBegin) == CharSize);
+
+(*I) += (Begin - OriginalBegin);
+
+// Valid, multi-byte, printable UTF8 character.
+if (llvm::sys::locale::isPrint(C))
+  return std::make_pair(SmallString<16>(OriginalBegin, End), true);
+
+// Valid but not printable.
+SmallString<16> Str("");
+while (C) {
+  Str.insert(Str.begin() + 3, llvm::hexdigit(C % 16));
+  C /= 16;
+}
+while (Str.size() < 8)
+  Str.insert(Str.begin() + 3, llvm::hexdigit(0));
+return std::make_pair(Str, false);
   }
 
-  // If next byte is not valid UTF-8 (and therefore not printable)
-  SmallString<16> expandedByte("");
-  unsigned char byte = SourceLine[*i];
-  expandedByte[1] = llvm::hexdigit(byte / 16);
-  expandedByte[2] = llvm::hexdigit(byte % 16);
-  ++(*i);
-  return std::make_pair(expandedByte, false);
+  // Otherwise, not printable since it's not valid UTF8.
+  SmallString<16> ExpandedByte("");
+  unsigned char Byte = SourceLine[*I];
+  ExpandedByte[1] = llvm::hexdigit(Byte / 16);
+  ExpandedByte[2] = llvm::hexdigit(Byte % 16);
+  ++(*I);
+  return std::make_pair(ExpandedByte, false);
 }
 
 static void expandTabs(std::string , unsigned TabStop) {
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-06-07 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder added a comment.

In D150843#4401189 , @tahonermann 
wrote:

> The summary states that the changes are not quite NFC. In that case, we would 
> ideally have a test that demonstrates the changed behavior. Would adding such 
> a test be challenging?

They are NFC, it's just not 100% only a refactoring, since it adds the new 
ASCII-only case.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-06-06 Thread Tom Honermann via Phabricator via cfe-commits
tahonermann added a comment.

The changes look good to me. I suggested two minor edits to align with 
parameter name changes.

The summary states that the changes are not quite NFC. In that case, we would 
ideally have a test that demonstrates the changed behavior. Would adding such a 
test be challenging?




Comment at: clang/lib/Frontend/TextDiagnostic.cpp:94
 /// \param SourceLine The line of source
 /// \param i Pointer to byte index,
 /// \param TabStop used to expand tabs





Comment at: clang/lib/Frontend/TextDiagnostic.cpp:101
   unsigned TabStop) {
-  assert(i && "i must not be null");
-  assert(*ihttps://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-06-05 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder added a comment.

Ping @tahonermann


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-30 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder added a comment.

Ping


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-24 Thread Corentin Jabot via Phabricator via cfe-commits
cor3ntin added a comment.

Give some time for @tahonermann to have an opportunity to review again, but 
otherwise this looks good to me. Thanks!




Comment at: clang/lib/Frontend/TextDiagnostic.cpp:124-128
+  if (CharSize == 1 && llvm::isLegalUTF8Sequence(Begin, End) &&
+  llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, End), true);
+  }

tbaeder wrote:
> tbaeder wrote:
> > cor3ntin wrote:
> > > tbaeder wrote:
> > > > cor3ntin wrote:
> > > > > this could be simplified : the common case for ascii could be just 
> > > > > looking at `isPrint(*Begin);` (which implies  CharSize == 1 and  
> > > > > llvm::isLegalUTF8Sequence(Begin, End))
> > > > > So you could do it before computing CharSize
> > > > This is not true in my testing fwiw.
> > > ```
> > > const unsigned char *Begin = SourceLine.bytes_begin() + *I;
> > > 
> > >   // Fast path for the common ASCII case.
> > >   if (*Begin < 0x80 && llvm::sys::locale::isPrint(*Begin)) {
> > > ++(*I);
> > > return std::make_pair(SmallString<16>(Begin, Begin + 1), true);
> > >   }
> > > ```
> > > seems to work fine locally. Note that I'm not sure `*Begin` is always 
> > > valid - it seems to be, but we might want an assert to check that 
> > > SourceLine is not empty. 
> > This function is only ever called in a `while (I < SourceLine.size())` 
> > loop. I've thought about refactoring this into a helper struct that keeps 
> > the index separate from the calling function to simplify callers.
> Oh also, there are two asserts at the beginning of the function to ensure `I` 
> is valid.
you are right, if, `SourceLine` is empty, that first asset would always fire


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-24 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder updated this revision to Diff 525094.

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

Files:
  clang/lib/Frontend/TextDiagnostic.cpp

Index: clang/lib/Frontend/TextDiagnostic.cpp
===
--- clang/lib/Frontend/TextDiagnostic.cpp
+++ clang/lib/Frontend/TextDiagnostic.cpp
@@ -96,68 +96,74 @@
 /// \return pair(printable text, 'true' iff original text was printable)
 ///
 static std::pair, bool>
-printableTextForNextCharacter(StringRef SourceLine, size_t *i,
+printableTextForNextCharacter(StringRef SourceLine, size_t *I,
   unsigned TabStop) {
-  assert(i && "i must not be null");
-  assert(*i expandedTab;
-expandedTab.assign(NumSpaces, ' ');
-return std::make_pair(expandedTab, true);
+SmallString<16> ExpandedTab;
+ExpandedTab.assign(NumSpaces, ' ');
+return std::make_pair(ExpandedTab, true);
   }
 
-  unsigned char const *begin, *end;
-  begin = reinterpret_cast(&*(SourceLine.begin() + *i));
-  end = begin + (SourceLine.size() - *i);
-
-  if (llvm::isLegalUTF8Sequence(begin, end)) {
-llvm::UTF32 c;
-llvm::UTF32 *cptr = 
-unsigned char const *original_begin = begin;
-unsigned char const *cp_end =
-begin + llvm::getNumBytesForUTF8(SourceLine[*i]);
-
-llvm::ConversionResult res = llvm::ConvertUTF8toUTF32(
-, cp_end, , cptr + 1, llvm::strictConversion);
-(void)res;
-assert(llvm::conversionOK == res);
-assert(0 < begin-original_begin
-   && "we must be further along in the string now");
-*i += begin-original_begin;
-
-if (!llvm::sys::locale::isPrint(c)) {
-  // If next character is valid UTF-8, but not printable
-  SmallString<16> expandedCP("");
-  while (c) {
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(c%16));
-c/=16;
-  }
-  while (expandedCP.size() < 8)
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(0));
-  return std::make_pair(expandedCP, false);
-}
-
-// If next character is valid UTF-8, and printable
-return std::make_pair(SmallString<16>(original_begin, cp_end), true);
+  const unsigned char *Begin = SourceLine.bytes_begin() + *I;
 
+  // Fast path for the common ASCII case.
+  if (*Begin < 0x80 && llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, Begin + 1), true);
+  }
+  unsigned CharSize = llvm::getNumBytesForUTF8(*Begin);
+  const unsigned char *End = Begin + CharSize;
+
+  // We now know that the next character is a multi-byte character.
+  // Convert it to UTF32 and check if it's printable.
+  if (End <= SourceLine.bytes_end() && llvm::isLegalUTF8Sequence(Begin, End)) {
+llvm::UTF32 C;
+llvm::UTF32 *CPtr = 
+
+// Begin and end before conversion.
+unsigned char const *OriginalBegin = Begin;
+llvm::ConversionResult Res = llvm::ConvertUTF8toUTF32(
+, End, , CPtr + 1, llvm::strictConversion);
+(void)Res;
+assert(Res == llvm::conversionOK);
+assert(OriginalBegin < Begin);
+assert((Begin - OriginalBegin) == CharSize);
+
+(*I) += (Begin - OriginalBegin);
+
+// Valid, multi-byte, printable UTF8 character.
+if (llvm::sys::locale::isPrint(C))
+  return std::make_pair(SmallString<16>(OriginalBegin, End), true);
+
+// Valid but not printable.
+SmallString<16> Str("");
+while (C) {
+  Str.insert(Str.begin() + 3, llvm::hexdigit(C % 16));
+  C /= 16;
+}
+while (Str.size() < 8)
+  Str.insert(Str.begin() + 3, llvm::hexdigit(0));
+return std::make_pair(Str, false);
   }
 
-  // If next byte is not valid UTF-8 (and therefore not printable)
-  SmallString<16> expandedByte("");
-  unsigned char byte = SourceLine[*i];
-  expandedByte[1] = llvm::hexdigit(byte / 16);
-  expandedByte[2] = llvm::hexdigit(byte % 16);
-  ++(*i);
-  return std::make_pair(expandedByte, false);
+  // Otherwise, not printable since it's not valid UTF8.
+  SmallString<16> ExpandedByte("");
+  unsigned char Byte = SourceLine[*I];
+  ExpandedByte[1] = llvm::hexdigit(Byte / 16);
+  ExpandedByte[2] = llvm::hexdigit(Byte % 16);
+  ++(*I);
+  return std::make_pair(ExpandedByte, false);
 }
 
 static void expandTabs(std::string , unsigned TabStop) {
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-24 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder added inline comments.



Comment at: clang/lib/Frontend/TextDiagnostic.cpp:124-128
+  if (CharSize == 1 && llvm::isLegalUTF8Sequence(Begin, End) &&
+  llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, End), true);
+  }

tbaeder wrote:
> cor3ntin wrote:
> > tbaeder wrote:
> > > cor3ntin wrote:
> > > > this could be simplified : the common case for ascii could be just 
> > > > looking at `isPrint(*Begin);` (which implies  CharSize == 1 and  
> > > > llvm::isLegalUTF8Sequence(Begin, End))
> > > > So you could do it before computing CharSize
> > > This is not true in my testing fwiw.
> > ```
> > const unsigned char *Begin = SourceLine.bytes_begin() + *I;
> > 
> >   // Fast path for the common ASCII case.
> >   if (*Begin < 0x80 && llvm::sys::locale::isPrint(*Begin)) {
> > ++(*I);
> > return std::make_pair(SmallString<16>(Begin, Begin + 1), true);
> >   }
> > ```
> > seems to work fine locally. Note that I'm not sure `*Begin` is always valid 
> > - it seems to be, but we might want an assert to check that SourceLine is 
> > not empty. 
> This function is only ever called in a `while (I < SourceLine.size())` loop. 
> I've thought about refactoring this into a helper struct that keeps the index 
> separate from the calling function to simplify callers.
Oh also, there are two asserts at the beginning of the function to ensure `I` 
is valid.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-24 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder added inline comments.



Comment at: clang/lib/Frontend/TextDiagnostic.cpp:124-128
+  if (CharSize == 1 && llvm::isLegalUTF8Sequence(Begin, End) &&
+  llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, End), true);
+  }

cor3ntin wrote:
> tbaeder wrote:
> > cor3ntin wrote:
> > > this could be simplified : the common case for ascii could be just 
> > > looking at `isPrint(*Begin);` (which implies  CharSize == 1 and  
> > > llvm::isLegalUTF8Sequence(Begin, End))
> > > So you could do it before computing CharSize
> > This is not true in my testing fwiw.
> ```
> const unsigned char *Begin = SourceLine.bytes_begin() + *I;
> 
>   // Fast path for the common ASCII case.
>   if (*Begin < 0x80 && llvm::sys::locale::isPrint(*Begin)) {
> ++(*I);
> return std::make_pair(SmallString<16>(Begin, Begin + 1), true);
>   }
> ```
> seems to work fine locally. Note that I'm not sure `*Begin` is always valid - 
> it seems to be, but we might want an assert to check that SourceLine is not 
> empty. 
This function is only ever called in a `while (I < SourceLine.size())` loop. 
I've thought about refactoring this into a helper struct that keeps the index 
separate from the calling function to simplify callers.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-24 Thread Corentin Jabot via Phabricator via cfe-commits
cor3ntin added inline comments.



Comment at: clang/lib/Frontend/TextDiagnostic.cpp:124-128
+  if (CharSize == 1 && llvm::isLegalUTF8Sequence(Begin, End) &&
+  llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, End), true);
+  }

tbaeder wrote:
> cor3ntin wrote:
> > this could be simplified : the common case for ascii could be just looking 
> > at `isPrint(*Begin);` (which implies  CharSize == 1 and  
> > llvm::isLegalUTF8Sequence(Begin, End))
> > So you could do it before computing CharSize
> This is not true in my testing fwiw.
```
const unsigned char *Begin = SourceLine.bytes_begin() + *I;

  // Fast path for the common ASCII case.
  if (*Begin < 0x80 && llvm::sys::locale::isPrint(*Begin)) {
++(*I);
return std::make_pair(SmallString<16>(Begin, Begin + 1), true);
  }
```
seems to work fine locally. Note that I'm not sure `*Begin` is always valid - 
it seems to be, but we might want an assert to check that SourceLine is not 
empty. 


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-23 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder added inline comments.



Comment at: clang/lib/Frontend/TextDiagnostic.cpp:124-128
+  if (CharSize == 1 && llvm::isLegalUTF8Sequence(Begin, End) &&
+  llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, End), true);
+  }

cor3ntin wrote:
> this could be simplified : the common case for ascii could be just looking at 
> `isPrint(*Begin);` (which implies  CharSize == 1 and  
> llvm::isLegalUTF8Sequence(Begin, End))
> So you could do it before computing CharSize
This is not true in my testing fwiw.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-23 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder updated this revision to Diff 525010.
tbaeder marked 2 inline comments as done.

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

Files:
  clang/lib/Frontend/TextDiagnostic.cpp

Index: clang/lib/Frontend/TextDiagnostic.cpp
===
--- clang/lib/Frontend/TextDiagnostic.cpp
+++ clang/lib/Frontend/TextDiagnostic.cpp
@@ -96,68 +96,75 @@
 /// \return pair(printable text, 'true' iff original text was printable)
 ///
 static std::pair, bool>
-printableTextForNextCharacter(StringRef SourceLine, size_t *i,
+printableTextForNextCharacter(StringRef SourceLine, size_t *I,
   unsigned TabStop) {
-  assert(i && "i must not be null");
-  assert(*i expandedTab;
-expandedTab.assign(NumSpaces, ' ');
-return std::make_pair(expandedTab, true);
+SmallString<16> ExpandedTab;
+ExpandedTab.assign(NumSpaces, ' ');
+return std::make_pair(ExpandedTab, true);
   }
 
-  unsigned char const *begin, *end;
-  begin = reinterpret_cast(&*(SourceLine.begin() + *i));
-  end = begin + (SourceLine.size() - *i);
-
-  if (llvm::isLegalUTF8Sequence(begin, end)) {
-llvm::UTF32 c;
-llvm::UTF32 *cptr = 
-unsigned char const *original_begin = begin;
-unsigned char const *cp_end =
-begin + llvm::getNumBytesForUTF8(SourceLine[*i]);
-
-llvm::ConversionResult res = llvm::ConvertUTF8toUTF32(
-, cp_end, , cptr + 1, llvm::strictConversion);
-(void)res;
-assert(llvm::conversionOK == res);
-assert(0 < begin-original_begin
-   && "we must be further along in the string now");
-*i += begin-original_begin;
-
-if (!llvm::sys::locale::isPrint(c)) {
-  // If next character is valid UTF-8, but not printable
-  SmallString<16> expandedCP("");
-  while (c) {
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(c%16));
-c/=16;
-  }
-  while (expandedCP.size() < 8)
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(0));
-  return std::make_pair(expandedCP, false);
-}
-
-// If next character is valid UTF-8, and printable
-return std::make_pair(SmallString<16>(original_begin, cp_end), true);
+  const unsigned char *Begin = SourceLine.bytes_begin() + *I;
+  unsigned CharSize = llvm::getNumBytesForUTF8(*Begin);
 
+  // Fast path for the common ASCII case.
+  if (CharSize == 1 && llvm::isLegalUTF8Sequence(Begin, Begin + CharSize) &&
+  llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, Begin + CharSize), true);
+  }
+  const unsigned char *End = Begin + CharSize;
+
+  // We now know that the next character is a multi-byte character.
+  // Convert it to UTF32 and check if it's printable.
+  if (End < SourceLine.bytes_end() && llvm::isLegalUTF8Sequence(Begin, End)) {
+llvm::UTF32 C;
+llvm::UTF32 *CPtr = 
+
+// Begin and end before conversion.
+unsigned char const *OriginalBegin = Begin;
+llvm::ConversionResult Res = llvm::ConvertUTF8toUTF32(
+, End, , CPtr + 1, llvm::strictConversion);
+(void)Res;
+assert(Res == llvm::conversionOK);
+assert(OriginalBegin < Begin);
+assert((Begin - OriginalBegin) == CharSize);
+
+(*I) += (Begin - OriginalBegin);
+
+// Valid, multi-byte, printable UTF8 character.
+if (llvm::sys::locale::isPrint(C))
+  return std::make_pair(SmallString<16>(OriginalBegin, End), true);
+
+// Valid but not printable.
+SmallString<16> Str("");
+while (C) {
+  Str.insert(Str.begin() + 3, llvm::hexdigit(C % 16));
+  C /= 16;
+}
+while (Str.size() < 8)
+  Str.insert(Str.begin() + 3, llvm::hexdigit(0));
+return std::make_pair(Str, false);
   }
 
-  // If next byte is not valid UTF-8 (and therefore not printable)
-  SmallString<16> expandedByte("");
-  unsigned char byte = SourceLine[*i];
-  expandedByte[1] = llvm::hexdigit(byte / 16);
-  expandedByte[2] = llvm::hexdigit(byte % 16);
-  ++(*i);
-  return std::make_pair(expandedByte, false);
+  // Otherwise, not printable since it's not valid UTF8.
+  SmallString<16> ExpandedByte("");
+  unsigned char Byte = SourceLine[*I];
+  ExpandedByte[1] = llvm::hexdigit(Byte / 16);
+  ExpandedByte[2] = llvm::hexdigit(Byte % 16);
+  ++(*I);
+  return std::make_pair(ExpandedByte, false);
 }
 
 static void expandTabs(std::string , unsigned TabStop) {
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-19 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder added inline comments.



Comment at: clang/lib/Frontend/TextDiagnostic.cpp:121
+  unsigned CharSize = llvm::getNumBytesForUTF8(*Begin);
+  const unsigned char *End = Begin + CharSize;
 

tahonermann wrote:
> This could assign `End` to a location past the end of the source line when 
> the source line ends with an ill-formed code unit sequence (e.g., a truncated 
> 4-byte sequence). Constructing such a pointer is likely to get one in trouble.
If we use `bytes_end()` (unconditionally) we will check the entire source line 
every time we call this function (which always happens in a loop), which is 
what I was trying to avoid.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-18 Thread Tom Honermann via Phabricator via cfe-commits
tahonermann requested changes to this revision.
tahonermann added a comment.
This revision now requires changes to proceed.

Splitting this into two patches (one to do the renames, another to perform the 
other changes) would simplify review.




Comment at: clang/lib/Frontend/TextDiagnostic.cpp:118-119
 
-  unsigned char const *begin, *end;
-  begin = reinterpret_cast(&*(SourceLine.begin() + *i));
-  end = begin + (SourceLine.size() - *i);
-
-  if (llvm::isLegalUTF8Sequence(begin, end)) {
-llvm::UTF32 c;
-llvm::UTF32 *cptr = 
-unsigned char const *original_begin = begin;
-unsigned char const *cp_end =
-begin + llvm::getNumBytesForUTF8(SourceLine[*i]);
-
-llvm::ConversionResult res = llvm::ConvertUTF8toUTF32(
-, cp_end, , cptr + 1, llvm::strictConversion);
-(void)res;
-assert(llvm::conversionOK == res);
-assert(0 < begin-original_begin
-   && "we must be further along in the string now");
-*i += begin-original_begin;
-
-if (!llvm::sys::locale::isPrint(c)) {
-  // If next character is valid UTF-8, but not printable
-  SmallString<16> expandedCP("");
-  while (c) {
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(c%16));
-c/=16;
-  }
-  while (expandedCP.size() < 8)
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(0));
-  return std::make_pair(expandedCP, false);
-}
+  const unsigned char *Begin =
+  reinterpret_cast(&*(SourceLine.begin())) + (*I);
+  unsigned CharSize = llvm::getNumBytesForUTF8(*Begin);





Comment at: clang/lib/Frontend/TextDiagnostic.cpp:120
-  begin = reinterpret_cast(&*(SourceLine.begin() + *i));
-  end = begin + (SourceLine.size() - *i);
-

tbaeder wrote:
> I don't know what this computation of `end` means, but from the debug output 
> I added, it meant that a call to this function always converted the entire 
> line. I think.
It looks to me like an overly complicated way of asking for 
`SourceLine.bytes_end()`.



Comment at: clang/lib/Frontend/TextDiagnostic.cpp:121
+  unsigned CharSize = llvm::getNumBytesForUTF8(*Begin);
+  const unsigned char *End = Begin + CharSize;
 

This could assign `End` to a location past the end of the source line when the 
source line ends with an ill-formed code unit sequence (e.g., a truncated 
4-byte sequence). Constructing such a pointer is likely to get one in trouble.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-18 Thread Corentin Jabot via Phabricator via cfe-commits
cor3ntin added a comment.

This generally looks good to me but i have a couple remarks




Comment at: clang/lib/Frontend/TextDiagnostic.cpp:124-128
+  if (CharSize == 1 && llvm::isLegalUTF8Sequence(Begin, End) &&
+  llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, End), true);
+  }

this could be simplified : the common case for ascii could be just looking at 
`isPrint(*Begin);` (which implies  CharSize == 1 and  
llvm::isLegalUTF8Sequence(Begin, End))
So you could do it before computing CharSize



Comment at: clang/lib/Frontend/TextDiagnostic.cpp:132
+  // Convert it to UTF32 and check if it's printable.
+  if (llvm::isLegalUTF8Sequence(Begin, End)) {
+llvm::UTF32 C;

I think we should check that `CharSize` is not greater than `Begin + 
SourceLine.size() - *i`  or something along those lines otherwise we may 
overflow SourceLine.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-17 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder added inline comments.



Comment at: clang/lib/Frontend/TextDiagnostic.cpp:120
-  begin = reinterpret_cast(&*(SourceLine.begin() + *i));
-  end = begin + (SourceLine.size() - *i);
-

I don't know what this computation of `end` means, but from the debug output I 
added, it meant that a call to this function always converted the entire line. 
I think.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150843/new/

https://reviews.llvm.org/D150843

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150843: [clang][Diagnostics] Refactor printableTextForNextCharacter

2023-05-17 Thread Timm Bäder via Phabricator via cfe-commits
tbaeder created this revision.
tbaeder added reviewers: aaron.ballman, tahonermann.
Herald added a project: All.
tbaeder requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Rename parameters and local variables to start with an uppercase characters.

This is not 100% NFC, since it adds an easy/fast path for 1 byte printable 
characters and changes how much text is converted to UTF32.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D150843

Files:
  clang/lib/Frontend/TextDiagnostic.cpp

Index: clang/lib/Frontend/TextDiagnostic.cpp
===
--- clang/lib/Frontend/TextDiagnostic.cpp
+++ clang/lib/Frontend/TextDiagnostic.cpp
@@ -96,68 +96,76 @@
 /// \return pair(printable text, 'true' iff original text was printable)
 ///
 static std::pair, bool>
-printableTextForNextCharacter(StringRef SourceLine, size_t *i,
+printableTextForNextCharacter(StringRef SourceLine, size_t *I,
   unsigned TabStop) {
-  assert(i && "i must not be null");
-  assert(*i expandedTab;
-expandedTab.assign(NumSpaces, ' ');
-return std::make_pair(expandedTab, true);
+SmallString<16> ExpandedTab;
+ExpandedTab.assign(NumSpaces, ' ');
+return std::make_pair(ExpandedTab, true);
   }
 
-  unsigned char const *begin, *end;
-  begin = reinterpret_cast(&*(SourceLine.begin() + *i));
-  end = begin + (SourceLine.size() - *i);
-
-  if (llvm::isLegalUTF8Sequence(begin, end)) {
-llvm::UTF32 c;
-llvm::UTF32 *cptr = 
-unsigned char const *original_begin = begin;
-unsigned char const *cp_end =
-begin + llvm::getNumBytesForUTF8(SourceLine[*i]);
-
-llvm::ConversionResult res = llvm::ConvertUTF8toUTF32(
-, cp_end, , cptr + 1, llvm::strictConversion);
-(void)res;
-assert(llvm::conversionOK == res);
-assert(0 < begin-original_begin
-   && "we must be further along in the string now");
-*i += begin-original_begin;
-
-if (!llvm::sys::locale::isPrint(c)) {
-  // If next character is valid UTF-8, but not printable
-  SmallString<16> expandedCP("");
-  while (c) {
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(c%16));
-c/=16;
-  }
-  while (expandedCP.size() < 8)
-expandedCP.insert(expandedCP.begin()+3, llvm::hexdigit(0));
-  return std::make_pair(expandedCP, false);
-}
+  const unsigned char *Begin =
+  reinterpret_cast(&*(SourceLine.begin())) + (*I);
+  unsigned CharSize = llvm::getNumBytesForUTF8(*Begin);
+  const unsigned char *End = Begin + CharSize;
 
-// If next character is valid UTF-8, and printable
-return std::make_pair(SmallString<16>(original_begin, cp_end), true);
+  // Fast path for the common ASCII case.
+  if (CharSize == 1 && llvm::isLegalUTF8Sequence(Begin, End) &&
+  llvm::sys::locale::isPrint(*Begin)) {
+++(*I);
+return std::make_pair(SmallString<16>(Begin, End), true);
+  }
 
+  // We now know that the next character is a multi-byte character.
+  // Convert it to UTF32 and check if it's printable.
+  if (llvm::isLegalUTF8Sequence(Begin, End)) {
+llvm::UTF32 C;
+llvm::UTF32 *CPtr = 
+
+// Begin and end before conversion.
+unsigned char const *OriginalBegin = Begin;
+llvm::ConversionResult Res = llvm::ConvertUTF8toUTF32(
+, End, , CPtr + 1, llvm::strictConversion);
+(void)Res;
+assert(Res == llvm::conversionOK);
+assert(OriginalBegin < Begin);
+assert((Begin - OriginalBegin) == CharSize);
+
+(*I) += (Begin - OriginalBegin);
+
+// Valid, multi-byte, printable UTF8 character.
+if (llvm::sys::locale::isPrint(C))
+  return std::make_pair(SmallString<16>(OriginalBegin, End), true);
+
+// Valid but not printable.
+SmallString<16> Str("");
+while (C) {
+  Str.insert(Str.begin() + 3, llvm::hexdigit(C % 16));
+  C /= 16;
+}
+while (Str.size() < 8)
+  Str.insert(Str.begin() + 3, llvm::hexdigit(0));
+return std::make_pair(Str, false);
   }
 
-  // If next byte is not valid UTF-8 (and therefore not printable)
-  SmallString<16> expandedByte("");
-  unsigned char byte = SourceLine[*i];
-  expandedByte[1] = llvm::hexdigit(byte / 16);
-  expandedByte[2] = llvm::hexdigit(byte % 16);
-  ++(*i);
-  return std::make_pair(expandedByte, false);
+  // Otherwise, not printable since it's not valid UTF8.
+  SmallString<16> ExpandedByte("");
+  unsigned char Byte = SourceLine[*I];
+  ExpandedByte[1] = llvm::hexdigit(Byte / 16);
+  ExpandedByte[2] = llvm::hexdigit(Byte % 16);
+  ++(*I);
+  return std::make_pair(ExpandedByte, false);
 }
 
 static void expandTabs(std::string , unsigned TabStop) {
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits