https://github.com/igorkudrin updated https://github.com/llvm/llvm-project/pull/70898
>From 37ab3fff62b1a3aa373fd513745b1c2b91b1b865 Mon Sep 17 00:00:00 2001 From: Igor Kudrin <ikud...@accesssoftek.com> Date: Tue, 7 Nov 2023 18:42:02 -0800 Subject: [PATCH] [YAMLParser] Unfold multi-line scalar values Long scalar values can be split into multiple lines to improve readability. The rules are described in Section 6.5. "Line Folding", https://yaml.org/spec/1.2.2/#65-line-folding. In addition, for flow scalar styles, the Spec states that "All leading and trailing white space characters on each line are excluded from the content", https://yaml.org/spec/1.2.2/#73-flow-scalar-styles. The patch implements these unfolding rules for double-quoted, single-quoted, and plain scalars. --- llvm/include/llvm/Support/YAMLParser.h | 9 +- llvm/lib/Support/YAMLParser.cpp | 373 ++++++++++++---------- llvm/test/YAMLParser/spec-05-13.test | 2 +- llvm/test/YAMLParser/spec-05-14.test | 2 +- llvm/test/YAMLParser/spec-09-01.test | 4 +- llvm/test/YAMLParser/spec-09-02.test | 18 +- llvm/test/YAMLParser/spec-09-03.test | 6 +- llvm/test/YAMLParser/spec-09-04.test | 2 +- llvm/test/YAMLParser/spec-09-05.test | 6 +- llvm/test/YAMLParser/spec-09-07.test | 4 +- llvm/test/YAMLParser/spec-09-08.test | 8 +- llvm/test/YAMLParser/spec-09-09.test | 6 +- llvm/test/YAMLParser/spec-09-10.test | 2 +- llvm/test/YAMLParser/spec-09-11.test | 4 +- llvm/test/YAMLParser/spec-09-13.test | 4 +- llvm/test/YAMLParser/spec-09-16.test | 8 +- llvm/test/YAMLParser/spec-09-17.test | 2 +- llvm/test/YAMLParser/spec-10-02.test | 6 +- llvm/test/YAMLParser/spec1.2-07-05.test | 2 +- llvm/test/YAMLParser/spec1.2-07-06.test | 2 +- llvm/test/YAMLParser/spec1.2-07-09.test | 2 +- llvm/test/YAMLParser/spec1.2-07-12.test | 2 +- llvm/unittests/Support/YAMLParserTest.cpp | 102 ++++++ 23 files changed, 376 insertions(+), 200 deletions(-) diff --git a/llvm/include/llvm/Support/YAMLParser.h b/llvm/include/llvm/Support/YAMLParser.h index f4767641647c217..9d95a1e13a0dff4 100644 --- a/llvm/include/llvm/Support/YAMLParser.h +++ b/llvm/include/llvm/Support/YAMLParser.h @@ -240,9 +240,14 @@ class ScalarNode final : public Node { private: StringRef Value; - StringRef unescapeDoubleQuoted(StringRef UnquotedValue, - StringRef::size_type Start, + StringRef getDoubleQuotedValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage) const; + + static StringRef getSingleQuotedValue(StringRef RawValue, + SmallVectorImpl<char> &Storage); + + static StringRef getPlainValue(StringRef RawValue, + SmallVectorImpl<char> &Storage); }; /// A block scalar node is an opaque datum that can be presented as a diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp index b47cb3ae3b44a75..fdd0ed6e682eb5e 100644 --- a/llvm/lib/Support/YAMLParser.cpp +++ b/llvm/lib/Support/YAMLParser.cpp @@ -2030,184 +2030,229 @@ bool Node::failed() const { } StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { - // TODO: Handle newlines properly. We need to remove leading whitespace. - if (Value[0] == '"') { // Double quoted. - // Pull off the leading and trailing "s. - StringRef UnquotedValue = Value.substr(1, Value.size() - 2); - // Search for characters that would require unescaping the value. - StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); - if (i != StringRef::npos) - return unescapeDoubleQuoted(UnquotedValue, i, Storage); + if (Value[0] == '"') + return getDoubleQuotedValue(Value, Storage); + if (Value[0] == '\'') + return getSingleQuotedValue(Value, Storage); + return getPlainValue(Value, Storage); +} + +/// parseScalarValue - A common parsing routine for all flow scalar styles. +/// It handles line break characters by itself, adds regular content characters +/// to the result, and forwards escaped sequences to the provided routine for +/// the style-specific processing. +/// +/// \param UnquotedValue - An input value without quotation marks. +/// \param Storage - A storage for the result if the input value is multiline or +/// contains escaped characters. +/// \param LookupChars - A set of special characters to search in the input +/// string. Should include line break characters and the escape character +/// specific for the processing scalar style, if any. +/// \param UnescapeCallback - This is called when the escape character is found +/// in the input. +/// \returns - The unfolded and unescaped value. +static StringRef +parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage, + StringRef LookupChars, + std::function<StringRef(StringRef, SmallVectorImpl<char> &)> + UnescapeCallback) { + size_t I = UnquotedValue.find_first_of(LookupChars); + if (I == StringRef::npos) return UnquotedValue; - } else if (Value[0] == '\'') { // Single quoted. - // Pull off the leading and trailing 's. - StringRef UnquotedValue = Value.substr(1, Value.size() - 2); - StringRef::size_type i = UnquotedValue.find('\''); - if (i != StringRef::npos) { - // We're going to need Storage. - Storage.clear(); - Storage.reserve(UnquotedValue.size()); - for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { - StringRef Valid(UnquotedValue.begin(), i); - llvm::append_range(Storage, Valid); - Storage.push_back('\''); - UnquotedValue = UnquotedValue.substr(i + 2); + + Storage.clear(); + Storage.reserve(UnquotedValue.size()); + char LastNewLineAddedAs = '\0'; + for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) { + if (UnquotedValue[I] != '\r' && UnquotedValue[I] != '\n') { + llvm::append_range(Storage, UnquotedValue.take_front(I)); + UnquotedValue = UnescapeCallback(UnquotedValue.drop_front(I), Storage); + LastNewLineAddedAs = '\0'; + continue; + } + if (size_t LastNonSWhite = UnquotedValue.find_last_not_of(" \t", I); + LastNonSWhite != StringRef::npos) { + llvm::append_range(Storage, UnquotedValue.take_front(LastNonSWhite + 1)); + Storage.push_back(' '); + LastNewLineAddedAs = ' '; + } else { + // Note: we can't just check if the last character in Storage is ' ', + // '\n', or something else; that would give a wrong result for double + // quoted values containing an escaped space character before a new-line + // character. + switch (LastNewLineAddedAs) { + case ' ': + assert(!Storage.empty() && Storage.back() == ' '); + Storage.back() = '\n'; + LastNewLineAddedAs = '\n'; + break; + case '\n': + assert(!Storage.empty() && Storage.back() == '\n'); + Storage.push_back('\n'); + break; + default: + Storage.push_back(' '); + LastNewLineAddedAs = ' '; + break; } - llvm::append_range(Storage, UnquotedValue); - return StringRef(Storage.begin(), Storage.size()); } - return UnquotedValue; + // Handle Windows-style EOL + if (UnquotedValue.substr(I, 2) == "\r\n") + I++; + UnquotedValue = UnquotedValue.drop_front(I + 1).ltrim(" \t"); } - // Plain. - // Trim whitespace ('b-char' and 's-white'). - // NOTE: Alternatively we could change the scanner to not include whitespace - // here in the first place. - return Value.rtrim("\x0A\x0D\x20\x09"); + llvm::append_range(Storage, UnquotedValue); + return StringRef(Storage.begin(), Storage.size()); } -StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue - , StringRef::size_type i - , SmallVectorImpl<char> &Storage) - const { - // Use Storage to build proper value. - Storage.clear(); - Storage.reserve(UnquotedValue.size()); - for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { - // Insert all previous chars into Storage. - StringRef Valid(UnquotedValue.begin(), i); - llvm::append_range(Storage, Valid); - // Chop off inserted chars. - UnquotedValue = UnquotedValue.substr(i); - - assert(!UnquotedValue.empty() && "Can't be empty!"); - - // Parse escape or line break. +StringRef +ScalarNode::getDoubleQuotedValue(StringRef RawValue, + SmallVectorImpl<char> &Storage) const { + assert(RawValue.size() >= 2 && RawValue.front() == '"' && + RawValue.back() == '"'); + StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2); + + auto UnescapeFunc = [this](StringRef UnquotedValue, + SmallVectorImpl<char> &Storage) { + assert(UnquotedValue.take_front(1) == "\\"); + if (UnquotedValue.size() == 1) { + Token T; + T.Range = UnquotedValue; + setError("Unrecognized escape code", T); + Storage.clear(); + return StringRef(); + } + UnquotedValue = UnquotedValue.drop_front(1); switch (UnquotedValue[0]) { + default: { + Token T; + T.Range = UnquotedValue.take_front(1); + setError("Unrecognized escape code", T); + Storage.clear(); + return StringRef(); + } case '\r': + // Shrink the Windows-style EOL. + if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n') + UnquotedValue = UnquotedValue.drop_front(1); + [[fallthrough]]; case '\n': - Storage.push_back('\n'); - if ( UnquotedValue.size() > 1 - && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) - UnquotedValue = UnquotedValue.substr(1); - UnquotedValue = UnquotedValue.substr(1); + return UnquotedValue.drop_front(1).ltrim(" \t"); + case '0': + Storage.push_back(0x00); break; - default: - if (UnquotedValue.size() == 1) { - Token T; - T.Range = StringRef(UnquotedValue.begin(), 1); - setError("Unrecognized escape code", T); - return ""; - } - UnquotedValue = UnquotedValue.substr(1); - switch (UnquotedValue[0]) { - default: { - Token T; - T.Range = StringRef(UnquotedValue.begin(), 1); - setError("Unrecognized escape code", T); - return ""; - } - case '\r': - // Shrink the Windows-style EOL. - if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n') - UnquotedValue = UnquotedValue.drop_front(1); - [[fallthrough]]; - case '\n': - UnquotedValue = UnquotedValue.drop_front(1).ltrim(" \t"); - continue; - case '0': - Storage.push_back(0x00); - break; - case 'a': - Storage.push_back(0x07); - break; - case 'b': - Storage.push_back(0x08); - break; - case 't': - case 0x09: - Storage.push_back(0x09); - break; - case 'n': - Storage.push_back(0x0A); - break; - case 'v': - Storage.push_back(0x0B); - break; - case 'f': - Storage.push_back(0x0C); - break; - case 'r': - Storage.push_back(0x0D); - break; - case 'e': - Storage.push_back(0x1B); - break; - case ' ': - Storage.push_back(0x20); - break; - case '"': - Storage.push_back(0x22); - break; - case '/': - Storage.push_back(0x2F); - break; - case '\\': - Storage.push_back(0x5C); - break; - case 'N': - encodeUTF8(0x85, Storage); - break; - case '_': - encodeUTF8(0xA0, Storage); + case 'a': + Storage.push_back(0x07); + break; + case 'b': + Storage.push_back(0x08); + break; + case 't': + case 0x09: + Storage.push_back(0x09); + break; + case 'n': + Storage.push_back(0x0A); + break; + case 'v': + Storage.push_back(0x0B); + break; + case 'f': + Storage.push_back(0x0C); + break; + case 'r': + Storage.push_back(0x0D); + break; + case 'e': + Storage.push_back(0x1B); + break; + case ' ': + Storage.push_back(0x20); + break; + case '"': + Storage.push_back(0x22); + break; + case '/': + Storage.push_back(0x2F); + break; + case '\\': + Storage.push_back(0x5C); + break; + case 'N': + encodeUTF8(0x85, Storage); + break; + case '_': + encodeUTF8(0xA0, Storage); + break; + case 'L': + encodeUTF8(0x2028, Storage); + break; + case 'P': + encodeUTF8(0x2029, Storage); + break; + case 'x': { + if (UnquotedValue.size() < 3) + // TODO: Report error. break; - case 'L': - encodeUTF8(0x2028, Storage); + unsigned int UnicodeScalarValue; + if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; + encodeUTF8(UnicodeScalarValue, Storage); + return UnquotedValue.drop_front(3); + } + case 'u': { + if (UnquotedValue.size() < 5) + // TODO: Report error. break; - case 'P': - encodeUTF8(0x2029, Storage); + unsigned int UnicodeScalarValue; + if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; + encodeUTF8(UnicodeScalarValue, Storage); + return UnquotedValue.drop_front(5); + } + case 'U': { + if (UnquotedValue.size() < 9) + // TODO: Report error. break; - case 'x': { - if (UnquotedValue.size() < 3) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(2); - break; - } - case 'u': { - if (UnquotedValue.size() < 5) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(4); - break; - } - case 'U': { - if (UnquotedValue.size() < 9) - // TODO: Report error. - break; - unsigned int UnicodeScalarValue; - if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) - // TODO: Report error. - UnicodeScalarValue = 0xFFFD; - encodeUTF8(UnicodeScalarValue, Storage); - UnquotedValue = UnquotedValue.substr(8); - break; - } - } - UnquotedValue = UnquotedValue.substr(1); + unsigned int UnicodeScalarValue; + if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; + encodeUTF8(UnicodeScalarValue, Storage); + return UnquotedValue.drop_front(9); } - } - llvm::append_range(Storage, UnquotedValue); - return StringRef(Storage.begin(), Storage.size()); + } + return UnquotedValue.drop_front(1); + }; + + return parseScalarValue(UnquotedValue, Storage, "\\\r\n", UnescapeFunc); +} + +StringRef ScalarNode::getSingleQuotedValue(StringRef RawValue, + SmallVectorImpl<char> &Storage) { + assert(RawValue.size() >= 2 && RawValue.front() == '\'' && + RawValue.back() == '\''); + StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2); + + auto UnescapeFunc = [](StringRef UnquotedValue, + SmallVectorImpl<char> &Storage) { + assert(UnquotedValue.take_front(2) == "''"); + Storage.push_back('\''); + return UnquotedValue.drop_front(2); + }; + + return parseScalarValue(UnquotedValue, Storage, "'\r\n", UnescapeFunc); +} + +StringRef ScalarNode::getPlainValue(StringRef RawValue, + SmallVectorImpl<char> &Storage) { + // Trim trailing whitespace ('b-char' and 's-white'). + // NOTE: Alternatively we could change the scanner to not include whitespace + // here in the first place. + RawValue = RawValue.rtrim("\r\n \t"); + return parseScalarValue(RawValue, Storage, "\r\n", nullptr); } Node *KeyValueNode::getKey() { diff --git a/llvm/test/YAMLParser/spec-05-13.test b/llvm/test/YAMLParser/spec-05-13.test index e7ec42a4aaa80d7..b2367a373ee454a 100644 --- a/llvm/test/YAMLParser/spec-05-13.test +++ b/llvm/test/YAMLParser/spec-05-13.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "Text containing \n both space and\t\n \ttab\tcharacters" +# CHECK: "Text containing both space and tab\tcharacters" "Text containing both space and diff --git a/llvm/test/YAMLParser/spec-05-14.test b/llvm/test/YAMLParser/spec-05-14.test index 984f3721312ab63..87d699dbc027b8d 100644 --- a/llvm/test/YAMLParser/spec-05-14.test +++ b/llvm/test/YAMLParser/spec-05-14.test @@ -6,4 +6,4 @@ \ \_ \N \L \P \ \x41 \u0041 \U00000041" -# CHECK: !!str "Fun with \\\n\" \a \b \e \f \n \r \t \v \0 \_ \N \L \P A A A" +# CHECK: !!str "Fun with \\ \" \a \b \e \f \n \r \t \v \0 \_ \N \L \P A A A" diff --git a/llvm/test/YAMLParser/spec-09-01.test b/llvm/test/YAMLParser/spec-09-01.test index 2b5a6f31166ddf1..e552e7ca264404c 100644 --- a/llvm/test/YAMLParser/spec-09-01.test +++ b/llvm/test/YAMLParser/spec-09-01.test @@ -4,8 +4,8 @@ # CHECK-NEXT: : !!map { # CHECK-NEXT: ? !!str "also simple" # CHECK-NEXT: : !!str "value", -# CHECK-NEXT: ? !!str "not a\n simple key" -# CHECK-NEXT: : !!str "any\n value", +# CHECK-NEXT: ? !!str "not a simple key" +# CHECK-NEXT: : !!str "any value", # CHECK-NEXT: }, # CHECK-NEXT: } diff --git a/llvm/test/YAMLParser/spec-09-02.test b/llvm/test/YAMLParser/spec-09-02.test index 51ea61dd23273d3..99c836bf0047536 100644 --- a/llvm/test/YAMLParser/spec-09-02.test +++ b/llvm/test/YAMLParser/spec-09-02.test @@ -1,12 +1,24 @@ # RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s --strict-whitespace -# CHECK: "as space\n trimmed \n specific\L\n escaped\t\n none" +# CHECK: "as space trimmed\nspecific\L escaped\t none" ## Note: The example was originally taken from Spec 1.1, but the parsing rules ## have been changed since then. -## * The paragraph-separator character '\u2029' is excluded from line-break +## * The line-separator character '\u2028' is no longer considered a line-break +## character, so the line "...specific\u2028\nescaped..." is now parsed as +## "...specific\L escaped...". +## * The paragraph-separator character '\u2029' is also excluded from line-break ## characters, so the original sequence "escaped\t\\\u2029" is no longer -## considered valid. This is replaced by "escaped\t\\\n" in the test source. +## considered valid. This is replaced by "escaped\t\\\n" in the test source, +# so the output has changed as well. ## See https://yaml.org/spec/1.2.2/ext/changes/ for details. +## +## Note 2: Different parsers handle this corner case example differently. +## * https://github.com/yaml/libyaml: +## "as space trimmed\nspecific\L\nescaped\t\nnone" +## * https://github.com/yaml/yaml-reference-parser (parser-1.2): +## "as space trimmed\nspecific\L escaped\t none" +## * https://github.com/yaml/yaml-reference-parser (parser-1.3): +## "as space trimmed\nspecific
 escaped\t none" "as space trimmed diff --git a/llvm/test/YAMLParser/spec-09-03.test b/llvm/test/YAMLParser/spec-09-03.test index c656058b7ff8b3e..f067d1366f06918 100644 --- a/llvm/test/YAMLParser/spec-09-03.test +++ b/llvm/test/YAMLParser/spec-09-03.test @@ -1,8 +1,8 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace # CHECK: !!seq [ -# CHECK-NEXT: !!str "\n last", -# CHECK-NEXT: !!str " \t\n last", -# CHECK-NEXT: !!str " \tfirst\n last", +# CHECK-NEXT: !!str " last", +# CHECK-NEXT: !!str " last", +# CHECK-NEXT: !!str " \tfirst last", # CHECK-NEXT: ] - " diff --git a/llvm/test/YAMLParser/spec-09-04.test b/llvm/test/YAMLParser/spec-09-04.test index e4f77ea83c7ac5f..79af877b38c8361 100644 --- a/llvm/test/YAMLParser/spec-09-04.test +++ b/llvm/test/YAMLParser/spec-09-04.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "first\n \tinner 1\t\n inner 2 last" +# CHECK: "first inner 1 inner 2 last" "first inner 1 diff --git a/llvm/test/YAMLParser/spec-09-05.test b/llvm/test/YAMLParser/spec-09-05.test index 5eb5b22f421d64b..4a748e609f1d692 100644 --- a/llvm/test/YAMLParser/spec-09-05.test +++ b/llvm/test/YAMLParser/spec-09-05.test @@ -1,8 +1,8 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace # CHECK: !!seq [ -# CHECK-NEXT: !!str "first\n \t", -# CHECK-NEXT: !!str "first\n \tlast", -# CHECK-NEXT: !!str "first\n inner\n \tlast", +# CHECK-NEXT: !!str "first ", +# CHECK-NEXT: !!str "first\nlast", +# CHECK-NEXT: !!str "first inner \tlast", # CHECK-NEXT: ] - "first diff --git a/llvm/test/YAMLParser/spec-09-07.test b/llvm/test/YAMLParser/spec-09-07.test index 71007e79b79d208..f397e2ca5f41672 100644 --- a/llvm/test/YAMLParser/spec-09-07.test +++ b/llvm/test/YAMLParser/spec-09-07.test @@ -4,8 +4,8 @@ # CHECK-NEXT: : !!map { # CHECK-NEXT: ? !!str "also simple" # CHECK-NEXT: : !!str "value", -# CHECK-NEXT: ? !!str "not a\n simple key" -# CHECK-NEXT: : !!str "any\n value", +# CHECK-NEXT: ? !!str "not a simple key" +# CHECK-NEXT: : !!str "any value", # CHECK-NEXT: }, # CHECK-NEXT: } diff --git a/llvm/test/YAMLParser/spec-09-08.test b/llvm/test/YAMLParser/spec-09-08.test index 5d1f13b0e31dfc0..7ed436ecb7cea7d 100644 --- a/llvm/test/YAMLParser/spec-09-08.test +++ b/llvm/test/YAMLParser/spec-09-08.test @@ -1,5 +1,11 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "as space\t\n trimmed \n \n specific\L\n none" +# CHECK: "as space trimmed\nspecific\L none" + +## Note: The parsing rules were changed in version 1.2 and the line-separator +## character is no longer considered a line-break character. The example is +## taken from Spec 1.1 and is now parsed as "..\L .." instead of "..\L\n.." as +## in the original edition. +## See https://yaml.org/spec/1.2.2/ext/changes/ for details. 'as space trimmed diff --git a/llvm/test/YAMLParser/spec-09-09.test b/llvm/test/YAMLParser/spec-09-09.test index 181971bd1349530..4910b66c24b1c9b 100644 --- a/llvm/test/YAMLParser/spec-09-09.test +++ b/llvm/test/YAMLParser/spec-09-09.test @@ -1,8 +1,8 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace # CHECK: !!seq [ -# CHECK-NEXT: !!str "\n last", -# CHECK-NEXT: !!str " \t\n last", -# CHECK-NEXT: !!str " \tfirst\n last", +# CHECK-NEXT: !!str " last", +# CHECK-NEXT: !!str " last", +# CHECK-NEXT: !!str " \tfirst last", # CHECK-NEXT: ] - ' diff --git a/llvm/test/YAMLParser/spec-09-10.test b/llvm/test/YAMLParser/spec-09-10.test index f75834fa4dda544..3e21afe22d349f1 100644 --- a/llvm/test/YAMLParser/spec-09-10.test +++ b/llvm/test/YAMLParser/spec-09-10.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "first\n \tinner\t\n last" +# CHECK: "first inner last" 'first inner diff --git a/llvm/test/YAMLParser/spec-09-11.test b/llvm/test/YAMLParser/spec-09-11.test index b1f8f45f954af22..62bc1927998b3e0 100644 --- a/llvm/test/YAMLParser/spec-09-11.test +++ b/llvm/test/YAMLParser/spec-09-11.test @@ -1,7 +1,7 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace # CHECK: !!seq [ -# CHECK-NEXT: !!str "first\n \t", -# CHECK-NEXT: !!str "first\n\n \tlast", +# CHECK-NEXT: !!str "first ", +# CHECK-NEXT: !!str "first\nlast", # CHECK-NEXT: ] - 'first diff --git a/llvm/test/YAMLParser/spec-09-13.test b/llvm/test/YAMLParser/spec-09-13.test index 015f38951ebbd64..f2a5f49ea0c6632 100644 --- a/llvm/test/YAMLParser/spec-09-13.test +++ b/llvm/test/YAMLParser/spec-09-13.test @@ -4,8 +4,8 @@ # CHECK-NEXT: : !!map { # CHECK-NEXT: ? !!str "also simple" # CHECK-NEXT: : !!str "value", -# CHECK-NEXT: ? !!str "not a\n simple key" -# CHECK-NEXT: : !!str "any\n value", +# CHECK-NEXT: ? !!str "not a simple key" +# CHECK-NEXT: : !!str "any value", # CHECK-NEXT: }, # CHECK-NEXT: } diff --git a/llvm/test/YAMLParser/spec-09-16.test b/llvm/test/YAMLParser/spec-09-16.test index b1f52ce194f11af..b6c92e3ec63c17d 100644 --- a/llvm/test/YAMLParser/spec-09-16.test +++ b/llvm/test/YAMLParser/spec-09-16.test @@ -1,5 +1,11 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "as space\t\n trimmed \n\n specific\L\n none" +# CHECK: "as space trimmed\nspecific\L none" + +## Note: The parsing rules were changed in version 1.2 and the line-separator +## character is no longer considered a line-break character. The example is +## taken from Spec 1.1 and is now parsed as "..\L .." instead of "..\L\n.." as +## in the original edition. +## See https://yaml.org/spec/1.2.2/ext/changes/ for details. as space trimmed diff --git a/llvm/test/YAMLParser/spec-09-17.test b/llvm/test/YAMLParser/spec-09-17.test index 425925774d92fd1..06f1db21202753b 100644 --- a/llvm/test/YAMLParser/spec-09-17.test +++ b/llvm/test/YAMLParser/spec-09-17.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "first line \n \n more line" +# CHECK: "first line\nmore line" first line diff --git a/llvm/test/YAMLParser/spec-10-02.test b/llvm/test/YAMLParser/spec-10-02.test index 9adddc9237d51de..2fd91040af26ccd 100644 --- a/llvm/test/YAMLParser/spec-10-02.test +++ b/llvm/test/YAMLParser/spec-10-02.test @@ -1,8 +1,8 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace # CHECK: !!seq [ -# CHECK-NEXT: !!str "double\n quoted", -# CHECK-NEXT: !!str "single\n quoted", -# CHECK-NEXT: !!str "plain\n text", +# CHECK-NEXT: !!str "double quoted", +# CHECK-NEXT: !!str "single quoted", +# CHECK-NEXT: !!str "plain text", # CHECK-NEXT: !!seq [ # CHECK-NEXT: !!str "nested", # CHECK-NEXT: ], diff --git a/llvm/test/YAMLParser/spec1.2-07-05.test b/llvm/test/YAMLParser/spec1.2-07-05.test index f923f68d04295f9..a273e79acef6551 100644 --- a/llvm/test/YAMLParser/spec1.2-07-05.test +++ b/llvm/test/YAMLParser/spec1.2-07-05.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "folded \nto a space,\t\n \nto a line feed, or \t \tnon-content" +# CHECK: "folded to a space,\nto a line feed, or \t \tnon-content" "folded to a space, diff --git a/llvm/test/YAMLParser/spec1.2-07-06.test b/llvm/test/YAMLParser/spec1.2-07-06.test index 8982c1ed2a7b18d..7008bbcf1516c5a 100644 --- a/llvm/test/YAMLParser/spec1.2-07-06.test +++ b/llvm/test/YAMLParser/spec1.2-07-06.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: " 1st non-empty\n 2nd non-empty \n\t3rd non-empty " +# CHECK: " 1st non-empty\n2nd non-empty 3rd non-empty " " 1st non-empty diff --git a/llvm/test/YAMLParser/spec1.2-07-09.test b/llvm/test/YAMLParser/spec1.2-07-09.test index 38d541973bc43fc..6a71f8c8ad890e7 100644 --- a/llvm/test/YAMLParser/spec1.2-07-09.test +++ b/llvm/test/YAMLParser/spec1.2-07-09.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: " 1st non-empty\n\n 2nd non-empty \n\t3rd non-empty " +# CHECK: " 1st non-empty\n2nd non-empty 3rd non-empty " ' 1st non-empty diff --git a/llvm/test/YAMLParser/spec1.2-07-12.test b/llvm/test/YAMLParser/spec1.2-07-12.test index 84d986e29d510c5..b5d0cb91f3023d4 100644 --- a/llvm/test/YAMLParser/spec1.2-07-12.test +++ b/llvm/test/YAMLParser/spec1.2-07-12.test @@ -1,5 +1,5 @@ # RUN: yaml-bench -canonical %s | FileCheck %s --strict-whitespace -# CHECK: "1st non-empty\n\n 2nd non-empty \n\t3rd non-empty" +# CHECK: "1st non-empty\n2nd non-empty 3rd non-empty" 1st non-empty diff --git a/llvm/unittests/Support/YAMLParserTest.cpp b/llvm/unittests/Support/YAMLParserTest.cpp index 247e70756861df1..7bd11e748155d8b 100644 --- a/llvm/unittests/Support/YAMLParserTest.cpp +++ b/llvm/unittests/Support/YAMLParserTest.cpp @@ -441,4 +441,106 @@ TEST(YAMLParser, ParsesBools) { expectCannotParseBool("0"); } +// Checks that the given string can be parsed into an expected scalar value. +static void expectCanParseScalar(StringRef Input, StringRef Expected) { + SourceMgr SM; + yaml::Stream Stream(Input, SM); + yaml::Node *Root = Stream.begin()->getRoot(); + ASSERT_NE(Root, nullptr); + auto *ScalarNode = dyn_cast<yaml::ScalarNode>(Root); + ASSERT_NE(ScalarNode, nullptr); + SmallVector<char> Storage; + StringRef Result = ScalarNode->getValue(Storage); + EXPECT_EQ(Result, Expected); +} + +TEST(YAMLParser, UnfoldsScalarValue) { + // Double-quoted values + expectCanParseScalar("\"\"", ""); + expectCanParseScalar("\" \t\t \t\t \"", " \t\t \t\t "); + expectCanParseScalar("\"\n\"", " "); + expectCanParseScalar("\"\r\"", " "); + expectCanParseScalar("\"\r\n\"", " "); + expectCanParseScalar("\"\n\n\"", "\n"); + expectCanParseScalar("\"\r\r\"", "\n"); + expectCanParseScalar("\"\n\r\"", "\n"); + expectCanParseScalar("\"\r\n\r\n\"", "\n"); + expectCanParseScalar("\"\n\n\n\"", "\n\n"); + expectCanParseScalar("\"\r\r\r\"", "\n\n"); + expectCanParseScalar("\"\r\n\r\n\r\n\"", "\n\n"); + expectCanParseScalar("\" \t \t \n\t \t \t\r \t \t \"", "\n"); + expectCanParseScalar("\" \t A \t \n \t B \t \"", " \t A B \t "); + expectCanParseScalar("\" \t \\ \r\r\t \\ \t \"", " \t \n \t "); + expectCanParseScalar("\"A\nB\"", "A B"); + expectCanParseScalar("\"A\rB\"", "A B"); + expectCanParseScalar("\"A\r\nB\"", "A B"); + expectCanParseScalar("\"A\n\nB\"", "A\nB"); + expectCanParseScalar("\"A\r\rB\"", "A\nB"); + expectCanParseScalar("\"A\n\rB\"", "A\nB"); + expectCanParseScalar("\"A\r\n\r\nB\"", "A\nB"); + expectCanParseScalar("\"A\n\n\nB\"", "A\n\nB"); + expectCanParseScalar("\"A\r\r\rB\"", "A\n\nB"); + expectCanParseScalar("\"A\r\n\r\n\r\nB\"", "A\n\nB"); + expectCanParseScalar("\"A \t \t \n\t \t \t B\"", "A B"); + expectCanParseScalar("\"A \t \t \n\t \t \t\r \t \t B\"", "A\nB"); + expectCanParseScalar("\"A \t \t \n\t \t \t\r\n \t \r \t B\"", "A\n\nB"); + expectCanParseScalar("\"A\\\rB\"", "AB"); + expectCanParseScalar("\"A\\\nB\"", "AB"); + expectCanParseScalar("\"A\\\r\nB\"", "AB"); + expectCanParseScalar("\"A \t \\\rB\"", "A \t B"); + expectCanParseScalar("\"A \t\\\nB\"", "A \tB"); + expectCanParseScalar("\"A\t \\\r\nB\"", "A\t B"); + expectCanParseScalar("\"A\\\r\rB\"", "A B"); + expectCanParseScalar("\"A\\\n\nB\"", "A B"); + expectCanParseScalar("\"A\\\r\n\r\nB\"", "A B"); + expectCanParseScalar("\"A\\\r\r\rB\"", "A\nB"); + expectCanParseScalar("\"A\\\n\n\nB\"", "A\nB"); + expectCanParseScalar("\"A\\\r\n\r\n\r\nB\"", "A\nB"); + expectCanParseScalar("\"A\r\\ \rB\"", "A B"); + // Single-quoted values + expectCanParseScalar("''", ""); + expectCanParseScalar("' \t\t \t\t '", " \t\t \t\t "); + expectCanParseScalar("'\n'", " "); + expectCanParseScalar("'\r'", " "); + expectCanParseScalar("'\r\n'", " "); + expectCanParseScalar("'\n\n'", "\n"); + expectCanParseScalar("'\r\r'", "\n"); + expectCanParseScalar("'\n\r'", "\n"); + expectCanParseScalar("'\r\n\r\n'", "\n"); + expectCanParseScalar("'\n\n\n'", "\n\n"); + expectCanParseScalar("'\r\r\r'", "\n\n"); + expectCanParseScalar("'\r\n\r\n\r\n'", "\n\n"); + expectCanParseScalar("' \t \t \n\t \t \t\r \t \t '", "\n"); + expectCanParseScalar("' \t A \t \n \t B \t '", " \t A B \t "); + expectCanParseScalar("'A\nB'", "A B"); + expectCanParseScalar("'A\rB'", "A B"); + expectCanParseScalar("'A\r\nB'", "A B"); + expectCanParseScalar("'A\n\nB'", "A\nB"); + expectCanParseScalar("'A\r\rB'", "A\nB"); + expectCanParseScalar("'A\n\rB'", "A\nB"); + expectCanParseScalar("'A\r\n\r\nB'", "A\nB"); + expectCanParseScalar("'A\n\n\nB'", "A\n\nB"); + expectCanParseScalar("'A\r\r\rB'", "A\n\nB"); + expectCanParseScalar("'A\r\n\r\n\r\nB'", "A\n\nB"); + expectCanParseScalar("'A \t \t \n\t \t \t B'", "A B"); + expectCanParseScalar("'A \t \t \n\t \t \t\r \t \t B'", "A\nB"); + expectCanParseScalar("'A \t \t \n\t \t \t\r\n \t \r \t B'", "A\n\nB"); + // Plain values + expectCanParseScalar("A \t \r \n \t \r\n \t\r\r\t ", "A"); + expectCanParseScalar("A \t \n \t B", "A B"); + expectCanParseScalar("A\nB", "A B"); + expectCanParseScalar("A\rB", "A B"); + expectCanParseScalar("A\r\nB", "A B"); + expectCanParseScalar("A\n\nB", "A\nB"); + expectCanParseScalar("A\r\rB", "A\nB"); + expectCanParseScalar("A\n\rB", "A\nB"); + expectCanParseScalar("A\r\n\r\nB", "A\nB"); + expectCanParseScalar("A\n\n\nB", "A\n\nB"); + expectCanParseScalar("A\r\r\rB", "A\n\nB"); + expectCanParseScalar("A\r\n\r\n\r\nB", "A\n\nB"); + expectCanParseScalar("A \t \t \n\t \t \t B", "A B"); + expectCanParseScalar("A \t \t \n\t \t \t\r \t \t B", "A\nB"); + expectCanParseScalar("A \t \t \n\t \t \t\r\n \t \r \t B", "A\n\nB"); +} + } // end namespace llvm _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits