This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new c0d9ca3be14c [SPARK-45400][SQL][DOCS] Refer to the unescaping rules from expression descriptions c0d9ca3be14c is described below commit c0d9ca3be14cb0ec8d8f9920d3ecc4aac3cf5adc Author: Max Gekk <max.g...@gmail.com> AuthorDate: Thu Oct 5 22:22:29 2023 +0300 [SPARK-45400][SQL][DOCS] Refer to the unescaping rules from expression descriptions ### What changes were proposed in this pull request? In the PR, I propose to refer to the unescaping rules added by https://github.com/apache/spark/pull/43152 from expression descriptions like in `Like`, see <img width="1057" alt="Screenshot 2023-10-05 at 19 15 17" src="https://github.com/apache/spark/assets/1580697/6a332b50-f2c8-4549-848a-61519c9f964e"> ### Why are the changes needed? To improve user experience w/ Spark SQL. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manually generated docs and checked by eyes. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43203 from MaxGekk/link-to-escape-doc. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- docs/sql-ref-literals.md | 2 + .../catalyst/expressions/regexpExpressions.scala | 70 ++++++++++++++-------- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/docs/sql-ref-literals.md b/docs/sql-ref-literals.md index e9447af71c54..2a02a22bd6f0 100644 --- a/docs/sql-ref-literals.md +++ b/docs/sql-ref-literals.md @@ -62,6 +62,8 @@ The following escape sequences are recognized in regular string literals (withou - `\_` -> `\_`; - `\<other char>` -> `<other char>`, skip the slash and leave the character as is. +The unescaping rules above can be turned off by setting the SQL config `spark.sql.parser.escapedStringLiterals` to `true`. + #### Examples ```sql diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 69d90296d7ff..87ea8b5a102a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -77,7 +77,7 @@ abstract class StringRegexExpression extends BinaryExpression } } -// scalastyle:off line.contains.tab +// scalastyle:off line.contains.tab line.size.limit /** * Simple RegEx pattern matching function */ @@ -92,11 +92,14 @@ abstract class StringRegexExpression extends BinaryExpression _ matches any one character in the input (similar to . in posix regular expressions)\ % matches zero or more characters in the input (similar to .* in posix regular expressions)<br><br> - Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order - to match "\abc", the pattern should be "\\abc".<br><br> + Since Spark 2.0, string literals are unescaped in our SQL parser, see the unescaping + rules at <a href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String Literal</a>. + For example, in order to match "\abc", the pattern should be "\\abc".<br><br> When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back to Spark 1.6 behavior regarding string literal parsing. For example, if the config is - enabled, the pattern to match "\abc" should be "\abc". + enabled, the pattern to match "\abc" should be "\abc".<br><br> + It's recommended to use a raw string literal (with the `r` prefix) to avoid escaping + special characters in the pattern string if exists. * escape - an character added since Spark 3.0. The default escape character is the '\'. If an escape character precedes a special symbol or another escape character, the following character is matched literally. It is invalid to escape any other character. @@ -121,7 +124,7 @@ abstract class StringRegexExpression extends BinaryExpression """, since = "1.0.0", group = "predicate_funcs") -// scalastyle:on line.contains.tab +// scalastyle:on line.contains.tab line.size.limit case class Like(left: Expression, right: Expression, escapeChar: Char) extends StringRegexExpression { @@ -207,11 +210,14 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) _ matches any one character in the input (similar to . in posix regular expressions)<br><br> % matches zero or more characters in the input (similar to .* in posix regular expressions)<br><br> - Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order - to match "\abc", the pattern should be "\\abc".<br><br> + Since Spark 2.0, string literals are unescaped in our SQL parser, see the unescaping + rules at <a href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String Literal</a>. + For example, in order to match "\abc", the pattern should be "\\abc".<br><br> When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back to Spark 1.6 behavior regarding string literal parsing. For example, if the config is - enabled, the pattern to match "\abc" should be "\abc". + enabled, the pattern to match "\abc" should be "\abc".<br><br> + It's recommended to use a raw string literal (with the `r` prefix) to avoid escaping + special characters in the pattern string if exists. * escape - an character added since Spark 3.0. The default escape character is the '\'. If an escape character precedes a special symbol or another escape character, the following character is matched literally. It is invalid to escape any other character. @@ -412,7 +418,7 @@ case class NotLikeAny(child: Expression, patterns: Seq[UTF8String]) extends Like copy(child = newChild) } -// scalastyle:off line.contains.tab +// scalastyle:off line.contains.tab line.size.limit @ExpressionDescription( usage = "_FUNC_(str, regexp) - Returns true if `str` matches `regexp`, or false otherwise.", arguments = """ @@ -421,12 +427,14 @@ case class NotLikeAny(child: Expression, patterns: Seq[UTF8String]) extends Like * regexp - a string expression. The regex string should be a Java regular expression. Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL - parser. For example, to match "\abc", a regular expression for `regexp` can be - "^\\abc$". + parser, see the unescaping rules at <a href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String Literal</a>. + For example, to match "\abc", a regular expression for `regexp` can be "^\\abc$". There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to fallback to the Spark 1.6 behavior regarding string literal parsing. For example, - if the config is enabled, the `regexp` that can match "\abc" is "^\abc$". + if the config is enabled, the `regexp` that can match "\abc" is "^\abc$".<br><br> + It's recommended to use a raw string literal (with the `r` prefix) to avoid escaping + special characters in the pattern string if exists. """, examples = """ Examples: @@ -444,7 +452,7 @@ case class NotLikeAny(child: Expression, patterns: Seq[UTF8String]) extends Like """, since = "1.0.0", group = "predicate_funcs") -// scalastyle:on line.contains.tab +// scalastyle:on line.contains.tab line.size.limit case class RLike(left: Expression, right: Expression) extends StringRegexExpression { override def escape(v: String): String = v @@ -573,11 +581,13 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression) * regexp - a string representing a regular expression. The regex string should be a Java regular expression.<br><br> Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL - parser. For example, to match "\abc", a regular expression for `regexp` can be - "^\\abc$".<br><br> + parser, see the unescaping rules at <a href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String Literal</a>. + For example, to match "\abc", a regular expression for `regexp` can be "^\\abc$".<br><br> There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to fallback to the Spark 1.6 behavior regarding string literal parsing. For example, - if the config is enabled, the `regexp` that can match "\abc" is "^\abc$". + if the config is enabled, the `regexp` that can match "\abc" is "^\abc$".<br><br> + It's recommended to use a raw string literal (with the `r` prefix) to avoid escaping + special characters in the pattern string if exists. * rep - a string expression to replace matched substrings. * position - a positive integer literal that indicates the position within `str` to begin searching. The default is 1. If position is greater than the number of characters in `str`, the result is `str`. @@ -774,6 +784,7 @@ abstract class RegExpExtractBase * * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status. */ +// scalastyle:off line.size.limit @ExpressionDescription( usage = """ _FUNC_(str, regexp[, idx]) - Extract the first string in the `str` that match the `regexp` @@ -785,11 +796,13 @@ abstract class RegExpExtractBase * regexp - a string representing a regular expression. The regex string should be a Java regular expression.<br><br> Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL - parser. For example, to match "\abc", a regular expression for `regexp` can be - "^\\abc$".<br><br> + parser, see the unescaping rules at <a href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String Literal</a>. + For example, to match "\abc", a regular expression for `regexp` can be "^\\abc$".<br><br> There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to fallback to the Spark 1.6 behavior regarding string literal parsing. For example, - if the config is enabled, the `regexp` that can match "\abc" is "^\abc$". + if the config is enabled, the `regexp` that can match "\abc" is "^\abc$".<br><br> + It's recommended to use a raw string literal (with the `r` prefix) to avoid escaping + special characters in the pattern string if exists. * idx - an integer expression that representing the group index. The regex maybe contains multiple groups. `idx` indicates which regex group to extract. The group index should be non-negative. The minimum value of `idx` is 0, which means matching the entire @@ -803,6 +816,7 @@ abstract class RegExpExtractBase """, since = "1.5.0", group = "string_funcs") +// scalastyle:on line.size.limit case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expression) extends RegExpExtractBase { def this(s: Expression, r: Expression) = this(s, r, Literal(1)) @@ -866,6 +880,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio * * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status. */ +// scalastyle:off line.size.limit @ExpressionDescription( usage = """ _FUNC_(str, regexp[, idx]) - Extract all strings in the `str` that match the `regexp` @@ -877,11 +892,13 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio * regexp - a string representing a regular expression. The regex string should be a Java regular expression.<br><br> Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL - parser. For example, to match "\abc", a regular expression for `regexp` can be - "^\\abc$".<br><br> + parser, see the unescaping rules at <a href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String Literal</a>. + For example, to match "\abc", a regular expression for `regexp` can be "^\\abc$".<br><br> There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to fallback to the Spark 1.6 behavior regarding string literal parsing. For example, - if the config is enabled, the `regexp` that can match "\abc" is "^\abc$". + if the config is enabled, the `regexp` that can match "\abc" is "^\abc$".<br><br> + It's recommended to use a raw string literal (with the `r` prefix) to avoid escaping + special characters in the pattern string if exists. * idx - an integer expression that representing the group index. The regex may contains multiple groups. `idx` indicates which regex group to extract. The group index should be non-negative. The minimum value of `idx` is 0, which means matching the entire @@ -895,6 +912,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio """, since = "3.1.0", group = "string_funcs") +// scalastyle:on line.size.limit case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: Expression) extends RegExpExtractBase { def this(s: Expression, r: Expression) = this(s, r, Literal(1)) @@ -1047,11 +1065,13 @@ case class RegExpSubStr(left: Expression, right: Expression) * regexp - a string representing a regular expression. The regex string should be a Java regular expression.<br><br> Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL - parser. For example, to match "\abc", a regular expression for `regexp` can be - "^\\abc$".<br><br> + parser, see the unescaping rules at <a href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String Literal</a>. + For example, to match "\abc", a regular expression for `regexp` can be "^\\abc$".<br><br> There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to fallback to the Spark 1.6 behavior regarding string literal parsing. For example, - if the config is enabled, the `regexp` that can match "\abc" is "^\abc$". + if the config is enabled, the `regexp` that can match "\abc" is "^\abc$".<br><br> + It's recommended to use a raw string literal (with the `r` prefix) to avoid escaping + special characters in the pattern string if exists. """, examples = """ Examples: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org