MaxGekk commented on code in PR #41038:
URL: https://github.com/apache/spark/pull/41038#discussion_r1189592412
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala:
##########
@@ -79,10 +80,38 @@ object StringUtils extends Logging {
private[this] val falseStrings =
Set("f", "false", "n", "no", "0").map(UTF8String.fromString)
- private[spark] def orderStringsBySimilarity(
+ private[spark] def orderSuggestedIdentifiersBySimilarity(
baseString: String,
testStrings: Seq[String]): Seq[String] = {
- testStrings.sortBy(LevenshteinDistance.getDefaultInstance.apply(_,
baseString))
+ // This method is used to generate suggested list of candidates closest to
`baseString` from the
+ // list of `testStrings`. Spark uses it to clarify error message in case a
query refers to non
+ // existent column or attribute. The `baseString` could be single part or
multi part and this
+ // method will try to match suggestions.
+ // Note that identifiers from `testStrings` could represent columns or
attributes from different
+ // catalogs, schemas or tables. We preserve suggested identifier prefix
and reconstruct
+ // multi-part identifier after ordering if there are more than one unique
prefix in a list. This
+ // will also reconstruct multi-part identifier for the cases of nested
columnns. E.g. for a
Review Comment:
```suggestion
// will also reconstruct multi-part identifier for the cases of nested
columns. E.g. for a
```
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala:
##########
@@ -79,10 +80,38 @@ object StringUtils extends Logging {
private[this] val falseStrings =
Set("f", "false", "n", "no", "0").map(UTF8String.fromString)
- private[spark] def orderStringsBySimilarity(
+ private[spark] def orderSuggestedIdentifiersBySimilarity(
baseString: String,
testStrings: Seq[String]): Seq[String] = {
- testStrings.sortBy(LevenshteinDistance.getDefaultInstance.apply(_,
baseString))
+ // This method is used to generate suggested list of candidates closest to
`baseString` from the
+ // list of `testStrings`. Spark uses it to clarify error message in case a
query refers to non
+ // existent column or attribute. The `baseString` could be single part or
multi part and this
+ // method will try to match suggestions.
+ // Note that identifiers from `testStrings` could represent columns or
attributes from different
+ // catalogs, schemas or tables. We preserve suggested identifier prefix
and reconstruct
+ // multi-part identifier after ordering if there are more than one unique
prefix in a list. This
+ // will also reconstruct multi-part identifier for the cases of nested
columnns. E.g. for a
+ // table `t` with columns `a`, `b`, `c.d` (nested) and requested column
`d` we will create
+ // prefixes `t`, `t`, and `t.c`. Since there is more than one distinct
prefix we will return
+ // sorted suggestions as multi-part identifiers => (`t`.`c`.`d`, `t`.`a`,
`t`.`b`).
+ val multiPart = UnresolvedAttribute.parseAttributeName(baseString).size > 1
+ if (multiPart) {
+ testStrings.sortBy(LevenshteinDistance.getDefaultInstance.apply(_,
baseString))
+ } else {
+ val split = testStrings.map { ident =>
+ val parts = UnresolvedAttribute.parseAttributeName(ident)
+ (parts.take(parts.size - 1).map(quoteIfNeeded).mkString("."),
quoteIfNeeded(parts.last))
Review Comment:
```suggestion
val parts =
UnresolvedAttribute.parseAttributeName(ident).map(quoteIfNeeded)
(parts.init.mkString("."), parts.last)
```
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala:
##########
@@ -79,10 +80,38 @@ object StringUtils extends Logging {
private[this] val falseStrings =
Set("f", "false", "n", "no", "0").map(UTF8String.fromString)
- private[spark] def orderStringsBySimilarity(
+ private[spark] def orderSuggestedIdentifiersBySimilarity(
baseString: String,
testStrings: Seq[String]): Seq[String] = {
- testStrings.sortBy(LevenshteinDistance.getDefaultInstance.apply(_,
baseString))
+ // This method is used to generate suggested list of candidates closest to
`baseString` from the
+ // list of `testStrings`. Spark uses it to clarify error message in case a
query refers to non
+ // existent column or attribute. The `baseString` could be single part or
multi part and this
+ // method will try to match suggestions.
+ // Note that identifiers from `testStrings` could represent columns or
attributes from different
+ // catalogs, schemas or tables. We preserve suggested identifier prefix
and reconstruct
+ // multi-part identifier after ordering if there are more than one unique
prefix in a list. This
+ // will also reconstruct multi-part identifier for the cases of nested
columnns. E.g. for a
Review Comment:
```suggestion
// will also reconstruct multi-part identifier for the cases of nested
columns. E.g. for a
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]