cloud-fan commented on code in PR #37483:
URL: https://github.com/apache/spark/pull/37483#discussion_r947512752
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##########
@@ -2469,14 +2469,76 @@ case class Encode(value: Expression, charset:
Expression)
newLeft: Expression, newRight: Expression): Encode = copy(value = newLeft,
charset = newRight)
}
+object ToBinary {
+ def isValidBase64(srcString: UTF8String) : Boolean = {
+ // We use RFC4648. The valid base64 string should contain zero or more
groups of 4 symbols plus
+ // last group consisting of 2-4 valid symbols and optional padding.
+ // Last group should contain at least 2 valid symbols and up to 2 padding
characters `=`.
+ // Valid symbols include - (A-Za-z0-9+/). Each group might contain
arbitrary number of
+ // whitespaces which are ignored.
+ // If padding is present - last group should include exactly 4 symbols.
+ // Examples:
+ // "abcd" - Valid, single group of 4 valid symbols
+ // "abc d" - Valid, single group of 4 valid symbols, whitespace is
skipped
+ // "abc?" - Invalid, group contains invalid symbol `?`
+ // "abcdA" - Invalid, last group should contain at least 2 valid
symbols
+ // "abcdAE" - Valid, a group of 4 valid symbols and a group of 2
valid symbols
+ // "abcdAE==" - Valid, last group includes 2 padding symbols and total
number of symbols
+ // in a group is 4.
+ // "abcdAE=" - Invalid, last group include padding symbols, therefore
it should have
+ // exactly 4 symbols but contains only 3.
+ // "ab==tm+1" - Invalid, nothing should be after padding.
+ var position = 0
+ var padSize = 0
+ var invalid = false // The string is detected to be invalid.
+ val validation = srcString.toString.map {
+ case c
+ if !invalid &&
+ ((c >= '0' && c <= '9')
+ || (c >= 'A' && c <= 'Z')
+ || (c >= 'a' && c <= 'z')
+ || c == '/' || c == '+') =>
+ position += 1
+ if (padSize != 0) { // Valid character after padding.
+ invalid = true
+ false
+ } else { // A group should have at least 2 valid symbols.
+ position % 4 != 1
Review Comment:
should we update `invalid` in this branch if the `srcString` is invalid?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]