Jackie-Jiang commented on a change in pull request #6458:
URL: https://github.com/apache/incubator-pinot/pull/6458#discussion_r560494650
##########
File path:
pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
##########
@@ -228,4 +267,86 @@ public static String chr(int codepoint) {
char[] result = Character.toChars(codepoint);
return new String(result);
}
+
+ /**
+ * @see StandardCharsets#UTF_8#encode(String)
+ * @param input
+ * @return bytes
+ */
+ @ScalarFunction
+ public static byte[] to_utf8(String input) {
+ return input.getBytes(StandardCharsets.UTF_8);
+ }
+
+ /**
+ * see Normalizer#normalize(String, Form)
+ * @param input
+ * @return transforms string with NFC normalization form.
+ */
+ @ScalarFunction
+ public static String normalize(String input) {
+ return Normalizer.normalize(input, Normalizer.Form.NFC);
+ }
+
+ /**
+ * see Normalizer#normalize(String, Form)
+ * @param input
+ * @param form
+ * @return transforms string with the specified normalization form
+ */
+ @ScalarFunction
+ public static String normalize(String input, String form) {
+ Normalizer.Form targetForm = Normalizer.Form.valueOf(form);
+ return Normalizer.normalize(input, targetForm);
+ }
+
+ /**
+ * see String#split(String)
+ * @param input
+ * @param delimiter
+ * @return splits string on specified delimiter and returns an array.
+ */
+ @ScalarFunction
+ public static String[] split(String input, String delimiter) {
+ return input.split(delimiter);
+ }
+
+ /**
+ * see String#replaceAll(String, String)
+ * @param input
+ * @param search
+ * @return removes all instances of search from string
+ */
+ @ScalarFunction
+ public static String replace(String input, String search) {
Review comment:
This is not `replace` but `remove`
##########
File path:
pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
##########
@@ -228,4 +267,86 @@ public static String chr(int codepoint) {
char[] result = Character.toChars(codepoint);
return new String(result);
}
+
+ /**
+ * @see StandardCharsets#UTF_8#encode(String)
+ * @param input
+ * @return bytes
+ */
+ @ScalarFunction
+ public static byte[] to_utf8(String input) {
+ return input.getBytes(StandardCharsets.UTF_8);
+ }
+
+ /**
+ * see Normalizer#normalize(String, Form)
+ * @param input
+ * @return transforms string with NFC normalization form.
+ */
+ @ScalarFunction
+ public static String normalize(String input) {
+ return Normalizer.normalize(input, Normalizer.Form.NFC);
+ }
+
+ /**
+ * see Normalizer#normalize(String, Form)
+ * @param input
+ * @param form
+ * @return transforms string with the specified normalization form
+ */
+ @ScalarFunction
+ public static String normalize(String input, String form) {
+ Normalizer.Form targetForm = Normalizer.Form.valueOf(form);
+ return Normalizer.normalize(input, targetForm);
+ }
+
+ /**
+ * see String#split(String)
+ * @param input
+ * @param delimiter
+ * @return splits string on specified delimiter and returns an array.
+ */
+ @ScalarFunction
+ public static String[] split(String input, String delimiter) {
+ return input.split(delimiter);
+ }
+
+ /**
+ * see String#replaceAll(String, String)
+ * @param input
+ * @param search
+ * @return removes all instances of search from string
+ */
+ @ScalarFunction
+ public static String replace(String input, String search) {
+ return input.replaceAll(search, "");
+ }
+
+ /**
+ * @param input1
+ * @param input2
+ * @return returns the Hamming distance of input1 and input2, note that the
two strings must have the same length.
+ */
+ @ScalarFunction
+ public static int hammingDistance(String input1, String input2) {
+ if (input1.length() != input2.length()) {
+ return -1;
+ }
+ int distance = 0;
+ for (int i = 0; i < input1.length(); i++) {
+ if (input1.charAt(i) != input2.charAt(i)) distance++;
Review comment:
(format) put braces over the if block
##########
File path:
pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
##########
@@ -228,4 +267,86 @@ public static String chr(int codepoint) {
char[] result = Character.toChars(codepoint);
return new String(result);
}
+
+ /**
+ * @see StandardCharsets#UTF_8#encode(String)
+ * @param input
+ * @return bytes
+ */
+ @ScalarFunction
+ public static byte[] to_utf8(String input) {
+ return input.getBytes(StandardCharsets.UTF_8);
+ }
+
+ /**
+ * see Normalizer#normalize(String, Form)
+ * @param input
+ * @return transforms string with NFC normalization form.
+ */
+ @ScalarFunction
+ public static String normalize(String input) {
+ return Normalizer.normalize(input, Normalizer.Form.NFC);
+ }
+
+ /**
+ * see Normalizer#normalize(String, Form)
+ * @param input
+ * @param form
+ * @return transforms string with the specified normalization form
+ */
+ @ScalarFunction
+ public static String normalize(String input, String form) {
+ Normalizer.Form targetForm = Normalizer.Form.valueOf(form);
+ return Normalizer.normalize(input, targetForm);
+ }
+
+ /**
+ * see String#split(String)
+ * @param input
+ * @param delimiter
+ * @return splits string on specified delimiter and returns an array.
+ */
+ @ScalarFunction
+ public static String[] split(String input, String delimiter) {
+ return input.split(delimiter);
Review comment:
Do we want regex match for the delimiter? If not, use StringUtils
##########
File path:
pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
##########
@@ -228,4 +267,86 @@ public static String chr(int codepoint) {
char[] result = Character.toChars(codepoint);
return new String(result);
}
+
+ /**
+ * @see StandardCharsets#UTF_8#encode(String)
+ * @param input
+ * @return bytes
+ */
+ @ScalarFunction
+ public static byte[] to_utf8(String input) {
Review comment:
Don't put underscore in function signature, or it won't match function
name without underscore or with underscore at a different place, e.g. `toUtf8`
or `to_utf_8`
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]