>From Hussain Towaileb <[email protected]>:

Hussain Towaileb has uploaded this change for review. ( 
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17986 )


Change subject: Return null on string functions for invalid unicode sequence
......................................................................

Return null on string functions for invalid unicode sequence

Change-Id: I67c04de2144f740fd63e85ecbd4efded544db62c
---
A 
asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp
M 
asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
M 
asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
M 
asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
M 
asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
M asterixdb/asterix-app/src/test/resources/runtimets/only_sqlpp.xml
M 
hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
M 
asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
A 
asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm
M 
asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
M asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
M asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
M 
hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
13 files changed, 134 insertions(+), 24 deletions(-)



  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb 
refs/changes/86/17986/1

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/only_sqlpp.xml 
b/asterixdb/asterix-app/src/test/resources/runtimets/only_sqlpp.xml
index 334dd52..66fe0fa 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/only_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/only_sqlpp.xml
@@ -19,5 +19,23 @@
  !-->
 <test-suite xmlns="urn:xml.testframework.asterix.apache.org" 
ResultOffsetPath="results" QueryOffsetPath="queries_sqlpp" 
QueryFileExtension=".sqlpp">
   <test-group name="failed">
+    <test-case FilePath="string" check-warnings="true">
+      <compilation-unit name="invalid-unicode">
+        <output-dir compare="Text">invalid-unicode</output-dir>
+        <expected-warn>ASX0060: Function 'string-length' failed to evaluate 
because: Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'string-to-codepoint' failed to 
evaluate because: Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'trim' failed to evaluate because: 
Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'trim' failed to evaluate because: 
Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'rtrim' failed to evaluate because: 
Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'rtrim' failed to evaluate because: 
Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'ltrim' failed to evaluate because: 
Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'ltrim' failed to evaluate because: 
Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'reverse' failed to evaluate because: 
Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'lowercase' failed to evaluate 
because: Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'uppercase' failed to evaluate 
because: Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'initcap' failed to evaluate because: 
Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'get-job-param' failed to evaluate 
because: Decoding error: got a low surrogate without a leading high 
surrogate</expected-warn>
+      </compilation-unit>
+    </test-case>
   </test-group>
 </test-suite>
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp
 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp
new file mode 100644
index 0000000..897a163
--- /dev/null
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// param max-warnings:json=1000
+
+[
+ string_length("xxxxxxxxxx x??\uDEAD"),
+ string_to_codepoint("xxxxxxxxxx x??\uDEAD"),
+ trim("xxxxxxxxxx x??\uDEAD"),
+ ltrim("xxxxxxxxxx x??\uDEAD"),
+ rtrim("xxxxxxxxxx x??\uDEAD"),
+ trim("xxxxxxxxxx x??\uDEAD", "x"),
+ ltrim("xxxxxxxxxx x??\uDEAD", "x"),
+ rtrim("xxxxxxxxxx x??\uDEAD", "x"),
+ reverse("xxxxxxxxxx x??\uDEAD"),
+ lowercase("xxxxxxxxxx x??\uDEAD"),
+ uppercase("xxxxxxxxxx x??\uDEAD"),
+ initcap("xxxxxxxxxx x??\uDEAD"),
+ get_job_param("xxxxxxxxxx x??\uDEAD")
+];
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm
 
b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm
new file mode 100644
index 0000000..f9e668e
--- /dev/null
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm
@@ -0,0 +1 @@
+[ null, null, null, null, null, null, null, null, null, null, null, null, null 
]
\ No newline at end of file
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml 
b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index 0ab9672..c7295ef 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -11324,6 +11324,12 @@
         <output-dir compare="Text">substring-after-5</output-dir>
       </compilation-unit>
     </test-case>
+    <test-case FilePath="string" check-warnings="true">
+      <compilation-unit name="invalid-unicode-string-length">
+        <output-dir compare="Text">invalid-unicode-string-length</output-dir>
+        <expected-warn>ASX60: Cannot find dataverse with name 
testUnknown</expected-warn>
+      </compilation-unit>
+    </test-case>
   </test-group>
   <test-group name="subquery">
     <test-case FilePath="subquery">
diff --git 
a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
 
b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
index b0826e8..4910343 100644
--- 
a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
+++ 
b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
@@ -87,6 +87,7 @@
     PARQUET_CONTAINS_OVERFLOWED_BIGINT(57),
     UNEXPECTED_ERROR_ENCOUNTERED(58),
     INVALID_PARQUET_FILE(59),
+    FUNCTION_EVALUATION_FAILED(60),

     UNSUPPORTED_JRE(100),

diff --git 
a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties 
b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
index 0bf523a..074245c 100644
--- a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
+++ b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
@@ -94,6 +94,7 @@
 57 = Parquet file(s) contain unsigned integer that is larger than the '%1$s' 
range
 58 = Error encountered: %1$s
 59 = Invalid Parquet file: %1$s. Reason: %2$s
+60 = Function '%1$s' failed to evaluate because: %2$s

 100 = Unsupported JRE: %1$s

diff --git 
a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
 
b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
index ccb3a8d..928a6b5 100644
--- 
a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
+++ 
b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
@@ -143,6 +143,14 @@
         warnInvalidValue(ctx, srcLoc, fid, argIdx, argValue, 
ErrorCode.NEGATIVE_VALUE);
     }

+    public static void warnStringFunctionFailed(IEvaluatorContext ctx, 
SourceLocation srcLoc, FunctionIdentifier fid,
+            String errMsg) {
+        if (ctx.getWarningCollector().shouldWarn()) {
+            ctx.getWarningCollector()
+                    .warn(Warning.of(srcLoc, 
ErrorCode.FUNCTION_EVALUATION_FAILED, fid.getName(), errMsg));
+        }
+    }
+
     private static void warnInvalidValue(IEvaluatorContext ctx, SourceLocation 
srcLoc, FunctionIdentifier fid,
             int argIdx, double argValue, ErrorCode errorCode) {
         IWarningCollector warningCollector = ctx.getWarningCollector();
diff --git 
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
 
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
index 2fc8654..d5083ed 100644
--- 
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
+++ 
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
@@ -35,6 +35,7 @@
 import org.apache.hyracks.data.std.primitive.VoidPointable;
 import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
 import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.string.UTF8StringUtil;

 public abstract class AbstractBinaryStringEval implements IScalarEvaluator {

@@ -99,13 +100,19 @@
             return;
         }

-        // Sets StringUTF8Pointables.
-        leftStringPointable.set(bytes0, offset0 + 1, len0 - 1);
-        rightStringPointable.set(bytes1, offset1 + 1, len1 - 1);
-
-        // The actual processing.
         try {
+            UTF8StringUtil.validateUnicodeSequence(bytes0, offset0 + 1, len0);
+            UTF8StringUtil.validateUnicodeSequence(bytes1, offset1 + 1, len1);
+
+            // Sets StringUTF8Pointables.
+            leftStringPointable.set(bytes0, offset0 + 1, len0 - 1);
+            rightStringPointable.set(bytes1, offset1 + 1, len1 - 1);
+
+            // The actual processing.
             process(leftStringPointable, rightStringPointable, 
resultPointable);
+        } catch (IllegalArgumentException ex) {
+            PointableHelper.setNull(resultPointable);
+            ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, funcID, 
ex.getMessage());
         } catch (IOException e) {
             throw HyracksDataException.create(e);
         }
diff --git 
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
 
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
index 5efe529..f3c3ce2 100644
--- 
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
+++ 
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
@@ -37,6 +37,7 @@
 import org.apache.hyracks.data.std.util.GrowableArray;
 import org.apache.hyracks.data.std.util.UTF8StringBuilder;
 import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.string.UTF8StringUtil;

 abstract class AbstractUnaryStringStringEval implements IScalarEvaluator {

@@ -64,26 +65,31 @@

     @Override
     public void evaluate(IFrameTupleReference tuple, IPointable 
resultPointable) throws HyracksDataException {
-        resultStorage.reset();
-        argEval.evaluate(tuple, argPtr);
-
-        if (PointableHelper.checkAndSetMissingOrNull(resultPointable, argPtr)) 
{
-            return;
-        }
-
-        byte[] argBytes = argPtr.getByteArray();
-        int offset = argPtr.getStartOffset();
-        byte inputTypeTag = argBytes[offset];
-        if (inputTypeTag != ATypeTag.SERIALIZED_STRING_TYPE_TAG) {
-            PointableHelper.setNull(resultPointable);
-            ExceptionUtil.warnTypeMismatch(ctx, sourceLoc, funcID, 
argBytes[offset], 0, ATypeTag.STRING);
-            return;
-        }
-        stringPtr.set(argBytes, offset + 1, argPtr.getLength() - 1);
-        resultArray.reset();
         try {
+            resultStorage.reset();
+            argEval.evaluate(tuple, argPtr);
+
+            if (PointableHelper.checkAndSetMissingOrNull(resultPointable, 
argPtr)) {
+                return;
+            }
+
+            byte[] argBytes = argPtr.getByteArray();
+            int offset = argPtr.getStartOffset();
+            byte inputTypeTag = argBytes[offset];
+            if (inputTypeTag != ATypeTag.SERIALIZED_STRING_TYPE_TAG) {
+                PointableHelper.setNull(resultPointable);
+                ExceptionUtil.warnTypeMismatch(ctx, sourceLoc, funcID, 
argBytes[offset], 0, ATypeTag.STRING);
+                return;
+            }
+            UTF8StringUtil.validateUnicodeSequence(argBytes, offset + 1, 
argBytes.length);
+            stringPtr.set(argBytes, offset + 1, argPtr.getLength() - 1);
+            resultArray.reset();
+
             process(stringPtr, resultPointable);
             writeResult(resultPointable);
+        } catch (IllegalArgumentException ex) {
+            PointableHelper.setNull(resultPointable);
+            ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, funcID, 
ex.getMessage());
         } catch (IOException e) {
             throw HyracksDataException.create(e);
         }
diff --git 
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
 
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
index 47caf14..3379242 100644
--- 
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
+++ 
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
@@ -85,10 +85,14 @@
                                         ATypeTag.STRING);
                                 return;
                             }
+                            UTF8StringUtil.validateUnicodeSequence(serString, 
offset + 1, serString.length);
                             int len = 
UTF8StringUtil.getNumCodePoint(serString, offset + 1);
                             result.setValue(len);
                             int64Serde.serialize(result, out);
                             resultPointable.set(resultStorage);
+                        } catch (IllegalArgumentException ex) {
+                            PointableHelper.setNull(resultPointable);
+                            ExceptionUtil.warnStringFunctionFailed(ctx, 
sourceLoc, getIdentifier(), ex.getMessage());
                         } catch (IOException e1) {
                             throw HyracksDataException.create(e1);
                         }
diff --git 
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
 
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
index 2f6a223..c0ddfb4 100644
--- 
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
+++ 
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
@@ -93,6 +93,7 @@
                                         ATypeTag.STRING);
                                 return;
                             }
+                            UTF8StringUtil.validateUnicodeSequence(serString, 
offset + 1, serString.length);
                             int len = UTF8StringUtil.getUTFLength(serString, 
offset + 1);

                             int start = offset + 1 + 
UTF8StringUtil.getNumBytesToStoreLength(len);
@@ -109,6 +110,9 @@
                             }
                             listBuilder.write(out, true);
                             result.set(resultStorage);
+                        } catch (IllegalArgumentException ex) {
+                            PointableHelper.setNull(result);
+                            ExceptionUtil.warnStringFunctionFailed(ctx, 
sourceLoc, getIdentifier(), ex.getMessage());
                         } catch (IOException e1) {
                             throw HyracksDataException.create(e1);
                         }
diff --git 
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
 
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 49f6221..a05b531 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -678,9 +678,9 @@
             endIndex = startIndex;
             int cursorIndex = startIndex;
             while (cursorIndex < srcUtfLen) {
-                int codePioint = srcPtr.codePointAt(srcStart + cursorIndex);
+                int codePoint = srcPtr.codePointAt(srcStart + cursorIndex);
                 cursorIndex += srcPtr.codePointSize(srcStart + cursorIndex);
-                if (!codePointSet.contains(codePioint)) {
+                if (!codePointSet.contains(codePoint)) {
                     endIndex = cursorIndex;
                 }
             }
diff --git 
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
 
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 3eb8687..b3746ad 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -119,6 +119,15 @@
         return c1;
     }

+    public static void validateUnicodeSequence(byte[] b, int offset, int size) 
throws IllegalArgumentException {
+        for (int i = offset; i < size; i++) {
+            if (Character.isLowSurrogate(charAt(b, i))) {
+                // In this case, the index s doesn't point to a correct 
position
+                throw new 
IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+            }
+        }
+    }
+
     public static int codePointSize(byte[] b, int s) {
         char c1 = charAt(b, s);
         int size1 = charSize(b, s);

--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17986
To unsubscribe, or for help writing mail filters, visit 
https://asterix-gerrit.ics.uci.edu/settings

Gerrit-Project: asterixdb
Gerrit-Branch: neo
Gerrit-Change-Id: I67c04de2144f740fd63e85ecbd4efded544db62c
Gerrit-Change-Number: 17986
Gerrit-PatchSet: 1
Gerrit-Owner: Hussain Towaileb <[email protected]>
Gerrit-MessageType: newchange

Reply via email to