>From Hussain Towaileb <[email protected]>: Hussain Towaileb has uploaded this change for review. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17986 )
Change subject: Return null on string functions for invalid unicode sequence ...................................................................... Return null on string functions for invalid unicode sequence Change-Id: I67c04de2144f740fd63e85ecbd4efded544db62c --- A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java M asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java M asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java M asterixdb/asterix-app/src/test/resources/runtimets/only_sqlpp.xml M hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java A asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java M asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml M asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties M hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java 13 files changed, 134 insertions(+), 24 deletions(-) git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/86/17986/1 diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/only_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/only_sqlpp.xml index 334dd52..66fe0fa 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/only_sqlpp.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/only_sqlpp.xml @@ -19,5 +19,23 @@ !--> <test-suite xmlns="urn:xml.testframework.asterix.apache.org" ResultOffsetPath="results" QueryOffsetPath="queries_sqlpp" QueryFileExtension=".sqlpp"> <test-group name="failed"> + <test-case FilePath="string" check-warnings="true"> + <compilation-unit name="invalid-unicode"> + <output-dir compare="Text">invalid-unicode</output-dir> + <expected-warn>ASX0060: Function 'string-length' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'string-to-codepoint' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'trim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'trim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'rtrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'rtrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'ltrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'ltrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'reverse' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'lowercase' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'uppercase' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'initcap' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'get-job-param' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + </compilation-unit> + </test-case> </test-group> </test-suite> diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp new file mode 100644 index 0000000..897a163 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// param max-warnings:json=1000 + +[ + string_length("xxxxxxxxxx x??\uDEAD"), + string_to_codepoint("xxxxxxxxxx x??\uDEAD"), + trim("xxxxxxxxxx x??\uDEAD"), + ltrim("xxxxxxxxxx x??\uDEAD"), + rtrim("xxxxxxxxxx x??\uDEAD"), + trim("xxxxxxxxxx x??\uDEAD", "x"), + ltrim("xxxxxxxxxx x??\uDEAD", "x"), + rtrim("xxxxxxxxxx x??\uDEAD", "x"), + reverse("xxxxxxxxxx x??\uDEAD"), + lowercase("xxxxxxxxxx x??\uDEAD"), + uppercase("xxxxxxxxxx x??\uDEAD"), + initcap("xxxxxxxxxx x??\uDEAD"), + get_job_param("xxxxxxxxxx x??\uDEAD") +]; diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm new file mode 100644 index 0000000..f9e668e --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm @@ -0,0 +1 @@ +[ null, null, null, null, null, null, null, null, null, null, null, null, null ] \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml index 0ab9672..c7295ef 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml @@ -11324,6 +11324,12 @@ <output-dir compare="Text">substring-after-5</output-dir> </compilation-unit> </test-case> + <test-case FilePath="string" check-warnings="true"> + <compilation-unit name="invalid-unicode-string-length"> + <output-dir compare="Text">invalid-unicode-string-length</output-dir> + <expected-warn>ASX60: Cannot find dataverse with name testUnknown</expected-warn> + </compilation-unit> + </test-case> </test-group> <test-group name="subquery"> <test-case FilePath="subquery"> diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java index b0826e8..4910343 100644 --- a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java +++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java @@ -87,6 +87,7 @@ PARQUET_CONTAINS_OVERFLOWED_BIGINT(57), UNEXPECTED_ERROR_ENCOUNTERED(58), INVALID_PARQUET_FILE(59), + FUNCTION_EVALUATION_FAILED(60), UNSUPPORTED_JRE(100), diff --git a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties index 0bf523a..074245c 100644 --- a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties +++ b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties @@ -94,6 +94,7 @@ 57 = Parquet file(s) contain unsigned integer that is larger than the '%1$s' range 58 = Error encountered: %1$s 59 = Invalid Parquet file: %1$s. Reason: %2$s +60 = Function '%1$s' failed to evaluate because: %2$s 100 = Unsupported JRE: %1$s diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java index ccb3a8d..928a6b5 100644 --- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java +++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java @@ -143,6 +143,14 @@ warnInvalidValue(ctx, srcLoc, fid, argIdx, argValue, ErrorCode.NEGATIVE_VALUE); } + public static void warnStringFunctionFailed(IEvaluatorContext ctx, SourceLocation srcLoc, FunctionIdentifier fid, + String errMsg) { + if (ctx.getWarningCollector().shouldWarn()) { + ctx.getWarningCollector() + .warn(Warning.of(srcLoc, ErrorCode.FUNCTION_EVALUATION_FAILED, fid.getName(), errMsg)); + } + } + private static void warnInvalidValue(IEvaluatorContext ctx, SourceLocation srcLoc, FunctionIdentifier fid, int argIdx, double argValue, ErrorCode errorCode) { IWarningCollector warningCollector = ctx.getWarningCollector(); diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java index 2fc8654..d5083ed 100644 --- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java +++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java @@ -35,6 +35,7 @@ import org.apache.hyracks.data.std.primitive.VoidPointable; import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; +import org.apache.hyracks.util.string.UTF8StringUtil; public abstract class AbstractBinaryStringEval implements IScalarEvaluator { @@ -99,13 +100,19 @@ return; } - // Sets StringUTF8Pointables. - leftStringPointable.set(bytes0, offset0 + 1, len0 - 1); - rightStringPointable.set(bytes1, offset1 + 1, len1 - 1); - - // The actual processing. try { + UTF8StringUtil.validateUnicodeSequence(bytes0, offset0 + 1, len0); + UTF8StringUtil.validateUnicodeSequence(bytes1, offset1 + 1, len1); + + // Sets StringUTF8Pointables. + leftStringPointable.set(bytes0, offset0 + 1, len0 - 1); + rightStringPointable.set(bytes1, offset1 + 1, len1 - 1); + + // The actual processing. process(leftStringPointable, rightStringPointable, resultPointable); + } catch (IllegalArgumentException ex) { + PointableHelper.setNull(resultPointable); + ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, funcID, ex.getMessage()); } catch (IOException e) { throw HyracksDataException.create(e); } diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java index 5efe529..f3c3ce2 100644 --- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java +++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java @@ -37,6 +37,7 @@ import org.apache.hyracks.data.std.util.GrowableArray; import org.apache.hyracks.data.std.util.UTF8StringBuilder; import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; +import org.apache.hyracks.util.string.UTF8StringUtil; abstract class AbstractUnaryStringStringEval implements IScalarEvaluator { @@ -64,26 +65,31 @@ @Override public void evaluate(IFrameTupleReference tuple, IPointable resultPointable) throws HyracksDataException { - resultStorage.reset(); - argEval.evaluate(tuple, argPtr); - - if (PointableHelper.checkAndSetMissingOrNull(resultPointable, argPtr)) { - return; - } - - byte[] argBytes = argPtr.getByteArray(); - int offset = argPtr.getStartOffset(); - byte inputTypeTag = argBytes[offset]; - if (inputTypeTag != ATypeTag.SERIALIZED_STRING_TYPE_TAG) { - PointableHelper.setNull(resultPointable); - ExceptionUtil.warnTypeMismatch(ctx, sourceLoc, funcID, argBytes[offset], 0, ATypeTag.STRING); - return; - } - stringPtr.set(argBytes, offset + 1, argPtr.getLength() - 1); - resultArray.reset(); try { + resultStorage.reset(); + argEval.evaluate(tuple, argPtr); + + if (PointableHelper.checkAndSetMissingOrNull(resultPointable, argPtr)) { + return; + } + + byte[] argBytes = argPtr.getByteArray(); + int offset = argPtr.getStartOffset(); + byte inputTypeTag = argBytes[offset]; + if (inputTypeTag != ATypeTag.SERIALIZED_STRING_TYPE_TAG) { + PointableHelper.setNull(resultPointable); + ExceptionUtil.warnTypeMismatch(ctx, sourceLoc, funcID, argBytes[offset], 0, ATypeTag.STRING); + return; + } + UTF8StringUtil.validateUnicodeSequence(argBytes, offset + 1, argBytes.length); + stringPtr.set(argBytes, offset + 1, argPtr.getLength() - 1); + resultArray.reset(); + process(stringPtr, resultPointable); writeResult(resultPointable); + } catch (IllegalArgumentException ex) { + PointableHelper.setNull(resultPointable); + ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, funcID, ex.getMessage()); } catch (IOException e) { throw HyracksDataException.create(e); } diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java index 47caf14..3379242 100644 --- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java +++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java @@ -85,10 +85,14 @@ ATypeTag.STRING); return; } + UTF8StringUtil.validateUnicodeSequence(serString, offset + 1, serString.length); int len = UTF8StringUtil.getNumCodePoint(serString, offset + 1); result.setValue(len); int64Serde.serialize(result, out); resultPointable.set(resultStorage); + } catch (IllegalArgumentException ex) { + PointableHelper.setNull(resultPointable); + ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, getIdentifier(), ex.getMessage()); } catch (IOException e1) { throw HyracksDataException.create(e1); } diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java index 2f6a223..c0ddfb4 100644 --- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java +++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java @@ -93,6 +93,7 @@ ATypeTag.STRING); return; } + UTF8StringUtil.validateUnicodeSequence(serString, offset + 1, serString.length); int len = UTF8StringUtil.getUTFLength(serString, offset + 1); int start = offset + 1 + UTF8StringUtil.getNumBytesToStoreLength(len); @@ -109,6 +110,9 @@ } listBuilder.write(out, true); result.set(resultStorage); + } catch (IllegalArgumentException ex) { + PointableHelper.setNull(result); + ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, getIdentifier(), ex.getMessage()); } catch (IOException e1) { throw HyracksDataException.create(e1); } diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java index 49f6221..a05b531 100644 --- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java +++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java @@ -678,9 +678,9 @@ endIndex = startIndex; int cursorIndex = startIndex; while (cursorIndex < srcUtfLen) { - int codePioint = srcPtr.codePointAt(srcStart + cursorIndex); + int codePoint = srcPtr.codePointAt(srcStart + cursorIndex); cursorIndex += srcPtr.codePointSize(srcStart + cursorIndex); - if (!codePointSet.contains(codePioint)) { + if (!codePointSet.contains(codePoint)) { endIndex = cursorIndex; } } diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java index 3eb8687..b3746ad 100644 --- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java +++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java @@ -119,6 +119,15 @@ return c1; } + public static void validateUnicodeSequence(byte[] b, int offset, int size) throws IllegalArgumentException { + for (int i = offset; i < size; i++) { + if (Character.isLowSurrogate(charAt(b, i))) { + // In this case, the index s doesn't point to a correct position + throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); + } + } + } + public static int codePointSize(byte[] b, int s) { char c1 = charAt(b, s); int size1 = charSize(b, s); -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17986 To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-Project: asterixdb Gerrit-Branch: neo Gerrit-Change-Id: I67c04de2144f740fd63e85ecbd4efded544db62c Gerrit-Change-Number: 17986 Gerrit-PatchSet: 1 Gerrit-Owner: Hussain Towaileb <[email protected]> Gerrit-MessageType: newchange
