This is an automated email from the ASF dual-hosted git repository. mblow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit d072c591ffb21710a357aba3374bcf60aab09f33 Author: Hussain Towaileb <[email protected]> AuthorDate: Wed Nov 29 02:32:04 2023 +0300 [ASTERIXDB-3321][FUN]: Return null and warn for string functions for invalid unicode sequence Change-Id: I67c04de2144f740fd63e85ecbd4efded544db62c Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17986 Integration-Tests: Jenkins <[email protected]> Tested-by: Jenkins <[email protected]> Reviewed-by: Murtadha Hubail <[email protected]> --- .../string/invalid-unicode/test.000.query.sqlpp | 34 ++++++++++++++++++++++ .../results/string/invalid-unicode/result.000.adm | 1 + .../test/resources/runtimets/testsuite_sqlpp.xml | 16 ++++++++++ .../asterix/common/exceptions/ErrorCode.java | 1 + .../src/main/resources/asx_errormsg/en.properties | 1 + .../asterix/om/exceptions/ExceptionUtil.java | 8 +++++ .../functions/AbstractBinaryStringEval.java | 4 +++ .../functions/AbstractUnaryStringStringEval.java | 4 +++ .../functions/StringLengthDescriptor.java | 4 +++ .../functions/StringToCodePointDescriptor.java | 4 +++ .../data/std/primitive/UTF8StringPointable.java | 19 ++++++------ .../util/exceptions/UTF8EncodingException.java | 25 ++++++++++++++++ .../apache/hyracks/util/string/UTF8StringUtil.java | 8 +++-- 13 files changed, 117 insertions(+), 12 deletions(-) diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp new file mode 100644 index 0000000000..a533d7efd2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// param max-warnings:json=1000 + +[ + string_length("\uDEAD x \uDEAD"), + string_to_codepoint("\uDEAD x \uDEAD"), + trim("\uDEAD x \uDEAD"), + ltrim("\uDEAD x \uDEAD"), + rtrim("\uDEAD x \uDEAD"), + trim("\uDEAD x \uDEAD", "x"), + ltrim("\uDEAD x \uDEAD", "x"), + rtrim("\uDEAD x \uDEAD", "x"), + reverse("\uDEAD x \uDEAD"), + position("\uDEAD x \uDEAD", "x"), + position1("\uDEAD x \uDEAD", "x") +]; diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm new file mode 100644 index 0000000000..a9fc274160 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm @@ -0,0 +1 @@ +[ null, null, null, null, null, null, null, null, null, null, null ] \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml index dc31f7e14a..9a3abae1d0 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml @@ -11341,6 +11341,22 @@ <output-dir compare="Text">substring-after-5</output-dir> </compilation-unit> </test-case> + <test-case FilePath="string" check-warnings="true"> + <compilation-unit name="invalid-unicode"> + <output-dir compare="Text">invalid-unicode</output-dir> + <expected-warn>ASX0060: Function 'string-length' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'string-to-codepoint' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'trim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'trim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'rtrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'rtrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'ltrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'ltrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'reverse' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'position' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + <expected-warn>ASX0060: Function 'position1' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn> + </compilation-unit> + </test-case> </test-group> <test-group name="subquery"> <test-case FilePath="subquery"> diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java index b0826e81f5..491034353e 100644 --- a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java +++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java @@ -87,6 +87,7 @@ public enum ErrorCode implements IError { PARQUET_CONTAINS_OVERFLOWED_BIGINT(57), UNEXPECTED_ERROR_ENCOUNTERED(58), INVALID_PARQUET_FILE(59), + FUNCTION_EVALUATION_FAILED(60), UNSUPPORTED_JRE(100), diff --git a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties index 0bf523a82d..074245c95a 100644 --- a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties +++ b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties @@ -94,6 +94,7 @@ 57 = Parquet file(s) contain unsigned integer that is larger than the '%1$s' range 58 = Error encountered: %1$s 59 = Invalid Parquet file: %1$s. Reason: %2$s +60 = Function '%1$s' failed to evaluate because: %2$s 100 = Unsupported JRE: %1$s diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java index ccb3a8d9cb..928a6b5195 100644 --- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java +++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java @@ -143,6 +143,14 @@ public final class ExceptionUtil { warnInvalidValue(ctx, srcLoc, fid, argIdx, argValue, ErrorCode.NEGATIVE_VALUE); } + public static void warnStringFunctionFailed(IEvaluatorContext ctx, SourceLocation srcLoc, FunctionIdentifier fid, + String errMsg) { + if (ctx.getWarningCollector().shouldWarn()) { + ctx.getWarningCollector() + .warn(Warning.of(srcLoc, ErrorCode.FUNCTION_EVALUATION_FAILED, fid.getName(), errMsg)); + } + } + private static void warnInvalidValue(IEvaluatorContext ctx, SourceLocation srcLoc, FunctionIdentifier fid, int argIdx, double argValue, ErrorCode errorCode) { IWarningCollector warningCollector = ctx.getWarningCollector(); diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java index 2fc8654da6..9de704ba87 100644 --- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java +++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java @@ -35,6 +35,7 @@ import org.apache.hyracks.data.std.primitive.UTF8StringPointable; import org.apache.hyracks.data.std.primitive.VoidPointable; import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; +import org.apache.hyracks.util.exceptions.UTF8EncodingException; public abstract class AbstractBinaryStringEval implements IScalarEvaluator { @@ -106,6 +107,9 @@ public abstract class AbstractBinaryStringEval implements IScalarEvaluator { // The actual processing. try { process(leftStringPointable, rightStringPointable, resultPointable); + } catch (UTF8EncodingException ex) { + PointableHelper.setNull(resultPointable); + ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, funcID, ex.getMessage()); } catch (IOException e) { throw HyracksDataException.create(e); } diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java index 5efe529e72..7a60aae3f3 100644 --- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java +++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java @@ -37,6 +37,7 @@ import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; import org.apache.hyracks.data.std.util.GrowableArray; import org.apache.hyracks.data.std.util.UTF8StringBuilder; import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; +import org.apache.hyracks.util.exceptions.UTF8EncodingException; abstract class AbstractUnaryStringStringEval implements IScalarEvaluator { @@ -84,6 +85,9 @@ abstract class AbstractUnaryStringStringEval implements IScalarEvaluator { try { process(stringPtr, resultPointable); writeResult(resultPointable); + } catch (UTF8EncodingException ex) { + PointableHelper.setNull(resultPointable); + ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, funcID, ex.getMessage()); } catch (IOException e) { throw HyracksDataException.create(e); } diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java index 47caf14ea1..d9c9ecb66a 100644 --- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java +++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java @@ -41,6 +41,7 @@ import org.apache.hyracks.data.std.api.IPointable; import org.apache.hyracks.data.std.primitive.VoidPointable; import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; +import org.apache.hyracks.util.exceptions.UTF8EncodingException; import org.apache.hyracks.util.string.UTF8StringUtil; @MissingNullInOutFunction @@ -89,6 +90,9 @@ public class StringLengthDescriptor extends AbstractScalarFunctionDynamicDescrip result.setValue(len); int64Serde.serialize(result, out); resultPointable.set(resultStorage); + } catch (UTF8EncodingException ex) { + PointableHelper.setNull(resultPointable); + ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, getIdentifier(), ex.getMessage()); } catch (IOException e1) { throw HyracksDataException.create(e1); } diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java index 2f6a2231b2..d4f5368c9d 100644 --- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java +++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java @@ -43,6 +43,7 @@ import org.apache.hyracks.data.std.api.IPointable; import org.apache.hyracks.data.std.primitive.VoidPointable; import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; +import org.apache.hyracks.util.exceptions.UTF8EncodingException; import org.apache.hyracks.util.string.UTF8StringUtil; @MissingNullInOutFunction @@ -109,6 +110,9 @@ public class StringToCodePointDescriptor extends AbstractScalarFunctionDynamicDe } listBuilder.write(out, true); result.set(resultStorage); + } catch (UTF8EncodingException ex) { + PointableHelper.setNull(result); + ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, getIdentifier(), ex.getMessage()); } catch (IOException e1) { throw HyracksDataException.create(e1); } diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java index 49f6221e4f..4acc8234b4 100644 --- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java +++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java @@ -20,6 +20,7 @@ package org.apache.hyracks.data.std.primitive; import static org.apache.hyracks.util.string.UTF8StringUtil.HIGH_SURROGATE_WITHOUT_LOW_SURROGATE; import static org.apache.hyracks.util.string.UTF8StringUtil.LOW_SURROGATE_WITHOUT_HIGH_SURROGATE; +import static org.apache.hyracks.util.string.UTF8StringUtil.MALFORMED_BYTES; import java.io.IOException; import java.nio.charset.Charset; @@ -35,6 +36,7 @@ import org.apache.hyracks.data.std.api.IPointable; import org.apache.hyracks.data.std.api.IPointableFactory; import org.apache.hyracks.data.std.util.GrowableArray; import org.apache.hyracks.data.std.util.UTF8StringBuilder; +import org.apache.hyracks.util.exceptions.UTF8EncodingException; import org.apache.hyracks.util.string.UTF8StringUtil; import com.fasterxml.jackson.databind.JsonNode; @@ -136,7 +138,7 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas } if (byteIdx != utf8Length) { - throw new IllegalArgumentException("Decoding error: malformed bytes"); + throw new UTF8EncodingException(MALFORMED_BYTES); } } @@ -317,7 +319,7 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas return startMatchPos; } else { if (prevHighSurrogate) { - throw new IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE); + throw new UTF8EncodingException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE); } return codePointCount; } @@ -333,7 +335,7 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas codePointCount++; prevHighSurrogate = false; } else { - throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); + throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); } } else { codePointCount++; @@ -678,9 +680,9 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas endIndex = startIndex; int cursorIndex = startIndex; while (cursorIndex < srcUtfLen) { - int codePioint = srcPtr.codePointAt(srcStart + cursorIndex); + int codePoint = srcPtr.codePointAt(srcStart + cursorIndex); cursorIndex += srcPtr.codePointSize(srcStart + cursorIndex); - if (!codePointSet.contains(codePioint)) { + if (!codePointSet.contains(codePoint)) { endIndex = cursorIndex; } } @@ -739,9 +741,8 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas cursorIndex--; if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) { ch = UTF8StringUtil.charAt(srcPtr.bytes, cursorIndex); - if (Character.isHighSurrogate(ch) == false) { - throw new IllegalArgumentException( - "Decoding Error: no corresponding high surrogate found for the following low surrogate"); + if (!Character.isHighSurrogate(ch)) { + throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); } charSize += UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex); @@ -749,7 +750,7 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas } } } else if (Character.isHighSurrogate(ch)) { - throw new IllegalArgumentException("Decoding Error: get a high surrogate without low surrogate"); + throw new UTF8EncodingException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE); } builder.appendUtf8StringPointable(srcPtr, cursorIndex, charSize); diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/exceptions/UTF8EncodingException.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/exceptions/UTF8EncodingException.java new file mode 100644 index 0000000000..3853a1f7a4 --- /dev/null +++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/exceptions/UTF8EncodingException.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.hyracks.util.exceptions; + +public class UTF8EncodingException extends IllegalArgumentException { + public UTF8EncodingException(String s) { + super(s); + } +} diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java index 3eb868712b..15638f9610 100644 --- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java +++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java @@ -28,6 +28,7 @@ import java.io.UTFDataFormatException; import java.lang.ref.SoftReference; import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder; +import org.apache.hyracks.util.exceptions.UTF8EncodingException; /** * A helper package to operate the UTF8String in Hyracks. @@ -35,6 +36,7 @@ import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder; */ public class UTF8StringUtil { + public static final String MALFORMED_BYTES = "Decoding error: malformed bytes"; public static final String LOW_SURROGATE_WITHOUT_HIGH_SURROGATE = "Decoding error: got a low surrogate without a leading high surrogate"; public static final String HIGH_SURROGATE_WITHOUT_LOW_SURROGATE = @@ -101,7 +103,7 @@ public class UTF8StringUtil { if (Character.isLowSurrogate(c1)) { // In this case, the index s doesn't point to a correct position - throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); + throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); } if (Character.isHighSurrogate(c1)) { @@ -112,7 +114,7 @@ public class UTF8StringUtil { if (Character.isLowSurrogate(c2)) { return Character.toCodePoint(c1, c2); } else { - throw new IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE); + throw new UTF8EncodingException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE); } } @@ -124,7 +126,7 @@ public class UTF8StringUtil { int size1 = charSize(b, s); if (Character.isLowSurrogate(c1)) { - throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); + throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); } if (Character.isHighSurrogate(c1)) {
