This is an automated email from the ASF dual-hosted git repository. mblow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit 8a64bf87cea9113a6b5ed0715c5e8d6ad8ff56c4 Author: Hussain Towaileb <[email protected]> AuthorDate: Wed Jul 8 23:04:52 2020 +0300 [ASTERIXDB-2757] Properly handle include/exclude pattern translation - user model changes: no - storage format changes: no - interface changes: no Details: - Handles properly handling the translation of a wildcard pattern to a regex to avoid potential failure in the query. - Added and updated some test cases. Change-Id: I661c1fad9b2c2692fa231e48f00eb5cf9d79ad5e Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7143 Tested-by: Jenkins <[email protected]> Integration-Tests: Jenkins <[email protected]> Reviewed-by: Ali Alsuliman <[email protected]> --- .../include-exclude/include-11/test.000.ddl.sqlpp | 39 +++++++ .../include-11/test.001.query.sqlpp | 21 ++++ .../include-exclude/include-11/test.099.ddl.sqlpp | 20 ++++ .../include-exclude/include-12/test.000.ddl.sqlpp | 41 +++++++ .../include-12/test.001.query.sqlpp | 21 ++++ .../include-exclude/include-12/test.099.ddl.sqlpp | 20 ++++ .../s3/include-exclude/include-10/result.001.adm | 1 + .../s3/include-exclude/include-11/result.001.adm | 1 + .../s3/include-exclude/include-12/result.001.adm | 1 + .../runtimets/testsuite_external_dataset.xml | 11 +- .../record/reader/aws/AwsS3InputStreamFactory.java | 4 +- .../asterix/external/util/ExternalDataUtils.java | 119 ++++++++++++--------- .../evaluators/functions/StringEvaluatorUtils.java | 4 +- 13 files changed, 247 insertions(+), 56 deletions(-) diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.000.ddl.sqlpp new file mode 100644 index 0000000..e4cf69b --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.000.ddl.sqlpp @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +drop dataverse test if exists; +create dataverse test; +use test; + +drop type test if exists; +create type test as open { +}; + +drop dataset test if exists; +create external dataset test(test) using S3 ( +("accessKeyId"="dummyAccessKey"), +("secretAccessKey"="dummySecretKey"), +("region"="us-west-2"), +("serviceEndpoint"="http://localhost:8001"), +("container"="include-exclude"), +("definition"="data/mixed/"), +("format"="csv"), +("header"=false), +("include"="*.[a-c][a-z][a-z**||\\\\&&--~~]") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.001.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.001.query.sqlpp new file mode 100644 index 0000000..3306d5c --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.001.query.sqlpp @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use test; +select count(*) as `count` from test; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.099.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.099.ddl.sqlpp new file mode 100644 index 0000000..548e632 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.099.ddl.sqlpp @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +drop dataverse test if exists; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.000.ddl.sqlpp new file mode 100644 index 0000000..47fbaef --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.000.ddl.sqlpp @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + // This test case matches nothing for "include", but has extreme cases and complicated pattern, expected is to not fail + +drop dataverse test if exists; +create dataverse test; +use test; + +drop type test if exists; +create type test as open { +}; + +drop dataset test if exists; +create external dataset test(test) using S3 ( +("accessKeyId"="dummyAccessKey"), +("secretAccessKey"="dummySecretKey"), +("region"="us-west-2"), +("serviceEndpoint"="http://localhost:8001"), +("container"="include-exclude"), +("definition"="data/mixed/"), +("format"="csv"), +("header"=false), +("include"="[][!][^]]]]*[![*a-zA--&&^$||0-9B$\\*&&]*&&[^a-b||0--9][[[") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.001.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.001.query.sqlpp new file mode 100644 index 0000000..3306d5c --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.001.query.sqlpp @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use test; +select count(*) as `count` from test; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.099.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.099.ddl.sqlpp new file mode 100644 index 0000000..548e632 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.099.ddl.sqlpp @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +drop dataverse test if exists; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-10/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-10/result.001.adm new file mode 100644 index 0000000..c1a0ea2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-10/result.001.adm @@ -0,0 +1 @@ +{ "count": 0 } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-11/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-11/result.001.adm new file mode 100644 index 0000000..95204aa --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-11/result.001.adm @@ -0,0 +1 @@ +{ "count": 6 } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-12/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-12/result.001.adm new file mode 100644 index 0000000..c1a0ea2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-12/result.001.adm @@ -0,0 +1 @@ +{ "count": 0 } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml index ad17afe..3a4f03e 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml @@ -232,7 +232,16 @@ <test-case FilePath="external-dataset"> <compilation-unit name="aws/s3/include-exclude/include-10"> <output-dir compare="Text">aws/s3/include-exclude/include-10</output-dir> - <expected-error>Invalid pattern *[abc][.*</expected-error> + </compilation-unit> + </test-case> + <test-case FilePath="external-dataset"> + <compilation-unit name="aws/s3/include-exclude/include-11"> + <output-dir compare="Text">aws/s3/include-exclude/include-11</output-dir> + </compilation-unit> + </test-case> + <test-case FilePath="external-dataset"> + <compilation-unit name="aws/s3/include-exclude/include-12"> + <output-dir compare="Text">aws/s3/include-exclude/include-12</output-dir> </compilation-unit> </test-case> </test-group> diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java index 67a7c93..f3a36ff 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java @@ -103,10 +103,10 @@ public class AwsS3InputStreamFactory implements IInputStreamFactory { for (Map.Entry<String, String> entry : configuration.entrySet()) { if (entry.getKey().startsWith(KEY_INCLUDE)) { pattern = entry.getValue(); - includeMatchers.add(Pattern.compile(ExternalDataUtils.wildcardToRegex(pattern)).matcher("")); + includeMatchers.add(Pattern.compile(ExternalDataUtils.patternToRegex(pattern)).matcher("")); } else if (entry.getKey().startsWith(KEY_EXCLUDE)) { pattern = entry.getValue(); - excludeMatchers.add(Pattern.compile(ExternalDataUtils.wildcardToRegex(pattern)).matcher("")); + excludeMatchers.add(Pattern.compile(ExternalDataUtils.patternToRegex(pattern)).matcher("")); } } } catch (PatternSyntaxException ex) { diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java index 1300ac3..fc31286 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java @@ -23,10 +23,12 @@ import static org.apache.asterix.external.util.ExternalDataConstants.KEY_ESCAPE; import static org.apache.asterix.external.util.ExternalDataConstants.KEY_QUOTE; import static org.apache.asterix.external.util.ExternalDataConstants.KEY_RECORD_END; import static org.apache.asterix.external.util.ExternalDataConstants.KEY_RECORD_START; +import static org.apache.asterix.runtime.evaluators.functions.StringEvaluatorUtils.RESERVED_REGEX_CHARS; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; +import java.util.Arrays; import java.util.EnumMap; import java.util.List; import java.util.Map; @@ -498,76 +500,91 @@ public class ExternalDataUtils { /** * Converts the wildcard to proper regex * - * @param wildcard wildcard pattern to convert + * @param pattern wildcard pattern to convert * * @return regex expression */ - public static String wildcardToRegex(String wildcard) { - StringBuilder builder = new StringBuilder(wildcard.length()); - builder.append('^'); + public static String patternToRegex(String pattern) { + int charPosition = 0; + int patternLength = pattern.length(); + StringBuilder stuffBuilder = new StringBuilder(); + StringBuilder result = new StringBuilder(); - // This keeps an eye on the presence inside or outside a sequence, everything inside a sequence is a literal - // e.g ("*" ===> ".*" while "[*]" ===> "[\*]" - boolean outsideBracketSequence = true; + while (charPosition < patternLength) { + char c = pattern.charAt(charPosition); + charPosition++; - for (int i = 0; i < wildcard.length(); i++) { - char c = wildcard.charAt(i); switch (c) { case '*': - builder.append(outsideBracketSequence ? "." : "\\").append(c); + result.append(".*"); break; case '?': - builder.append(outsideBracketSequence ? "." : "\\?"); + result.append("."); break; case '[': - if (outsideBracketSequence) { - outsideBracketSequence = false; - builder.append(c); - if (i + 1 < wildcard.length()) { - if (wildcard.charAt(i + 1) == '!') { - i++; - builder.append('^'); - } - } - } else { - // escape the open bracket "[" if we are already inside a bracket sequence - builder.append("\\").append(c); + int closingBracketPosition = charPosition; + if (closingBracketPosition < patternLength && pattern.charAt(closingBracketPosition) == '!') { + closingBracketPosition++; } - break; - case ']': - if (outsideBracketSequence) { - // escape if we are outside bracket sequence - builder.append("\\").append(c); + + // 2 cases can happen here: + // 1- Empty character class [] which is invalid for java, so treat ] as literal and find another + // closing bracket, if no closing bracket is found, the whole thing is a literal + // 2- Negated empty class [!] converted to [^] which is invalid for java, so treat ] as literal and + // find another closing bracket, if no closing bracket is found, the whole thing is a literal + if (closingBracketPosition < patternLength && pattern.charAt(closingBracketPosition) == ']') { + closingBracketPosition++; + } + + // No [] and [!] cases, search for the closing bracket + while (closingBracketPosition < patternLength && pattern.charAt(closingBracketPosition) != ']') { + closingBracketPosition++; + } + + // No closing bracket found (or [] or [!]), escape the opening bracket, treat it as literals + if (closingBracketPosition >= patternLength) { + result.append("\\["); } else { - // Inside bracket, close it and mark as outside bracket - outsideBracketSequence = true; - builder.append(c); + // Found closing bracket, get the stuff in between the found the character class ("[" and "]") + String stuff = pattern.substring(charPosition, closingBracketPosition); + + stuffBuilder.setLength(0); + int stuffCharPos = 0; + + // If first character in the character class is "!" then convert it to "^" + if (stuff.charAt(0) == '!') { + stuffBuilder.append('^'); + stuffCharPos++; // ignore first character when escaping metacharacters next step + } + + for (; stuffCharPos < stuff.length(); stuffCharPos++) { + char stuffChar = stuff.charAt(stuffCharPos); + if (stuffChar != '-' && Arrays.binarySearch(RESERVED_REGEX_CHARS, stuffChar) >= 0) { + stuffBuilder.append("\\"); + } + stuffBuilder.append(stuffChar); + } + + String stuffEscaped = stuffBuilder.toString(); + + // Escape the set operations + stuffEscaped = stuffEscaped.replace("&&", "\\&\\&").replace("~~", "\\~\\~") + .replace("||", "\\|\\|").replace("--", "\\-\\-"); + + result.append("[").append(stuffEscaped).append("]"); + charPosition = closingBracketPosition + 1; } break; - // escape special regexp-characters - case '(': - case ')': - case '$': - case '^': - case '.': - case '{': - case '}': - case '|': - case '+': - case '=': - case '<': - case '>': - case '!': - case '\\': - builder.append("\\").append(c); - break; default: - builder.append(c); + if (Arrays.binarySearch(RESERVED_REGEX_CHARS, c) >= 0) { + result.append("\\"); + } + result.append(c); break; } } - builder.append('$'); - return builder.toString(); + + return result.toString(); } public static class AwsS3 { diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java index 81ce832..492d64c 100644 --- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java +++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java @@ -60,8 +60,8 @@ public final class StringEvaluatorUtils { return destString; } - static final char[] RESERVED_REGEX_CHARS = new char[] { '\\', '(', ')', '[', ']', '{', '}', '.', '^', '$', '*', '|', - '+', '?', '<', '>', '-', '=', '!' }; + public static final char[] RESERVED_REGEX_CHARS = new char[] { '\\', '(', ')', '[', ']', '{', '}', '.', '^', '$', + '*', '|', '+', '?', '<', '>', '-', '=', '!' }; static { Arrays.sort(RESERVED_REGEX_CHARS);
