This is an automated email from the ASF dual-hosted git repository. mblow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit 7952d127854a2ef4750df897f39a957257f88a9f Merge: 9f38f8df67 fc90675063 Author: Michael Blow <[email protected]> AuthorDate: Tue Jun 13 19:11:15 2023 -0400 Merge branch 'gerrit/neo' into 'master' Change-Id: I76febe15393ba442f007d36655cd68e2a6a75238 .../non-pure-function.00.ddl.sqlpp | 30 ++++ .../non-pure-function.01.update.sqlpp | 24 +++ .../non-pure-function.02.query.sqlpp | 26 +++ .../non-pure-function.03.query.sqlpp | 27 ++++ .../non-pure-function.04.query.sqlpp | 28 ++++ .../non-pure-function.05.query.sqlpp | 27 ++++ .../non-pure-function.06.query.sqlpp | 28 ++++ .../non-pure-function/non-pure-function.02.adm | 1 + .../non-pure-function/non-pure-function.03.adm | 1 + .../non-pure-function/non-pure-function.04.adm | 1 + .../non-pure-function/non-pure-function.05.adm | 1 + .../non-pure-function/non-pure-function.06.adm | 1 + .../test/resources/runtimets/testsuite_sqlpp.xml | 5 + asterixdb/asterix-external-data/pom.xml | 10 ++ .../asterix/external/util/google/gcs/GCSUtils.java | 8 +- asterixdb/asterix-server/pom.xml | 87 ++++------ asterixdb/pom.xml | 22 ++- .../appended-resources/supplemental-models.xml | 180 ++++++++++++--------- ...bc8e88873f0c1e42723640536866d3df_COPYRIGHT.txt} | 0 ...bc7348d6be23d2a9daaacd6b8424b8c1_COPYRIGHT.txt} | 4 +- ...7db1c3a9d42f1ae39fb35cfbb54a8742_COPYRIGHT.txt} | 4 +- ...5b7e93858a39ad4a46ba4d0e17a0aefe_COPYRIGHT.txt} | 4 +- ...-sdk-for-java_azure-identity_1.9.0_LICENSE.txt} | 0 ...e-sdk-for-java_azure-identity_1.9.0_NOTICE.txt} | 141 +++++++++++++--- ...or-java_azure-storage-blob_12.22.0_LICENSE.txt} | 0 ...for-java_azure-storage-blob_12.22.0_NOTICE.txt} | 45 ++++-- .../rules/InlineAssignIntoAggregateRule.java | 31 ++-- .../hyracks/control/common/config/OptionTypes.java | 4 + hyracks-fullstack/hyracks/hyracks-hdfs/pom.xml | 8 +- hyracks-fullstack/pom.xml | 15 ++ 30 files changed, 571 insertions(+), 192 deletions(-) diff --cc asterixdb/asterix-external-data/pom.xml index 466bbb11f5,0698b8a20f..253754efb0 --- a/asterixdb/asterix-external-data/pom.xml +++ b/asterixdb/asterix-external-data/pom.xml @@@ -550,16 -551,11 +555,21 @@@ <groupId>org.eclipse.jetty</groupId> <artifactId>jetty-util-ajax</artifactId> </dependency> + <!-- Manually included to avoid CVE-2023-1370 --> + <dependency> + <groupId>net.minidev</groupId> + <artifactId>json-smart</artifactId> + </dependency> + <dependency> + <groupId>org.apache.iceberg</groupId> + <artifactId>iceberg-core</artifactId> + <version>1.1.0</version> + </dependency> + <dependency> + <groupId>org.apache.avro</groupId> + <artifactId>avro</artifactId> + <version>1.11.1</version> + </dependency> </dependencies> <!-- apply patch for HADOOP-17225 to workaround CVE-2019-10172 --> <repositories> diff --cc asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/google/gcs/GCSUtils.java index 3efb041dae,0000000000..6183a88143 mode 100644,000000..100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/google/gcs/GCSUtils.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/google/gcs/GCSUtils.java @@@ -1,242 -1,0 +1,242 @@@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.external.util.google.gcs; + +import static org.apache.asterix.common.exceptions.ErrorCode.EXTERNAL_SOURCE_ERROR; +import static org.apache.asterix.common.exceptions.ErrorCode.INVALID_PARAM_VALUE_ALLOWED_VALUE; +import static org.apache.asterix.common.exceptions.ErrorCode.PARAM_NOT_ALLOWED_IF_PARAM_IS_PRESENT; +import static org.apache.asterix.external.util.ExternalDataUtils.getPrefix; +import static org.apache.asterix.external.util.ExternalDataUtils.validateIncludeExclude; +import static org.apache.asterix.external.util.google.gcs.GCSConstants.APPLICATION_DEFAULT_CREDENTIALS_FIELD_NAME; +import static org.apache.asterix.external.util.google.gcs.GCSConstants.ENDPOINT_FIELD_NAME; +import static org.apache.asterix.external.util.google.gcs.GCSConstants.HADOOP_AUTH_SERVICE_ACCOUNT_JSON_KEY_FILE; +import static org.apache.asterix.external.util.google.gcs.GCSConstants.HADOOP_AUTH_SERVICE_ACCOUNT_JSON_KEY_FILE_PATH; +import static org.apache.asterix.external.util.google.gcs.GCSConstants.HADOOP_AUTH_TYPE; +import static org.apache.asterix.external.util.google.gcs.GCSConstants.HADOOP_AUTH_UNAUTHENTICATED; +import static org.apache.asterix.external.util.google.gcs.GCSConstants.HADOOP_ENDPOINT; +import static org.apache.asterix.external.util.google.gcs.GCSConstants.HADOOP_GCS_PROTOCOL; +import static org.apache.asterix.external.util.google.gcs.GCSConstants.JSON_CREDENTIALS_FIELD_NAME; +import static org.apache.hyracks.api.util.ExceptionUtils.getMessageOrToString; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.function.BiPredicate; +import java.util.regex.Matcher; + +import org.apache.asterix.common.exceptions.CompilationException; +import org.apache.asterix.common.exceptions.ErrorCode; +import org.apache.asterix.external.input.record.reader.abstracts.AbstractExternalInputStreamFactory.IncludeExcludeMatcher; +import org.apache.asterix.external.util.ExternalDataConstants; +import org.apache.asterix.external.util.ExternalDataUtils; +import org.apache.asterix.external.util.HDFSUtils; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hyracks.api.exceptions.IWarningCollector; +import org.apache.hyracks.api.exceptions.SourceLocation; +import org.apache.hyracks.api.exceptions.Warning; + +import com.google.api.gax.paging.Page; +import com.google.auth.oauth2.GoogleCredentials; +import com.google.cloud.BaseServiceException; ++import com.google.cloud.NoCredentials; +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageOptions; + +public class GCSUtils { + private GCSUtils() { + throw new AssertionError("do not instantiate"); + + } + + /** + * Builds the client using the provided configuration + * + * @param configuration properties + * @return clientasterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java + * @throws CompilationException CompilationException + */ + public static Storage buildClient(Map<String, String> configuration) throws CompilationException { + String applicationDefaultCredentials = configuration.get(APPLICATION_DEFAULT_CREDENTIALS_FIELD_NAME); + String jsonCredentials = configuration.get(JSON_CREDENTIALS_FIELD_NAME); + String endpoint = configuration.get(ENDPOINT_FIELD_NAME); + + StorageOptions.Builder builder = StorageOptions.newBuilder(); + + // default credentials provider + if (applicationDefaultCredentials != null) { + // only "true" value is allowed + if (!applicationDefaultCredentials.equalsIgnoreCase("true")) { + throw new CompilationException(INVALID_PARAM_VALUE_ALLOWED_VALUE, + APPLICATION_DEFAULT_CREDENTIALS_FIELD_NAME, "true"); + } + + // no other authentication parameters are allowed + if (jsonCredentials != null) { + throw new CompilationException(PARAM_NOT_ALLOWED_IF_PARAM_IS_PRESENT, JSON_CREDENTIALS_FIELD_NAME, + APPLICATION_DEFAULT_CREDENTIALS_FIELD_NAME); + } + + try { + builder.setCredentials(GoogleCredentials.getApplicationDefault()); + } catch (IOException ex) { + throw CompilationException.create(EXTERNAL_SOURCE_ERROR, getMessageOrToString(ex)); + } - } - - // json credentials - if (jsonCredentials != null) { ++ } else if (jsonCredentials != null) { + try (InputStream credentialsStream = new ByteArrayInputStream(jsonCredentials.getBytes())) { + builder.setCredentials(GoogleCredentials.fromStream(credentialsStream)); + } catch (IOException ex) { + throw new CompilationException(EXTERNAL_SOURCE_ERROR, getMessageOrToString(ex)); + } ++ } else { ++ builder.setCredentials(NoCredentials.getInstance()); + } + + if (endpoint != null) { + builder.setHost(endpoint); + } + + return builder.build().getService(); + } + + /** + * Validate external dataset properties + * + * @param configuration properties + * @throws CompilationException Compilation exception + */ + public static void validateProperties(Map<String, String> configuration, SourceLocation srcLoc, + IWarningCollector collector) throws CompilationException { + + // check if the format property is present + if (configuration.get(ExternalDataConstants.KEY_FORMAT) == null) { + throw new CompilationException(ErrorCode.PARAMETERS_REQUIRED, srcLoc, ExternalDataConstants.KEY_FORMAT); + } + + validateIncludeExclude(configuration); + String container = configuration.get(ExternalDataConstants.CONTAINER_NAME_FIELD_NAME); + + try { + Storage.BlobListOption limitOption = Storage.BlobListOption.pageSize(1); + Storage.BlobListOption prefixOption = Storage.BlobListOption.prefix(getPrefix(configuration)); + Storage storage = buildClient(configuration); + Page<Blob> items = storage.list(container, limitOption, prefixOption); + + if (!items.iterateAll().iterator().hasNext() && collector.shouldWarn()) { + Warning warning = Warning.of(srcLoc, ErrorCode.EXTERNAL_SOURCE_CONFIGURATION_RETURNED_NO_FILES); + collector.warn(warning); + } + } catch (CompilationException ex) { + throw ex; + } catch (Exception ex) { + throw new CompilationException(ErrorCode.EXTERNAL_SOURCE_ERROR, getMessageOrToString(ex)); + } + } + + public static List<Blob> listItems(Map<String, String> configuration, IncludeExcludeMatcher includeExcludeMatcher, + IWarningCollector warningCollector) throws CompilationException { + // Prepare to retrieve the objects + List<Blob> filesOnly = new ArrayList<>(); + String container = configuration.get(ExternalDataConstants.CONTAINER_NAME_FIELD_NAME); + Storage gcs = buildClient(configuration); + Storage.BlobListOption options = Storage.BlobListOption.prefix(ExternalDataUtils.getPrefix(configuration)); + Page<Blob> items; + + try { + items = gcs.list(container, options); + } catch (BaseServiceException ex) { + throw new CompilationException(ErrorCode.EXTERNAL_SOURCE_ERROR, getMessageOrToString(ex)); + } + + // Collect the paths to files only + collectAndFilterFiles(items, includeExcludeMatcher.getPredicate(), includeExcludeMatcher.getMatchersList(), + filesOnly); + + // Warn if no files are returned + if (filesOnly.isEmpty() && warningCollector.shouldWarn()) { + Warning warning = Warning.of(null, ErrorCode.EXTERNAL_SOURCE_CONFIGURATION_RETURNED_NO_FILES); + warningCollector.warn(warning); + } + + return filesOnly; + } + + /** + * Excludes paths ending with "/" as that's a directory indicator, we need to return the files only + * + * @param items List of returned objects + */ + private static void collectAndFilterFiles(Page<Blob> items, BiPredicate<List<Matcher>, String> predicate, + List<Matcher> matchers, List<Blob> filesOnly) { + for (Blob item : items.iterateAll()) { + // skip folders + if (item.getName().endsWith("/")) { + continue; + } + + // No filter, add file + if (predicate.test(matchers, item.getName())) { + filesOnly.add(item); + } + } + } + + /** + * Builds the client using the provided configuration + * + * @param configuration properties + * @param numberOfPartitions number of partitions in the cluster + */ + public static void configureHdfsJobConf(JobConf conf, Map<String, String> configuration, int numberOfPartitions) { + String jsonCredentials = configuration.get(JSON_CREDENTIALS_FIELD_NAME); + String endpoint = configuration.get(ENDPOINT_FIELD_NAME); + + // disable caching FileSystem + HDFSUtils.disableHadoopFileSystemCache(conf, HADOOP_GCS_PROTOCOL); + + // TODO(htowaileb): needs further testing, recommended to disable by gcs-hadoop team + conf.set(GCSConstants.HADOOP_SUPPORT_COMPRESSED, ExternalDataConstants.FALSE); + + // TODO(htowaileb): needs further testing + // set number of threads + // conf.set(GCSConstants.HADOOP_MAX_REQUESTS_PER_BATCH, String.valueOf(numberOfPartitions)); + // conf.set(GCSConstants.HADOOP_BATCH_THREADS, String.valueOf(numberOfPartitions)); + + // authentication method + // TODO(htowaileb): find a way to pass the content instead of the path to keyfile, this line is temporary + Path credentials = Path.of("credentials.json"); + if (jsonCredentials == null) { + // anonymous access + conf.set(HADOOP_AUTH_TYPE, HADOOP_AUTH_UNAUTHENTICATED); + } else { + // TODO(htowaileb) need to pass the file content + conf.set(HADOOP_AUTH_TYPE, HADOOP_AUTH_SERVICE_ACCOUNT_JSON_KEY_FILE); + conf.set(HADOOP_AUTH_SERVICE_ACCOUNT_JSON_KEY_FILE_PATH, credentials.toAbsolutePath().toString()); + } + + // set endpoint if provided, default is https://storage.googleapis.com/ + if (endpoint != null) { + conf.set(HADOOP_ENDPOINT, endpoint); + } + } +} diff --cc asterixdb/asterix-server/pom.xml index e068a2aa2d,0dcfbdec5e..6e279b836d --- a/asterixdb/asterix-server/pom.xml +++ b/asterixdb/asterix-server/pom.xml @@@ -676,14 -518,7 +658,15 @@@ <aliasUrl>https://raw.githubusercontent.com/googleapis/gapic-generator-java/v2.13.0/java-common-protos/LICENSE</aliasUrl> <aliasUrl>https://raw.githubusercontent.com/googleapis/google-api-java-client/v2.1.2/LICENSE</aliasUrl> <aliasUrl>https://raw.githubusercontent.com/grpc/grpc-java/v1.52.1/LICENSE</aliasUrl> + <aliasUrl>https://raw.githubusercontent.com/reactor/reactor-netty/v1.0.28/LICENSE</aliasUrl> + <aliasUrl>https://raw.githubusercontent.com/googleapis/java-core/v2.8.0/LICENSE</aliasUrl> + <aliasUrl>https://raw.githubusercontent.com/google/gson/gson-parent-2.9.0/LICENSE</aliasUrl> + <aliasUrl>https://raw.githubusercontent.com/allegro/json-avro-converter/json-avro-converter-0.2.15/LICENSE.md</aliasUrl> + <aliasUrl>https://raw.githubusercontent.com/airlift/aircompressor/0.21/license.txt</aliasUrl> + <aliasUrl>https://raw.githubusercontent.com/apache/orc/v1.8.0/LICENSE</aliasUrl> + <aliasUrl>https://raw.githubusercontent.com/RoaringBitmap/RoaringBitmap/0.9.39/LICENSE</aliasUrl> + <aliasUrl>https://raw.githubusercontent.com/JetBrains/java-annotations/master/LICENSE.txt</aliasUrl> + <aliasUrl>https://raw.githubusercontent.com/awslabs/aws-crt-java/v0.21.10/LICENSE</aliasUrl> </aliasUrls> <metric>1</metric> </license> diff --cc asterixdb/pom.xml index 47544bbc8b,33ca4a80d9..681e9a8236 --- a/asterixdb/pom.xml +++ b/asterixdb/pom.xml @@@ -87,10 -87,12 +87,12 @@@ <hadoop.version>3.3.4</hadoop.version> <jacoco.version>0.7.6.201602180812</jacoco.version> <log4j.version>2.19.0</log4j.version> - <awsjavasdk.version>2.17.218</awsjavasdk.version> + <awsjavasdk.version>2.20.37</awsjavasdk.version> <parquet.version>1.12.3</parquet.version> <hadoop-awsjavasdk.version>1.12.402</hadoop-awsjavasdk.version> - <azureblobjavasdk.version>12.14.2</azureblobjavasdk.version> + <azureblobjavasdk.version>12.22.0</azureblobjavasdk.version> + <azurecommonjavasdk.version>12.21.0</azurecommonjavasdk.version> + <azureidentity.version>1.9.0</azureidentity.version> <azuredatalakejavasdk.version>12.7.2</azuredatalakejavasdk.version> <gcsjavasdk.version>2.17.2</gcsjavasdk.version> <hadoop-azuresdk.version>8.6.6</hadoop-azuresdk.version>
