abhishekrb19 commented on code in PR #13627: URL: https://github.com/apache/druid/pull/13627#discussion_r1067766205
########## server/src/main/java/org/apache/druid/catalog/model/table/LocalInputSourceDefn.java: ########## @@ -0,0 +1,281 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.catalog.model.table; + +import com.google.common.base.Strings; +import org.apache.druid.catalog.model.CatalogUtils; +import org.apache.druid.catalog.model.ColumnSpec; +import org.apache.druid.catalog.model.table.BaseTableFunction.Parameter; +import org.apache.druid.catalog.model.table.TableFunction.ParameterDefn; +import org.apache.druid.catalog.model.table.TableFunction.ParameterType; +import org.apache.druid.data.input.InputSource; +import org.apache.druid.data.input.impl.LocalInputSource; +import org.apache.druid.java.util.common.IAE; +import org.apache.druid.utils.CollectionUtils; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Definition for a {@link LocalInputSource}. + */ +public class LocalInputSourceDefn extends FormattedInputSourceDefn +{ + public static final String TYPE_KEY = LocalInputSource.TYPE_KEY; + + /** + * Base directory for file or filter operations. If not provided, + * then the servers current working directory is assumed, which is + * typically valid only for sample data. + */ + public static final String BASE_DIR_PARAMETER = "baseDir"; + + // Note name "fileFilter", not "filter". These properties mix in with Review Comment: Is this comment stale? Asking because the property below is named "filter", not "fileFilter" ########## extensions-core/s3-extensions/pom.xml: ########## @@ -114,7 +120,6 @@ <dependency> <groupId>com.amazonaws</groupId> <artifactId>aws-java-sdk-sts</artifactId> - <version>${aws.sdk.version}</version> Review Comment: Is this change intentional? ########## extensions-core/s3-extensions/src/main/java/org/apache/druid/catalog/model/table/S3InputSourceDefn.java: ########## @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.catalog.model.table; + +import org.apache.druid.catalog.model.CatalogUtils; +import org.apache.druid.catalog.model.ColumnSpec; +import org.apache.druid.catalog.model.table.BaseTableFunction.Parameter; +import org.apache.druid.catalog.model.table.TableFunction.ParameterDefn; +import org.apache.druid.catalog.model.table.TableFunction.ParameterType; +import org.apache.druid.data.input.InputSource; +import org.apache.druid.data.input.impl.CloudObjectLocation; +import org.apache.druid.data.input.s3.S3InputSource; +import org.apache.druid.java.util.common.IAE; +import org.apache.druid.storage.s3.S3StorageDruidModule; +import org.apache.druid.utils.CollectionUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Catalog definition for the S3 input source. + * <p> + * The catalog entry contains a serialized S3 input source, with some simplifying variations. + * The catalog entry can define the {@code bucket} table property which is the (single) bucket + * value to use when creating the list of objects: the catalog provides the bucket, the table + * function provides the list of objects. (Done this way since defining two correlated lists + * in SQL is awkward, and credentials make the most sense when working with a single bucket.) + * <p> + * The ad-hoc function allows the various ways to specify the objects, but not the configuration + * parameters. If the user wishes to use such parameters, they should be defined in a catalog + * entry, since providing maps in SQL is awkward. + * <p> + * The partial table function can be of various forms: + * <ul> + * <li>Fully define the table, which means providing the full set of S3 properties and not + * providing the table-level {@code bucket} property. This form is complete and does't need + * a table function. If used with a table function, the function provides the {@code glob} + * parameter (if not already provided in the table spec.)</li> + * <li>Partially define the table: using URIs with the {@code glob} to be provided later, or + * by using the {@code bucket} table property. The table function provides the {@code objects} + * parameter to specify the specific objects. This form provides both the format and the list + * of columns.</li> + * <li>Partially define the table as a connection: provide only the {@code bucket} property, + * and omit both the format and the columns. The table function requests the {@code objects} + * and the {@code format}. The user must provide the list of columns.</li> + * </ul> + * + * @see {@link S3InputSource} for details on the meaning of the various properties, and the rules + * about valid combinations + */ +public class S3InputSourceDefn extends FormattedInputSourceDefn +{ + public static final String URIS_PARAMETER = "uris"; + public static final String PREFIXES_PARAMETER = "prefixes"; + public static final String BUCKET_PARAMETER = "bucket"; + public static final String PATHS_PARAMETER = "paths"; + public static final String ACCESS_KEY_ID_PARAMETER = "accessKeyId"; + public static final String SECRET_ACCESS_KEY_PARAMETER = "secretAccessKey"; + public static final String ASSUME_ROLE_ARN_PARAMETER = "assumeRoleArn"; + + /** + * The {@code objectGlob} property exists in S3, but is not documented. The corresponding + * function parameter also exists, but is not documented. + */ + public static final String OBJECT_GLOB_PARAMETER = "objectGlob"; + + /** + * External data table spec property that let's the user define one bucket in the catalog, + * so that the corresponding table function can supply just the relative path names within + * that bucket. That is, if the user sets this, Druid will generate the {@code objects} + * field from this entry and files provided in the table function. + */ + public static final String BUCKET_PROPERTY = "bucket"; + + private static final ParameterDefn URI_PARAM_DEFN = new Parameter(URIS_PARAMETER, ParameterType.VARCHAR_ARRAY, true); + private static final ParameterDefn PREFIXES_PARAM_DEFN = new Parameter(PREFIXES_PARAMETER, ParameterType.VARCHAR_ARRAY, true); + private static final ParameterDefn BUCKET_PARAM_DEFN = new Parameter(BUCKET_PARAMETER, ParameterType.VARCHAR, true); + private static final ParameterDefn PATHS_PARAM_DEFN = new Parameter(PATHS_PARAMETER, ParameterType.VARCHAR_ARRAY, true); + private static final ParameterDefn OBJECT_GLOB_PARAM_DEFN = new Parameter(OBJECT_GLOB_PARAMETER, ParameterType.VARCHAR, true); + private static final List<ParameterDefn> SECURITY_PARAMS = Arrays.asList( + new Parameter(ACCESS_KEY_ID_PARAMETER, ParameterType.VARCHAR, true), + new Parameter(SECRET_ACCESS_KEY_PARAMETER, ParameterType.VARCHAR, true), + new Parameter(ASSUME_ROLE_ARN_PARAMETER, ParameterType.VARCHAR, true) + ); + + // Field names in the S3InputSource + private static final String URIS_FIELD = "uris"; + private static final String PREFIXES_FIELD = "prefixes"; + private static final String OBJECTS_FIELD = "objects"; + private static final String OBJECT_GLOB_FIELD = "objectGlob"; + private static final String PROPERTIES_FIELD = "properties"; + private static final String ACCESS_KEY_ID_FIELD = "accessKeyId"; + private static final String SECRET_ACCESS_KEY_FIELD = "secretAccessKey"; + private static final String ASSUME_ROLE_ARN_FIELD = "assumeRoleArn"; + + @Override + public String typeValue() + { + return S3StorageDruidModule.SCHEME; + } + + @Override + protected Class<? extends InputSource> inputSourceClass() + { + return S3InputSource.class; + } + + @Override + public void validate(ResolvedExternalTable table) + { + final boolean hasFormat = table.inputFormatMap != null; + final boolean hasColumns = !CollectionUtils.isNullOrEmpty(table.resolvedTable().spec().columns()); + + if (hasFormat && !hasColumns) { + throw new IAE( + "An external S3 table with a format must also provide the corresponding columns" + ); + } + + // The user can either provide a bucket, or can provide one of the valid items. + final String bucket = table.resolvedTable().stringProperty(BUCKET_PROPERTY); + final boolean hasBucket = bucket != null; + final Map<String, Object> sourceMap = table.inputSourceMap; + final boolean hasUris = sourceMap.containsKey(URIS_FIELD); + final boolean hasPrefix = sourceMap.containsKey(PREFIXES_FIELD); + final boolean hasObjects = sourceMap.containsKey(OBJECTS_FIELD); + final boolean hasGlob = sourceMap.containsKey(OBJECT_GLOB_FIELD); + if (hasBucket) { + if (hasUris || hasPrefix || hasObjects) { + throw new IAE( + "Provide either the %s property, or one of the S3 input source fields %s, %s or %s, but not both.", + BUCKET_PROPERTY, + URIS_FIELD, + PREFIXES_FIELD, + OBJECTS_FIELD + ); + } + if (hasGlob) { + throw new IAE( + "The %s property cannot be provided when the the %s property is set", Review Comment: nit: repeated "the" twice ```suggestion "The %s property cannot be provided when the %s property is set", ``` ########## extensions-core/s3-extensions/src/main/java/org/apache/druid/catalog/model/table/S3InputSourceDefn.java: ########## @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.catalog.model.table; + +import org.apache.druid.catalog.model.CatalogUtils; +import org.apache.druid.catalog.model.ColumnSpec; +import org.apache.druid.catalog.model.table.BaseTableFunction.Parameter; +import org.apache.druid.catalog.model.table.TableFunction.ParameterDefn; +import org.apache.druid.catalog.model.table.TableFunction.ParameterType; +import org.apache.druid.data.input.InputSource; +import org.apache.druid.data.input.impl.CloudObjectLocation; +import org.apache.druid.data.input.s3.S3InputSource; +import org.apache.druid.java.util.common.IAE; +import org.apache.druid.storage.s3.S3StorageDruidModule; +import org.apache.druid.utils.CollectionUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Catalog definition for the S3 input source. + * <p> + * The catalog entry contains a serialized S3 input source, with some simplifying variations. + * The catalog entry can define the {@code bucket} table property which is the (single) bucket + * value to use when creating the list of objects: the catalog provides the bucket, the table + * function provides the list of objects. (Done this way since defining two correlated lists + * in SQL is awkward, and credentials make the most sense when working with a single bucket.) + * <p> + * The ad-hoc function allows the various ways to specify the objects, but not the configuration + * parameters. If the user wishes to use such parameters, they should be defined in a catalog + * entry, since providing maps in SQL is awkward. + * <p> + * The partial table function can be of various forms: + * <ul> + * <li>Fully define the table, which means providing the full set of S3 properties and not + * providing the table-level {@code bucket} property. This form is complete and does't need + * a table function. If used with a table function, the function provides the {@code glob} + * parameter (if not already provided in the table spec.)</li> + * <li>Partially define the table: using URIs with the {@code glob} to be provided later, or + * by using the {@code bucket} table property. The table function provides the {@code objects} + * parameter to specify the specific objects. This form provides both the format and the list + * of columns.</li> + * <li>Partially define the table as a connection: provide only the {@code bucket} property, + * and omit both the format and the columns. The table function requests the {@code objects} + * and the {@code format}. The user must provide the list of columns.</li> + * </ul> + * + * @see {@link S3InputSource} for details on the meaning of the various properties, and the rules + * about valid combinations + */ +public class S3InputSourceDefn extends FormattedInputSourceDefn +{ + public static final String URIS_PARAMETER = "uris"; + public static final String PREFIXES_PARAMETER = "prefixes"; + public static final String BUCKET_PARAMETER = "bucket"; + public static final String PATHS_PARAMETER = "paths"; + public static final String ACCESS_KEY_ID_PARAMETER = "accessKeyId"; + public static final String SECRET_ACCESS_KEY_PARAMETER = "secretAccessKey"; + public static final String ASSUME_ROLE_ARN_PARAMETER = "assumeRoleArn"; + + /** + * The {@code objectGlob} property exists in S3, but is not documented. The corresponding + * function parameter also exists, but is not documented. + */ + public static final String OBJECT_GLOB_PARAMETER = "objectGlob"; + + /** + * External data table spec property that let's the user define one bucket in the catalog, Review Comment: nit: let's -> lets -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
