zachjsh commented on code in PR #13627: URL: https://github.com/apache/druid/pull/13627#discussion_r1067559789
########## server/src/main/java/org/apache/druid/catalog/model/table/HttpInputSourceDefn.java: ########## @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.catalog.model.table; + +import com.google.common.collect.ImmutableMap; +import org.apache.druid.catalog.model.CatalogUtils; +import org.apache.druid.catalog.model.ColumnSpec; +import org.apache.druid.catalog.model.table.BaseTableFunction.Parameter; +import org.apache.druid.catalog.model.table.TableFunction.ParameterDefn; +import org.apache.druid.catalog.model.table.TableFunction.ParameterType; +import org.apache.druid.data.input.InputSource; +import org.apache.druid.data.input.impl.HttpInputSource; +import org.apache.druid.java.util.common.IAE; +import org.apache.druid.java.util.common.ISE; +import org.apache.druid.metadata.DefaultPasswordProvider; +import org.apache.druid.metadata.EnvironmentVariablePasswordProvider; +import org.apache.druid.utils.CollectionUtils; + +import java.net.URI; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Definition of an HTTP input source source. + * <p> + * Provides a parameterized form where the user defines a value for the + * {@code uriTemplate} table property in the table spec, then provides the partial URLs + * in a table function to use for that one query. The final URIs are created by combining + * the template and the arguments. Example: + * <li>{@code uriTemplate} property): "http://example.com/data/kttm-{}.json"</li> + * <li>{@code uris} function argument: "22-Nov-21, 22-Nov-22"</li> + * </ul> + * <p> + * When the template is used, the format is optional: it can be provided either with + * the table spec or at runtime, depending on what the user wants to accomplish. In the + * above, where the ".json" is encoded in the template, it makes sense to include the format + * with the spec. If the template was "http://example.com/data/{}", and the data comes in + * multiple formats, it might make sense to specify the format in the query. In this case, + * the table spec acts more like a connection. + * <p> + * If the template is not used, then the {@code uris} property must be provided in the + * table spec, along with the corresponding format. + * <p> + * The above semantics make a bit more sense when we realize that the spec can also + * provide a user name and password. When those are provided, then the input source must + * name a single site: the one for which the credentials are valid. Given this, the only + * table spec that makes sense is one where the URI is defined: either as a template or + * explicitly. + * <p> + * When used as an ad-hoc function, the user specifies the uris and optional user name + * and password: the template is not available (or useful) in the ad-hoc case. + * <p> + * Table function parameters are cleaned up relative to the input source field names to + * make them a bit easier to use. + */ +public class HttpInputSourceDefn extends FormattedInputSourceDefn +{ + public static final String TYPE_KEY = HttpInputSource.TYPE_KEY; + + // Catalog properties that map to fields in the HttpInputSource. See + // that class for the meaning of these properties. + + public static final String URI_TEMPLATE_PROPERTY = "uriTemplate"; + + public static final String URIS_PARAMETER = "uris"; + + // Note, cannot be the simpler "user" since USER is a reserved word in SQL + // and we don't want to require users to quote "user" each time it is used. + public static final String USER_PARAMETER = "userName"; + public static final String PASSWORD_PARAMETER = "password"; + public static final String PASSWORD_ENV_VAR_PARAMETER = "passwordEnvVar"; + + private static final List<ParameterDefn> URI_PARAMS = Arrays.asList( + new Parameter(URIS_PARAMETER, ParameterType.VARCHAR_ARRAY, true) + ); + + private static final List<ParameterDefn> USER_PWD_PARAMS = Arrays.asList( + new Parameter(USER_PARAMETER, ParameterType.VARCHAR, true), + new Parameter(PASSWORD_PARAMETER, ParameterType.VARCHAR, true), + new Parameter(PASSWORD_ENV_VAR_PARAMETER, ParameterType.VARCHAR, true) + ); + + // Field names in the HttpInputSource + private static final String URIS_FIELD = "uris"; + private static final String PASSWORD_FIELD = "httpAuthenticationPassword"; + private static final String USERNAME_FIELD = "httpAuthenticationUsername"; + + @Override + public String typeValue() + { + return TYPE_KEY; + } + + @Override + protected Class<? extends InputSource> inputSourceClass() + { + return HttpInputSource.class; + } + + @Override + public void validate(ResolvedExternalTable table) + { + final Map<String, Object> sourceMap = table.inputSourceMap; + final boolean hasUri = sourceMap.containsKey(URIS_FIELD); + final String uriTemplate = table.resolvedTable().stringProperty(URI_TEMPLATE_PROPERTY); + final boolean hasTemplate = uriTemplate != null; + final boolean hasFormat = table.inputFormatMap != null; + final boolean hasColumns = !CollectionUtils.isNullOrEmpty(table.resolvedTable().spec().columns()); + + if (!hasUri && !hasTemplate) { + throw new IAE( + "External HTTP tables must provide either a URI or a %s property", + URI_TEMPLATE_PROPERTY + ); + } + if (hasUri && hasTemplate) { + throw new IAE( + "External HTTP tables must provide only one of a URI or a %s property", + URI_TEMPLATE_PROPERTY + ); + } + if (hasUri && !hasFormat) { + throw new IAE( + "An external HTTP table with a URI must also provide the corresponding format" + ); + } + if (hasUri && !hasColumns) { + throw new IAE( + "An external HTTP table with a URI must also provide the corresponding columns" + ); + } + if (hasTemplate) { + + // Verify the template + templateMatcher(uriTemplate); + + // Patch in a dummy URI so that validation of the rest of the fields + // will pass. + try { + sourceMap.put( + URIS_FIELD, + Collections.singletonList(new URI("https://bogus.com/file")) Review Comment: Was this just added for debugging? Should remove? ########## server/src/main/java/org/apache/druid/catalog/model/table/ExternalTableSpec.java: ########## @@ -28,13 +28,13 @@ /** * Catalog form of an external table specification used to pass along the three * components needed for an external table in MSQ ingest. Just like - * {@code ExternalTableSource}, except that the parameters are not required - * to be non-null. + * {@code ExternalTableSource}, except that the class is visible in this + * module. */ public class ExternalTableSpec { - @Nullable public final InputSource inputSource; - @Nullable public final InputFormat inputFormat; + public final InputSource inputSource; Review Comment: Are these fields guaranteed to be non-null now? ########## server/src/main/java/org/apache/druid/catalog/model/table/ExternalTableDefn.java: ########## @@ -19,254 +19,291 @@ package org.apache.druid.catalog.model.table; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import org.apache.druid.catalog.model.CatalogUtils; -import org.apache.druid.catalog.model.ColumnDefn; +import com.fasterxml.jackson.core.type.TypeReference; +import com.google.common.annotations.VisibleForTesting; import org.apache.druid.catalog.model.ColumnSpec; -import org.apache.druid.catalog.model.Columns; -import org.apache.druid.catalog.model.ModelProperties; +import org.apache.druid.catalog.model.ModelProperties.ObjectPropertyDefn; import org.apache.druid.catalog.model.ModelProperties.PropertyDefn; -import org.apache.druid.catalog.model.ParameterizedDefn; -import org.apache.druid.catalog.model.PropertyAttributes; import org.apache.druid.catalog.model.ResolvedTable; import org.apache.druid.catalog.model.TableDefn; -import org.apache.druid.catalog.model.table.InputFormats.InputFormatDefn; +import org.apache.druid.catalog.model.TableDefnRegistry; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputSource; -import org.apache.druid.java.util.common.IAE; -import org.apache.druid.java.util.common.ISE; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; +import java.util.Arrays; import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; /** - * Definition of an external input source, primarily for ingestion. + * Definition of an external table, primarily for ingestion. * The components are derived from those for Druid ingestion: an * input source, a format and a set of columns. Also provides * properties, as do all table definitions. + * + * <h4>Partial Tables and Connections</h4> + * + * An input source is a template for an external table. The input + * source says how to get data, and optionally the format and structure of that data. + * Since Druid never ingests the same data twice, the actual external table needs + * details that says which data to read on any specific ingestion. Thus, an external + * table is usually a "partial table": all the information that remains constant + * across ingestions, but without the information that changes. The changing + * information is typically the list of files (or objects or URLs) to ingest. + * <p> + * The pattern is:<br> + * {@code external table spec + parameters --> external table} + * <p> + * Since an input source is a parameterized (partial) external table, we can reuse + * the table metadata structures and APIs, avoiding the need to have a separate (but + * otherwise identical) structure for external tables. + * + * An external table can be thought of as a "connection", though Druid does not + * use that term. When used as a connection, the external table spec will omit the + * format. Instead, the format will also be provided at ingest time, along with the + * list of tables (or objects.) + * <p> + * To keep all this straight, we adopt the following terms: + * <dl> + * <dt>External table spec</dt> + * <dd>The JSON serialized version of an external table which can be partial or + * complete. The spec is a named entry in the Druid catalog</dd> + * <dt>Complete spec</dt> + * <dd>An external table spec that provides all information needed to access an + * external table. Each use identifies the same set of data. Useful if MSQ is used + * to query an external data source. A complete spec can be referenced as a + * first-class table in a {@code FROM} clause in an MSQ query.</dd> + * <dt>Partial spec</dt> + * <dd>An external table spec that omits some information. That information must + * be provided at query time in the form of a {@code TABLE} function. If the partial + * spec includes a format, then it is essentially a <i>partial table</i>. If it + * omits the format, then it is essentially a <i>connection</i>.</dd> + * <dt>Completed table</dt> + * <dd>The full external table that results from a partial spec and a set of SQL + * table function parameters.</dd> + * <dt>Ad-hoc table</dt> + * <dd>Users can define an external table using the generic {@code EXTERN} function + * or one of the input-source-specific functions. In this case, there is no + * catalog entry: all information comes from the SQL table function</dd> + * <dt>Partial table function</dt> + * <dd>The SQL table function used to "complete" a partial spec. The function + * defines parameters to fill in the missing information. The function is generated + * on demand and has the same name as the catalog entry for the partial spec. + * The function will include parameters for format if the catalog spec does not + * specify a format. Else, the format parameters are omitted and the completed + * table uses the format provided in the catalog spec.</dd> + * <dt>Ad-hoc table function</dt> + * <dd>The SQL table function used to create an ad-hoc external table. The function + * as a name defined by the {@link InputFormatDefn}, and has parameters for all + * support formats: the user must specify all input source and format properties.</dd> + * </dl> + * + * <h4>External Table Structure</h4> + * + * The external table is generic: it represents all valid combinations of input + * sources and formats. Rather than have a different table definition for each, we + * instead split out input sources and formats into their own definitions, and those + * definitions are integrated and used by this external table class. As a result, + * the {@code properties} field will contain the {@code source} property which has + * the JSON serialized form of the input source (minus items to be parameterized.) + * <p> + * Similarly, if the external table also defines a format (rather than requiring the + * format at ingest time), then the {@code format} property holds the JSON-serialized + * form of the input format, minus columns. The columns can be provided in the spec, + * in the {@code columns} field. The {@link InputFormatDefn} converts the columns to + * the form needed by the input format. + * <p> + * Druid's input sources all require formats. However, some sources may not actually + * need the format. A JDBC input source for example, needs no format. In other cases, + * there may be a subset of formats. Each {@link InputSourceDefn} is responsible for + * working out which formats (if any) are required. This class is agnostic about whether + * the format is supplied. (Remember that, when used as a connection, the external table + * will provide no format until ingest time.) + * <p> + * By contrast, the input source is always required. + * + * <h4>Data Formats and Conversions</h4> + * + * Much of the code here handles conversion of an external table specification to + * the form needed by SQL. Since SQL is not visible here, we instead create an + * instance of {@link ExternalTableSpec} which holds the input source, input format + * and row signature in the form required by SQL. + * <p> + * This class handles table specifications in three forms: + * <ol> + * <li>From a fully-defined table specification, converted to a {@code ExternalTableSpec} + * by the {@link #convert(ResolvedTable)} function.</li> + * <li>From a fully-defined set of arguments to a SQL table function. The + * {@link InputSourceDefn#adHocTableFn()} method provides the function definition which + * handles the conversion.</li> + * <li>From a partially-defined table specification in the catalog, augmented by + * parameters passed from a SQL function. The {@link #tableFn(ResolvedTable)} method + * creates the required function by caching the table spec. That function then combines + * the parameters to produce the required {@code ExternalTableSpec}.</li> + * </ol> + * <p> + * To handle these formats, and the need to adjust JSON, conversion to an + * {@code ExternalTableSpec} occurs in multiple steps: + * <ul> + * <li>When using a table spec, the serialized JSON is first converted to a generic + * Java map: one for the input source, another for the format.</li> + * <li>When using a SQL function, the SQL arguments are converted (if needed) and + * written into a Java map. If the function references an existing table spec: then + * the JSON map is first populated with the deserialized spec.</li> + * <li>Validation and/or adjustments are made to the Java map. Adjustments are + * those described elsewhere in this Javadoc.</li> + * <li>The column specifications from either SQL or the table spec are converted to + * a list of column names, and placed into the Java map for the input format.</li> + * <li>The maps are converted to the {@link InputSource} or {@link InputFormat} + * objects using a Jackson conversion.</li> + * </ul> + * The actual conversions are handled in the {@link InputFormatDefn} and + * {@link InputFormatDefn} classes, either directly (for a fully-defined table + * function) or starting here (for other use cases). + * + * <h4>Property and Parameter Names</h4> + * + * Pay careful attention to names: the names may be different in each of the above + * cases: + * <ul> + * <li>The table specification stores the input source and input format specs using + * the names defined by the classes themselves. That is, the table spec holds a string + * that represents the Jackson-serialized form of those classes. In some cases, the + * JSON can be a subset: some sources and formats have obscure checks, or options which + * are not available via this path. The code that does conversions will adjust the JSON + * prior to conversion. Each JSON object has a type field: the value of that type + * field must match that defined in the Jackson annotations for the corresponding + * class.</li> + * <li>SQL table functions use argument names that are typically selected for user + * convenience, and may not be the same as the JSON field name. For example, a field + * name may be a SQL reserved word, or may be overly long, or may be obscure. The code + * for each input source and input format definition does the needed conversion.</li> + * <li>Each input source and input format has a type. The input format type is given, + * in SQL by the {@code format} property. The format type name is typically the same + * as the JSON type name, but need not be.</li> + * </ul> + * + * <h4>Extensions</h4> + * + * This class is designed to work both with "well known" Druid input sources and formats, + * and those defined in an extension. For extension-defined sources and formats to work, + * the extension must define an {@link InputSourceDefn} or {@link InputFormatDefn} which + * are put into the {@link TableDefnRegistry} and thus available to this class. The result + * is that this class is ignorant of the actual details of sources and formats: it instead + * delegates to the input source and input format definitions for that work. * <p> - * The external table implements the mechanism for parameterized tables, - * but does not implement the {@link ParameterizedDefn} interface itself. - * Tables which are parameterized implement that interface to expose - * methods defined here. + * Input sources and input formats defined in an extension are considered "ephemeral": + * they can go away if the corresponding extension is removed from the system. In that + * case, any table functions defined by those extensions are no longer available, and + * any SQL statements that use those functions will no longer work. The catalog may contain + * an external table spec that references those definitions. Such specs will continue to + * reside in the catalog, and can be retrieved, but they will fail any query that attempts + * to reference them. */ -public abstract class ExternalTableDefn extends TableDefn +public class ExternalTableDefn extends TableDefn { - public static final String EXTERNAL_COLUMN_TYPE = "extern"; - - public abstract static class FormattedExternalTableDefn extends ExternalTableDefn - { - public static final String FORMAT_PROPERTY = "format"; - - private final Map<String, InputFormatDefn> formats; - - public FormattedExternalTableDefn( - final String name, - final String typeValue, - final List<PropertyDefn<?>> properties, - final List<ColumnDefn> columnDefns, - final List<InputFormatDefn> formats - ) - { - super( - name, - typeValue, - addFormatProperties(properties, formats), - columnDefns - ); - ImmutableMap.Builder<String, InputFormatDefn> builder = ImmutableMap.builder(); - for (InputFormatDefn format : formats) { - builder.put(format.typeTag(), format); - } - this.formats = builder.build(); - } - - /** - * Add format properties to the base set, in the order of the formats, - * in the order defined by the format. Allow same-named properties across - * formats, as long as the types are the same. - */ - private static List<PropertyDefn<?>> addFormatProperties( - final List<PropertyDefn<?>> properties, - final List<InputFormatDefn> formats - ) - { - List<PropertyDefn<?>> toAdd = new ArrayList<>(); - PropertyDefn<?> formatProp = new ModelProperties.StringPropertyDefn(FORMAT_PROPERTY, PropertyAttributes.SQL_FN_PARAM); - toAdd.add(formatProp); - Map<String, PropertyDefn<?>> formatProps = new HashMap<>(); - for (InputFormatDefn format : formats) { - for (PropertyDefn<?> prop : format.properties()) { - PropertyDefn<?> existing = formatProps.putIfAbsent(prop.name(), prop); - if (existing == null) { - toAdd.add(prop); - } else if (existing.getClass() != prop.getClass()) { - throw new ISE( - "Format %s, property %s of class %s conflicts with another format property of class %s", - format.name(), - prop.name(), - prop.getClass().getSimpleName(), - existing.getClass().getSimpleName() - ); - } - } - } - return CatalogUtils.concatLists(properties, toAdd); - } - - @Override - protected InputFormat convertFormat(ResolvedTable table) - { - return formatDefn(table).convert(table); - } + /** + * Identifier for external tables. + */ + public static final String TABLE_TYPE = "extern"; - protected InputFormatDefn formatDefn(ResolvedTable table) - { - String formatTag = table.stringProperty(FORMAT_PROPERTY); - if (formatTag == null) { - throw new IAE("%s property must be set", FORMAT_PROPERTY); - } - InputFormatDefn formatDefn = formats.get(formatTag); - if (formatDefn == null) { - throw new IAE( - "Format type [%s] for property %s is not valid", - formatTag, - FORMAT_PROPERTY - ); - } - return formatDefn; - } + /** + * Column type for external tables. + */ + public static final String EXTERNAL_COLUMN_TYPE = "extern"; Review Comment: What is this used for? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
