This is an automated email from the ASF dual-hosted git repository. tmarshall pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 0511b44f9236c655e695185b33f412ec65a80a07 Author: norbert.luksa <norbert.lu...@cloudera.com> AuthorDate: Thu Dec 19 09:32:34 2019 +0100 IMPALA-8046: Support CREATE TABLE from an ORC file Impala supports creating a table using the schema of a file. However, only Parquet is supported currently. This commit adds support for creating tables from ORC files The change relies on the ORC Java API with version 1.5 or greater, because of a bug in earlier versions. Therefore, ORC is listed as an external dependency, instead of relying on Hive's ORC version (from Hive3, Hive also lists it as a dependency). Also, the commit performs a little clean-up on the ParquetHelper class, renaming it to ParquetSchemaExtractor and removing outdated comments. To create a table from an ORC file, run: CREATE TABLE tablename LIKE ORC '/path/to/file' Tests: * Added analysis tests for primitive and complex types. * Added e2e tests for creating tables from ORC files. Change-Id: I77cd84cda2ed86516937a67eb320fd41e3f1cf2d Reviewed-on: http://gerrit.cloudera.org:8080/14811 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- bin/impala-config.sh | 1 + fe/pom.xml | 26 +++ .../impala/analysis/CreateTableLikeFileStmt.java | 14 +- .../apache/impala/analysis/OrcSchemaExtractor.java | 200 +++++++++++++++++++++ ...quetHelper.java => ParquetSchemaExtractor.java} | 38 ++-- .../org/apache/impala/common/FileSystemUtil.java | 8 + .../org/apache/impala/util/FileAnalysisUtil.java | 50 ++++++ .../org/apache/impala/analysis/AnalyzeDDLTest.java | 74 ++++++-- impala-parent/pom.xml | 1 + shaded-deps/pom.xml | 1 + .../QueryTest/create-table-like-file-orc.test | 89 +++++++++ .../queries/QueryTest/create-table-like-file.test | 37 ---- .../queries/QueryTest/create-table-like-table.test | 27 +++ tests/common/skip.py | 3 +- tests/metadata/test_ddl.py | 7 + 15 files changed, 495 insertions(+), 81 deletions(-) diff --git a/bin/impala-config.sh b/bin/impala-config.sh index 9848505..4758da4 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -190,6 +190,7 @@ export IMPALA_AVRO_JAVA_VERSION=1.8.2-cdh6.x-SNAPSHOT export IMPALA_LLAMA_MINIKDC_VERSION=1.0.0 export IMPALA_KITE_VERSION=1.0.0-cdh6.x-SNAPSHOT export IMPALA_KUDU_JAVA_VERSION=1.11.0-cdh6.x-SNAPSHOT +export IMPALA_ORC_JAVA_VERSION=1.6.2 # When IMPALA_(CDH_COMPONENT)_URL are overridden, they may contain '$(platform_label)' # which will be substituted for the CDH platform label in bootstrap_toolchain.py diff --git a/fe/pom.xml b/fe/pom.xml index 046851d..d75d1c2 100644 --- a/fe/pom.xml +++ b/fe/pom.xml @@ -306,6 +306,22 @@ under the License. </dependency> <dependency> + <groupId>org.apache.orc</groupId> + <artifactId>orc-core</artifactId> + <version>${orc.version}</version> + <exclusions> + <exclusion> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.hive</groupId> + <artifactId>*</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>${hbase.version}</version> @@ -713,6 +729,7 @@ under the License. <exclude>org.apache.kudu:*</exclude> <exclude>org.apache.sentry:*</exclude> <exclude>org.apache.parquet:*</exclude> + <exclude>org.apache.orc:*</exclude> </excludes> <includes> <!-- hadoop-yarn-common depends on some Jetty utilities. --> @@ -725,6 +742,7 @@ under the License. <include>org.apache.kudu:*:${kudu.version}</include> <include>org.apache.sentry:*:${sentry.version}</include> <include>org.apache.parquet:*:${parquet.version}</include> + <include>org.apache.orc:*:${orc.version}</include> </includes> </bannedDependencies> </rules> @@ -946,6 +964,14 @@ under the License. <groupId>org.apache.ant</groupId> <artifactId>*</artifactId> </exclusion> + <exclusion> + <groupId>orc</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.orc</groupId> + <artifactId>*</artifactId> + </exclusion> </exclusions> </dependency> diff --git a/fe/src/main/java/org/apache/impala/analysis/CreateTableLikeFileStmt.java b/fe/src/main/java/org/apache/impala/analysis/CreateTableLikeFileStmt.java index 5053572..2d034b6 100644 --- a/fe/src/main/java/org/apache/impala/analysis/CreateTableLikeFileStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/CreateTableLikeFileStmt.java @@ -26,6 +26,7 @@ import org.apache.impala.catalog.HdfsCompression; import org.apache.impala.catalog.HdfsFileFormat; import org.apache.impala.common.AnalysisException; import org.apache.impala.common.Pair; +import org.apache.impala.compat.MetastoreShim; import org.apache.impala.thrift.THdfsFileFormat; @@ -71,11 +72,18 @@ public class CreateTableLikeFileStmt extends CreateTableStmt { schemaLocation_.analyze(analyzer, Privilege.ALL, FsAction.READ); switch (schemaFileFormat_) { case PARQUET: - getColumnDefs().addAll(ParquetHelper.extractParquetSchema(schemaLocation_)); + getColumnDefs().addAll(ParquetSchemaExtractor.extract(schemaLocation_)); + break; + case ORC: + if (MetastoreShim.getMajorVersion() < 3) { + throw new AnalysisException("Creating table like ORC file is unsupported for " + + "Hive with version < 3"); + } + getColumnDefs().addAll(OrcSchemaExtractor.extract(schemaLocation_)); break; default: - throw new AnalysisException("Unsupported file type for schema inference: " - + schemaFileFormat_); + throw new AnalysisException("Unsupported file type for schema inference: " + + schemaFileFormat_); } super.analyze(analyzer); } diff --git a/fe/src/main/java/org/apache/impala/analysis/OrcSchemaExtractor.java b/fe/src/main/java/org/apache/impala/analysis/OrcSchemaExtractor.java new file mode 100644 index 0000000..9515ff4 --- /dev/null +++ b/fe/src/main/java/org/apache/impala/analysis/OrcSchemaExtractor.java @@ -0,0 +1,200 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.impala.analysis; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.fs.Path; +import org.apache.impala.catalog.MapType; +import org.apache.impala.catalog.ScalarType; +import org.apache.impala.catalog.StructField; +import org.apache.impala.catalog.ArrayType; +import org.apache.impala.catalog.StructType; +import org.apache.impala.catalog.Type; +import org.apache.impala.common.AnalysisException; +import org.apache.impala.common.FileSystemUtil; +import org.apache.impala.util.FileAnalysisUtil; +import org.apache.orc.OrcFile; +import org.apache.orc.OrcFile.ReaderOptions; +import org.apache.orc.Reader; +import org.apache.orc.TypeDescription; +import org.apache.orc.TypeDescription.Category; + +import com.google.common.base.Preconditions; + +/** + * Provides a helper function (extract()) which extracts the Impala schema from a given + * ORC file. Details of the ORC types: + * https://orc.apache.org/docs/types.html + */ +public class OrcSchemaExtractor { + private final static String ERROR_MSG = + "Failed to convert ORC type\n%s\nto an Impala %s type:\n%s\n"; + + /** + * Validates the path and loads the ORC schema of the file. The ORC schema is also an + * ORC type (TypeDescription), represented as a struct. + */ + private static TypeDescription loadOrcSchema(Path pathToFile) throws AnalysisException { + FileAnalysisUtil.CheckIfFile(pathToFile); + Reader reader = null; + try { + reader = OrcFile.createReader(pathToFile, + new ReaderOptions(FileSystemUtil.getConfiguration())); + } catch (IOException e) { + // OrcFile.createReader throws IOException in case of any failure, including trying + // to open a non-ORC file. + throw new AnalysisException("Failed to open file as an ORC file: " + e); + } + return reader.getSchema(); + } + + /** + * Converts a primitive ORC type to an Impala Type. + */ + static private Type convertPrimitiveOrcType(TypeDescription type) { + Category category = type.getCategory(); + Preconditions.checkState(category.isPrimitive()); + switch (category) { + case BINARY: return Type.STRING; + case BOOLEAN: return Type.BOOLEAN; + case BYTE: return Type.TINYINT; + case CHAR: return ScalarType.createCharType(type.getMaxLength()); + case DATE: return Type.DATE; + case DECIMAL: + return ScalarType.createDecimalType(type.getPrecision(), type.getScale()); + case DOUBLE: return Type.DOUBLE; + case FLOAT: return Type.FLOAT; + case INT: return Type.INT; + case LONG: return Type.BIGINT; + case SHORT: return Type.SMALLINT; + case STRING: return Type.STRING; + case TIMESTAMP: return Type.TIMESTAMP; + case VARCHAR: return ScalarType.createVarcharType(type.getMaxLength()); + default: + Preconditions.checkState(false, + "Unexpected ORC primitive type: " + category.getName()); + return null; + } + } + + /** + * Converts an ORC list type to an Impala array Type. An ORC list contains one child, + * the TypeDescription of the elements. + */ + private static ArrayType convertArray(TypeDescription listType) + throws AnalysisException { + Preconditions.checkState(listType.getChildren().size() == 1); + return new ArrayType(convertOrcType(listType.getChildren().get(0))); + } + + /** + * Converts an ORC map type to an Impala map Type. An ORC map contains two children, + * the TypeDescriptions for the keys and values. + */ + private static MapType convertMap(TypeDescription mapType) throws AnalysisException { + // ORC maps have two children, one for the keys, one for the values. + Preconditions.checkState(mapType.getChildren().size() == 2); + + TypeDescription key = mapType.getChildren().get(0); + TypeDescription value = mapType.getChildren().get(1); + + if (!key.getCategory().isPrimitive()) { + throw new AnalysisException(String.format(ERROR_MSG, mapType.toString(), "MAP", + "The key type of the MAP type must be primitive.")); + } + + return new MapType(convertOrcType(key), convertOrcType(value)); + } + + /** + * Converts an ORC struct type to an Impala struct Type. + */ + private static StructType convertStruct(TypeDescription structType) + throws AnalysisException { + List<StructField> structFields = new ArrayList<>(); + List<String> fieldNames = structType.getFieldNames(); + List<TypeDescription> subTypes = structType.getChildren(); + Preconditions.checkState(subTypes.size() == fieldNames.size()); + for (int i = 0; i < subTypes.size(); i++) { + StructField f = new StructField(fieldNames.get(i), convertOrcType(subTypes.get(i))); + structFields.add(f); + } + return new StructType(structFields); + } + + /** + * Converts a non-primitive ORC type to an Impala Type. + */ + static private Type convertComplexOrcType(TypeDescription type) + throws AnalysisException { + Category category = type.getCategory(); + Preconditions.checkState(!category.isPrimitive()); + + switch (category) { + case LIST: return convertArray(type); + case MAP: return convertMap(type); + case STRUCT: return convertStruct(type); + case UNION: + throw new AnalysisException( + "Unsupported ORC type UNION for field " + category.getName()); + default: + Preconditions.checkState(false, + "Unexpected ORC primitive type: " + category.getName()); + return null; + } + } + + /** + * Converts an ORC type to an Impala Type. + */ + static private Type convertOrcType(TypeDescription type) throws AnalysisException { + if (type.getCategory().isPrimitive()) { + return convertPrimitiveOrcType(type); + } else { + return convertComplexOrcType(type); + } + } + + /** + * Parses an ORC file stored in HDFS and returns the corresponding Impala schema. + * This fails with an analysis exception if any errors occur reading the file, + * parsing the ORC schema, or if the ORC types cannot be represented in Impala. + */ + static public List<ColumnDef> extract(HdfsUri location) throws AnalysisException { + List<ColumnDef> schema = new ArrayList<>(); + TypeDescription orcSchema = loadOrcSchema(location.getPath()); // Returns a STRUCT. + List<TypeDescription> subTypes = orcSchema.getChildren(); + List<String> fieldNames = orcSchema.getFieldNames(); + Preconditions.checkState(subTypes.size() == fieldNames.size()); + for (int i = 0; i < subTypes.size(); i++) { + TypeDescription orcType = subTypes.get(i); + Type type = convertOrcType(orcType); + Preconditions.checkNotNull(type); + String colName = fieldNames.get(i); + Map<ColumnDef.Option, Object> option = new HashMap<>(); + option.put(ColumnDef.Option.COMMENT, "Inferred from ORC file."); + schema.add(new ColumnDef(colName, new TypeDef(type), option)); + } + return schema; + } +} diff --git a/fe/src/main/java/org/apache/impala/analysis/ParquetHelper.java b/fe/src/main/java/org/apache/impala/analysis/ParquetSchemaExtractor.java similarity index 92% rename from fe/src/main/java/org/apache/impala/analysis/ParquetHelper.java rename to fe/src/main/java/org/apache/impala/analysis/ParquetSchemaExtractor.java index dc55a34..0e2d15a 100644 --- a/fe/src/main/java/org/apache/impala/analysis/ParquetHelper.java +++ b/fe/src/main/java/org/apache/impala/analysis/ParquetSchemaExtractor.java @@ -25,14 +25,14 @@ import java.util.List; import java.util.Map; import com.google.common.base.Preconditions; -import org.apache.hadoop.fs.FileSystem; + import org.apache.hadoop.fs.Path; +import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.LogicalTypeAnnotation.*; - import org.apache.impala.catalog.ArrayType; import org.apache.impala.catalog.MapType; import org.apache.impala.catalog.ScalarType; @@ -41,16 +41,13 @@ import org.apache.impala.catalog.StructType; import org.apache.impala.catalog.Type; import org.apache.impala.common.AnalysisException; import org.apache.impala.common.FileSystemUtil; +import org.apache.impala.util.FileAnalysisUtil; /** - * Provides extractParquetSchema() to extract a schema - * from a parquet file. - * - * Because Parquet's Java package changed between Parquet 1.5 - * and 1.9, a second copy of this file, with "org.apache.parquet." replaced - * with "org.apache.org.apache.parquet." is generated by the build system. + * Provides a helper function (extract()) which extracts the Impala schema from a given + * Parquet file. */ -class ParquetHelper { +class ParquetSchemaExtractor { private final static String ERROR_MSG = "Failed to convert Parquet type\n%s\nto an Impala %s type:\n%s\n"; @@ -61,21 +58,11 @@ class ParquetHelper { */ private static org.apache.parquet.schema.MessageType loadParquetSchema(Path pathToFile) throws AnalysisException { - try { - FileSystem fs = pathToFile.getFileSystem(FileSystemUtil.getConfiguration()); - if (!fs.isFile(pathToFile)) { - throw new AnalysisException("Cannot infer schema, path is not a file: " + - pathToFile); - } - } catch (IOException e) { - throw new AnalysisException("Failed to connect to filesystem:" + e); - } catch (IllegalArgumentException e) { - throw new AnalysisException(e.getMessage()); - } + FileAnalysisUtil.CheckIfFile(pathToFile); ParquetMetadata readFooter = null; try { readFooter = ParquetFileReader.readFooter(FileSystemUtil.getConfiguration(), - pathToFile); + pathToFile, ParquetMetadataConverter.NO_FILTER); } catch (FileNotFoundException e) { throw new AnalysisException("File not found: " + e); } catch (IOException e) { @@ -95,7 +82,8 @@ class ParquetHelper { * Converts a "primitive" Parquet type to an Impala type. * A primitive type is a non-nested type with no annotations. */ - private static Type convertPrimitiveParquetType(org.apache.parquet.schema.Type parquetType) + private static Type convertPrimitiveParquetType( + org.apache.parquet.schema.Type parquetType) throws AnalysisException { Preconditions.checkState(parquetType.isPrimitive()); PrimitiveType prim = parquetType.asPrimitiveType(); @@ -339,9 +327,9 @@ class ParquetHelper { * This fails with an analysis exception if any errors occur reading the file, * parsing the Parquet schema, or if the Parquet types cannot be represented in Impala. */ - static List<ColumnDef> extractParquetSchema(HdfsUri location) - throws AnalysisException { - org.apache.parquet.schema.MessageType parquetSchema = loadParquetSchema(location.getPath()); + static List<ColumnDef> extract(HdfsUri location) throws AnalysisException { + org.apache.parquet.schema.MessageType parquetSchema = + loadParquetSchema(location.getPath()); List<org.apache.parquet.schema.Type> fields = parquetSchema.getFields(); List<ColumnDef> schema = new ArrayList<>(); diff --git a/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java b/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java index 7eccd13..f77fd55 100644 --- a/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java +++ b/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java @@ -599,6 +599,14 @@ public class FileSystemUtil { } /** + * Returns true if the path 'p' is a file, false if not. Throws if path does not exist. + */ + public static boolean isFile(Path p) throws IOException, FileNotFoundException { + FileSystem fs = getFileSystemForPath(p); + return fs.getFileStatus(p).isFile(); + } + + /** * Return the path of 'path' relative to the startPath. This may * differ from simply the file name in the case of recursive listings. */ diff --git a/fe/src/main/java/org/apache/impala/util/FileAnalysisUtil.java b/fe/src/main/java/org/apache/impala/util/FileAnalysisUtil.java new file mode 100644 index 0000000..d4abb5e --- /dev/null +++ b/fe/src/main/java/org/apache/impala/util/FileAnalysisUtil.java @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.impala.util; + +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.apache.hadoop.fs.Path; +import org.apache.impala.common.AnalysisException; +import org.apache.impala.common.FileSystemUtil; + +/** + * Provides common utilities for ORCSchemeExtractor and ParquetSchemeExtractor. + */ +public class FileAnalysisUtil { + + /** + * Throws if the given path is not a file. + */ + public static void CheckIfFile(Path pathToFile) throws AnalysisException { + try { + if (!FileSystemUtil.isFile(pathToFile)) { + throw new AnalysisException("Cannot infer schema, path is not a file: " + + pathToFile); + } + } catch (FileNotFoundException e) { + throw new AnalysisException("Cannot infer schema, path does not exist: " + + pathToFile); + } catch (IOException e) { + throw new AnalysisException("Failed to connect to filesystem:" + e); + } catch (IllegalArgumentException e) { + throw new AnalysisException(e.getMessage()); + } + } +} diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java index dff57a2..8a680b8 100644 --- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java @@ -54,9 +54,8 @@ import org.apache.impala.thrift.TBackendGflags; import org.apache.impala.thrift.TDescribeTableParams; import org.apache.impala.thrift.TQueryOptions; import org.apache.impala.util.MetaStoreUtil; -import org.junit.AfterClass; import org.junit.Assert; -import org.junit.BeforeClass; +import org.junit.Assume; import org.junit.Test; import com.google.common.base.Joiner; @@ -1994,27 +1993,27 @@ public class AnalyzeDDLTest extends FrontendTestBase { "Database does not exist: database_DNE"); // check invalid paths - AnalysisError("create table if not exists functional.zipcode_incomes like parquet " - + "'/test-warehouse'", + AnalysisError("create table if not exists functional.zipcode_incomes like parquet " + + "'/test-warehouse'", "Cannot infer schema, path is not a file: hdfs://localhost:20500/test-warehouse"); AnalysisError("create table newtbl_DNE like parquet 'foobar'", "URI path must be absolute: foobar"); AnalysisError("create table newtbl_DNE like parquet '/not/a/file/path'", - "Cannot infer schema, path is not a file: " - + "hdfs://localhost:20500/not/a/file/path"); - AnalysisError("create table if not exists functional.zipcode_incomes like parquet " - + "'file:///tmp/foobar'", - "Cannot infer schema, path is not a file: file:/tmp/foobar"); + "Cannot infer schema, path does not exist: " + + "hdfs://localhost:20500/not/a/file/path"); + AnalysisError("create table if not exists functional.zipcode_incomes like parquet " + + "'file:///tmp/foobar'", + "Cannot infer schema, path does not exist: file:/tmp/foobar"); // check valid paths with bad file contents - AnalysisError("create table database_DNE.newtbl_DNE like parquet " - + "'/test-warehouse/zipcode_incomes_rc/000000_0'", - "File is not a parquet file: " - + "hdfs://localhost:20500/test-warehouse/zipcode_incomes_rc/000000_0"); + AnalysisError("create table database_DNE.newtbl_DNE like parquet " + + "'/test-warehouse/zipcode_incomes_rc/000000_0'", + "File is not a parquet file: " + + "hdfs://localhost:20500/test-warehouse/zipcode_incomes_rc/000000_0"); // this is a decimal file without annotations - AnalysisError("create table if not exists functional.zipcode_incomes like parquet " - + "'/test-warehouse/schemas/malformed_decimal_tiny.parquet'", + AnalysisError("create table if not exists functional.zipcode_incomes like parquet " + + "'/test-warehouse/schemas/malformed_decimal_tiny.parquet'", "Unsupported parquet type FIXED_LEN_BYTE_ARRAY for field c1"); // Invalid file format @@ -2024,6 +2023,51 @@ public class AnalyzeDDLTest extends FrontendTestBase { BackendConfig.INSTANCE.setZOrderSortUnlocked(false); + + } + + @Test + public void TestCreateTableLikeFileOrc() throws AnalysisException { + Assume.assumeTrue( + "Skipping this test; CREATE TABLE LIKE ORC is only supported when running " + + "against Hive-3 or greater", TestUtils.getHiveMajorVersion() >= 3); + + AnalysisError("create table database_DNE.newtbl_DNE like ORC " + + "'/test-warehouse/schemas/alltypestiny.parquet'", + "Failed to open file as an ORC file: org.apache.orc.FileFormatException: " + + "Malformed ORC file " + + "hdfs://localhost:20500/test-warehouse/schemas/alltypestiny.parquet" + + ". Invalid postscript."); + + // Inferring primitive and complex types + AnalyzesOk("create table if not exists newtbl_DNE like orc " + + "'/test-warehouse/alltypestiny_orc_def/year=2009/month=1/000000_0'"); + AnalyzesOk("create table if not exists newtbl_DNE like orc " + + "'/test-warehouse/functional_orc_def.db/complextypes_fileformat/000000_0'"); + + // check invalid paths + AnalysisError("create table if not exists functional.zipcode_incomes like ORC " + + "'/test-warehouse'", + "Cannot infer schema, path is not a file: hdfs://localhost:20500/test-warehouse"); + AnalysisError("create table newtbl_DNE like ORC 'foobar'", + "URI path must be absolute: foobar"); + AnalysisError("create table newtbl_DNE like ORC '/not/a/file/path'", + "Cannot infer schema, path does not exist: " + + "hdfs://localhost:20500/not/a/file/path"); + AnalysisError("create table if not exists functional.zipcode_incomes like ORC " + + "'file:///tmp/foobar'", + "Cannot infer schema, path does not exist: file:/tmp/foobar"); + } + + @Test + public void TestCreateTableLikeFileOrcWithHive2() throws AnalysisException { + // Testing if error is thrown when trying to create table like orc file with Hive-2. + Assume.assumeTrue(TestUtils.getHiveMajorVersion() < 3); + + // Inferring primitive and complex types + AnalysisError("create table if not exists newtbl_DNE like orc " + + "'/test-warehouse/alltypestiny_orc_def/year=2009/month=1/000000_0'", + "Creating table like ORC file is unsupported for Hive with version < 3"); } @Test diff --git a/impala-parent/pom.xml b/impala-parent/pom.xml index cbb6c7b..f8cc667 100644 --- a/impala-parent/pom.xml +++ b/impala-parent/pom.xml @@ -38,6 +38,7 @@ under the License. <postgres.jdbc.version>${env.IMPALA_POSTGRES_JDBC_DRIVER_VERSION}</postgres.jdbc.version> <sentry.version>${env.IMPALA_SENTRY_VERSION}</sentry.version> <hbase.version>${env.IMPALA_HBASE_VERSION}</hbase.version> + <orc.version>${env.IMPALA_ORC_JAVA_VERSION}</orc.version> <parquet.version>${env.IMPALA_PARQUET_VERSION}</parquet.version> <kite.version>${env.IMPALA_KITE_VERSION}</kite.version> <knox.version>${env.IMPALA_KNOX_VERSION}</knox.version> diff --git a/shaded-deps/pom.xml b/shaded-deps/pom.xml index 5870894..eefd73b 100644 --- a/shaded-deps/pom.xml +++ b/shaded-deps/pom.xml @@ -94,6 +94,7 @@ the same dependencies <include>org/apache/hadoop/hive/serde2/**</include> <include>org/apache/hive/service/rpc/thrift/**</include> <include>org/apache/hive/common/HiveVersionAnnotation.class</include> + <include>org/apache/orc/**</include> <include>com/google/**</include> </includes> </filter> diff --git a/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file-orc.test b/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file-orc.test new file mode 100644 index 0000000..71901ca --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file-orc.test @@ -0,0 +1,89 @@ +==== +---- QUERY +create table $DATABASE.temp_decimal_table_orc like ORC +'$FILESYSTEM_PREFIX/test-warehouse/decimal_tiny_orc_def/000000_0' +---- RESULTS +'Table has been created.' +==== +---- QUERY +describe $DATABASE.temp_decimal_table_orc +---- RESULTS +'c1','decimal(10,4)','Inferred from ORC file.' +'c2','decimal(15,5)','Inferred from ORC file.' +'c3','decimal(1,1)','Inferred from ORC file.' +---- TYPES +STRING, STRING, STRING +==== +---- QUERY +create table $DATABASE.temp_chars_table like ORC +'$FILESYSTEM_PREFIX/test-warehouse/chars_tiny_orc_def/000000_0' +---- RESULTS +'Table has been created.' +==== +---- QUERY +describe $DATABASE.temp_chars_table +---- RESULTS +'cs','char(5)','Inferred from ORC file.' +'cl','char(140)','Inferred from ORC file.' +'vc','varchar(32)','Inferred from ORC file.' +---- TYPES +STRING, STRING, STRING +==== +---- QUERY +create table $DATABASE.like_zipcodes_file_orc like ORC +'$FILESYSTEM_PREFIX/test-warehouse/zipcode_incomes_orc_def/000000_0' +---- RESULTS +'Table has been created.' +==== +---- QUERY +describe $DATABASE.like_zipcodes_file_orc +---- RESULTS +'id','string','Inferred from ORC file.' +'zip','string','Inferred from ORC file.' +'description1','string','Inferred from ORC file.' +'description2','string','Inferred from ORC file.' +'income','int','Inferred from ORC file.' +---- TYPES +STRING, STRING, STRING +==== +---- QUERY +create table $DATABASE.like_alltypestiny_file_orc like ORC +'$FILESYSTEM_PREFIX/test-warehouse/alltypestiny_orc_def/year=2009/month=1/000000_0' +---- RESULTS +'Table has been created.' +==== +---- QUERY +describe $DATABASE.like_alltypestiny_file_orc +---- RESULTS +'id','int','Inferred from ORC file.' +'bool_col','boolean','Inferred from ORC file.' +'tinyint_col','tinyint','Inferred from ORC file.' +'smallint_col','smallint','Inferred from ORC file.' +'int_col','int','Inferred from ORC file.' +'bigint_col','bigint','Inferred from ORC file.' +'float_col','float','Inferred from ORC file.' +'double_col','double','Inferred from ORC file.' +'date_string_col','string','Inferred from ORC file.' +'string_col','string','Inferred from ORC file.' +'timestamp_col','timestamp','Inferred from ORC file.' +---- TYPES +STRING, STRING, STRING +==== +---- QUERY +create table allcomplextypes_clone_orc like ORC +'$FILESYSTEM_PREFIX/test-warehouse/complextypestbl_orc_def/nullable.orc' +---- RESULTS +'Table has been created.' +==== +---- QUERY +describe allcomplextypes_clone_orc +---- RESULTS +'id','bigint','Inferred from ORC file.' +'int_array','array<int>','Inferred from ORC file.' +'int_array_array','array<array<int>>','Inferred from ORC file.' +'int_map','map<string,int>','Inferred from ORC file.' +'int_map_array','array<map<string,int>>','Inferred from ORC file.' +'nested_struct','struct<\n a:int,\n b:array<int>,\n c:struct<\n d:array<array<struct<\n e:int,\n f:string\n >>>\n >,\n g:map<string,struct<\n h:struct<\n i:array<double>\n >\n >>\n>','Inferred from ORC file.' +---- TYPES +STRING, STRING, STRING +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file.test b/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file.test index 7a80602..fd81aee 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file.test +++ b/testdata/workloads/functional-query/queries/QueryTest/create-table-like-file.test @@ -106,43 +106,6 @@ describe $DATABASE.like_alltypestiny_file STRING, STRING, STRING ==== ---- QUERY -drop table if exists allcomplextypes_clone ----- RESULTS -'Table does not exist.' -==== ----- QUERY -create table allcomplextypes_clone like functional.allcomplextypes -stored as parquet ----- RESULTS -'Table has been created.' -==== ----- QUERY -describe allcomplextypes_clone ----- RESULTS -'id','int','' -'int_array_col','array<int>','' -'array_array_col','array<array<int>>','' -'map_array_col','array<map<string,int>>','' -'struct_array_col','array<struct<\n f1:bigint,\n f2:string\n>>','' -'int_map_col','map<string,int>','' -'array_map_col','map<string,array<int>>','' -'map_map_col','map<string,map<string,int>>','' -'struct_map_col','map<string,struct<\n f1:bigint,\n f2:string\n>>','' -'int_struct_col','struct<\n f1:int,\n f2:int\n>','' -'complex_struct_col','struct<\n f1:int,\n f2:array<int>,\n f3:map<string,int>\n>','' -'nested_struct_col','struct<\n f1:int,\n f2:struct<\n f11:bigint,\n f12:struct<\n f21:bigint\n >\n >\n>','' -'complex_nested_struct_col','struct<\n f1:int,\n f2:array<struct<\n f11:bigint,\n f12:map<string,struct<\n f21:bigint\n >>\n >>\n>','' -'year','int','' -'month','int','' ----- TYPES -STRING, STRING, STRING -==== ----- QUERY -drop table allcomplextypes_clone ----- RESULTS -'Table has been dropped.' -==== ----- QUERY drop table if exists $DATABASE.temp_legacy_table ---- RESULTS 'Table does not exist.' diff --git a/testdata/workloads/functional-query/queries/QueryTest/create-table-like-table.test b/testdata/workloads/functional-query/queries/QueryTest/create-table-like-table.test index ee16c37..456f499 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/create-table-like-table.test +++ b/testdata/workloads/functional-query/queries/QueryTest/create-table-like-table.test @@ -251,3 +251,30 @@ describe formatted sortbytest_override; ---- TYPES STRING,STRING,STRING ==== +---- QUERY +create table allcomplextypes_clone like functional.allcomplextypes +stored as parquet +---- RESULTS +'Table has been created.' +==== +---- QUERY +describe allcomplextypes_clone +---- RESULTS +'id','int','' +'int_array_col','array<int>','' +'array_array_col','array<array<int>>','' +'map_array_col','array<map<string,int>>','' +'struct_array_col','array<struct<\n f1:bigint,\n f2:string\n>>','' +'int_map_col','map<string,int>','' +'array_map_col','map<string,array<int>>','' +'map_map_col','map<string,map<string,int>>','' +'struct_map_col','map<string,struct<\n f1:bigint,\n f2:string\n>>','' +'int_struct_col','struct<\n f1:int,\n f2:int\n>','' +'complex_struct_col','struct<\n f1:int,\n f2:array<int>,\n f3:map<string,int>\n>','' +'nested_struct_col','struct<\n f1:int,\n f2:struct<\n f11:bigint,\n f12:struct<\n f21:bigint\n >\n >\n>','' +'complex_nested_struct_col','struct<\n f1:int,\n f2:array<struct<\n f11:bigint,\n f12:map<string,struct<\n f21:bigint\n >>\n >>\n>','' +'year','int','' +'month','int','' +---- TYPES +STRING, STRING, STRING +==== \ No newline at end of file diff --git a/tests/common/skip.py b/tests/common/skip.py index 2ab4250..3729649 100644 --- a/tests/common/skip.py +++ b/tests/common/skip.py @@ -227,7 +227,8 @@ class SkipIfHive2: create_external_kudu_table = pytest.mark.skipif(HIVE_MAJOR_VERSION == 2, reason="Hive 2 does not support creating external.table.purge Kudu tables." " See IMPALA-9092 for details.") - + orc = pytest.mark.skipif(HIVE_MAJOR_VERSION <= 2, + reason="CREATE TABLE LIKE ORC is only supported with Hive version >= 3") class SkipIfCatalogV2: """Expose decorators as methods so that is_catalog_v2_cluster() can be evaluated lazily diff --git a/tests/metadata/test_ddl.py b/tests/metadata/test_ddl.py index dbb82f9..25f7032 100644 --- a/tests/metadata/test_ddl.py +++ b/tests/metadata/test_ddl.py @@ -294,6 +294,13 @@ class TestDdlStatements(TestDdlBase): self.run_test_case('QueryTest/create-table-like-file', vector, use_db=unique_database, multiple_impalad=self._use_multiple_impalad(vector)) + @SkipIfHive2.orc + @UniqueDatabase.parametrize(sync_ddl=True) + def test_create_table_like_file_orc(self, vector, unique_database): + vector.get_value('exec_option')['abort_on_error'] = False + self.run_test_case('QueryTest/create-table-like-file-orc', vector, + use_db=unique_database, multiple_impalad=self._use_multiple_impalad(vector)) + @UniqueDatabase.parametrize(sync_ddl=True) def test_create_table_as_select(self, vector, unique_database): vector.get_value('exec_option')['abort_on_error'] = False