This is an automated email from the ASF dual-hosted git repository. boroknagyz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 46e03392c38299731d937898c4177b7baedc8f7d Author: Fredy Wijaya <fwij...@cloudera.com> AuthorDate: Sat Feb 23 12:56:10 2019 -0800 IMPALA-7645: Add a query option to set the default table file format This patch adds a query option "DEFAULT_FILE_FORMAT" to allow setting the default table file format. Below is the list of supported "DEFAULT_FILE_FORMAT" query option values: - TEXT - RC_FILE - SEQUENCE_FILE - AVRO - PARQUET - KUDU - ORC For backward compatibility, the default table file format remains as TEXT. Testing: - Ran all FE tests - Added BE test - Added E2E tests - Ran test_ddl.py and test_set.py Change-Id: Ic857c38076d973ad749a41fecd1b470c7881db5e Reviewed-on: http://gerrit.cloudera.org:8080/12568 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- be/src/service/query-options-test.cc | 2 ++ be/src/service/query-options.cc | 22 ++++++++++++++ be/src/service/query-options.h | 5 ++-- common/thrift/ImpalaInternalService.thrift | 4 +++ common/thrift/ImpalaService.thrift | 3 ++ fe/src/main/cup/sql-parser.cup | 5 ++-- .../java/org/apache/impala/analysis/TableDef.java | 18 ++++++++---- .../functional-query/queries/QueryTest/set.test | 8 +++++ tests/metadata/test_ddl.py | 34 ++++++++++++++++++++++ 9 files changed, 92 insertions(+), 9 deletions(-) diff --git a/be/src/service/query-options-test.cc b/be/src/service/query-options-test.cc index b18b58e..1be2b81 100644 --- a/be/src/service/query-options-test.cc +++ b/be/src/service/query-options-test.cc @@ -211,6 +211,8 @@ TEST(QueryOptions, SetEnumOptions) { (THREE_LEVEL, TWO_LEVEL, TWO_LEVEL_THEN_THREE_LEVEL)), true); TestEnumCase(options, CASE(compression_codec, THdfsCompression, (NONE, GZIP, BZIP2, DEFAULT, SNAPPY, SNAPPY_BLOCKED)), false); + TestEnumCase(options, CASE(default_file_format, THdfsFileFormat, + (TEXT, RC_FILE, SEQUENCE_FILE, AVRO, PARQUET, KUDU, ORC)), true); #undef CASE #undef ENTRIES #undef ENTRY diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index 5720eac..9618d18 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -765,6 +765,28 @@ Status impala::SetQueryOption(const string& key, const string& value, query_options->__set_num_rows_produced_limit(num_rows_produced_limit); break; } + case TImpalaQueryOptions::DEFAULT_FILE_FORMAT: { + if (iequals(value, "TEXT") || iequals(value, "0")) { + query_options->__set_default_file_format(THdfsFileFormat::TEXT); + } else if (iequals(value, "RC_FILE") || iequals(value, "1")) { + query_options->__set_default_file_format(THdfsFileFormat::RC_FILE); + } else if (iequals(value, "SEQUENCE_FILE") || iequals(value, "2")) { + query_options->__set_default_file_format(THdfsFileFormat::SEQUENCE_FILE); + } else if (iequals(value, "AVRO") || iequals(value, "3")) { + query_options->__set_default_file_format(THdfsFileFormat::AVRO); + } else if (iequals(value, "PARQUET") || iequals(value, "4")) { + query_options->__set_default_file_format(THdfsFileFormat::PARQUET); + } else if (iequals(value, "KUDU") || iequals(value, "5")) { + query_options->__set_default_file_format(THdfsFileFormat::KUDU); + } else if (iequals(value, "ORC") || iequals(value, "6")) { + query_options->__set_default_file_format(THdfsFileFormat::ORC); + } else { + return Status(Substitute("Invalid default_file_format '$0'. Valid values are " + "TEXT, RC_FILE, SEQUENCE_FILE, AVRO, PARQUET, KUDU " + "and ORC.", value)); + } + break; + } default: if (IsRemovedQueryOption(key)) { LOG(WARNING) << "Ignoring attempt to set removed query option '" << key << "'"; diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index 7fee4d1..4ead0e5 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -41,7 +41,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type> // the DCHECK. #define QUERY_OPTS_TABLE\ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\ - TImpalaQueryOptions::PLANNER_TESTCASE_MODE + 1);\ + TImpalaQueryOptions::DEFAULT_FILE_FORMAT + 1);\ REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\ QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\ REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\ @@ -151,7 +151,8 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type> QUERY_OPT_FN(num_rows_produced_limit, NUM_ROWS_PRODUCED_LIMIT,\ TQueryOptionLevel::ADVANCED)\ QUERY_OPT_FN(\ - planner_testcase_mode, PLANNER_TESTCASE_MODE, TQueryOptionLevel::DEVELOPMENT) + planner_testcase_mode, PLANNER_TESTCASE_MODE, TQueryOptionLevel::DEVELOPMENT)\ + QUERY_OPT_FN(default_file_format, DEFAULT_FILE_FORMAT, TQueryOptionLevel::REGULAR) ; /// Enforce practical limits on some query options to avoid undesired query state. diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift index 5238848..65de009 100644 --- a/common/thrift/ImpalaInternalService.thrift +++ b/common/thrift/ImpalaInternalService.thrift @@ -323,6 +323,10 @@ struct TQueryOptions { // See comment in ImpalaService.thrift 78: optional bool planner_testcase_mode = false; + + // See comment in ImpalaService.thrift. + 79: optional CatalogObjects.THdfsFileFormat default_file_format = + CatalogObjects.THdfsFileFormat.TEXT; } // Impala currently has two types of sessions: Beeswax and HiveServer2 diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift index 71fc732..7c46bd8 100644 --- a/common/thrift/ImpalaService.thrift +++ b/common/thrift/ImpalaService.thrift @@ -377,6 +377,9 @@ enum TImpalaQueryOptions { // debugging a testcase. Should not be set in user clusters. If set, a warning // is emitted in the query runtime profile. PLANNER_TESTCASE_MODE = 77 + + // Specifies the default table file format. + DEFAULT_FILE_FORMAT = 78 } // The summary of a DML statement. diff --git a/fe/src/main/cup/sql-parser.cup b/fe/src/main/cup/sql-parser.cup index 9d80ced..3216600 100644 --- a/fe/src/main/cup/sql-parser.cup +++ b/fe/src/main/cup/sql-parser.cup @@ -1319,7 +1319,7 @@ create_tbl_stmt ::= if (tbl_def.isExternal()) { parser.parseError("external", SqlParserSymbols.KW_EXTERNAL); } - tbl_def.setOptions(new TableDef.Options(comment)); + tbl_def.setOptions(new TableDef.Options(comment, parser.getQueryOptions())); RESULT = new CreateTableDataSrcStmt(new CreateTableStmt(tbl_def), data_src_name, init_string); :} @@ -1400,8 +1400,9 @@ tbl_options ::= {: CreateTableStmt.unescapeProperties(serde_props); CreateTableStmt.unescapeProperties(tbl_props); + RESULT = new TableDef.Options(sort_cols, comment, row_format, serde_props, - file_format, location, cache_op, tbl_props); + file_format, location, cache_op, tbl_props, parser.getQueryOptions()); :} ; diff --git a/fe/src/main/java/org/apache/impala/analysis/TableDef.java b/fe/src/main/java/org/apache/impala/analysis/TableDef.java index 0c79083..e9e5399 100644 --- a/fe/src/main/java/org/apache/impala/analysis/TableDef.java +++ b/fe/src/main/java/org/apache/impala/analysis/TableDef.java @@ -37,6 +37,7 @@ import org.apache.impala.common.FileSystemUtil; import org.apache.impala.thrift.TAccessEvent; import org.apache.impala.thrift.TCatalogObjectType; import org.apache.impala.thrift.THdfsFileFormat; +import org.apache.impala.thrift.TQueryOptions; import org.apache.impala.util.MetaStoreUtil; import com.google.common.base.Preconditions; @@ -126,22 +127,29 @@ class TableDef { Options(List<String> sortCols, String comment, RowFormat rowFormat, Map<String, String> serdeProperties, THdfsFileFormat fileFormat, HdfsUri location, - HdfsCachingOp cachingOp, Map<String, String> tblProperties) { + HdfsCachingOp cachingOp, Map<String, String> tblProperties, + TQueryOptions queryOptions) { this.sortCols = sortCols; this.comment = comment; this.rowFormat = rowFormat; Preconditions.checkNotNull(serdeProperties); this.serdeProperties = serdeProperties; - this.fileFormat = fileFormat == null ? THdfsFileFormat.TEXT : fileFormat; + // The file format passed via STORED AS <file format> has a higher precedence than + // the one set in query options. + this.fileFormat = (fileFormat != null) ? + fileFormat : queryOptions.getDefault_file_format(); this.location = location; this.cachingOp = cachingOp; Preconditions.checkNotNull(tblProperties); this.tblProperties = tblProperties; } - public Options(String comment) { - this(ImmutableList.<String>of(), comment, RowFormat.DEFAULT_ROW_FORMAT, - new HashMap<>(), THdfsFileFormat.TEXT, null, null, new HashMap<>()); + public Options(String comment, TQueryOptions queryOptions) { + // Passing null to file format so that it uses the file format from the query option + // if specified, otherwise it will use the default file format, which is TEXT. + this(ImmutableList.of(), comment, RowFormat.DEFAULT_ROW_FORMAT, + new HashMap<>(), /*file format*/ null, null, null, new HashMap<>(), + queryOptions); } } diff --git a/testdata/workloads/functional-query/queries/QueryTest/set.test b/testdata/workloads/functional-query/queries/QueryTest/set.test index cabc356..aa74ee1 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/set.test +++ b/testdata/workloads/functional-query/queries/QueryTest/set.test @@ -25,6 +25,7 @@ set all; 'PARQUET_FILE_SIZE','0','ADVANCED' 'REQUEST_POOL','','REGULAR' 'SYNC_DDL','0','REGULAR' +'DEFAULT_FILE_FORMAT','TEXT','REGULAR' ---- TYPES STRING, STRING, STRING ==== @@ -50,6 +51,7 @@ set all; 'PARQUET_FILE_SIZE','0','ADVANCED' 'REQUEST_POOL','','REGULAR' 'SYNC_DDL','0','REGULAR' +'DEFAULT_FILE_FORMAT','TEXT','REGULAR' ---- TYPES STRING, STRING, STRING ==== @@ -75,6 +77,7 @@ set all; 'PARQUET_FILE_SIZE','0','ADVANCED' 'REQUEST_POOL','','REGULAR' 'SYNC_DDL','0','REGULAR' +'DEFAULT_FILE_FORMAT','TEXT','REGULAR' ---- TYPES STRING, STRING, STRING ==== @@ -228,3 +231,8 @@ set SCAN_NODE_CODEGEN_THRESHOLD = "foo"; set max_io_buffers="foo"; ---- RESULTS ==== +---- QUERY +set default_file_format=foo; +---- CATCH +Invalid default_file_format 'foo'. Valid values are TEXT, RC_FILE, SEQUENCE_FILE, AVRO, PARQUET, KUDU and ORC. +==== \ No newline at end of file diff --git a/tests/metadata/test_ddl.py b/tests/metadata/test_ddl.py index 1bf0e7d..af078d2 100644 --- a/tests/metadata/test_ddl.py +++ b/tests/metadata/test_ddl.py @@ -689,6 +689,40 @@ class TestDdlStatements(TestDdlBase): self.run_test_case('QueryTest/partition-ddl-predicates-hdfs-only', vector, use_db=unique_database, multiple_impalad=self._use_multiple_impalad(vector)) + def test_create_table_file_format(self, vector, unique_database): + # When default_file_format query option is not specified, the default table file + # format is TEXT. + text_table = "{0}.text_tbl".format(unique_database) + self.execute_query_expect_success( + self.client, "create table {0}(i int)".format(text_table)) + result = self.execute_query_expect_success( + self.client, "show create table {0}".format(text_table)) + assert any("TEXTFILE" in x for x in result.data) + + self.execute_query_expect_failure( + self.client, "create table {0}.foobar_tbl".format(unique_database), + {"default_file_format": "foobar"}) + + parquet_table = "{0}.parquet_tbl".format(unique_database) + self.execute_query_expect_success( + self.client, "create table {0}(i int)".format(parquet_table), + {"default_file_format": "parquet"}) + result = self.execute_query_expect_success( + self.client, "show create table {0}".format(parquet_table)) + assert any("PARQUET" in x for x in result.data) + + # The table created should still be ORC even though the default_file_format query + # option is set to parquet. + orc_table = "{0}.orc_tbl".format(unique_database) + self.execute_query_expect_success( + self.client, + "create table {0}(i int) stored as orc".format(orc_table), + {"default_file_format": "parquet"}) + result = self.execute_query_expect_success( + self.client, "show create table {0}".format(orc_table)) + assert any("ORC" in x for x in result.data) + + # IMPALA-2002: Tests repeated adding/dropping of .jar and .so in the lib cache. class TestLibCache(TestDdlBase): @classmethod