This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 46e03392c38299731d937898c4177b7baedc8f7d
Author: Fredy Wijaya <fwij...@cloudera.com>
AuthorDate: Sat Feb 23 12:56:10 2019 -0800

    IMPALA-7645: Add a query option to set the default table file format
    
    This patch adds a query option "DEFAULT_FILE_FORMAT" to allow setting
    the default table file format. Below is the list of supported
    "DEFAULT_FILE_FORMAT" query option values:
    - TEXT
    - RC_FILE
    - SEQUENCE_FILE
    - AVRO
    - PARQUET
    - KUDU
    - ORC
    
    For backward compatibility, the default table file format remains as
    TEXT.
    
    Testing:
    - Ran all FE tests
    - Added BE test
    - Added E2E tests
    - Ran test_ddl.py and test_set.py
    
    Change-Id: Ic857c38076d973ad749a41fecd1b470c7881db5e
    Reviewed-on: http://gerrit.cloudera.org:8080/12568
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 be/src/service/query-options-test.cc               |  2 ++
 be/src/service/query-options.cc                    | 22 ++++++++++++++
 be/src/service/query-options.h                     |  5 ++--
 common/thrift/ImpalaInternalService.thrift         |  4 +++
 common/thrift/ImpalaService.thrift                 |  3 ++
 fe/src/main/cup/sql-parser.cup                     |  5 ++--
 .../java/org/apache/impala/analysis/TableDef.java  | 18 ++++++++----
 .../functional-query/queries/QueryTest/set.test    |  8 +++++
 tests/metadata/test_ddl.py                         | 34 ++++++++++++++++++++++
 9 files changed, 92 insertions(+), 9 deletions(-)

diff --git a/be/src/service/query-options-test.cc 
b/be/src/service/query-options-test.cc
index b18b58e..1be2b81 100644
--- a/be/src/service/query-options-test.cc
+++ b/be/src/service/query-options-test.cc
@@ -211,6 +211,8 @@ TEST(QueryOptions, SetEnumOptions) {
       (THREE_LEVEL, TWO_LEVEL, TWO_LEVEL_THEN_THREE_LEVEL)), true);
   TestEnumCase(options, CASE(compression_codec, THdfsCompression,
       (NONE, GZIP, BZIP2, DEFAULT, SNAPPY, SNAPPY_BLOCKED)), false);
+  TestEnumCase(options, CASE(default_file_format, THdfsFileFormat,
+      (TEXT, RC_FILE, SEQUENCE_FILE, AVRO, PARQUET, KUDU, ORC)), true);
 #undef CASE
 #undef ENTRIES
 #undef ENTRY
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index 5720eac..9618d18 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -765,6 +765,28 @@ Status impala::SetQueryOption(const string& key, const 
string& value,
         query_options->__set_num_rows_produced_limit(num_rows_produced_limit);
         break;
       }
+      case TImpalaQueryOptions::DEFAULT_FILE_FORMAT: {
+        if (iequals(value, "TEXT") || iequals(value, "0")) {
+          query_options->__set_default_file_format(THdfsFileFormat::TEXT);
+        } else if (iequals(value, "RC_FILE") || iequals(value, "1")) {
+          query_options->__set_default_file_format(THdfsFileFormat::RC_FILE);
+        } else if (iequals(value, "SEQUENCE_FILE") || iequals(value, "2")) {
+          
query_options->__set_default_file_format(THdfsFileFormat::SEQUENCE_FILE);
+        } else if (iequals(value, "AVRO") || iequals(value, "3")) {
+          query_options->__set_default_file_format(THdfsFileFormat::AVRO);
+        } else if (iequals(value, "PARQUET") || iequals(value, "4")) {
+          query_options->__set_default_file_format(THdfsFileFormat::PARQUET);
+        } else if (iequals(value, "KUDU") || iequals(value, "5")) {
+          query_options->__set_default_file_format(THdfsFileFormat::KUDU);
+        } else if (iequals(value, "ORC") || iequals(value, "6")) {
+          query_options->__set_default_file_format(THdfsFileFormat::ORC);
+        } else {
+          return Status(Substitute("Invalid default_file_format '$0'. Valid 
values are "
+                                   "TEXT, RC_FILE, SEQUENCE_FILE, AVRO, 
PARQUET, KUDU "
+                                   "and ORC.", value));
+        }
+        break;
+      }
       default:
         if (IsRemovedQueryOption(key)) {
           LOG(WARNING) << "Ignoring attempt to set removed query option '" << 
key << "'";
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index 7fee4d1..4ead0e5 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -41,7 +41,7 @@ typedef std::unordered_map<string, 
beeswax::TQueryOptionLevel::type>
 // the DCHECK.
 #define QUERY_OPTS_TABLE\
   DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\
-      TImpalaQueryOptions::PLANNER_TESTCASE_MODE + 1);\
+      TImpalaQueryOptions::DEFAULT_FILE_FORMAT + 1);\
   REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, 
ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\
   QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\
   REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\
@@ -151,7 +151,8 @@ typedef std::unordered_map<string, 
beeswax::TQueryOptionLevel::type>
   QUERY_OPT_FN(num_rows_produced_limit, NUM_ROWS_PRODUCED_LIMIT,\
       TQueryOptionLevel::ADVANCED)\
   QUERY_OPT_FN(\
-      planner_testcase_mode, PLANNER_TESTCASE_MODE, 
TQueryOptionLevel::DEVELOPMENT)
+      planner_testcase_mode, PLANNER_TESTCASE_MODE, 
TQueryOptionLevel::DEVELOPMENT)\
+  QUERY_OPT_FN(default_file_format, DEFAULT_FILE_FORMAT, 
TQueryOptionLevel::REGULAR)
   ;
 
 /// Enforce practical limits on some query options to avoid undesired query 
state.
diff --git a/common/thrift/ImpalaInternalService.thrift 
b/common/thrift/ImpalaInternalService.thrift
index 5238848..65de009 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -323,6 +323,10 @@ struct TQueryOptions {
 
   // See comment in ImpalaService.thrift
   78: optional bool planner_testcase_mode = false;
+
+  // See comment in ImpalaService.thrift.
+  79: optional CatalogObjects.THdfsFileFormat default_file_format =
+      CatalogObjects.THdfsFileFormat.TEXT;
 }
 
 // Impala currently has two types of sessions: Beeswax and HiveServer2
diff --git a/common/thrift/ImpalaService.thrift 
b/common/thrift/ImpalaService.thrift
index 71fc732..7c46bd8 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -377,6 +377,9 @@ enum TImpalaQueryOptions {
   // debugging a testcase. Should not be set in user clusters. If set, a 
warning
   // is emitted in the query runtime profile.
   PLANNER_TESTCASE_MODE = 77
+
+  // Specifies the default table file format.
+  DEFAULT_FILE_FORMAT = 78
 }
 
 // The summary of a DML statement.
diff --git a/fe/src/main/cup/sql-parser.cup b/fe/src/main/cup/sql-parser.cup
index 9d80ced..3216600 100644
--- a/fe/src/main/cup/sql-parser.cup
+++ b/fe/src/main/cup/sql-parser.cup
@@ -1319,7 +1319,7 @@ create_tbl_stmt ::=
     if (tbl_def.isExternal()) {
       parser.parseError("external", SqlParserSymbols.KW_EXTERNAL);
     }
-    tbl_def.setOptions(new TableDef.Options(comment));
+    tbl_def.setOptions(new TableDef.Options(comment, 
parser.getQueryOptions()));
     RESULT = new CreateTableDataSrcStmt(new CreateTableStmt(tbl_def),
         data_src_name, init_string);
   :}
@@ -1400,8 +1400,9 @@ tbl_options ::=
   {:
     CreateTableStmt.unescapeProperties(serde_props);
     CreateTableStmt.unescapeProperties(tbl_props);
+
     RESULT = new TableDef.Options(sort_cols, comment, row_format, serde_props,
-        file_format, location, cache_op, tbl_props);
+        file_format, location, cache_op, tbl_props, parser.getQueryOptions());
   :}
   ;
 
diff --git a/fe/src/main/java/org/apache/impala/analysis/TableDef.java 
b/fe/src/main/java/org/apache/impala/analysis/TableDef.java
index 0c79083..e9e5399 100644
--- a/fe/src/main/java/org/apache/impala/analysis/TableDef.java
+++ b/fe/src/main/java/org/apache/impala/analysis/TableDef.java
@@ -37,6 +37,7 @@ import org.apache.impala.common.FileSystemUtil;
 import org.apache.impala.thrift.TAccessEvent;
 import org.apache.impala.thrift.TCatalogObjectType;
 import org.apache.impala.thrift.THdfsFileFormat;
+import org.apache.impala.thrift.TQueryOptions;
 import org.apache.impala.util.MetaStoreUtil;
 
 import com.google.common.base.Preconditions;
@@ -126,22 +127,29 @@ class TableDef {
 
     Options(List<String> sortCols, String comment, RowFormat rowFormat,
         Map<String, String> serdeProperties, THdfsFileFormat fileFormat, 
HdfsUri location,
-        HdfsCachingOp cachingOp, Map<String, String> tblProperties) {
+        HdfsCachingOp cachingOp, Map<String, String> tblProperties,
+        TQueryOptions queryOptions) {
       this.sortCols = sortCols;
       this.comment = comment;
       this.rowFormat = rowFormat;
       Preconditions.checkNotNull(serdeProperties);
       this.serdeProperties = serdeProperties;
-      this.fileFormat = fileFormat == null ? THdfsFileFormat.TEXT : fileFormat;
+      // The file format passed via STORED AS <file format> has a higher 
precedence than
+      // the one set in query options.
+      this.fileFormat = (fileFormat != null) ?
+          fileFormat : queryOptions.getDefault_file_format();
       this.location = location;
       this.cachingOp = cachingOp;
       Preconditions.checkNotNull(tblProperties);
       this.tblProperties = tblProperties;
     }
 
-    public Options(String comment) {
-      this(ImmutableList.<String>of(), comment, RowFormat.DEFAULT_ROW_FORMAT,
-          new HashMap<>(), THdfsFileFormat.TEXT, null, null, new HashMap<>());
+    public Options(String comment, TQueryOptions queryOptions) {
+      // Passing null to file format so that it uses the file format from the 
query option
+      // if specified, otherwise it will use the default file format, which is 
TEXT.
+      this(ImmutableList.of(), comment, RowFormat.DEFAULT_ROW_FORMAT,
+          new HashMap<>(), /*file format*/ null, null, null, new HashMap<>(),
+          queryOptions);
     }
   }
 
diff --git a/testdata/workloads/functional-query/queries/QueryTest/set.test 
b/testdata/workloads/functional-query/queries/QueryTest/set.test
index cabc356..aa74ee1 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/set.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/set.test
@@ -25,6 +25,7 @@ set all;
 'PARQUET_FILE_SIZE','0','ADVANCED'
 'REQUEST_POOL','','REGULAR'
 'SYNC_DDL','0','REGULAR'
+'DEFAULT_FILE_FORMAT','TEXT','REGULAR'
 ---- TYPES
 STRING, STRING, STRING
 ====
@@ -50,6 +51,7 @@ set all;
 'PARQUET_FILE_SIZE','0','ADVANCED'
 'REQUEST_POOL','','REGULAR'
 'SYNC_DDL','0','REGULAR'
+'DEFAULT_FILE_FORMAT','TEXT','REGULAR'
 ---- TYPES
 STRING, STRING, STRING
 ====
@@ -75,6 +77,7 @@ set all;
 'PARQUET_FILE_SIZE','0','ADVANCED'
 'REQUEST_POOL','','REGULAR'
 'SYNC_DDL','0','REGULAR'
+'DEFAULT_FILE_FORMAT','TEXT','REGULAR'
 ---- TYPES
 STRING, STRING, STRING
 ====
@@ -228,3 +231,8 @@ set SCAN_NODE_CODEGEN_THRESHOLD = "foo";
 set max_io_buffers="foo";
 ---- RESULTS
 ====
+---- QUERY
+set default_file_format=foo;
+---- CATCH
+Invalid default_file_format 'foo'. Valid values are TEXT, RC_FILE, 
SEQUENCE_FILE, AVRO, PARQUET, KUDU and ORC.
+====
\ No newline at end of file
diff --git a/tests/metadata/test_ddl.py b/tests/metadata/test_ddl.py
index 1bf0e7d..af078d2 100644
--- a/tests/metadata/test_ddl.py
+++ b/tests/metadata/test_ddl.py
@@ -689,6 +689,40 @@ class TestDdlStatements(TestDdlBase):
       self.run_test_case('QueryTest/partition-ddl-predicates-hdfs-only', 
vector,
           use_db=unique_database, 
multiple_impalad=self._use_multiple_impalad(vector))
 
+  def test_create_table_file_format(self, vector, unique_database):
+    # When default_file_format query option is not specified, the default 
table file
+    # format is TEXT.
+    text_table = "{0}.text_tbl".format(unique_database)
+    self.execute_query_expect_success(
+        self.client, "create table {0}(i int)".format(text_table))
+    result = self.execute_query_expect_success(
+        self.client, "show create table {0}".format(text_table))
+    assert any("TEXTFILE" in x for x in result.data)
+
+    self.execute_query_expect_failure(
+        self.client, "create table {0}.foobar_tbl".format(unique_database),
+        {"default_file_format": "foobar"})
+
+    parquet_table = "{0}.parquet_tbl".format(unique_database)
+    self.execute_query_expect_success(
+        self.client, "create table {0}(i int)".format(parquet_table),
+        {"default_file_format": "parquet"})
+    result = self.execute_query_expect_success(
+        self.client, "show create table {0}".format(parquet_table))
+    assert any("PARQUET" in x for x in result.data)
+
+    # The table created should still be ORC even though the 
default_file_format query
+    # option is set to parquet.
+    orc_table = "{0}.orc_tbl".format(unique_database)
+    self.execute_query_expect_success(
+        self.client,
+        "create table {0}(i int) stored as orc".format(orc_table),
+        {"default_file_format": "parquet"})
+    result = self.execute_query_expect_success(
+      self.client, "show create table {0}".format(orc_table))
+    assert any("ORC" in x for x in result.data)
+
+
 # IMPALA-2002: Tests repeated adding/dropping of .jar and .so in the lib cache.
 class TestLibCache(TestDdlBase):
   @classmethod

Reply via email to