IMPALA-6240: [DOCS] Document PARQUET_ARRAY_RESOLUTION query option

Cherry-picks: not for 2.x
Change-Id: I12696b609609ea16c05d8b7e84b2bae0be6d6cb5
Reviewed-on: http://gerrit.cloudera.org:8080/9534
Reviewed-by: Alex Behm <alex.b...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/e096233a
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/e096233a
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/e096233a

Branch: refs/heads/master
Commit: e096233a25d65c1078a153630547f9551f1cf15e
Parents: c7a58b8
Author: Alex Rodoni <arod...@cloudera.com>
Authored: Wed Mar 7 12:08:44 2018 -0800
Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org>
Committed: Fri Mar 9 23:37:57 2018 +0000

----------------------------------------------------------------------
 docs/impala.ditamap                             |   1 +
 docs/impala_keydefs.ditamap                     |   1 +
 docs/topics/impala_parquet_array_resolution.xml | 206 +++++++++++++++++++
 3 files changed, 208 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/e096233a/docs/impala.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index d9a42d7..0f010a2 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -207,6 +207,7 @@ under the License.
           <topicref rev="2.5.0" 
href="topics/impala_optimize_partition_key_scans.xml"/>
           <topicref href="topics/impala_parquet_compression_codec.xml"/>
           <topicref rev="2.6.0 IMPALA-2069" 
href="topics/impala_parquet_annotate_strings_utf8.xml"/>
+          <topicref rev="2.9.0 IMPALA-4725" 
href="topics/impala_parquet_array_resolution.xml"/>
           <topicref rev="2.6.0 IMPALA-2835" 
href="topics/impala_parquet_fallback_schema_resolution.xml"/>
           <topicref href="topics/impala_parquet_file_size.xml"/>
           <topicref rev="2.6.0 IMPALA-3286" 
href="topics/impala_prefetch_mode.xml"/>

http://git-wip-us.apache.org/repos/asf/impala/blob/e096233a/docs/impala_keydefs.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index 81881a8..35dd0b8 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -10798,6 +10798,7 @@ under the License.
   <keydef href="topics/impala_optimize_partition_key_scans.xml" 
keys="optimize_partition_key_scans"/>
   <keydef href="topics/impala_parquet_compression_codec.xml" 
keys="parquet_compression_codec"/>
   <keydef href="topics/impala_parquet_annotate_strings_utf8.xml" 
keys="parquet_annotate_strings_utf8"/>
+  <keydef href="topics/impala_parquet_array_resolution.xml" 
keys="parquet_array_resolution"/>
   <keydef href="topics/impala_parquet_fallback_schema_resolution.xml" 
keys="parquet_fallback_schema_resolution"/>
   <keydef href="topics/impala_parquet_file_size.xml" keys="parquet_file_size"/>
   <keydef href="topics/impala_prefetch_mode.xml" keys="prefetch_mode"/>

http://git-wip-us.apache.org/repos/asf/impala/blob/e096233a/docs/topics/impala_parquet_array_resolution.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_parquet_array_resolution.xml 
b/docs/topics/impala_parquet_array_resolution.xml
new file mode 100644
index 0000000..62c78d2
--- /dev/null
+++ b/docs/topics/impala_parquet_array_resolution.xml
@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="parquet_array_resolution" rev="2.9.0 IMPALA-4725">
+
+  <title>
+    PARQUET_ARRAY_RESOLUTION Query Option (<keyword keyref="impala29"/> or 
higher only)
+  </title>
+
+  <titlealts audience="PDF">
+
+    <navtitle>PARQUET_ARRAY_RESOLUTION</navtitle>
+
+  </titlealts>
+
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Parquet"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="parquet_array_resolution">
+      The <codeph>PARQUET_ARRAY_RESOLUTION</codeph> query option controls the
+      behavior of the indexed-based resolution for nested arrays in Parquet.
+    </p>
+
+    <p>
+      In Parquet, you can represent an array using a 2-level or 3-level
+      representation. The modern, standard representation is 3-level. The 
legacy
+      2-level scheme is supported for compatibility with older Parquet files.
+      However, there is no reliable metadata within Parquet files to indicate
+      which encoding was used. It is even possible to have mixed encodings 
within
+      the same file if there are multiple arrays. The
+      <codeph>PARQUET_ARRAY_RESOLTUTION</codeph> option controls the process of
+      resolution that is to match every column/field reference from a query to 
a
+      column in the Parquet file.</p>
+
+    <p>
+      The supported values for the query option are:
+    </p>
+
+    <ul>
+      <li>
+        <codeph>THREE_LEVEL</codeph>: Assumes arrays are encoded with the 
3-level
+        representation, and does not attempt the 2-level resolution.
+      </li>
+
+      <li>
+        <codeph>TWO_LEVEL</codeph>: Assumes arrays are encoded with the 2-level
+        representation, and does not attempt the 3-level resolution.
+      </li>
+
+      <li>
+        <codeph>TWO_LEVEL_THEN_THREE_LEVEL</codeph>: First tries to resolve
+        assuming a 2-level representation, and if unsuccessful, tries a 3-level
+        representation.
+      </li>
+    </ul>
+
+    <p>
+      All of the above options resolve arrays encoded with a single level.
+    </p>
+
+    <p>
+      A failure to resolve a column/field reference in a query with a given 
array
+      resolution policy does not necessarily result in a warning or error 
returned
+      by the query. A mismatch might be treated like a missing column (returns
+      NULL values), and it is not possible to reliably distinguish the 'bad
+      resolution' and 'legitimately missing column' cases.
+    </p>
+
+    <p>
+      The name-based policy generally does not have the problem of ambiguous
+      array representations. You specify to use the name-based policy by 
setting
+      the <codeph>PARQUET_FALLBACK_SCHEMA_RESOLUTION</codeph> query option to
+      <codeph>NAME</codeph>.
+    </p>
+
+    <p>
+      <b>Type:</b> Enum of <codeph>ONE_LEVEL</codeph>, 
<codeph>TWO_LEVEL</codeph>,
+      <codeph>THREE_LEVEL</codeph>
+    </p>
+
+    <p>
+      <b>Default:</b> <codeph>THREE_LEVEL</codeph>
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_290"/>
+
+    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+    <p>
+      EXAMPLE A: The following Parquet schema of a file can be interpreted as a
+      2-level or 3-level:
+    </p>
+
+<codeblock>
+ParquetSchemaExampleA {
+  optional group single_element_groups (LIST) {
+    repeated group single_element_group {
+      required int64 count;
+    }
+  }
+}
+</codeblock>
+
+    <p>
+      The following table schema corresponds to a 2-level interpretation:
+    </p>
+
+<codeblock>
+CREATE TABLE t (col1 array&lt;struct&lt;f1: bigint>>) STORED AS PARQUET;
+</codeblock>
+
+    <p>
+      Successful query with a 2-level interpretation:
+    </p>
+
+<codeblock>
+SET PARQUET_ARRAY_RESOLUTION=TWO_LEVEL;
+SELECT ITEM.f1 FROM t.col1;
+</codeblock>
+
+    <p>
+      The following table schema corresponds to a 3-level interpretation:
+    </p>
+
+<codeblock>
+CREATE TABLE t (col1 array&lt;bigint>) STORED AS PARQUET;
+</codeblock>
+
+    <p>
+      Successful query with a 3-level interpretation:
+    </p>
+
+<codeblock>
+SET PARQUET_ARRAY_RESOLUTION=THREE_LEVEL;
+SELECT ITEM FROM t.col1
+</codeblock>
+
+    <p>
+      EXAMPLE B: The following Parquet schema of a file can be only be 
successfully
+      interpreted as a 2-level:
+    </p>
+
+<codeblock>
+ParquetSchemaExampleB {
+  required group list_of_ints (LIST) {
+    repeated int32 list_of_ints_tuple;
+  }
+}
+</codeblock>
+
+    <p>
+      The following table schema corresponds to a 2-level interpretation:
+    </p>
+
+<codeblock>
+CREATE TABLE t (col1 array&lt;int>) STORED AS PARQUET;
+</codeblock>
+
+    <p>
+      Successful query with a 2-level interpretation:
+    </p>
+
+<codeblock>
+SET PARQUET_ARRAY_RESOLUTION=TWO_LEVEL;
+SELECT ITEM FROM t.col1
+</codeblock>
+
+    <p>
+      Unsuccessful query with a 3-level interpretation. The query returns
+      <codeph>NULL</codeph>s as if the column was missing in the file:
+    </p>
+
+<codeblock>
+SET PARQUET_ARRAY_RESOLUTION=THREE_LEVEL;
+SELECT ITEM FROM t.col1
+</codeblock>
+
+  </conbody>
+
+</concept>

Reply via email to