[16/51] [partial] incubator-impala git commit: IMPALA-3398: Add docs to main Impala branch.

jbapple Thu, 17 Nov 2016 15:12:48 -0800

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_optimize_partition_key_scans.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_optimize_partition_key_scans.xml 
b/docs/topics/impala_optimize_partition_key_scans.xml
new file mode 100644
index 0000000..2cae1cf
--- /dev/null
+++ b/docs/topics/impala_optimize_partition_key_scans.xml
@@ -0,0 +1,180 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.5.0 IMPALA-2499" id="optimize_partition_key_scans">
+
+  <title>OPTIMIZE_PARTITION_KEY_SCANS Query Option (<keyword 
keyref="impala25"/> or higher only)</title>
+  <titlealts 
audience="PDF"><navtitle>OPTIMIZE_PARTITION_KEY_SCANS</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.5.0 IMPALA-2499">
+      <indexterm audience="Cloudera">OPTIMIZE_PARTITION_KEY_SCANS query 
option</indexterm>
+      Enables a fast code path for queries that apply simple aggregate 
functions to partition key
+      columns: <codeph>MIN(<varname>key_column</varname>)</codeph>, 
<codeph>MAX(<varname>key_column</varname>)</codeph>,
+      or <codeph>COUNT(DISTINCT <varname>key_column</varname>)</codeph>.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/type_boolean"/>
+    <p conref="../shared/impala_common.xml#common/default_false_0"/>
+
+    <note conref="../shared/impala_common.xml#common/one_but_not_true"/>
+
+    <p conref="../shared/impala_common.xml#common/added_in_250"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p>
+      This optimization speeds up common <q>introspection</q> operations when 
using queries
+      to calculate the cardinality and range for partition key columns.
+    </p>
+
+    <p>
+      This optimization does not apply if the queries contain any 
<codeph>WHERE</codeph>,
+      <codeph>GROUP BY</codeph>, or <codeph>HAVING</codeph> clause. The 
relevant queries
+      should only compute the minimum, maximum, or number of distinct values 
for the
+      partition key columns across the whole table.
+    </p>
+
+    <p>
+      This optimization is enabled by a query option because it skips some 
consistency checks
+      and therefore can return slightly different partition values if 
partitions are in the
+      process of being added, dropped, or loaded outside of Impala. Queries 
might exhibit different
+      behavior depending on the setting of this option in the following cases:
+    </p>
+
+    <ul>
+      <li>
+        <p>
+          If files are removed from a partition using HDFS or other non-Impala 
operations,
+          there is a period until the next <codeph>REFRESH</codeph> of the 
table where regular
+          queries fail at run time because they detect the missing files. With 
this optimization
+          enabled, queries that evaluate only the partition key column values 
(not the contents of
+          the partition itself) succeed, and treat the partition as if it 
still exists.
+        </p>
+      </li>
+      <li>
+        <p>
+          If a partition contains any data files, but the data files do not 
contain any rows,
+          a regular query considers that the partition does not exist. With 
this optimization
+          enabled, the partition is treated as if it exists.
+        </p>
+        <p>
+          If the partition includes no files at all, this optimization does 
not change the query
+          behavior: the partition is considered to not exist whether or not 
this optimization is enabled.
+        </p>
+      </li>
+    </ul>
+
+    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+    <p>
+      The following example shows initial schema setup and the default 
behavior of queries that
+      return just the partition key column for a table:
+    </p>
+
+<codeblock>
+-- Make a partitioned table with 3 partitions.
+create table t1 (s string) partitioned by (year int);
+insert into t1 partition (year=2015) values ('last year');
+insert into t1 partition (year=2016) values ('this year');
+insert into t1 partition (year=2017) values ('next year');
+
+-- Regardless of the option setting, this query must read the
+-- data files to know how many rows to return for each year value.
+explain select year from t1;
++-----------------------------------------------------+
+| Explain String                                      |
++-----------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=0B VCores=0 |
+|                                                     |
+| F00:PLAN FRAGMENT [UNPARTITIONED]                   |
+|   00:SCAN HDFS [key_cols.t1]                        |
+|      partitions=3/3 files=4 size=40B                |
+|      table stats: 3 rows total                      |
+|      column stats: all                              |
+|      hosts=3 per-host-mem=unavailable               |
+|      tuple-ids=0 row-size=4B cardinality=3          |
++-----------------------------------------------------+
+
+-- The aggregation operation means the query does not need to read
+-- the data within each partition: the result set contains exactly 1 row
+-- per partition, derived from the partition key column value.
+-- By default, Impala still includes a 'scan' operation in the query.
+explain select distinct year from t1;
++------------------------------------------------------------------------------------+
+| Explain String                                                               
      |
++------------------------------------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=0B VCores=0                          
      |
+|                                                                              
      |
+| 01:AGGREGATE [FINALIZE]                                                      
      |
+| |  group by: year                                                            
      |
+| |                                                                            
      |
+| 00:SCAN HDFS [key_cols.t1]                                                   
      |
+|    partitions=0/0 files=0 size=0B                                            
      |
++------------------------------------------------------------------------------------+
+</codeblock>
+
+    <p>
+      The following examples show how the plan is made more efficient when the
+      <codeph>OPTIMIZE_PARTITION_KEY_SCANS</codeph> option is enabled:
+    </p>
+
+<codeblock>
+set optimize_partition_key_scans=1;
+OPTIMIZE_PARTITION_KEY_SCANS set to 1
+
+-- The aggregation operation is turned into a UNION internally,
+-- with constant values known in advance based on the metadata
+-- for the partitioned table.
+explain select distinct year from t1;
++-----------------------------------------------------+
+| Explain String                                      |
++-----------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=0B VCores=0 |
+|                                                     |
+| F00:PLAN FRAGMENT [UNPARTITIONED]                   |
+|   01:AGGREGATE [FINALIZE]                           |
+|   |  group by: year                                 |
+|   |  hosts=1 per-host-mem=unavailable               |
+|   |  tuple-ids=1 row-size=4B cardinality=3          |
+|   |                                                 |
+|   00:UNION                                          |
+|      constant-operands=3                            |
+|      hosts=1 per-host-mem=unavailable               |
+|      tuple-ids=0 row-size=4B cardinality=3          |
++-----------------------------------------------------+
+
+-- The same optimization applies to other aggregation queries
+-- that only return values based on partition key columns:
+-- MIN, MAX, COUNT(DISTINCT), and so on.
+explain select min(year) from t1;
++-----------------------------------------------------+
+| Explain String                                      |
++-----------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=0B VCores=0 |
+|                                                     |
+| F00:PLAN FRAGMENT [UNPARTITIONED]                   |
+|   01:AGGREGATE [FINALIZE]                           |
+|   |  output: min(year)                              |
+|   |  hosts=1 per-host-mem=unavailable               |
+|   |  tuple-ids=1 row-size=4B cardinality=1          |
+|   |                                                 |
+|   00:UNION                                          |
+|      constant-operands=3                            |
+|      hosts=1 per-host-mem=unavailable               |
+|      tuple-ids=0 row-size=4B cardinality=3          |
++-----------------------------------------------------+
+</codeblock>
+
+  </conbody>
+</concept>


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_order_by.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_order_by.xml b/docs/topics/impala_order_by.xml
new file mode 100644
index 0000000..2db141d
--- /dev/null
+++ b/docs/topics/impala_order_by.xml
@@ -0,0 +1,318 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD 
DITA Concept//EN" "concept.dtd">
+<concept id="order_by">
+
+  <title>ORDER BY Clause</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="SQL"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      The familiar <codeph>ORDER BY</codeph> clause of a 
<codeph>SELECT</codeph> statement sorts the result set
+      based on the values from one or more columns.
+    </p>
+
+    <p>
+      For distributed queries, this is a relatively expensive operation, 
because the entire result set must be
+      produced and transferred to one node before the sorting can happen. This 
can require more memory capacity
+      than a query without <codeph>ORDER BY</codeph>. Even if the query takes 
approximately the same time to finish
+      with or without the <codeph>ORDER BY</codeph> clause, subjectively it 
can appear slower because no results
+      are available until all processing is finished, rather than results 
coming back gradually as rows matching
+      the <codeph>WHERE</codeph> clause are found. Therefore, if you only need 
the first N results from the sorted
+      result set, also include the <codeph>LIMIT</codeph> clause, which 
reduces network overhead and the memory
+      requirement on the coordinator node.
+    </p>
+
+    <note>
+      <p rev="1.4.0 obwl">
+        In Impala 1.4.0 and higher, the <codeph>LIMIT</codeph> clause is now 
optional (rather than required) for
+        queries that use the <codeph>ORDER BY</codeph> clause. Impala 
automatically uses a temporary disk work area
+        to perform the sort if the sort operation would otherwise exceed the 
Impala memory limit for a particular
+        DataNode.
+      </p>
+    </note>
+
+    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+    <p>
+      The full syntax for the <codeph>ORDER BY</codeph> clause is:
+    </p>
+
+<codeblock rev="1.2.1">ORDER BY <varname>col_ref</varname> [, 
<varname>col_ref</varname> ...] [ASC | DESC] [NULLS FIRST | NULLS LAST]
+
+col_ref ::= <varname>column_name</varname> | <varname>integer_literal</varname>
+</codeblock>
+
+    <p>
+      Although the most common usage is <codeph>ORDER BY 
<varname>column_name</varname></codeph>, you can also
+      specify <codeph>ORDER BY 1</codeph> to sort by the first column of the 
result set, <codeph>ORDER BY
+      2</codeph> to sort by the second column, and so on. The number must be a 
numeric literal, not some other kind
+      of constant expression. (If the argument is some other expression, even 
a <codeph>STRING</codeph> value, the
+      query succeeds but the order of results is undefined.)
+    </p>
+
+    <p>
+      <codeph>ORDER BY <varname>column_number</varname></codeph> can only be 
used when the query explicitly lists
+      the columns in the <codeph>SELECT</codeph> list, not with <codeph>SELECT 
*</codeph> queries.
+    </p>
+
+    <p>
+      <b>Ascending and descending sorts:</b>
+    </p>
+
+    <p>
+      The default sort order (the same as using the <codeph>ASC</codeph> 
keyword) puts the smallest values at the
+      start of the result set, and the largest values at the end. Specifying 
the <codeph>DESC</codeph> keyword
+      reverses that order.
+    </p>
+
+    <p>
+      <b>Sort order for NULL values:</b>
+    </p>
+
+    <p rev="1.2.1">
+      See <xref href="impala_literals.xml#null"/> for details about how 
<codeph>NULL</codeph> values are positioned
+      in the sorted result set, and how to use the <codeph>NULLS 
FIRST</codeph> and <codeph>NULLS LAST</codeph>
+      clauses. (The sort position for <codeph>NULL</codeph> values in 
<codeph>ORDER BY ... DESC</codeph> queries is
+      changed in Impala 1.2.1 and higher to be more standards-compliant, and 
the <codeph>NULLS FIRST</codeph> and
+      <codeph>NULLS LAST</codeph> keywords are new in Impala 1.2.1.)
+    </p>
+
+    <p rev="obwl" conref="../shared/impala_common.xml#common/order_by_limit"/>
+
+    <!-- Good to show an example of cases where ORDER BY does and doesn't work 
with complex types. -->
+    <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+    <p rev="2.3.0">
+      In <keyword keyref="impala23_full"/> and higher, the complex data types 
<codeph>STRUCT</codeph>,
+      <codeph>ARRAY</codeph>, and <codeph>MAP</codeph> are available. These 
columns cannot
+      be referenced directly in the <codeph>ORDER BY</codeph> clause.
+      When you query a complex type column, you use join notation to 
<q>unpack</q> the elements
+      of the complex type, and within the join query you can include an 
<codeph>ORDER BY</codeph>
+      clause to control the order in the result set of the scalar elements 
from the complex type.
+      See <xref href="impala_complex_types.xml#complex_types"/> for details 
about Impala support for complex types.
+    </p>
+
+    <p>
+      The following query shows how a complex type column cannot be directly 
used in an <codeph>ORDER BY</codeph> clause:
+    </p>
+
+<codeblock>CREATE TABLE games (id BIGINT, score ARRAY &lt;BIGINT&gt;) STORED 
AS PARQUET;
+...use LOAD DATA to load externally created Parquet files into the table...
+SELECT id FROM games ORDER BY score DESC;
+ERROR: AnalysisException: ORDER BY expression 'score' with complex type 
'ARRAY&lt;BIGINT&gt;' is not supported.
+</codeblock>
+
+    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+    <p>
+      The following query retrieves the user ID and score, only for scores 
greater than one million,
+      with the highest scores for each user listed first.
+      Because the individual array elements are now represented as separate 
rows in the result set,
+      they can be used in the <codeph>ORDER BY</codeph> clause, referenced 
using the <codeph>ITEM</codeph>
+      pseudocolumn that represents each array element.
+    </p>
+
+<codeblock>SELECT id, item FROM games, games.score
+  WHERE item &gt; 1000000
+ORDER BY id, item desc;
+</codeblock>
+
+    <p>
+      The following queries use similar <codeph>ORDER BY</codeph> techniques 
with variations of the <codeph>GAMES</codeph>
+      table, where the complex type is an <codeph>ARRAY</codeph> containing 
<codeph>STRUCT</codeph> or <codeph>MAP</codeph>
+      elements to represent additional details about each game that was played.
+      For an array of structures, the fields of the structure are referenced 
as <codeph>ITEM.<varname>field_name</varname></codeph>.
+      For an array of maps, the keys and values within each array element are 
referenced as <codeph>ITEM.KEY</codeph>
+      and <codeph>ITEM.VALUE</codeph>.
+    </p>
+
+<codeblock>CREATE TABLE games2 (id BIGINT, play array &lt; struct 
&lt;game_name: string, score: BIGINT, high_score: boolean&gt; &gt;) STORED AS 
PARQUET
+...use LOAD DATA to load externally created Parquet files into the table...
+SELECT id, item.game_name, item.score FROM games2, games2.play
+  WHERE item.score &gt; 1000000
+ORDER BY id, item.score DESC;
+
+CREATE TABLE games3 (id BIGINT, play ARRAY &lt; MAP &lt;STRING, BIGINT&gt; 
&gt;) STORED AS PARQUET;
+...use LOAD DATA to load externally created Parquet files into the table...
+SELECT id, info.key AS k, info.value AS v from games3, games3.play AS plays, 
games3.play.item AS info
+  WHERE info.KEY = 'score' AND info.VALUE &gt; 1000000
+ORDER BY id, info.value desc;
+</codeblock>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p>
+      Although the <codeph>LIMIT</codeph> clause is now optional on 
<codeph>ORDER BY</codeph> queries, if your
+      query only needs some number of rows that you can predict in advance, 
use the <codeph>LIMIT</codeph> clause
+      to reduce unnecessary processing. For example, if the query has a clause 
<codeph>LIMIT 10</codeph>, each data
+      node sorts its portion of the relevant result set and only returns 10 
rows to the coordinator node. The
+      coordinator node picks the 10 highest or lowest row values out of this 
small intermediate result set.
+    </p>
+
+    <p>
+      If an <codeph>ORDER BY</codeph> clause is applied to an early phase of 
query processing, such as a subquery
+      or a view definition, Impala ignores the <codeph>ORDER BY</codeph> 
clause. To get ordered results from a
+      subquery or view, apply an <codeph>ORDER BY</codeph> clause to the 
outermost or final <codeph>SELECT</codeph>
+      level.
+    </p>
+
+    <p>
+      <codeph>ORDER BY</codeph> is often used in combination with 
<codeph>LIMIT</codeph> to perform <q>top-N</q>
+      queries:
+    </p>
+
+<codeblock>SELECT user_id AS "Top 10 Visitors", SUM(page_views) FROM web_stats
+  GROUP BY page_views, user_id
+  ORDER BY SUM(page_views) DESC LIMIT 10;
+</codeblock>
+
+    <p>
+      <codeph>ORDER BY</codeph> is sometimes used in combination with 
<codeph>OFFSET</codeph> and
+      <codeph>LIMIT</codeph> to paginate query results, although it is 
relatively inefficient to issue multiple
+      queries like this against the large tables typically used with Impala:
+    </p>
+
+<codeblock>SELECT page_title AS "Page 1 of search results", page_url FROM 
search_content
+  WHERE LOWER(page_title) LIKE '%game%')
+  ORDER BY page_title LIMIT 10 OFFSET 0;
+SELECT page_title AS "Page 2 of search results", page_url FROM search_content
+  WHERE LOWER(page_title) LIKE '%game%')
+  ORDER BY page_title LIMIT 10 OFFSET 10;
+SELECT page_title AS "Page 3 of search results", page_url FROM search_content
+  WHERE LOWER(page_title) LIKE '%game%')
+  ORDER BY page_title LIMIT 10 OFFSET 20;
+</codeblock>
+
+    <p conref="../shared/impala_common.xml#common/internals_blurb"/>
+
+    <p>
+      Impala sorts the intermediate results of an <codeph>ORDER BY</codeph> 
clause in memory whenever practical. In
+      a cluster of N DataNodes, each node sorts roughly 1/Nth of the result 
set, the exact proportion varying
+      depending on how the data matching the query is distributed in HDFS.
+    </p>
+
+    <p>
+      If the size of the sorted intermediate result set on any DataNode would 
cause the query to exceed the Impala
+      memory limit, Impala sorts as much as practical in memory, then writes 
partially sorted data to disk. (This
+      technique is known in industry terminology as <q>external sorting</q> 
and <q>spilling to disk</q>.) As each
+      8 MB batch of data is written to disk, Impala frees the corresponding 
memory to sort a new 8 MB batch of
+      data. When all the data has been processed, a final merge sort operation 
is performed to correctly order the
+      in-memory and on-disk results as the result set is transmitted back to 
the coordinator node. When external
+      sorting becomes necessary, Impala requires approximately 60 MB of RAM at 
a minimum for the buffers needed to
+      read, write, and sort the intermediate results. If more RAM is available 
on the DataNode, Impala will use
+      the additional RAM to minimize the amount of disk I/O for sorting.
+    </p>
+
+    <p>
+      This external sort technique is used as appropriate on each DataNode 
(possibly including the coordinator
+      node) to sort the portion of the result set that is processed on that 
node. When the sorted intermediate
+      results are sent back to the coordinator node to produce the final 
result set, the coordinator node uses a
+      merge sort technique to produce a final sorted result set without using 
any extra resources on the
+      coordinator node.
+    </p>
+
+    <p rev="obwl">
+      <b>Configuration for disk usage:</b>
+    </p>
+
+    <p rev="obwl" 
conref="../shared/impala_common.xml#common/order_by_scratch_dir"/>
+
+<!-- Here is actually the more logical place to collect all those examples, 
move them from SELECT and cross-reference to here. -->
+
+<!--     <p rev="obwl" 
conref="../shared/impala_common.xml#common/restrictions_blurb"/> -->
+
+    <p rev="obwl" 
conref="../shared/impala_common.xml#common/insert_sort_blurb"/>
+
+    <p rev="obwl" 
conref="../shared/impala_common.xml#common/order_by_view_restriction"/>
+
+    <p>
+      With the lifting of the requirement to include a <codeph>LIMIT</codeph> 
clause in every <codeph>ORDER
+      BY</codeph> query (in Impala 1.4 and higher):
+    </p>
+
+    <ul>
+      <li>
+        <p>
+          Now the use of scratch disk space raises the possibility of an 
<q>out of disk space</q> error on a
+          particular DataNode, as opposed to the previous possibility of an 
<q>out of memory</q> error. Make sure
+          to keep at least 1 GB free on the filesystem used for temporary 
sorting work.
+        </p>
+      </li>
+
+      <li>
+        <p>
+          The query options
+          <xref 
href="impala_default_order_by_limit.xml#default_order_by_limit">DEFAULT_ORDER_BY_LIMIT</xref>
 and
+          <xref 
href="impala_abort_on_default_limit_exceeded.xml#abort_on_default_limit_exceeded">ABORT_ON_DEFAULT_LIMIT_EXCEEDED</xref>,
+          which formerly controlled the behavior of <codeph>ORDER BY</codeph> 
queries with no limit specified, are
+          now ignored.
+        </p>
+      </li>
+    </ul>
+
+    <p rev="obwl" 
conref="../shared/impala_common.xml#common/null_sorting_change"/>
+<codeblock>[localhost:21000] > create table numbers (x int);
+[localhost:21000] > insert into numbers values (1), (null), (2), (null), (3);
+[localhost:21000] > select x from numbers order by x nulls first;
++------+
+| x    |
++------+
+| NULL |
+| NULL |
+| 1    |
+| 2    |
+| 3    |
++------+
+[localhost:21000] > select x from numbers order by x desc nulls first;
++------+
+| x    |
++------+
+| NULL |
+| NULL |
+| 3    |
+| 2    |
+| 1    |
++------+
+[localhost:21000] > select x from numbers order by x nulls last;
++------+
+| x    |
++------+
+| 1    |
+| 2    |
+| 3    |
+| NULL |
+| NULL |
++------+
+[localhost:21000] > select x from numbers order by x desc nulls last;
++------+
+| x    |
++------+
+| 3    |
+| 2    |
+| 1    |
+| NULL |
+| NULL |
++------+
+</codeblock>
+
+    <p rev="obwl" conref="../shared/impala_common.xml#common/related_info"/>
+
+    <p rev="obwl">
+      See <xref href="impala_select.xml#select"/> for further examples of 
queries with the <codeph>ORDER
+      BY</codeph> clause.
+    </p>
+
+    <p>
+      Analytic functions use the <codeph>ORDER BY</codeph> clause in a 
different context to define the sequence in
+      which rows are analyzed. See <xref 
href="impala_analytic_functions.xml#analytic_functions"/> for details.
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_parquet.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_parquet.xml b/docs/topics/impala_parquet.xml
new file mode 100644
index 0000000..b22fa84
--- /dev/null
+++ b/docs/topics/impala_parquet.xml
@@ -0,0 +1,1160 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="parquet">
+
+  <title>Using the Parquet File Format with Impala Tables</title>
+  <titlealts audience="PDF"><navtitle>Parquet Data Files</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="File Formats"/>
+      <data name="Category" value="Parquet"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+      <data name="Category" value="Tables"/>
+      <data name="Category" value="Schemas"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">Parquet support in Impala</indexterm>
+      Impala helps you to create, manage, and query Parquet tables. Parquet is 
a column-oriented binary file format
+      intended to be highly efficient for the types of large-scale queries 
that Impala is best at. Parquet is
+      especially good for queries scanning particular columns within a table, 
for example to query <q>wide</q>
+      tables with many columns, or to perform aggregation operations such as 
<codeph>SUM()</codeph> and
+      <codeph>AVG()</codeph> that need to process most or all of the values 
from a column. Each data file contains
+      the values for a set of rows (the <q>row group</q>). Within a data file, 
the values from each column are
+      organized so that they are all adjacent, enabling good compression for 
the values from that column. Queries
+      against a Parquet table can retrieve and analyze these values from any 
column quickly and with minimal I/O.
+    </p>
+
+    <table>
+      <title>Parquet Format Support in Impala</title>
+      <tgroup cols="5">
+        <colspec colname="1" colwidth="10*"/>
+        <colspec colname="2" colwidth="10*"/>
+        <colspec colname="3" colwidth="20*"/>
+        <colspec colname="4" colwidth="30*"/>
+        <colspec colname="5" colwidth="30*"/>
+        <thead>
+          <row>
+            <entry>
+              File Type
+            </entry>
+            <entry>
+              Format
+            </entry>
+            <entry>
+              Compression Codecs
+            </entry>
+            <entry>
+              Impala Can CREATE?
+            </entry>
+            <entry>
+              Impala Can INSERT?
+            </entry>
+          </row>
+        </thead>
+        <tbody>
+          <row conref="impala_file_formats.xml#file_formats/parquet_support">
+            <entry/>
+          </row>
+        </tbody>
+      </tgroup>
+    </table>
+
+    <p outputclass="toc inpage"/>
+
+    <p audience="integrated">
+      For general information about using Parquet with other CDH components,
+      see <xref href="cdh_ig_parquet.xml#parquet_format"/>.
+    </p>
+
+  </conbody>
+
+
+  <concept id="parquet_ddl">
+
+    <title>Creating Parquet Tables in Impala</title>
+
+    <conbody>
+
+      <p>
+        To create a table named <codeph>PARQUET_TABLE</codeph> that uses the 
Parquet format, you would use a
+        command like the following, substituting your own table name, column 
names, and data types:
+      </p>
+
+<codeblock>[impala-host:21000] &gt; create table 
<varname>parquet_table_name</varname> (x INT, y STRING) STORED AS 
PARQUET;</codeblock>
+
+<!--
+<note>
+Formerly, the <codeph>STORED AS</codeph> clause required the keyword 
<codeph>PARQUETFILE</codeph>.
+In Impala 1.2.2 and higher, you can use <codeph>STORED AS PARQUET</codeph>.
+This <codeph>PARQUET</codeph> keyword is recommended for new code.
+</note>
+-->
+
+      <p>
+        Or, to clone the column names and data types of an existing table:
+      </p>
+
+<codeblock>[impala-host:21000] &gt; create table 
<varname>parquet_table_name</varname> LIKE <varname>other_table_name</varname> 
STORED AS PARQUET;</codeblock>
+
+      <p rev="1.4.0">
+        In Impala 1.4.0 and higher, you can derive column definitions from a 
raw Parquet data file, even without an
+        existing Impala table. For example, you can create an external table 
pointing to an HDFS directory, and
+        base the column definitions on one of the files in that directory:
+      </p>
+
+<codeblock rev="1.4.0">CREATE EXTERNAL TABLE ingest_existing_files LIKE 
PARQUET '/user/etl/destination/datafile1.dat'
+  STORED AS PARQUET
+  LOCATION '/user/etl/destination';
+</codeblock>
+
+      <p>
+        Or, you can refer to an existing data file and create a new empty 
table with suitable column definitions.
+        Then you can use <codeph>INSERT</codeph> to create new data files or 
<codeph>LOAD DATA</codeph> to transfer
+        existing data files into the new table.
+      </p>
+
+<codeblock rev="1.4.0">CREATE TABLE columns_from_data_file LIKE PARQUET 
'/user/etl/destination/datafile1.dat'
+  STORED AS PARQUET;
+</codeblock>
+
+      <p>
+        The default properties of the newly created table are the same as for 
any other <codeph>CREATE
+        TABLE</codeph> statement. For example, the default file format is 
text; if you want the new table to use
+        the Parquet file format, include the <codeph>STORED AS 
PARQUET</codeph> file also.
+      </p>
+
+      <p>
+        In this example, the new table is partitioned by year, month, and day. 
These partition key columns are not
+        part of the data file, so you specify them in the <codeph>CREATE 
TABLE</codeph> statement:
+      </p>
+
+<codeblock rev="1.4.0">CREATE TABLE columns_from_data_file LIKE PARQUET 
'/user/etl/destination/datafile1.dat'
+  PARTITION (year INT, month TINYINT, day TINYINT)
+  STORED AS PARQUET;
+</codeblock>
+
+      <p rev="1.4.0">
+        See <xref href="impala_create_table.xml#create_table"/> for more 
details about the <codeph>CREATE TABLE
+        LIKE PARQUET</codeph> syntax.
+      </p>
+
+      <p>
+        Once you have created a table, to insert data into that table, use a 
command similar to the following,
+        again with your own table names:
+      </p>
+
+      <!-- To do:
+        Opportunity for another example showing CTAS technique.
+      -->
+
+<codeblock>[impala-host:21000] &gt; insert overwrite table 
<varname>parquet_table_name</varname> select * from 
<varname>other_table_name</varname>;</codeblock>
+
+      <p>
+        If the Parquet table has a different number of columns or different 
column names than the other table,
+        specify the names of columns from the other table rather than 
<codeph>*</codeph> in the
+        <codeph>SELECT</codeph> statement.
+      </p>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="parquet_etl">
+
+    <title>Loading Data into Parquet Tables</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="ETL"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        Choose from the following techniques for loading data into Parquet 
tables, depending on whether the
+        original data is already in an Impala table, or exists as raw data 
files outside Impala.
+      </p>
+
+      <p>
+        If you already have data in an Impala or Hive table, perhaps in a 
different file format or partitioning
+        scheme, you can transfer the data to a Parquet table using the Impala 
<codeph>INSERT...SELECT</codeph>
+        syntax. You can convert, filter, repartition, and do other things to 
the data as part of this same
+        <codeph>INSERT</codeph> statement. See <xref 
href="#parquet_compression"/> for some examples showing how to
+        insert data into Parquet tables.
+      </p>
+
+      <p conref="../shared/impala_common.xml#common/insert_hints"/>
+
+      <p conref="../shared/impala_common.xml#common/insert_parquet_blocksize"/>
+
+      <draft-comment translate="no">
+Add an example here.
+</draft-comment>
+
+      <p>
+        Avoid the <codeph>INSERT...VALUES</codeph> syntax for Parquet tables, 
because
+        <codeph>INSERT...VALUES</codeph> produces a separate tiny data file 
for each
+        <codeph>INSERT...VALUES</codeph> statement, and the strength of 
Parquet is in its handling of data
+        (compressing, parallelizing, and so on) in <ph 
rev="parquet_block_size">large</ph> chunks.
+      </p>
+
+      <p>
+        If you have one or more Parquet data files produced outside of Impala, 
you can quickly make the data
+        queryable through Impala by one of the following methods:
+      </p>
+
+      <ul>
+        <li>
+          The <codeph>LOAD DATA</codeph> statement moves a single data file or 
a directory full of data files into
+          the data directory for an Impala table. It does no validation or 
conversion of the data. The original
+          data files must be somewhere in HDFS, not the local filesystem.
+          <draft-comment translate="no">
+Add an example here.
+</draft-comment>
+        </li>
+
+        <li>
+          The <codeph>CREATE TABLE</codeph> statement with the 
<codeph>LOCATION</codeph> clause creates a table
+          where the data continues to reside outside the Impala data 
directory. The original data files must be
+          somewhere in HDFS, not the local filesystem. For extra safety, if 
the data is intended to be long-lived
+          and reused by other applications, you can use the <codeph>CREATE 
EXTERNAL TABLE</codeph> syntax so that
+          the data files are not deleted by an Impala <codeph>DROP 
TABLE</codeph> statement.
+          <draft-comment translate="no">
+Add an example here.
+</draft-comment>
+        </li>
+
+        <li>
+          If the Parquet table already exists, you can copy Parquet data files 
directly into it, then use the
+          <codeph>REFRESH</codeph> statement to make Impala recognize the 
newly added data. Remember to preserve
+          the block size of the Parquet data files by using the <codeph>hadoop 
distcp -pb</codeph> command rather
+          than a <codeph>-put</codeph> or <codeph>-cp</codeph> operation on 
the Parquet files. See
+          <xref href="#parquet_compression_multiple"/> for an example of this 
kind of operation.
+        </li>
+      </ul>
+
+      <note 
conref="../shared/impala_common.xml#common/restrictions_nonimpala_parquet"/>
+
+      <p>
+        Recent versions of Sqoop can produce Parquet output files using the 
<codeph>--as-parquetfile</codeph>
+        option.
+      </p>
+
+      <p conref="../shared/impala_common.xml#common/sqoop_timestamp_caveat"/>
+
+      <p>
+        If the data exists outside Impala and is in some other format, combine 
both of the preceding techniques.
+        First, use a <codeph>LOAD DATA</codeph> or <codeph>CREATE EXTERNAL 
TABLE ... LOCATION</codeph> statement to
+        bring the data into an Impala table that uses the appropriate file 
format. Then, use an
+        <codeph>INSERT...SELECT</codeph> statement to copy the data to the 
Parquet table, converting to Parquet
+        format as part of the process.
+      </p>
+
+      <draft-comment translate="no">
+Add an example here.
+</draft-comment>
+
+      <p>
+        Loading data into Parquet tables is a memory-intensive operation, 
because the incoming data is buffered
+        until it reaches <ph rev="parquet_block_size">one data block</ph> in 
size, then that chunk of data is
+        organized and compressed in memory before being written out. The 
memory consumption can be larger when
+        inserting data into partitioned Parquet tables, because a separate 
data file is written for each
+        combination of partition key column values, potentially requiring 
several
+        <ph rev="parquet_block_size">large</ph> chunks to be manipulated in 
memory at once.
+      </p>
+
+      <p>
+        When inserting into a partitioned Parquet table, Impala redistributes 
the data among the nodes to reduce
+        memory consumption. You might still need to temporarily increase the 
memory dedicated to Impala during the
+        insert operation, or break up the load operation into several 
<codeph>INSERT</codeph> statements, or both.
+      </p>
+
+      <note>
+        All the preceding techniques assume that the data you are loading 
matches the structure of the destination
+        table, including column order, column names, and partition layout. To 
transform or reorganize the data,
+        start by loading the data into a Parquet table that matches the 
underlying structure of the data, then use
+        one of the table-copying techniques such as <codeph>CREATE TABLE AS 
SELECT</codeph> or <codeph>INSERT ...
+        SELECT</codeph> to reorder or rename columns, divide the data among 
multiple partitions, and so on. For
+        example to take a single comprehensive Parquet data file and load it 
into a partitioned table, you would
+        use an <codeph>INSERT ... SELECT</codeph> statement with dynamic 
partitioning to let Impala create separate
+        data files with the appropriate partition values; for an example, see
+        <xref href="impala_insert.xml#insert"/>.
+      </note>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="parquet_performance">
+
+    <title>Query Performance for Impala Parquet Tables</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Performance"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        Query performance for Parquet tables depends on the number of columns 
needed to process the
+        <codeph>SELECT</codeph> list and <codeph>WHERE</codeph> clauses of the 
query, the way data is divided into
+        <ph rev="parquet_block_size">large data files with block size equal to 
file size</ph>, the reduction in I/O
+        by reading the data for each column in compressed format, which data 
files can be skipped (for partitioned
+        tables), and the CPU overhead of decompressing the data for each 
column.
+      </p>
+
+      <p>
+        For example, the following is an efficient query for a Parquet table:
+<codeblock>select avg(income) from census_data where state = 'CA';</codeblock>
+        The query processes only 2 columns out of a large number of total 
columns. If the table is partitioned by
+        the <codeph>STATE</codeph> column, it is even more efficient because 
the query only has to read and decode
+        1 column from each data file, and it can read only the data files in 
the partition directory for the state
+        <codeph>'CA'</codeph>, skipping the data files for all the other 
states, which will be physically located
+        in other directories.
+      </p>
+
+      <p>
+        The following is a relatively inefficient query for a Parquet table:
+<codeblock>select * from census_data;</codeblock>
+        Impala would have to read the entire contents of each <ph 
rev="parquet_block_size">large</ph> data file,
+        and decompress the contents of each column for each row group, 
negating the I/O optimizations of the
+        column-oriented format. This query might still be faster for a Parquet 
table than a table with some other
+        file format, but it does not take advantage of the unique strengths of 
Parquet data files.
+      </p>
+
+      <p>
+        Impala can optimize queries on Parquet tables, especially join 
queries, better when statistics are
+        available for all the tables. Issue the <codeph>COMPUTE STATS</codeph> 
statement for each table after
+        substantial amounts of data are loaded into or appended to it. See
+        <xref href="impala_compute_stats.xml#compute_stats"/> for details.
+      </p>
+
+      <p rev="2.5.0">
+        The runtime filtering feature, available in <keyword 
keyref="impala25_full"/> and higher, works best with Parquet tables.
+        The per-row filtering aspect only applies to Parquet tables.
+        See <xref href="impala_runtime_filtering.xml#runtime_filtering"/> for 
details.
+      </p>
+
+      <p conref="../shared/impala_common.xml#common/s3_block_splitting"/>
+
+    </conbody>
+
+    <concept id="parquet_partitioning">
+
+      <title>Partitioning for Parquet Tables</title>
+
+      <conbody>
+
+        <p>
+          As explained in <xref href="impala_partitioning.xml#partitioning"/>, 
partitioning is an important
+          performance technique for Impala generally. This section explains 
some of the performance considerations
+          for partitioned Parquet tables.
+        </p>
+
+        <p>
+          The Parquet file format is ideal for tables containing many columns, 
where most queries only refer to a
+          small subset of the columns. As explained in <xref 
href="#parquet_data_files"/>, the physical layout of
+          Parquet data files lets Impala read only a small fraction of the 
data for many queries. The performance
+          benefits of this approach are amplified when you use Parquet tables 
in combination with partitioning.
+          Impala can skip the data files for certain partitions entirely, 
based on the comparisons in the
+          <codeph>WHERE</codeph> clause that refer to the partition key 
columns. For example, queries on
+          partitioned tables often analyze data for time intervals based on 
columns such as <codeph>YEAR</codeph>,
+          <codeph>MONTH</codeph>, and/or <codeph>DAY</codeph>, or for 
geographic regions. Remember that Parquet
+          data files use a <ph rev="parquet_block_size">large</ph> block size, 
so when deciding how finely to
+          partition the data, try to find a granularity where each partition 
contains
+          <ph rev="parquet_block_size">256 MB</ph> or more of data, rather 
than creating a large number of smaller
+          files split among many partitions.
+        </p>
+
+        <p>
+          Inserting into a partitioned Parquet table can be a 
resource-intensive operation, because each Impala
+          node could potentially be writing a separate data file to HDFS for 
each combination of different values
+          for the partition key columns. The large number of simultaneous open 
files could exceed the HDFS
+          <q>transceivers</q> limit. To avoid exceeding this limit, consider 
the following techniques:
+        </p>
+
+        <ul>
+          <li>
+            Load different subsets of data using separate 
<codeph>INSERT</codeph> statements with specific values
+            for the <codeph>PARTITION</codeph> clause, such as 
<codeph>PARTITION (year=2010)</codeph>.
+          </li>
+
+          <li>
+            Increase the <q>transceivers</q> value for HDFS, sometimes spelled 
<q>xcievers</q> (sic). The property
+            value in the <filepath>hdfs-site.xml</filepath> configuration file 
is
+<!-- Old name, now deprecated: <codeph>dfs.datanode.max.xcievers</codeph>. -->
+            <codeph>dfs.datanode.max.transfer.threads</codeph>. For example, 
if you were loading 12 years of data
+            partitioned by year, month, and day, even a value of 4096 might 
not be high enough. This
+            <xref 
href="http://blog.cloudera.com/blog/2012/03/hbase-hadoop-xceivers/"; 
scope="external" format="html">blog
+            post</xref> explores the considerations for setting this value 
higher or lower, using HBase examples
+            for illustration.
+          </li>
+
+          <li>
+            Use the <codeph>COMPUTE STATS</codeph> statement to collect
+            <xref href="impala_perf_stats.xml#perf_column_stats">column 
statistics</xref> on the source table from
+            which data is being copied, so that the Impala query can estimate 
the number of different values in the
+            partition key columns and distribute the work accordingly.
+          </li>
+        </ul>
+
+      </conbody>
+
+    </concept>
+
+  </concept>
+
+  <concept id="parquet_compression">
+
+    <title>Snappy and GZip Compression for Parquet Data Files</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Snappy"/>
+      <data name="Category" value="Gzip"/>
+      <data name="Category" value="Compression"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        <indexterm audience="Cloudera">COMPRESSION_CODEC query 
option</indexterm>
+        When Impala writes Parquet data files using the 
<codeph>INSERT</codeph> statement, the underlying
+        compression is controlled by the <codeph>COMPRESSION_CODEC</codeph> 
query option. (Prior to Impala 2.0, the
+        query option name was <codeph>PARQUET_COMPRESSION_CODEC</codeph>.) The 
allowed values for this query option
+        are <codeph>snappy</codeph> (the default), <codeph>gzip</codeph>, and 
<codeph>none</codeph>. The option
+        value is not case-sensitive. If the option is set to an unrecognized 
value, all kinds of queries will fail
+        due to the invalid option setting, not just queries involving Parquet 
tables.
+      </p>
+
+    </conbody>
+
+    <concept id="parquet_snappy">
+
+      <title>Example of Parquet Table with Snappy Compression</title>
+
+      <conbody>
+
+        <p>
+          <indexterm audience="Cloudera">compression</indexterm>
+          By default, the underlying data files for a Parquet table are 
compressed with Snappy. The combination of
+          fast compression and decompression makes it a good choice for many 
data sets. To ensure Snappy
+          compression is used, for example after experimenting with other 
compression codecs, set the
+          <codeph>COMPRESSION_CODEC</codeph> query option to 
<codeph>snappy</codeph> before inserting the data:
+        </p>
+
+<codeblock>[localhost:21000] &gt; create database parquet_compression;
+[localhost:21000] &gt; use parquet_compression;
+[localhost:21000] &gt; create table parquet_snappy like raw_text_data;
+[localhost:21000] &gt; set COMPRESSION_CODEC=snappy;
+[localhost:21000] &gt; insert into parquet_snappy select * from raw_text_data;
+Inserted 1000000000 rows in 181.98s
+</codeblock>
+
+      </conbody>
+
+    </concept>
+
+    <concept id="parquet_gzip">
+
+      <title>Example of Parquet Table with GZip Compression</title>
+
+      <conbody>
+
+        <p>
+          If you need more intensive compression (at the expense of more CPU 
cycles for uncompressing during
+          queries), set the <codeph>COMPRESSION_CODEC</codeph> query option to 
<codeph>gzip</codeph> before
+          inserting the data:
+        </p>
+
+<codeblock>[localhost:21000] &gt; create table parquet_gzip like raw_text_data;
+[localhost:21000] &gt; set COMPRESSION_CODEC=gzip;
+[localhost:21000] &gt; insert into parquet_gzip select * from raw_text_data;
+Inserted 1000000000 rows in 1418.24s
+</codeblock>
+
+      </conbody>
+
+    </concept>
+
+    <concept id="parquet_none">
+
+      <title>Example of Uncompressed Parquet Table</title>
+
+      <conbody>
+
+        <p>
+          If your data compresses very poorly, or you want to avoid the CPU 
overhead of compression and
+          decompression entirely, set the <codeph>COMPRESSION_CODEC</codeph> 
query option to <codeph>none</codeph>
+          before inserting the data:
+        </p>
+
+<codeblock>[localhost:21000] &gt; create table parquet_none like raw_text_data;
+[localhost:21000] &gt; set COMPRESSION_CODEC=none;
+[localhost:21000] &gt; insert into parquet_none select * from raw_text_data;
+Inserted 1000000000 rows in 146.90s
+</codeblock>
+
+      </conbody>
+
+    </concept>
+
+    <concept id="parquet_compression_examples">
+
+      <title>Examples of Sizes and Speeds for Compressed Parquet Tables</title>
+
+      <conbody>
+
+        <p>
+          Here are some examples showing differences in data sizes and query 
speeds for 1 billion rows of synthetic
+          data, compressed with each kind of codec. As always, run similar 
tests with realistic data sets of your
+          own. The actual compression ratios, and relative insert and query 
speeds, will vary depending on the
+          characteristics of the actual data.
+        </p>
+
+        <p>
+          In this case, switching from Snappy to GZip compression shrinks the 
data by an additional 40% or so,
+          while switching from Snappy compression to no compression expands 
the data also by about 40%:
+        </p>
+
+<codeblock>$ hdfs dfs -du -h /user/hive/warehouse/parquet_compression.db
+23.1 G  /user/hive/warehouse/parquet_compression.db/parquet_snappy
+13.5 G  /user/hive/warehouse/parquet_compression.db/parquet_gzip
+32.8 G  /user/hive/warehouse/parquet_compression.db/parquet_none
+</codeblock>
+
+        <p>
+          Because Parquet data files are typically <ph 
rev="parquet_block_size">large</ph>, each directory will
+          have a different number of data files and the row groups will be 
arranged differently.
+        </p>
+
+        <p>
+          At the same time, the less agressive the compression, the faster the 
data can be decompressed. In this
+          case using a table with a billion rows, a query that evaluates all 
the values for a particular column
+          runs faster with no compression than with Snappy compression, and 
faster with Snappy compression than
+          with Gzip compression. Query performance depends on several other 
factors, so as always, run your own
+          benchmarks with your own data to determine the ideal tradeoff 
between data size, CPU efficiency, and
+          speed of insert and query operations.
+        </p>
+
+<codeblock>[localhost:21000] &gt; desc parquet_snappy;
+Query finished, fetching results ...
++-----------+---------+---------+
+| name      | type    | comment |
++-----------+---------+---------+
+| id        | int     |         |
+| val       | int     |         |
+| zfill     | string  |         |
+| name      | string  |         |
+| assertion | boolean |         |
++-----------+---------+---------+
+Returned 5 row(s) in 0.14s
+[localhost:21000] &gt; select avg(val) from parquet_snappy;
+Query finished, fetching results ...
++-----------------+
+| _c0             |
++-----------------+
+| 250000.93577915 |
++-----------------+
+Returned 1 row(s) in 4.29s
+[localhost:21000] &gt; select avg(val) from parquet_gzip;
+Query finished, fetching results ...
++-----------------+
+| _c0             |
++-----------------+
+| 250000.93577915 |
++-----------------+
+Returned 1 row(s) in 6.97s
+[localhost:21000] &gt; select avg(val) from parquet_none;
+Query finished, fetching results ...
++-----------------+
+| _c0             |
++-----------------+
+| 250000.93577915 |
++-----------------+
+Returned 1 row(s) in 3.67s
+</codeblock>
+
+      </conbody>
+
+    </concept>
+
+    <concept id="parquet_compression_multiple">
+
+      <title>Example of Copying Parquet Data Files</title>
+
+      <conbody>
+
+        <p>
+          Here is a final example, to illustrate how the data files using the 
various compression codecs are all
+          compatible with each other for read operations. The metadata about 
the compression format is written into
+          each data file, and can be decoded during queries regardless of the 
<codeph>COMPRESSION_CODEC</codeph>
+          setting in effect at the time. In this example, we copy data files 
from the
+          <codeph>PARQUET_SNAPPY</codeph>, <codeph>PARQUET_GZIP</codeph>, and 
<codeph>PARQUET_NONE</codeph> tables
+          used in the previous examples, each containing 1 billion rows, all 
to the data directory of a new table
+          <codeph>PARQUET_EVERYTHING</codeph>. A couple of sample queries 
demonstrate that the new table now
+          contains 3 billion rows featuring a variety of compression codecs 
for the data files.
+        </p>
+
+        <p>
+          First, we create the table in Impala so that there is a destination 
directory in HDFS to put the data
+          files:
+        </p>
+
+<codeblock>[localhost:21000] &gt; create table parquet_everything like 
parquet_snappy;
+Query: create table parquet_everything like parquet_snappy
+</codeblock>
+
+        <p>
+          Then in the shell, we copy the relevant data files into the data 
directory for this new table. Rather
+          than using <codeph>hdfs dfs -cp</codeph> as with typical files, we 
use <codeph>hadoop distcp -pb</codeph>
+          to ensure that the special <ph rev="parquet_block_size"> block 
size</ph> of the Parquet data files is
+          preserved.
+        </p>
+
+<codeblock>$ hadoop distcp -pb 
/user/hive/warehouse/parquet_compression.db/parquet_snappy \
+  /user/hive/warehouse/parquet_compression.db/parquet_everything
+...<varname>MapReduce output</varname>...
+$ hadoop distcp -pb /user/hive/warehouse/parquet_compression.db/parquet_gzip  \
+  /user/hive/warehouse/parquet_compression.db/parquet_everything
+...<varname>MapReduce output</varname>...
+$ hadoop distcp -pb /user/hive/warehouse/parquet_compression.db/parquet_none  \
+  /user/hive/warehouse/parquet_compression.db/parquet_everything
+...<varname>MapReduce output</varname>...
+</codeblock>
+
+        <p>
+          Back in the <cmdname>impala-shell</cmdname> interpreter, we use the 
<codeph>REFRESH</codeph> statement to
+          alert the Impala server to the new data files for this table, then 
we can run queries demonstrating that
+          the data files represent 3 billion rows, and the values for one of 
the numeric columns match what was in
+          the original smaller tables:
+        </p>
+
+<codeblock>[localhost:21000] &gt; refresh parquet_everything;
+Query finished, fetching results ...
+
+Returned 0 row(s) in 0.32s
+[localhost:21000] &gt; select count(*) from parquet_everything;
+Query finished, fetching results ...
++------------+
+| _c0        |
++------------+
+| 3000000000 |
++------------+
+Returned 1 row(s) in 8.18s
+[localhost:21000] &gt; select avg(val) from parquet_everything;
+Query finished, fetching results ...
++-----------------+
+| _c0             |
++-----------------+
+| 250000.93577915 |
++-----------------+
+Returned 1 row(s) in 13.35s
+</codeblock>
+
+      </conbody>
+
+    </concept>
+
+  </concept>
+
+  <concept rev="2.3.0" id="parquet_complex_types">
+
+    <title>Parquet Tables for Impala Complex Types</title>
+
+    <conbody>
+
+    <p>
+      In <keyword keyref="impala23_full"/> and higher, Impala supports the 
complex types
+      <codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>
+      See <xref href="impala_complex_types.xml#complex_types"/> for details.
+      Because these data types are currently supported only for the Parquet 
file format,
+      if you plan to use them, become familiar with the performance and 
storage aspects
+      of Parquet first.
+    </p>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="parquet_interop">
+
+    <title>Exchanging Parquet Data Files with Other Hadoop Components</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Hadoop"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        Starting in CDH 4.5, you can read and write Parquet data files from 
other CDH components.
+        <ph audience="integrated">See <xref 
href="cdh_ig_parquet.xml#parquet_format"/> for details.</ph>
+      </p>
+
+<!-- These couple of paragraphs reused in the release notes 'incompatible 
changes' section. -->
+
+<!-- But conbodydiv tag too restrictive, can't have just paragraphs and 
codeblocks inside. -->
+
+<!-- So I will physically copy the info for the time being. -->
+
+<!-- <conbodydiv id="upgrade_parquet_metadata"> -->
+
+      <p>
+        Previously, it was not possible to create Parquet data through Impala 
and reuse that table within Hive. Now
+        that Parquet support is available for Hive in CDH 4.5, reusing 
existing Impala Parquet data files in Hive
+        requires updating the table metadata. Use the following command if you 
are already running Impala 1.1.1 or
+        higher:
+      </p>
+
+<codeblock>ALTER TABLE <varname>table_name</varname> SET FILEFORMAT PARQUET;
+</codeblock>
+
+      <p>
+        If you are running a level of Impala that is older than 1.1.1, do the 
metadata update through Hive:
+      </p>
+
+<codeblock>ALTER TABLE <varname>table_name</varname> SET SERDE 
'parquet.hive.serde.ParquetHiveSerDe';
+ALTER TABLE <varname>table_name</varname> SET FILEFORMAT
+  INPUTFORMAT "parquet.hive.DeprecatedParquetInputFormat"
+  OUTPUTFORMAT "parquet.hive.DeprecatedParquetOutputFormat";
+</codeblock>
+
+      <p>
+        Impala 1.1.1 and higher can reuse Parquet data files created by Hive, 
without any action required.
+      </p>
+
+<!-- </conbodydiv> -->
+
+      <p rev="2.2.0">
+        Impala supports the scalar data types that you can encode in a Parquet 
data file, but not composite or
+        nested types such as maps or arrays. In <keyword 
keyref="impala22_full"/> and higher, Impala can query Parquet data
+        files that include composite or nested types, as long as the query 
only refers to columns with scalar
+        types.
+<!-- TK: could include an example here, but would require setup in Hive or Pig 
or something. -->
+      </p>
+
+      <p>
+        If you copy Parquet data files between nodes, or even between 
different directories on the same node, make
+        sure to preserve the block size by using the command <codeph>hadoop 
distcp -pb</codeph>. To verify that the
+        block size was preserved, issue the command <codeph>hdfs fsck -blocks
+        <varname>HDFS_path_of_impala_table_dir</varname></codeph> and check 
that the average block size is at or
+        near <ph rev="parquet_block_size">256 MB (or whatever other size is 
defined by the
+        <codeph>PARQUET_FILE_SIZE</codeph> query option).</ph>. (The 
<codeph>hadoop distcp</codeph> operation
+        typically leaves some directories behind, with names matching 
<filepath>_distcp_logs_*</filepath>, that you
+        can delete from the destination directory afterward.)
+<!-- The Apache wiki page keeps disappearing, even though Google still points 
to it as of Nov. 11/2014. -->
+<!-- Now there is a 'distcp2' guide: 
http://hadoop.apache.org/docs/r1.2.1/distcp2.html but I haven't tried that so 
let's play it safe for now and hide the link. -->
+<!--      See the <xref 
href="http://hadoop.apache.org/docs/r0.19.0/distcp.html"; scope="external" 
format="html">Hadoop DistCP Guide</xref> for details. -->
+        Issue the command <cmdname>hadoop distcp</cmdname> for details about 
<cmdname>distcp</cmdname> command
+        syntax.
+      </p>
+
+<!-- Sample commands/output for when the 'distcp' business is expanded into a 
tutorial later. Part of
+     a1730.halxg.cloudera.com:/home/jrussell/jdr/mixed_format_partitions.log.
+<codeblock>$ hdfs fsck -blocks 
/user/impala/warehouse/parquet_compression.db/parquet_everything
+Connecting to namenode via http://a1730.halxg.cloudera.com:50070
+FSCK started by jrussell (auth:SIMPLE) from /10.20.198.130 for path 
/user/impala/warehouse/parquet_compression.db/parquet_everything at Fri Aug 23 
11:35:37 PDT 2013
+............................................................................Status:
 HEALTHY
+ Total size:    74504481213 B
+ Total dirs:    1
+ Total files:   76
+ Total blocks (validated):      76 (avg. block size 980322121 B)
+ Minimally replicated blocks:   76 (100.0 %)
+ Over-replicated blocks:        0 (0.0 %)
+ Under-replicated blocks:       0 (0.0 %)
+ Mis-replicated blocks:         0 (0.0 %)
+ Default replication factor:    3
+ Average block replication:     3.0
+ Corrupt blocks:                0
+ Missing replicas:              0 (0.0 %)
+ Number of data-nodes:          4
+ Number of racks:               1
+FSCK ended at Fri Aug 23 11:35:37 PDT 2013 in 8 milliseconds
+
+
+The filesystem under path 
'/user/impala/warehouse/parquet_compression.db/parquet_everything' is HEALTHY
+</codeblock>
+-->
+
+      <p 
conref="../shared/impala_common.xml#common/impala_parquet_encodings_caveat"/>
+      <p conref="../shared/impala_common.xml#common/parquet_tools_blurb"/>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="parquet_data_files">
+
+    <title>How Parquet Data Files Are Organized</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Concepts"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        Although Parquet is a column-oriented file format, do not expect to 
find one data file for each column.
+        Parquet keeps all the data for a row within the same data file, to 
ensure that the columns for a row are
+        always available on the same node for processing. What Parquet does is 
to set a large HDFS block size and a
+        matching maximum data file size, to ensure that I/O and network 
transfer requests apply to large batches of
+        data.
+      </p>
+
+      <p>
+        Within that data file, the data for a set of rows is rearranged so 
that all the values from the first
+        column are organized in one contiguous block, then all the values from 
the second column, and so on.
+        Putting the values from the same column next to each other lets Impala 
use effective compression techniques
+        on the values in that column.
+      </p>
+
+      <note>
+        <p>
+          Impala <codeph>INSERT</codeph> statements write Parquet data files 
using an HDFS block size
+          <ph rev="parquet_block_size">that matches the data file size</ph>, 
to ensure that each data file is
+          represented by a single HDFS block, and the entire file can be 
processed on a single node without
+          requiring any remote reads.
+        </p>
+
+        <p>
+          If you create Parquet data files outside of Impala, such as through 
a MapReduce or Pig job, ensure that
+          the HDFS block size is greater than or equal to the file size, so 
that the <q>one file per block</q>
+          relationship is maintained. Set the <codeph>dfs.block.size</codeph> 
or the <codeph>dfs.blocksize</codeph>
+          property large enough that each file fits within a single HDFS 
block, even if that size is larger than
+          the normal HDFS block size.
+        </p>
+
+        <p>
+          If the block size is reset to a lower value during a file copy, you 
will see lower performance for
+          queries involving those files, and the <codeph>PROFILE</codeph> 
statement will reveal that some I/O is
+          being done suboptimally, through remote reads. See
+          <xref href="impala_parquet.xml#parquet_compression_multiple"/> for 
an example showing how to preserve the
+          block size when copying Parquet data files.
+        </p>
+      </note>
+
+      <p>
+        When Impala retrieves or tests the data for a particular column, it 
opens all the data files, but only
+        reads the portion of each file containing the values for that column. 
The column values are stored
+        consecutively, minimizing the I/O required to process the values 
within a single column. If other columns
+        are named in the <codeph>SELECT</codeph> list or 
<codeph>WHERE</codeph> clauses, the data for all columns
+        in the same row is available within that same data file.
+      </p>
+
+      <p>
+        If an <codeph>INSERT</codeph> statement brings in less than <ph 
rev="parquet_block_size">one Parquet
+        block's worth</ph> of data, the resulting data file is smaller than 
ideal. Thus, if you do split up an ETL
+        job to use multiple <codeph>INSERT</codeph> statements, try to keep 
the volume of data for each
+        <codeph>INSERT</codeph> statement to approximately <ph 
rev="parquet_block_size">256 MB, or a multiple of
+        256 MB</ph>.
+      </p>
+
+    </conbody>
+
+    <concept id="parquet_encoding">
+
+      <title>RLE and Dictionary Encoding for Parquet Data Files</title>
+
+      <conbody>
+
+        <p>
+          Parquet uses some automatic compression techniques, such as 
run-length encoding (RLE) and dictionary
+          encoding, based on analysis of the actual data values. Once the data 
values are encoded in a compact
+          form, the encoded data can optionally be further compressed using a 
compression algorithm. Parquet data
+          files created by Impala can use Snappy, GZip, or no compression; the 
Parquet spec also allows LZO
+          compression, but currently Impala does not support LZO-compressed 
Parquet files.
+        </p>
+
+        <p>
+          RLE and dictionary encoding are compression techniques that Impala 
applies automatically to groups of
+          Parquet data values, in addition to any Snappy or GZip compression 
applied to the entire data files.
+          These automatic optimizations can save you time and planning that 
are normally needed for a traditional
+          data warehouse. For example, dictionary encoding reduces the need to 
create numeric IDs as abbreviations
+          for longer string values.
+        </p>
+
+        <p>
+          Run-length encoding condenses sequences of repeated data values. For 
example, if many consecutive rows
+          all contain the same value for a country code, those repeating 
values can be represented by the value
+          followed by a count of how many times it appears consecutively.
+        </p>
+
+        <p>
+          Dictionary encoding takes the different values present in a column, 
and represents each one in compact
+          2-byte form rather than the original value, which could be several 
bytes. (Additional compression is
+          applied to the compacted values, for extra space savings.) This type 
of encoding applies when the number
+          of different values for a column is less than 2**16 (16,384). It 
does not apply to columns of data type
+          <codeph>BOOLEAN</codeph>, which are already very short. 
<codeph>TIMESTAMP</codeph> columns sometimes have
+          a unique value for each row, in which case they can quickly exceed 
the 2**16 limit on distinct values.
+          The 2**16 limit on different values within a column is reset for 
each data file, so if several different
+          data files each contained 10,000 different city names, the city name 
column in each data file could still
+          be condensed using dictionary encoding.
+        </p>
+
+      </conbody>
+
+    </concept>
+
+  </concept>
+
+  <concept rev="1.4.0" id="parquet_compacting">
+
+    <title>Compacting Data Files for Parquet Tables</title>
+
+    <conbody>
+
+      <p>
+        If you reuse existing table structures or ETL processes for Parquet 
tables, you might encounter a <q>many
+        small files</q> situation, which is suboptimal for query efficiency. 
For example, statements like these
+        might produce inefficiently organized data files:
+      </p>
+
+<codeblock>-- In an N-node cluster, each node produces a data file
+-- for the INSERT operation. If you have less than
+-- N GB of data to copy, some files are likely to be
+-- much smaller than the <ph rev="parquet_block_size">default Parquet</ph> 
block size.
+insert into parquet_table select * from text_table;
+
+-- Even if this operation involves an overall large amount of data,
+-- when split up by year/month/day, each partition might only
+-- receive a small amount of data. Then the data files for
+-- the partition might be divided between the N nodes in the cluster.
+-- A multi-gigabyte copy operation might produce files of only
+-- a few MB each.
+insert into partitioned_parquet_table partition (year, month, day)
+  select year, month, day, url, referer, user_agent, http_code, response_time
+  from web_stats;
+</codeblock>
+
+      <p>
+        Here are techniques to help you produce large data files in Parquet 
<codeph>INSERT</codeph> operations, and
+        to compact existing too-small data files:
+      </p>
+
+      <ul>
+        <li>
+          <p>
+            When inserting into a partitioned Parquet table, use statically 
partitioned <codeph>INSERT</codeph>
+            statements where the partition key values are specified as 
constant values. Ideally, use a separate
+            <codeph>INSERT</codeph> statement for each partition.
+          </p>
+        </li>
+
+        <li>
+          <p conref="../shared/impala_common.xml#common/num_nodes_tip"/>
+        </li>
+
+        <li>
+          <p>
+            Be prepared to reduce the number of partition key columns from 
what you are used to with traditional
+            analytic database systems.
+          </p>
+        </li>
+
+        <li>
+          <p>
+            Do not expect Impala-written Parquet files to fill up the entire 
Parquet block size. Impala estimates
+            on the conservative side when figuring out how much data to write 
to each Parquet file. Typically, the
+            of uncompressed data in memory is substantially reduced on disk by 
the compression and encoding
+            techniques in the Parquet file format.
+<!--
+  Impala reserves <ph rev="parquet_block_size">1 GB</ph> of memory to buffer 
the data before writing,
+  but the actual data file might be smaller, in the hundreds of megabytes.
+  -->
+            The final data file size varies depending on the compressibility 
of the data. Therefore, it is not an
+            indication of a problem if <ph rev="parquet_block_size">256 
MB</ph> of text data is turned into 2
+            Parquet data files, each less than <ph 
rev="parquet_block_size">256 MB</ph>.
+          </p>
+        </li>
+
+        <li>
+          <p>
+            If you accidentally end up with a table with many small data 
files, consider using one or more of the
+            preceding techniques and copying all the data into a new Parquet 
table, either through <codeph>CREATE
+            TABLE AS SELECT</codeph> or <codeph>INSERT ... SELECT</codeph> 
statements.
+          </p>
+
+          <p>
+            To avoid rewriting queries to change table names, you can adopt a 
convention of always running
+            important queries against a view. Changing the view definition 
immediately switches any subsequent
+            queries to use the new underlying tables:
+          </p>
+<codeblock>create view production_table as select * from 
table_with_many_small_files;
+-- CTAS or INSERT...SELECT all the data into a more efficient layout...
+alter view production_table as select * from table_with_few_big_files;
+select * from production_table where c1 = 100 and c2 &lt; 50 and ...;
+</codeblock>
+        </li>
+      </ul>
+
+    </conbody>
+
+  </concept>
+
+  <concept rev="1.4.0" id="parquet_schema_evolution">
+
+    <title>Schema Evolution for Parquet Tables</title>
+
+    <conbody>
+
+      <p>
+        Schema evolution refers to using the statement <codeph>ALTER TABLE ... 
REPLACE COLUMNS</codeph> to change
+        the names, data type, or number of columns in a table. You can perform 
schema evolution for Parquet tables
+        as follows:
+      </p>
+
+      <ul>
+        <li>
+          <p>
+            The Impala <codeph>ALTER TABLE</codeph> statement never changes 
any data files in the tables. From the
+            Impala side, schema evolution involves interpreting the same data 
files in terms of a new table
+            definition. Some types of schema changes make sense and are 
represented correctly. Other types of
+            changes cannot be represented in a sensible way, and produce 
special result values or conversion errors
+            during queries.
+          </p>
+        </li>
+
+        <li>
+          <p>
+            The <codeph>INSERT</codeph> statement always creates data using 
the latest table definition. You might
+            end up with data files with different numbers of columns or 
internal data representations if you do a
+            sequence of <codeph>INSERT</codeph> and <codeph>ALTER TABLE ... 
REPLACE COLUMNS</codeph> statements.
+          </p>
+        </li>
+
+        <li>
+          <p>
+            If you use <codeph>ALTER TABLE ... REPLACE COLUMNS</codeph> to 
define additional columns at the end,
+            when the original data files are used in a query, these final 
columns are considered to be all
+            <codeph>NULL</codeph> values.
+          </p>
+        </li>
+
+        <li>
+          <p>
+            If you use <codeph>ALTER TABLE ... REPLACE COLUMNS</codeph> to 
define fewer columns than before, when
+            the original data files are used in a query, the unused columns 
still present in the data file are
+            ignored.
+          </p>
+        </li>
+
+        <li>
+          <p>
+            Parquet represents the <codeph>TINYINT</codeph>, 
<codeph>SMALLINT</codeph>, and <codeph>INT</codeph>
+            types the same internally, all stored in 32-bit integers.
+          </p>
+          <ul>
+            <li>
+              That means it is easy to promote a <codeph>TINYINT</codeph> 
column to <codeph>SMALLINT</codeph> or
+              <codeph>INT</codeph>, or a <codeph>SMALLINT</codeph> column to 
<codeph>INT</codeph>. The numbers are
+              represented exactly the same in the data file, and the columns 
being promoted would not contain any
+              out-of-range values.
+            </li>
+
+            <li>
+              <p>
+                If you change any of these column types to a smaller type, any 
values that are out-of-range for the
+                new type are returned incorrectly, typically as negative 
numbers.
+              </p>
+            </li>
+
+            <li>
+              <p>
+                You cannot change a <codeph>TINYINT</codeph>, 
<codeph>SMALLINT</codeph>, or <codeph>INT</codeph>
+                column to <codeph>BIGINT</codeph>, or the other way around. 
Although the <codeph>ALTER
+                TABLE</codeph> succeeds, any attempt to query those columns 
results in conversion errors.
+              </p>
+            </li>
+
+            <li>
+              <p>
+                Any other type conversion for columns produces a conversion 
error during queries. For example,
+                <codeph>INT</codeph> to <codeph>STRING</codeph>, 
<codeph>FLOAT</codeph> to <codeph>DOUBLE</codeph>,
+                <codeph>TIMESTAMP</codeph> to <codeph>STRING</codeph>, 
<codeph>DECIMAL(9,0)</codeph> to
+                <codeph>DECIMAL(5,2)</codeph>, and so on.
+              </p>
+            </li>
+          </ul>
+        </li>
+      </ul>
+
+      <p rev="2.6.0 IMPALA-2835 CDH-33330">
+        You might find that you have Parquet files where the columns do not 
line up in the same
+        order as in your Impala table. For example, you might have a Parquet 
file that was part of
+        a table with columns <codeph>C1,C2,C3,C4</codeph>, and now you want to 
reuse the same
+        Parquet file in a table with columns <codeph>C4,C2</codeph>. By 
default, Impala expects the
+        columns in the data file to appear in the same order as the columns 
defined for the table,
+        making it impractical to do some kinds of file reuse or schema 
evolution. In <keyword keyref="impala26_full"/>
+        and higher, the query option 
<codeph>PARQUET_FALLBACK_SCHEMA_RESOLUTION=name</codeph> lets Impala
+        resolve columns by name, and therefore handle out-of-order or extra 
columns in the data file.
+        For example:
+
+<codeblock 
conref="../shared/impala_common.xml#common/parquet_fallback_schema_resolution_example"/>
+
+        See <xref 
href="impala_parquet_fallback_schema_resolution.xml#parquet_fallback_schema_resolution"/>
+        for more details.
+      </p>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="parquet_data_types">
+
+    <title>Data Type Considerations for Parquet Tables</title>
+
+    <conbody>
+
+      <p>
+        The Parquet format defines a set of data types whose names differ from 
the names of the corresponding
+        Impala data types. If you are preparing Parquet files using other 
Hadoop components such as Pig or
+        MapReduce, you might need to work with the type names defined by 
Parquet. The following figure lists the
+        Parquet-defined types and the equivalent types in Impala.
+      </p>
+
+      <p>
+        <b>Primitive types:</b>
+      </p>
+
+<codeblock>BINARY -&gt; STRING
+BOOLEAN -&gt; BOOLEAN
+DOUBLE -&gt; DOUBLE
+FLOAT -&gt; FLOAT
+INT32 -&gt; INT
+INT64 -&gt; BIGINT
+INT96 -&gt; TIMESTAMP
+</codeblock>
+
+      <p>
+        <b>Logical types:</b>
+      </p>
+
+<codeblock>BINARY + OriginalType UTF8 -&gt; STRING
+BINARY + OriginalType DECIMAL -&gt; DECIMAL
+</codeblock>
+
+      <p rev="2.3.0">
+        <b>Complex types:</b>
+      </p>
+
+      <p rev="2.3.0">
+        For the complex types (<codeph>ARRAY</codeph>, <codeph>MAP</codeph>, 
and <codeph>STRUCT</codeph>)
+        available in <keyword keyref="impala23_full"/> and higher, Impala only 
supports queries
+        against those types in Parquet tables.
+      </p>
+
+    </conbody>
+
+  </concept>
+
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_parquet_annotate_strings_utf8.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_parquet_annotate_strings_utf8.xml 
b/docs/topics/impala_parquet_annotate_strings_utf8.xml
new file mode 100644
index 0000000..b603b2c
--- /dev/null
+++ b/docs/topics/impala_parquet_annotate_strings_utf8.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="parquet_annotate_strings_utf8" rev="2.6.0 IMPALA-2069">
+
+  <title>PARQUET_ANNOTATE_STRINGS_UTF8 Query Option (<keyword 
keyref="impala26"/> or higher only)</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Parquet"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.6.0 IMPALA-2069">
+      <indexterm audience="Cloudera">PARQUET_ANNOTATE_STRINGS_UTF8 query 
option</indexterm>
+      Causes Impala <codeph>INSERT</codeph> and <codeph>CREATE TABLE AS 
SELECT</codeph> statements
+      to write Parquet files that use the UTF-8 annotation for 
<codeph>STRING</codeph> columns.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+    <p>
+      By default, Impala represents a <codeph>STRING</codeph> column in 
Parquet as an unannotated binary field.
+    </p>
+    <p>
+      Impala always uses the UTF-8 annotation when writing 
<codeph>CHAR</codeph> and <codeph>VARCHAR</codeph>
+      columns to Parquet files. An alternative to using the query option is to 
cast <codeph>STRING</codeph>
+      values to <codeph>VARCHAR</codeph>.
+    </p>
+    <p>
+      This option is to help make Impala-written data more interoperable with 
other data processing engines.
+      Impala itself currently does not support all operations on UTF-8 data.
+      Although data processed by Impala is typically represented in ASCII, it 
is valid to designate the
+      data as UTF-8 when storing on disk, because ASCII is a subset of UTF-8.
+    </p>
+    <p conref="../shared/impala_common.xml#common/type_boolean"/>
+    <p conref="../shared/impala_common.xml#common/default_false_0"/>
+
+    <p conref="../shared/impala_common.xml#common/added_in_260"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_parquet.xml#parquet"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_parquet_compression_codec.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_parquet_compression_codec.xml 
b/docs/topics/impala_parquet_compression_codec.xml
new file mode 100644
index 0000000..7132727
--- /dev/null
+++ b/docs/topics/impala_parquet_compression_codec.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="parquet_compression_codec">
+
+  <title>PARQUET_COMPRESSION_CODEC Query Option</title>
+  <titlealts 
audience="PDF"><navtitle>PARQUET_COMPRESSION_CODEC</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Parquet"/>
+      <data name="Category" value="File Formats"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Deprecated Features"/>
+      <data name="Category" value="Compression"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">PARQUET_COMPRESSION_CODEC query 
option</indexterm>
+      Deprecated. Use <codeph>COMPRESSION_CODEC</codeph> in Impala 2.0 and 
later. See
+      <xref href="impala_compression_codec.xml#compression_codec"/> for 
details.
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_parquet_fallback_schema_resolution.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_parquet_fallback_schema_resolution.xml 
b/docs/topics/impala_parquet_fallback_schema_resolution.xml
new file mode 100644
index 0000000..e9c9327
--- /dev/null
+++ b/docs/topics/impala_parquet_fallback_schema_resolution.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="parquet_fallback_schema_resolution" rev="2.6.0 IMPALA-2835 
CDH-33330">
+
+  <title>PARQUET_FALLBACK_SCHEMA_RESOLUTION Query Option (<keyword 
keyref="impala26"/> or higher only)</title>
+  <titlealts 
audience="PDF"><navtitle>PARQUET_FALLBACK_SCHEMA_RESOLUTION</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Parquet"/>
+      <data name="Category" value="Schemas"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.6.0 IMPALA-2835 CDH-33330">
+      <indexterm audience="Cloudera">PARQUET_FALLBACK_SCHEMA_RESOLUTION query 
option</indexterm>
+      Allows Impala to look up columns within Parquet files by column name, 
rather than column order,
+      when necessary.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+    <p>
+      By default, Impala looks up columns within a Parquet file based on
+      the order of columns in the table.
+      The <codeph>name</codeph> setting for this option enables behavior for 
Impala queries
+      similar to the Hive setting 
<codeph>parquet.column.index.access=false</codeph>.
+      It also allows Impala to query Parquet files created by Hive with the
+      <codeph>parquet.column.index.access=false</codeph> setting in effect.
+    </p>
+
+    <p>
+      <b>Type:</b> integer or string.
+      Allowed values are 0 or <codeph>position</codeph> (default), 1 or 
<codeph>name</codeph>.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_260"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_parquet.xml#parquet_schema_evolution"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_parquet_file_size.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_parquet_file_size.xml 
b/docs/topics/impala_parquet_file_size.xml
new file mode 100644
index 0000000..7019e93
--- /dev/null
+++ b/docs/topics/impala_parquet_file_size.xml
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="parquet_block_size" id="parquet_file_size">
+
+  <title>PARQUET_FILE_SIZE Query Option</title>
+  <titlealts audience="PDF"><navtitle>PARQUET_FILE_SIZE</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Parquet"/>
+      <data name="Category" value="ETL"/>
+      <data name="Category" value="File Formats"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">PARQUET_FILE_SIZE query option</indexterm>
+      Specifies the maximum size of each Parquet data file produced by Impala 
<codeph>INSERT</codeph> statements.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+    <p>
+      Specify the size in bytes, or with a trailing <codeph>m</codeph> or 
<codeph>g</codeph> character to indicate
+      megabytes or gigabytes. For example:
+    </p>
+
+<codeblock>-- 128 megabytes.
+set PARQUET_FILE_SIZE=134217728
+INSERT OVERWRITE parquet_table SELECT * FROM text_table;
+
+-- 512 megabytes.
+set PARQUET_FILE_SIZE=512m;
+INSERT OVERWRITE parquet_table SELECT * FROM text_table;
+
+-- 1 gigabyte.
+set PARQUET_FILE_SIZE=1g;
+INSERT OVERWRITE parquet_table SELECT * FROM text_table;
+</codeblock>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p>
+      With tables that are small or finely partitioned, the default Parquet 
block size (formerly 1 GB, now 256 MB
+      in Impala 2.0 and later) could be much larger than needed for each data 
file. For <codeph>INSERT</codeph>
+      operations into such tables, you can increase parallelism by specifying 
a smaller
+      <codeph>PARQUET_FILE_SIZE</codeph> value, resulting in more HDFS blocks 
that can be processed by different
+      nodes.
+<!-- Reducing the file size also reduces the memory required to buffer each 
block before writing it to disk. -->
+    </p>
+
+    <p>
+      <b>Type:</b> numeric, with optional unit specifier
+    </p>
+
+    <note type="important">
+    <p>
+      Currently, the maximum value for this setting is 1 gigabyte 
(<codeph>1g</codeph>).
+      Setting a value higher than 1 gigabyte could result in errors during
+      an <codeph>INSERT</codeph> operation.
+    </p>
+    </note>
+
+    <p>
+      <b>Default:</b> 0 (produces files with a target size of 256 MB; files 
might be larger for very wide tables)
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/isilon_blurb"/>
+    <p conref="../shared/impala_common.xml#common/isilon_block_size_caveat"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+
+    <p>
+      For information about the Parquet file format, and how the number and 
size of data files affects query
+      performance, see <xref href="impala_parquet.xml#parquet"/>.
+    </p>
+
+<!-- Examples actually folded into Syntax earlier.   <p 
conref="../shared/impala_common.xml#common/example_blurb"/> -->
+
+  </conbody>
+</concept>

[16/51] [partial] incubator-impala git commit: IMPALA-3398: Add docs to main Impala branch.

Reply via email to