5.10 versions.

jrussell Tue, 01 Nov 2016 16:13:58 -0700

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_query_options.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_query_options.xml 
b/docs/topics/impala_query_options.xml
index 1011746..7ed6ac2 100644
--- a/docs/topics/impala_query_options.xml
+++ b/docs/topics/impala_query_options.xml
@@ -38,30 +38,32 @@
     </p>
 
     <note rev="2.0.0">
-      In Impala 2.0 and later, you can set query options directly through the 
JDBC and ODBC interfaces by using the
-      <codeph>SET</codeph> statement. Formerly, <codeph>SET</codeph> was only 
available as a command within the
-      <cmdname>impala-shell</cmdname> interpreter.
+      <p rev="2.0.0">
+        In Impala 2.0 and later, you can set query options directly through 
the JDBC and ODBC interfaces by using the
+        <codeph>SET</codeph> statement. Formerly, <codeph>SET</codeph> was 
only available as a command within the
+        <cmdname>impala-shell</cmdname> interpreter.
+      </p>
     </note>
 
 <!-- This is the list including defaults from the pre-release 1.2 impala-shell:
-       ABORT_ON_DEFAULT_LIMIT_EXCEEDED: 0
-       ABORT_ON_ERROR: 0
-       ALLOW_UNSUPPORTED_FORMATS: 0
-       BATCH_SIZE: 0
-       DEBUG_ACTION:
-       DEFAULT_ORDER_BY_LIMIT: -1
-       DISABLE_CODEGEN: 0
-       HBASE_CACHE_BLOCKS: 0
-       HBASE_CACHING: 0
-       MAX_ERRORS: 0
-       MAX_IO_BUFFERS: 0
-       MAX_SCAN_RANGE_LENGTH: 0
-       MEM_LIMIT: 0
-       NUM_NODES: 0
-       NUM_SCANNER_THREADS: 0
-       PARQUET_COMPRESSION_CODEC: SNAPPY
-       PARQUET_FILE_SIZE: 0
-       SUPPORT_START_OVER: false
+        ABORT_ON_DEFAULT_LIMIT_EXCEEDED: 0
+        ABORT_ON_ERROR: 0
+        ALLOW_UNSUPPORTED_FORMATS: 0
+        BATCH_SIZE: 0
+        DEBUG_ACTION:
+        DEFAULT_ORDER_BY_LIMIT: -1
+        DISABLE_CODEGEN: 0
+        HBASE_CACHE_BLOCKS: 0
+        HBASE_CACHING: 0
+        MAX_ERRORS: 0
+        MAX_IO_BUFFERS: 0
+        MAX_SCAN_RANGE_LENGTH: 0
+        MEM_LIMIT: 0
+        NUM_NODES: 0
+        NUM_SCANNER_THREADS: 0
+        PARQUET_COMPRESSION_CODEC: SNAPPY
+        PARQUET_FILE_SIZE: 0
+        SUPPORT_START_OVER: false
 -->
 
     <p outputclass="toc"/>


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_query_timeout_s.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_query_timeout_s.xml 
b/docs/topics/impala_query_timeout_s.xml
index 41f2918..2afa14f 100644
--- a/docs/topics/impala_query_timeout_s.xml
+++ b/docs/topics/impala_query_timeout_s.xml
@@ -2,18 +2,21 @@
 <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
 <concept rev="2.0.0" id="query_timeout_s">
 
-  <title>QUERY_TIMEOUT_S Query Option</title>
+  <title>QUERY_TIMEOUT_S Query Option (CDH 5.2 or higher only)</title>
+  <titlealts audience="PDF"><navtitle>QUERY_TIMEOUT_S</navtitle></titlealts>
   <prolog>
     <metadata>
       <data name="Category" value="Impala"/>
       <data name="Category" value="Impala Query Options"/>
       <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
     </metadata>
   </prolog>
 
   <conbody>
 
-    <p>
+    <p rev="2.0.0">
       <indexterm audience="Cloudera">QUERY_TIMEOUT_S query option</indexterm>
       Sets the idle query timeout value for the session, in seconds. Queries 
that sit idle for longer than the
       timeout value are automatically cancelled. If the system administrator 
specified the
@@ -30,7 +33,7 @@
 <!-- Don't have a compelling example to show at this time because the 'idle' 
aspect only applies
      when the client is careless and leaves the query open. Can't easily 
demonstrate in impala-shell.
 
-     <p conref="/Content/impala_common_xi44078.xml#common/example_blurb"/>
+     <p conref="../shared/impala_common.xml#common/example_blurb"/>
 -->
 
     <p>
@@ -42,6 +45,8 @@
       <codeph>--idle_query_timeout</codeph> value)
     </p>
 
+    <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
     <p conref="../shared/impala_common.xml#common/related_info"/>
 
     <p>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_real.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_real.xml b/docs/topics/impala_real.xml
index e6430e3..12ef5aa 100644
--- a/docs/topics/impala_real.xml
+++ b/docs/topics/impala_real.xml
@@ -3,7 +3,7 @@
 <concept id="real">
 
   <title>REAL Data Type</title>
-  <titlealts><navtitle>REAL</navtitle></titlealts>
+  <titlealts audience="PDF"><navtitle>REAL</navtitle></titlealts>
   <prolog>
     <metadata>
       <data name="Category" value="Impala"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_refresh.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_refresh.xml b/docs/topics/impala_refresh.xml
index ee022d5..9c790f0 100644
--- a/docs/topics/impala_refresh.xml
+++ b/docs/topics/impala_refresh.xml
@@ -3,7 +3,7 @@
 <concept id="refresh">
 
   <title>REFRESH Statement</title>
-  <titlealts><navtitle>REFRESH</navtitle></titlealts>
+  <titlealts audience="PDF"><navtitle>REFRESH</navtitle></titlealts>
   <prolog>
     <metadata>
       <data name="Category" value="Impala"/>
@@ -14,6 +14,8 @@
       <data name="Category" value="Metastore"/>
       <data name="Category" value="ETL"/>
       <data name="Category" value="Ingest"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
     </metadata>
   </prolog>
 
@@ -30,7 +32,7 @@
 
     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
 
-<codeblock>REFRESH 
[<varname>db_name</varname>.]<varname>table_name</varname></codeblock>
+<codeblock rev="IMPALA-1683 CDH-43732">REFRESH 
[<varname>db_name</varname>.]<varname>table_name</varname> [PARTITION 
(<varname>key_col1</varname>=<varname>val1</varname> [, 
<varname>key_col2</varname>=<varname>val2</varname>...])]</codeblock>
 
     <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
 
@@ -52,6 +54,15 @@
       </li>
     </ul>
 
+    <note rev="2.3.0">
+      <p rev="2.3.0">
+        In CDH 5.5 / Impala 2.3 and higher, the syntax <codeph>ALTER TABLE 
<varname>table_name</varname> RECOVER PARTITIONS</codeph>
+        is a faster alternative to <codeph>REFRESH</codeph> when the only 
change to the table data is the addition of
+        new partition directories through Hive or manual HDFS operations.
+        See <xref href="impala_alter_table.xml#alter_table"/> for details.
+      </p>
+    </note>
+
     <p>
       You only need to issue the <codeph>REFRESH</codeph> statement on the 
node to which you connect to issue
       queries. The coordinator node divides the work among all the Impala 
nodes in a cluster, and sends read
@@ -73,31 +84,23 @@
 
     <note>
       <p rev="1.2">
-        In Impala 1.2 and higher, the catalog service broadcasts any changed 
metadata as a result of Impala
+        The catalog service broadcasts any changed metadata as a result of 
Impala
         <codeph>ALTER TABLE</codeph>, <codeph>INSERT</codeph> and <codeph>LOAD 
DATA</codeph> statements to all
         Impala nodes. Thus, the <codeph>REFRESH</codeph> statement is only 
required if you load data through Hive
         or by manipulating data files in HDFS directly. See <xref 
href="impala_components.xml#intro_catalogd"/> for
         more information on the catalog service.
       </p>
       <p rev="1.2.1">
-        In Impala 1.2.1 and higher, another way to avoid inconsistency across 
nodes is to enable the
+        Another way to avoid inconsistency across nodes is to enable the
         <codeph>SYNC_DDL</codeph> query option before performing a DDL 
statement or an <codeph>INSERT</codeph> or
         <codeph>LOAD DATA</codeph>.
       </p>
-      <p>
-        The functionality of the <codeph>REFRESH</codeph> statement has 
changed in Impala 1.1 and higher. Now the
-        table name is a required parameter. To flush the metadata for all 
tables, use the
+      <p rev="1.1">
+        The table name is a required parameter. To flush the metadata for all 
tables, use the
         <codeph><xref 
href="impala_invalidate_metadata.xml#invalidate_metadata">INVALIDATE 
METADATA</xref></codeph>
         command.
       </p>
-      <draft-comment translate="no"> Almost-identical wording here, under 
INVALIDATE METADATA, and in Release Notes :: New Features. Makes sense to 
conref. </draft-comment>
-      <p>
-        Because <codeph>REFRESH <varname>table_name</varname></codeph> only 
works for tables that Impala is already
-        aware of, when you create a new table in the Hive shell, you must 
enter <codeph>INVALIDATE
-        METADATA</codeph> with no table parameter before you can see the new 
table in
-        <cmdname>impala-shell</cmdname>. Once the table is known to Impala, 
you can issue <codeph>REFRESH
-        <varname>table_name</varname></codeph> as needed after you add more 
data files for that table.
-      </p>
+      <p conref="../shared/impala_common.xml#common/invalidate_then_refresh"/>
     </note>
 
     <p conref="../shared/impala_common.xml#common/refresh_vs_invalidate"/>
@@ -116,7 +119,7 @@
       </li>
 
       <li>
-        <b>and</b> the change is made to a database to which clients such as 
the Impala shell or ODBC directly
+        <b>and</b> the change is made to a metastore database to which clients 
such as the Impala shell or ODBC directly
         connect.
       </li>
     </ul>
@@ -139,7 +142,7 @@
 
       <li>
         Impalad - through <codeph>CREATE TABLE</codeph>, <codeph>ALTER 
TABLE</codeph>, and <codeph>INSERT</codeph>
-        operations. <ph rev="1.2">In Impala 1.2 and higher, such changes are 
propagated to all Impala nodes by the
+        operations. <ph rev="1.2">Such changes are propagated to all Impala 
nodes by the
         Impala catalog service.</ph>
       </li>
     </ul>
@@ -150,9 +153,101 @@
       delay later, for example if the next reference to the table is during a 
benchmark test.
     </p>
 
-    <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+    <p rev="IMPALA-1683 CDH-43732">
+      <b>Refreshing a single partition:</b>
+    </p>
 
-    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+    <p rev="IMPALA-1683 CDH-43732">
+      In CDH 5.9 / Impala 2.7 and higher, the <codeph>REFRESH</codeph> 
statement can apply to a single partition at a time,
+      rather than the whole table. Include the optional <codeph>PARTITION 
(<varname>partition_spec</varname>)</codeph>
+      clause and specify values for each of the partition key columns.
+    </p>
+
+    <p rev="IMPALA-1683 CDH-43732">
+      The following examples show how to make Impala aware of data added to a 
single partition, after data is loaded into
+      a partition's data directory using some mechanism outside Impala, such 
as Hive or Spark. The partition can be one that
+      Impala created and is already aware of, or a new partition created 
through Hive.
+    </p>
+
+<codeblock rev="IMPALA-1683 CDH-43732"><![CDATA[
+impala> create table p (x int) partitioned by (y int);
+impala> insert into p (x,y) values (1,2), (2,2), (2,1);
+impala> show partitions p;
++-------+-------+--------+------+...
+| y     | #Rows | #Files | Size |...
++-------+-------+--------+------+...
+| 1     | -1    | 1      | 2B   |...
+| 2     | -1    | 1      | 4B   |...
+| Total | -1    | 2      | 6B   |...
++-------+-------+--------+------+...
+
+-- ... Data is inserted into one of the partitions by some external mechanism 
...
+beeline> insert into p partition (y = 1) values(1000);
+
+impala> refresh p partition (y=1);
+impala> select x from p where y=1;
++------+
+| x    |
++------+
+| 2    | <- Original data created by Impala
+| 1000 | <- Additional data inserted through Beeline
++------+
+]]>
+</codeblock>
+
+    <p rev="IMPALA-1683 CDH-43732">
+      The same applies for tables with more than one partition key column.
+      The <codeph>PARTITION</codeph> clause of the <codeph>REFRESH</codeph>
+      statement must include all the partition key columns.
+    </p>
+
+<codeblock rev="IMPALA-1683 CDH-43732"><![CDATA[
+impala> create table p2 (x int) partitioned by (y int, z int);
+impala> insert into p2 (x,y,z) values (0,0,0), (1,2,3), (2,2,3);
+impala> show partitions p2;
++-------+---+-------+--------+------+...
+| y     | z | #Rows | #Files | Size |...
++-------+---+-------+--------+------+...
+| 0     | 0 | -1    | 1      | 2B   |...
+| 2     | 3 | -1    | 1      | 4B   |...
+| Total |   | -1    | 2      | 6B   |...
++-------+---+-------+--------+------+...
+
+-- ... Data is inserted into one of the partitions by some external mechanism 
...
+beeline> insert into p2 partition (y = 2, z = 3) values(1000);
+
+impala> refresh p2 partition (y=2, z=3);
+impala> select x from p where y=2 and z = 3;
++------+
+| x    |
++------+
+| 1    | <- Original data created by Impala
+| 2    | <- Original data created by Impala
+| 1000 | <- Additional data inserted through Beeline
++------+
+]]>
+</codeblock>
+
+    <p rev="IMPALA-1683 CDH-43732">
+      The following examples show how specifying a nonexistent partition does 
not cause any error,
+      and the order of the partition key columns does not have to match the 
column order in the table.
+      The partition spec must include all the partition key columns; 
specifying an incomplete set of
+      columns does cause an error.
+    </p>
+
+<codeblock rev="IMPALA-1683 CDH-43732"><![CDATA[
+-- Partition doesn't exist.
+refresh p2 partition (y=0, z=3);
+refresh p2 partition (y=0, z=-1)
+-- Key columns specified in a different order than the table definition.
+refresh p2 partition (z=1, y=0)
+-- Incomplete partition spec causes an error.
+refresh p2 partition (y=0)
+ERROR: AnalysisException: Items in partition spec must exactly match the 
partition columns in the table definition: default.p2 (1 vs 2)
+]]>
+</codeblock>
+
+    <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
 
     <p conref="../shared/impala_common.xml#common/example_blurb"/>
 
@@ -174,24 +269,15 @@
     </p>
 
     <p>
-      <b>Related impalad options:</b>
-    </p>
-
-    <p>
-      In Impala 1.0, the <codeph>-r</codeph> option of 
<cmdname>impala-shell</cmdname> issued
-      <codeph>REFRESH</codeph> to reload metadata for all tables.
-    </p>
-
-    <p>
-      In Impala 1.1 and higher, this option issues <codeph>INVALIDATE 
METADATA</codeph> because
-      <codeph>REFRESH</codeph> now requires a table name parameter. Due to the 
expense of reloading the metadata
-      for all tables, the <cmdname>impala-shell</cmdname> <codeph>-r</codeph> 
option is not recommended for
-      day-to-day use in a production environment.
+      <b>Related impala-shell options:</b>
     </p>
 
-    <p rev="1.2">
-      In Impala 1.2 and higher, the <codeph>-r</codeph> option is needed even 
less frequently, because metadata
-      changes caused by SQL statements in Impala are automatically broadcast 
to all Impala nodes.
+    <p rev="1.1">
+      The <cmdname>impala-shell</cmdname> option <codeph>-r</codeph> issues an 
<codeph>INVALIDATE METADATA</codeph> statement
+      when starting up the shell, effectively performing a 
<codeph>REFRESH</codeph> of all tables.
+      Due to the expense of reloading the metadata for all tables, the 
<cmdname>impala-shell</cmdname> <codeph>-r</codeph>
+      option is not recommended for day-to-day use in a production 
environment. (This option was mainly intended as a workaround
+      for synchronization issues in very old Impala versions.)
     </p>
 
     <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
@@ -207,6 +293,10 @@
       but subsequent statements such as <codeph>SELECT</codeph>
       or <codeph>SHOW TABLE STATS</codeph> could fail.
     </p>
+    <p rev="IMPALA-1683 CDH-43732">
+      All HDFS and Sentry permissions and privileges are the same whether you 
refresh the entire table
+      or a single partition.
+    </p>
 
     <p conref="../shared/impala_common.xml#common/hdfs_blurb"/>
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_request_pool.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_request_pool.xml 
b/docs/topics/impala_request_pool.xml
index cf2a811..a820edd 100644
--- a/docs/topics/impala_request_pool.xml
+++ b/docs/topics/impala_request_pool.xml
@@ -3,6 +3,7 @@
 <concept rev="1.3.0" id="request_pool">
 
   <title>REQUEST_POOL Query Option</title>
+  <titlealts audience="PDF"><navtitle>REQUEST_POOL</navtitle></titlealts>
   <prolog>
     <metadata>
       <data name="Category" value="Impala"/>
@@ -10,7 +11,8 @@
       <data name="Category" value="Impala Query Options"/>
       <data name="Category" value="Admission Control"/>
       <data name="Category" value="YARN"/>
-      <data name="Category" value="Llama"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
     </metadata>
   </prolog>
 
@@ -18,16 +20,8 @@
 
     <p>
       <indexterm audience="Cloudera">REQUEST_POOL query option</indexterm>
-      The pool or queue name that queries should be submitted to. Only applies 
when you enable the Impala admission
-      control feature (CDH 4 or CDH 5; see <xref 
href="impala_admission.xml#admission_control"/>), or the YARN
-      resource management feature (CDH 5 only; see
-      <xref href="impala_resource_management.xml#resource_management"/>). 
Specifies the name of the pool used by
-      requests from Impala to the resource manager.
-    </p>
-
-    <p>
-      Formerly known as <codeph>YARN_POOL</codeph> during the CDH 5 beta 
period. Renamed to reflect that it can be
-      used both with YARN and with the lightweight admission control feature 
introduced in Impala 1.3.
+      The pool or queue name that queries should be submitted to. Only applies 
when you enable the Impala admission control feature.
+      Specifies the name of the pool used by requests from Impala to the 
resource manager.
     </p>
 
     <p>
@@ -39,7 +33,11 @@
       in the Impala configuration file)
     </p>
 
-<!-- Worth adding a couple of related info links here. -->
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_admission.xml"/>
+    </p>
+
 
   </conbody>
 </concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_reservation_request_timeout.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_reservation_request_timeout.xml 
b/docs/topics/impala_reservation_request_timeout.xml
index 0316e44..0e01f83 100644
--- a/docs/topics/impala_reservation_request_timeout.xml
+++ b/docs/topics/impala_reservation_request_timeout.xml
@@ -3,6 +3,7 @@
 <concept rev="1.2" id="reservation_request_timeout">
 
   <title>RESERVATION_REQUEST_TIMEOUT Query Option (CDH 5 only)</title>
+  <titlealts 
audience="PDF"><navtitle>RESERVATION_REQUEST_TIMEOUT</navtitle></titlealts>
   <prolog>
     <metadata>
       <data name="Category" value="Impala"/>
@@ -10,11 +11,15 @@
       <data name="Category" value="Resource Management"/>
       <data name="Category" value="YARN"/>
       <data name="Category" value="Llama"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
     </metadata>
   </prolog>
 
   <conbody>
 
+    <note 
conref="../shared/impala_common.xml#common/llama_query_options_obsolete"/>
+
     <p>
       <indexterm audience="Cloudera">RESERVATION_REQUEST_TIMEOUT query 
option</indexterm>
       Maximum number of milliseconds Impala will wait for a reservation to be 
completely granted or denied. Used in
@@ -29,7 +34,5 @@
       <b>Default:</b> 300000 (5 minutes)
     </p>
 
-<!-- Worth adding a couple of related info links here. -->
-
   </conbody>
 </concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_resource_management.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_resource_management.xml 
b/docs/topics/impala_resource_management.xml
index 3fdbb15..baadee1 100644
--- a/docs/topics/impala_resource_management.xml
+++ b/docs/topics/impala_resource_management.xml
@@ -3,7 +3,16 @@
 <concept rev="1.2" id="resource_management">
 
   <title>Resource Management for Impala</title>
-  
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="YARN"/>
+      <data name="Category" value="Resource Management"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
 
   <conbody>
 
@@ -14,7 +23,316 @@
       that run jobs from many Hadoop components.
     </p>
 
-    
+    <p outputclass="toc inpage"/>
+  </conbody>
+
+  <concept audience="Cloudera" id="llama">
+  <!-- Hiding the whole concept now that Llama is desupported. -->
+
+    <title>The Llama Daemon</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Llama"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+<!-- Copied from 
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH5/latest/CDH5-Installation-Guide/cdh_ig_install_llama.html
 - turn into a conref in both places. -->
+
+      <p>
+        Llama is a system that mediates resource management between Impala and 
Hadoop YARN. Llama enables
+        Impala to reserve, use, and release resource allocations in a Hadoop 
cluster. Llama is only required if
+        resource management is enabled in Impala.
+      </p>
+
+      <p id="p_b2y_msl_jp">
+        By default, YARN allocates resources bit-by-bit as needed by MapReduce 
jobs. Impala needs all resources
+        available at the same time, so that intermediate results can be 
exchanged between cluster nodes, and
+        queries do not stall partway through waiting for new resources to be 
allocated. Llama is the intermediary
+        process that ensures all requested resources are available before each 
Impala query actually begins.
+      </p>
+
+<!-- Original URL: 
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH5/latest/CDH5-Installation-Guide/cdh_ig_llama_installation.html
 -->
+
+<!-- Highly likely to be removed per 
https://jira.cloudera.com/browse/CDH-21352,
+     so commenting out for now. Feels like there should be some 'installing 
llama via CM' topic
+     even if it says not to worry.
+
+      <p>
+        For Llama installation instructions, see
+        <xref 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/cdh_ig_llama_installation.html";
 scope="external" format="html">Llama
+        Installation</xref>.
+      </p>
+-->
+
+<!-- Original URL: 
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CM5/latest/Cloudera-Manager-Managing-Clusters/cm_mc_impala_service.html
 -->
+
+      <p>
+        For management through Cloudera Manager, see
+        <xref 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/admin_llama.html";
 scope="external" format="html">The
+        Impala Llama ApplicationMaster</xref>.
+      </p>
+    </conbody>
+  </concept>
+
+<!-- Hiding more content per MJ feedback.
+  <concept id="rm_resource_estimates">
+
+    <title>Controlling Resource Estimation Behavior</title>
+
+    <conbody>
+
+      <p>
+        By default, Impala consults the table statistics and column statistics 
for each table in a query, and uses
+        those figures to construct estimates of needed resources for each 
query. See
+        <xref href="impala_compute_stats.xml#compute_stats"/> for the 
statement to collect table and column
+        statistics for a table.
+      </p>
+
+      <p>
+        In CDH 5.7 / Impala 2.5 and higher, the preferred way to avoid 
overcommitting memory in a high-concurrency
+        or multitenant scenario is to use Impala admission control together 
with dynamic resource pools.
+        You can specify a <uicontrol>Default Query Memory Limit</uicontrol> 
setting, with a different value for each
+        pool, and Impala uses that value to calculate how many queries can 
safely run within a specified
+        cluster-wide aggregate memory size.
+        See <xref href="impala_admission.xml#admission_control"/> for details.
+      </p>
+
+      <p>
+        To avoid problems with inaccurate or missing statistics, which can 
lead to inaccurate estimates of resource
+        consumption, Impala allows you to set default estimates for CPU and 
memory resource consumption.
+        When the query is complete, those resources are returned to YARN as
+        normal. To enable this feature, use the command-line option 
<codeph>-rm_always_use_defaults</codeph> when
+        starting <cmdname>impalad</cmdname>, and optionally
+        <codeph>-rm_default_memory=<varname>size</varname></codeph> and 
<codeph>-rm_default_cpu_cores</codeph>.
+        See <xref href="impala_resource_management.xml#rm_options"/> for 
details about each option.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="rm_checking">
+
+    <title>Checking Resource Estimates and Actual Usage</title>
+
+    <conbody>
+
+      <p>
+        To make resource usage easier to verify, the output of the 
<codeph>EXPLAIN</codeph> SQL statement now
+        includes information about estimated memory usage, whether table and 
column statistics are available for
+        each table, and the number of virtual cores that a query will use. You 
can get this information through the
+        <codeph>EXPLAIN</codeph> statement without actually running the query. 
The extra information requires
+        setting the query option <codeph>EXPLAIN_LEVEL=verbose</codeph>; see
+        <xref href="impala_explain.xml#explain"/> for details. The same 
extended information is shown at the start
+        of the output from the <codeph>PROFILE</codeph> statement in 
<cmdname>impala-shell</cmdname>. The detailed
+        profile information is only available after running the query. You can 
take appropriate actions (gathering
+        statistics, adjusting query options) if you find that queries fail or 
run with suboptimal performance when
+        resource management is enabled.
+      </p>
+    </conbody>
+  </concept>
+-->
+
+  <concept id="rm_enforcement">
+
+    <title>How Resource Limits Are Enforced</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Concepts"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <ul>
+        <li>
+          If Cloudera Manager Static Partitioning is used, it creates a cgroup 
in which Impala runs.
+          This cgroup limits CPU, network, and IO according to the static 
partitioning policy.
+        </li>
+
+        <li>
+          Limits on memory usage are enforced by Impala's process memory limit 
(the <codeph>MEM_LIMIT</codeph>
+          query option setting). The admission control feature checks this 
setting to decide how many queries
+          can be safely run at the same time. Then the Impala daemon enforces 
the limit by activating the
+          spill-to-disk mechanism when necessary, or cancelling a query 
altogether if the limit is exceeded at runtime.
+        </li>
+      </ul>
+    </conbody>
+  </concept>
+
+<!--
+  <concept id="rm_enable">
+
+    <title>Enabling Resource Management for Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Configuring"/>
+      <data name="Category" value="Starting and Stopping"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        To enable resource management for Impala, first you <xref 
href="#rm_cdh_prereqs">set up the YARN
+        service for your CDH cluster</xref>. Then you <xref 
href="#rm_options">add startup options and customize
+        resource management settings</xref> for the Impala services.
+      </p>
     </conbody>
+
+    <concept id="rm_cdh_prereqs">
+
+      <title>Required CDH Setup for Resource Management with Impala</title>
+
+      <conbody>
+
+        <p>
+          YARN is the general-purpose service that manages resources for many 
Hadoop components within a CDH
+          cluster.
+        </p>
+
+        <p>
+          For information about setting up the YARN service, see the 
instructions for
+          <xref 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/cm_mc_yarn_service.html";
 scope="external" format="html">Cloudera
+          Manager</xref>.
+        </p>
+      </conbody>
+    </concept>
+
+    <concept id="rm_options">
+
+      <title>impalad Startup Options for Resource Management</title>
+
+      <conbody>
+
+        <p id="resource_management_impalad_options">
+          The following startup options for <cmdname>impalad</cmdname> enable 
resource management and customize its
+          parameters for your cluster configuration:
+          <ul>
+            <li>
+              <codeph>-enable_rm</codeph>: Whether to enable resource 
management or not, either
+              <codeph>true</codeph> or <codeph>false</codeph>. The default is 
<codeph>false</codeph>. None of the
+              other resource management options have any effect unless 
<codeph>-enable_rm</codeph> is turned on.
+            </li>
+
+            <li>
+              <codeph>-cgroup_hierarchy_path</codeph>: Path where YARN will 
create cgroups for granted
+              resources. Impala assumes that the cgroup for an allocated 
container is created in the path
+              '<varname>cgroup_hierarchy_path</varname> + 
<varname>container_id</varname>'.
+            </li>
+
+            <li rev="1.4.0">
+              <codeph>-rm_always_use_defaults</codeph>: If this Boolean option 
is enabled, Impala ignores computed
+              estimates and always obtains the default memory and CPU 
allocation settings at the start of the
+              query. These default estimates are approximately 2 CPUs and 4 GB 
of memory, possibly varying slightly
+              depending on cluster size, workload, and so on. Cloudera 
recommends enabling
+              <codeph>-rm_always_use_defaults</codeph> whenever resource 
management is used, and relying on these
+              default values (that is, leaving out the two following options).
+            </li>
+
+            <li rev="1.4.0">
+              <codeph>-rm_default_memory=<varname>size</varname></codeph>: 
Optionally sets the default estimate for
+              memory usage for each query. You can use suffixes such as M and 
G for megabytes and gigabytes, the
+              same as with the <xref 
href="impala_mem_limit.xml#mem_limit">MEM_LIMIT</xref> query option. Only has
+              an effect when <codeph>-rm_always_use_defaults</codeph> is also 
enabled.
+            </li>
+
+            <li rev="1.4.0">
+              <codeph>-rm_default_cpu_cores</codeph>: Optionally sets the 
default estimate for number of virtual
+              CPU cores for each query. Only has an effect when 
<codeph>-rm_always_use_defaults</codeph> is also
+              enabled.
+            </li>
+          </ul>
+        </p>
+
+      </conbody>
+    </concept>
+-->
+
+    <concept id="rm_query_options">
+
+      <title>impala-shell Query Options for Resource Management</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala Query Options"/>
+    </metadata>
+  </prolog>
+
+      <conbody>
+
+        <p>
+          Before issuing SQL statements through the 
<cmdname>impala-shell</cmdname> interpreter, you can use the
+          <codeph>SET</codeph> command to configure the following parameters 
related to resource management:
+        </p>
+
+        <ul id="ul_nzt_twf_jp">
+          <li>
+            <xref href="impala_explain_level.xml#explain_level"/>
+          </li>
+
+          <li>
+            <xref href="impala_mem_limit.xml#mem_limit"/>
+          </li>
+
+<!-- Not supported in CDH 5.5 / Impala 2.3 and up.
+          <li>
+            <xref 
href="impala_reservation_request_timeout.xml#reservation_request_timeout"/>
+          </li>
+
+          <li>
+            <xref href="impala_v_cpu_cores.xml#v_cpu_cores"/>
+          </li>
+-->
+        </ul>
+      </conbody>
+    </concept>
+
+<!-- Parent topic is going away, so former subtopic is hoisted up a level.
   </concept>
+-->
+
+  <concept id="rm_limitations">
 
+    <title>Limitations of Resource Management for Impala</title>
+
+    <conbody>
+
+<!-- Conditionalizing some content here with audience="Cloudera" because there 
are already some XML comments
+     inside the list, so not practical to enclose the whole thing in XML 
comments. -->
+
+      <p audience="Cloudera">
+        Currently, Impala in CDH 5 has the following limitations for resource 
management of Impala queries:
+      </p>
+
+      <ul audience="Cloudera">
+        <li>
+          Table statistics are required, and column statistics are highly 
valuable, for Impala to produce accurate
+          estimates of how much memory to request from YARN. See
+          <xref href="impala_perf_stats.xml#perf_table_stats"/> and
+          <xref href="impala_perf_stats.xml#perf_column_stats"/> for 
instructions on gathering both kinds of
+          statistics, and <xref href="impala_explain.xml#explain"/> for the 
extended <codeph>EXPLAIN</codeph>
+          output where you can check that statistics are available for a 
specific table and set of columns.
+        </li>
+
+        <li>
+          If the Impala estimate of required memory is lower than is actually 
required for a query, Impala
+          dynamically expands the amount of requested memory.
+<!--          Impala will cancel the query when it exceeds the requested 
memory size. -->
+          Queries might still be cancelled if the reservation expansion fails, 
for example if there are
+          insufficient remaining resources for that pool, or the expansion 
request takes long enough that it
+          exceeds the query timeout interval, or because of YARN preemption.
+<!--          This could happen in some cases with complex queries, even when 
table and column statistics are available. -->
+          You can see the actual memory usage after a failed query by issuing 
a <codeph>PROFILE</codeph> command in
+          <cmdname>impala-shell</cmdname>. Specify a larger memory figure with 
the <codeph>MEM_LIMIT</codeph>
+          query option and re-try the query.
+        </li>
+      </ul>
+
+      <p rev="2.0.0">
+        The <codeph>MEM_LIMIT</codeph> query option, and the other 
resource-related query options, are settable
+        through the ODBC or JDBC interfaces in Impala 2.0 and higher. This is 
a former limitation that is now
+        lifted.
+      </p>
+    </conbody>
+  </concept>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_revoke.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_revoke.xml b/docs/topics/impala_revoke.xml
index 312d2b0..2a590c1 100644
--- a/docs/topics/impala_revoke.xml
+++ b/docs/topics/impala_revoke.xml
@@ -3,21 +3,25 @@
 <concept rev="2.0.0" id="revoke">
 
   <title>REVOKE Statement (CDH 5.2 or higher only)</title>
-  <titlealts><navtitle>REVOKE (CDH 5.2 or higher only)</navtitle></titlealts>
+  <titlealts audience="PDF"><navtitle>REVOKE</navtitle></titlealts>
   <prolog>
     <metadata>
       <data name="Category" value="Impala"/>
       <data name="Category" value="DDL"/>
       <data name="Category" value="SQL"/>
+      <data name="Category" value="Security"/>
       <data name="Category" value="Sentry"/>
       <data name="Category" value="Roles"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
       <!-- Consider whether to go deeper into categories like Security for the 
Sentry-related statements. -->
     </metadata>
   </prolog>
 
   <conbody>
 
-    <p>
+    <p rev="2.0.0">
       <indexterm audience="Cloudera">REVOKE statement</indexterm>
 <!-- Copied from Sentry docs. Turn into conref. I did some rewording for 
clarity. -->
       The <codeph>REVOKE</codeph> statement revokes roles or privileges on a 
specified object from groups. Only
@@ -43,8 +47,8 @@ object_type ::= TABLE | DATABASE | SERVER | URI
 
     <p rev="2.3.0 collevelauth">
       The ability to grant or revoke <codeph>SELECT</codeph> privilege on 
specific columns is available
-      in CDH 5.5 / Impala 2.3 and higher. <!--See <xref 
href="sg_hive_sql.xml#concept_c2q_4qx_p4/col_level_auth_sentry"/>
-      for details.-->
+      in CDH 5.5 / Impala 2.3 and higher. See <xref 
href="sg_hive_sql.xml#concept_c2q_4qx_p4/col_level_auth_sentry"/>
+      for details.
     </p>
 
     <p conref="../shared/impala_common.xml#common/privileges_blurb"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_s3.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_s3.xml b/docs/topics/impala_s3.xml
index 6cb7834..04788eb 100644
--- a/docs/topics/impala_s3.xml
+++ b/docs/topics/impala_s3.xml
@@ -4,24 +4,793 @@
 
   <title>Using Impala with the Amazon S3 Filesystem</title>
   <titlealts audience="PDF"><navtitle>S3 Tables</navtitle></titlealts>
-  
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Amazon"/>
+      <data name="Category" value="S3"/>
+      <data name="Category" value="Data Analysts"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Preview Features"/>
+    </metadata>
+  </prolog>
 
   <conbody>
 
-    
+    <note conref="../shared/impala_common.xml#common/s3_production"/>
 
     <p rev="2.2.0">
-      
+      <indexterm audience="Cloudera">S3 with Impala</indexterm>
+
+      <indexterm audience="Cloudera">Amazon S3 with Impala</indexterm>
       You can use Impala to query data residing on the Amazon S3 filesystem. 
This capability allows convenient
       access to a storage system that is remotely managed, accessible from 
anywhere, and integrated with various
       cloud-based services. Impala can query files in any supported file 
format from S3. The S3 storage location
       can be for an entire table, or individual partitions in a partitioned 
table.
     </p>
 
-    
+    <p>
+      The default Impala tables use data files stored on HDFS, which are ideal 
for bulk loads and queries using
+      full-table scans. In contrast, queries against S3 data are less 
performant, making S3 suitable for holding
+      <q>cold</q> data that is only queried occasionally, while more 
frequently accessed <q>hot</q> data resides in
+      HDFS. In a partitioned table, you can set the <codeph>LOCATION</codeph> 
attribute for individual partitions
+      to put some partitions on HDFS and others on S3, typically depending on 
the age of the data.
+    </p>
+
+    <p outputclass="toc inpage"/>
+
+  </conbody>
+
+  <concept id="s3_sql">
+    <title>How Impala SQL Statements Work with S3</title>
+    <conbody>
+      <p>
+        Impala SQL statements work with data on S3 as follows:
+      </p>
+      <ul>
+        <li>
+          <p>
+            The <xref href="impala_create_table.xml#create_table"/>
+            or <xref href="impala_alter_table.xml#alter_table"/> statements
+            can specify that a table resides on the S3 filesystem by
+            encoding an <codeph>s3a://</codeph> prefix for the 
<codeph>LOCATION</codeph>
+            property. <codeph>ALTER TABLE</codeph> can also set the 
<codeph>LOCATION</codeph>
+            property for an individual partition, so that some data in a table 
resides on
+            S3 and other data in the same table resides on HDFS.
+          </p>
+        </li>
+        <li>
+          <p>
+            Once a table or partition is designated as residing on S3, the 
<xref href="impala_select.xml#select"/>
+            statement transparently accesses the data files from the 
appropriate storage layer.
+          </p>
+        </li>
+        <li>
+          <p>
+            If the S3 table is an internal table, the <xref 
href="impala_drop_table.xml#drop_table"/> statement
+            removes the corresponding data files from S3 when the table is 
dropped.
+          </p>
+        </li>
+        <li>
+          <p>
+            The <xref href="impala_truncate_table.xml#truncate_table"/> 
statement always removes the corresponding
+            data files from S3 when the table is truncated.
+          </p>
+        </li>
+        <li>
+          <p>
+            The <xref href="impala_load_data.xml#load_data"/> can move data 
files residing in HDFS into
+            an S3 table.
+          </p>
+        </li>
+        <li>
+          <p>
+            The <xref href="impala_insert.xml#insert"/> statement, or the 
<codeph>CREATE TABLE AS SELECT</codeph>
+            form of the <codeph>CREATE TABLE</codeph> statement, can copy data 
from an HDFS table or another S3
+            table into an S3 table. The <xref 
href="impala_s3_skip_insert_staging.xml#s3_skip_insert_staging"/>
+            query option chooses whether or not to use a fast code path for 
these write operations to S3,
+            with the tradeoff of potential inconsistency in the case of a 
failure during the statement. 
+          </p>
+        </li>
+      </ul>
+      <p>
+        For usage information about Impala SQL statements with S3 tables, see 
<xref href="impala_s3.xml#s3_ddl"/>
+        and <xref href="impala_s3.xml#s3_dml"/>.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="s3_creds">
+
+    <title>Specifying Impala Credentials to Access Data in S3</title>
+
+    <conbody>
+
+      <p>
+        <indexterm audience="Cloudera">fs.s3a.access.key configuration 
setting</indexterm>
+        <indexterm audience="Cloudera">fs.s3a.secret.key configuration 
setting</indexterm>
+        <indexterm audience="Cloudera">access.key configuration 
setting</indexterm>
+        <indexterm audience="Cloudera">secret.key configuration 
setting</indexterm>
+        To allow Impala to access data in S3, specify values for the following 
configuration settings in your
+        <filepath>core-site.xml</filepath> file:
+      </p>
+
+<!-- Normally I would turn this example into CDATA notation to avoid all the 
&lt; and &gt; entities.
+     However, then I couldn't use the <varname> tag inside the same example. 
-->
+<codeblock>
+&lt;property&gt;
+&lt;name&gt;fs.s3a.access.key&lt;/name&gt;
+&lt;value&gt;<varname>your_access_key</varname>&lt;/value&gt;
+&lt;/property&gt;
+&lt;property&gt;
+&lt;name&gt;fs.s3a.secret.key&lt;/name&gt;
+&lt;value&gt;<varname>your_secret_key</varname>&lt;/value&gt;
+&lt;/property&gt;
+</codeblock>
+
+      <p>
+        As of CDH 5.8, these settings do not have corresponding controls in 
the Cloudera Manager user interface.
+        Specify them in the <uicontrol>HDFS Client Advanced Configuration 
Snippet (Safety Valve) for
+        core-site.xml</uicontrol> field. After specifying the credentials, 
restart both the Impala and Hive
+        services. (Restarting Hive is required because Impala queries, 
<codeph>CREATE TABLE</codeph> statements,
+        and so on go through the Hive metastore.)
+      </p>
+
+<!--
+      <p rev="CDH-39914 IMPALA-3306">
+        In CDH 5.8 / Impala 2.6 and higher, you can specify the S3 access key 
and secret key through
+        configuration settings for the <cmdname>impalad</cmdname> daemon.
+        Rather than specifying the keys themselves on the command line or in 
startup scripts,
+        you specify the commands to retrieve the keys as 
<cmdname>impalad</cmdname>
+        startup options. For clusters not managed by Cloudera Manager, use the
+        <codeph>-
+        -s3a_access_key_cmd</codeph> and <codeph>-
+        -s3a_secret_key_cmd</codeph>
+        startup options for the <cmdname>impalad</cmdname> daemon.
+        For clusters managed by Cloudera Manager, set the
+        <codeph>s3a_access_key_cmd</codeph> and 
<codeph>s3a_secret_key_cmd</codeph>
+        configuration settings and restart the Impala and Hive services.
+        (Restarting Hive is required because Impala queries, <codeph>CREATE 
TABLE</codeph> statements,
+        and so on go through the Hive metastore.)
+      </p>
+-->
+
+      <note type="important">
+<!--
+      <ul>
+        <li>
+          <p rev="CDH-39914 IMPALA-3306">
+            The <codeph>s3a_access_key_cmd</codeph> and 
<codeph>s3a_secret_key_cmd</codeph> settings
+            for <cmdname>impalad</cmdname> only allow Impala to access S3. You 
must still include the credentials in the
+            client <filepath>hdfs-site.xml</filepath> configuration file to 
allow S3 access for the Hive metastore,
+            <codeph>hadoop fs</codeph> command, and so on.
+          </p>
+        </li>
+        <li>
+-->
+          <p>
+            Although you can specify the access key ID and secret key as part 
of the <codeph>s3a://</codeph> URL in the
+            <codeph>LOCATION</codeph> attribute, doing so makes this sensitive 
information visible in many places, such
+            as <codeph>DESCRIBE FORMATTED</codeph> output and Impala log 
files. Therefore, specify this information
+            centrally in the <filepath>core-site.xml</filepath> file, and 
restrict read access to that file to only
+            trusted users.
+          </p>
+<!--
+        </li>
+-->
+        <!-- Overriding with a new first list bullet following clarification 
by Sailesh.
+        <li>
+          <p rev="CDH-39914 IMPALA-3306">
+            Prior to CDH 5.8 / Impala 2.6, an alternative way to specify the 
keys was by
+            including the fields <codeph>fs.s3a.access.key</codeph> and 
<codeph>fs.s3a.secret.key</codeph>
+            in a configuration file such as <filepath>core-site.xml</filepath> 
or <filepath>hdfs-site.xml</filepath>.
+            With the enhanced S3 key management in CDH 5.8 / Impala 2.6 and 
higher, if you are upgrading from
+            an earlier release where you used Impala with S3, remove the S3 
keys from any copies of those files.
+          </p>
+        </li>
+        -->
+<!--
+      </ul>
+-->
+      </note>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="s3_etl">
+
+    <title>Loading Data into S3 for Impala Queries</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="ETL"/>
+      <data name="Category" value="Ingest"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        If your ETL pipeline involves moving data into S3 and then querying 
through Impala,
+        you can either use Impala DML statements to create, move, or copy the 
data, or
+        use the same data loading techniques as you would for non-Impala data.
+      </p>
+
+    </conbody>
+
+    <concept id="s3_dml" rev="2.6.0 CDH-39913 IMPALA-1878">
+      <title>Using Impala DML Statements for S3 Data</title>
+      <conbody>
+        <p conref="../shared/impala_common.xml#common/s3_dml"/>
+        <p conref="../shared/impala_common.xml#common/s3_dml_performance"/>
+      </conbody>
+    </concept>
+
+    <concept id="s3_manual_etl">
+      <title>Manually Loading Data into Impala Tables on S3</title>
+      <conbody>
+        <p>
+          As an alternative, or on earlier Impala releases without DML support 
for S3,
+          you can use the Amazon-provided methods to bring data files into S3 
for querying through Impala. See
+          <xref href="http://aws.amazon.com/s3/"; scope="external" 
format="html">the Amazon S3 web site</xref> for
+          details.
+        </p>
+
+        <note type="important">
+          <p conref="../shared/impala_common.xml#common/s3_drop_table_purge"/>
+        </note>
+
+        <p>
+          Alternative file creation techniques (less compatible with the 
<codeph>PURGE</codeph> clause) include:
+        </p>
+
+        <ul>
+          <li>
+            The <xref href="https://console.aws.amazon.com/s3/home"; 
scope="external" format="html">Amazon AWS / S3
+            web interface</xref> to upload from a web browser.
+          </li>
+
+          <li>
+            The <xref href="http://aws.amazon.com/cli/"; scope="external" 
format="html">Amazon AWS CLI</xref> to
+            manipulate files from the command line.
+          </li>
+
+          <li>
+            Other S3-enabled software, such as
+            <xref href="http://s3tools.org/s3cmd"; scope="external" 
format="html">the S3Tools client software</xref>.
+          </li>
+        </ul>
+
+        <p>
+          After you upload data files to a location already mapped to an 
Impala table or partition, or if you delete
+          files in S3 from such a location, issue the <codeph>REFRESH 
<varname>table_name</varname></codeph>
+          statement to make Impala aware of the new set of data files.
+        </p>
+
+      </conbody>
+    </concept>
+
+  </concept>
+
+  <concept id="s3_ddl">
+
+    <title>Creating Impala Databases, Tables, and Partitions for Data Stored 
on S3</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Databases"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        Impala reads data for a table or partition from S3 based on the 
<codeph>LOCATION</codeph> attribute for the
+        table or partition. Specify the S3 details in the 
<codeph>LOCATION</codeph> clause of a <codeph>CREATE
+        TABLE</codeph> or <codeph>ALTER TABLE</codeph> statement. The notation 
for the <codeph>LOCATION</codeph>
+        clause is 
<codeph>s3a://<varname>bucket_name</varname>/<varname>path/to/file</varname></codeph>.
 The
+        filesystem prefix is always <codeph>s3a://</codeph> because Impala 
does not support the <codeph>s3://</codeph> or
+        <codeph>s3n://</codeph> prefixes.
+      </p>
+
+      <p>
+        For a partitioned table, either specify a separate 
<codeph>LOCATION</codeph> clause for each new partition,
+        or specify a base <codeph>LOCATION</codeph> for the table and set up a 
directory structure in S3 to mirror
+        the way Impala partitioned tables are structured in HDFS. Although, 
strictly speaking, S3 filenames do not
+        have directory paths, Impala treats S3 filenames with 
<codeph>/</codeph> characters the same as HDFS
+        pathnames that include directories.
+      </p>
+
+      <p>
+        You point a nonpartitioned table or an individual partition at S3 by 
specifying a single directory
+        path in S3, which could be any arbitrary directory. To replicate the 
structure of an entire Impala
+        partitioned table or database in S3 requires more care, with 
directories and subdirectories nested and
+        named to match the equivalent directory tree in HDFS. Consider setting 
up an empty staging area if
+        necessary in HDFS, and recording the complete directory structure so 
that you can replicate it in S3.
+        <!--
+        Or, specify an S3 location for an entire database, after which all 
tables and partitions created inside that
+        database automatically inherit the database <codeph>LOCATION</codeph> 
and create new S3 directories
+        underneath the database directory.
+        -->
+      </p>
+
+      <p>
+        For convenience when working with multiple tables with data files 
stored in S3, you can create a database
+        with a <codeph>LOCATION</codeph> attribute pointing to an S3 path.
+        Specify a URL of the form 
<codeph>s3a://<varname>bucket</varname>/<varname>root/path/for/database</varname></codeph>
+        for the <codeph>LOCATION</codeph> attribute of the database.
+        Any tables created inside that database
+        automatically create directories underneath the one specified by the 
database
+        <codeph>LOCATION</codeph> attribute.
+      </p>
+
+      <p>
+        For example, the following session creates a partitioned table where 
only a single partition resides on S3.
+        The partitions for years 2013 and 2014 are located on HDFS. The 
partition for year 2015 includes a
+        <codeph>LOCATION</codeph> attribute with an <codeph>s3a://</codeph> 
URL, and so refers to data residing on
+        S3, under a specific path underneath the bucket 
<codeph>impala-demo</codeph>.
+      </p>
+
+<codeblock>[localhost:21000] > create database db_on_hdfs;
+[localhost:21000] > use db_on_hdfs;
+[localhost:21000] > create table mostly_on_hdfs (x int) partitioned by (year 
int);
+[localhost:21000] > alter table mostly_on_hdfs add partition (year=2013);
+[localhost:21000] > alter table mostly_on_hdfs add partition (year=2014);
+[localhost:21000] > alter table mostly_on_hdfs add partition (year=2015)
+                  >   location 's3a://impala-demo/dir1/dir2/dir3/t1';
+</codeblock>
+
+      <p>
+        The following session creates a database and two partitioned tables 
residing entirely on S3, one
+        partitioned by a single column and the other partitioned by multiple 
columns. Because a
+        <codeph>LOCATION</codeph> attribute with an <codeph>s3a://</codeph> 
URL is specified for the database, the
+        tables inside that database are automatically created on S3 underneath 
the database directory. To see the
+        names of the associated subdirectories, including the partition key 
values, we use an S3 client tool to
+        examine how the directory structure is organized on S3. For example, 
Impala partition directories such as
+        <codeph>month=1</codeph> do not include leading zeroes, which 
sometimes appear in partition directories created
+        through Hive.
+      </p>
+
+<codeblock>[localhost:21000] > create database db_on_s3 location 
's3a://impala-demo/dir1/dir2/dir3';
+[localhost:21000] > use db_on_s3;
+
+[localhost:21000] > create table partitioned_on_s3 (x int) partitioned by 
(year int);
+[localhost:21000] > alter table partitioned_on_s3 add partition (year=2013);
+[localhost:21000] > alter table partitioned_on_s3 add partition (year=2014);
+[localhost:21000] > alter table partitioned_on_s3 add partition (year=2015);
+
+[localhost:21000] > !aws s3 ls s3://impala-demo/dir1/dir2/dir3 --recursive;
+2015-03-17 13:56:34          0 dir1/dir2/dir3/
+2015-03-17 16:43:28          0 dir1/dir2/dir3/partitioned_on_s3/
+2015-03-17 16:43:49          0 dir1/dir2/dir3/partitioned_on_s3/year=2013/
+2015-03-17 16:43:53          0 dir1/dir2/dir3/partitioned_on_s3/year=2014/
+2015-03-17 16:43:58          0 dir1/dir2/dir3/partitioned_on_s3/year=2015/
+
+[localhost:21000] > create table partitioned_multiple_keys (x int)
+                  >   partitioned by (year smallint, month tinyint, day 
tinyint);
+[localhost:21000] > alter table partitioned_multiple_keys
+                  >   add partition (year=2015,month=1,day=1);
+[localhost:21000] > alter table partitioned_multiple_keys
+                  >   add partition (year=2015,month=1,day=31);
+[localhost:21000] > alter table partitioned_multiple_keys
+                  >   add partition (year=2015,month=2,day=28);
+
+[localhost:21000] > !aws s3 ls s3://impala-demo/dir1/dir2/dir3 --recursive;
+2015-03-17 13:56:34          0 dir1/dir2/dir3/
+2015-03-17 16:47:13          0 dir1/dir2/dir3/partitioned_multiple_keys/
+2015-03-17 16:47:44          0 
dir1/dir2/dir3/partitioned_multiple_keys/year=2015/month=1/day=1/
+2015-03-17 16:47:50          0 
dir1/dir2/dir3/partitioned_multiple_keys/year=2015/month=1/day=31/
+2015-03-17 16:47:57          0 
dir1/dir2/dir3/partitioned_multiple_keys/year=2015/month=2/day=28/
+2015-03-17 16:43:28          0 dir1/dir2/dir3/partitioned_on_s3/
+2015-03-17 16:43:49          0 dir1/dir2/dir3/partitioned_on_s3/year=2013/
+2015-03-17 16:43:53          0 dir1/dir2/dir3/partitioned_on_s3/year=2014/
+2015-03-17 16:43:58          0 dir1/dir2/dir3/partitioned_on_s3/year=2015/
+</codeblock>
+
+      <p>
+        The <codeph>CREATE DATABASE</codeph> and <codeph>CREATE TABLE</codeph> 
statements create the associated
+        directory paths if they do not already exist. You can specify multiple 
levels of directories, and the
+        <codeph>CREATE</codeph> statement creates all appropriate levels, 
similar to using <codeph>mkdir
+        -p</codeph>.
+      </p>
+
+      <p>
+        Use the standard S3 file upload methods to actually put the data files 
into the right locations. You can
+        also put the directory paths and data files in place before creating 
the associated Impala databases or
+        tables, and Impala automatically uses the data from the appropriate 
location after the associated databases
+        and tables are created.
+      </p>
+
+      <p>
+        You can switch whether an existing table or partition points to data 
in HDFS or S3. For example, if you
+        have an Impala table or partition pointing to data files in HDFS or 
S3, and you later transfer those data
+        files to the other filesystem, use an <codeph>ALTER TABLE</codeph> 
statement to adjust the
+        <codeph>LOCATION</codeph> attribute of the corresponding table or 
partition to reflect that change. Because
+        Impala does not have an <codeph>ALTER DATABASE</codeph> statement, 
this location-switching technique is not
+        practical for entire databases that have a custom 
<codeph>LOCATION</codeph> attribute.
+      </p>
 
     </conbody>
+
   </concept>
 
+  <concept id="s3_internal_external">
+
+    <title>Internal and External Tables Located on S3</title>
+
+    <conbody>
+
+      <p>
+        Just as with tables located on HDFS storage, you can designate 
S3-based tables as either internal (managed
+        by Impala) or external, by using the syntax <codeph>CREATE 
TABLE</codeph> or <codeph>CREATE EXTERNAL
+        TABLE</codeph> respectively. When you drop an internal table, the 
files associated with the table are
+        removed, even if they are on S3 storage. When you drop an external 
table, the files associated with the
+        table are left alone, and are still available for access by other 
tools or components. See
+        <xref href="impala_tables.xml#tables"/> for details.
+      </p>
+
+      <p>
+        If the data on S3 is intended to be long-lived and accessed by other 
tools in addition to Impala, create
+        any associated S3 tables with the <codeph>CREATE EXTERNAL 
TABLE</codeph> syntax, so that the files are not
+        deleted from S3 when the table is dropped.
+      </p>
+
+      <p>
+        If the data on S3 is only needed for querying by Impala and can be 
safely discarded once the Impala
+        workflow is complete, create the associated S3 tables using the 
<codeph>CREATE TABLE</codeph> syntax, so
+        that dropping the table also deletes the corresponding data files on 
S3.
+      </p>
+
+      <p>
+        For example, this session creates a table in S3 with the same column 
layout as a table in HDFS, then
+        examines the S3 table and queries some data from it. The table in S3 
works the same as a table in HDFS as
+        far as the expected file format of the data, table and column 
statistics, and other table properties. The
+        only indication that it is not an HDFS table is the 
<codeph>s3a://</codeph> URL in the
+        <codeph>LOCATION</codeph> property. Many data files can reside in the 
S3 directory, and their combined
+        contents form the table data. Because the data in this example is 
uploaded after the table is created, a
+        <codeph>REFRESH</codeph> statement prompts Impala to update its cached 
information about the data files.
+      </p>
+
+<codeblock>[localhost:21000] > create table usa_cities_s3 like usa_cities 
location 's3a://impala-demo/usa_cities';
+[localhost:21000] > desc usa_cities_s3;
++-------+----------+---------+
+| name  | type     | comment |
++-------+----------+---------+
+| id    | smallint |         |
+| city  | string   |         |
+| state | string   |         |
++-------+----------+---------+
+
+-- Now from a web browser, upload the same data file(s) to S3 as in the HDFS 
table,
+-- under the relevant bucket and path. If you already have the data in S3, you 
would
+-- point the table LOCATION at an existing path.
+
+[localhost:21000] > refresh usa_cities_s3;
+[localhost:21000] > select count(*) from usa_cities_s3;
++----------+
+| count(*) |
++----------+
+| 289      |
++----------+
+[localhost:21000] > select distinct state from sample_data_s3 limit 5;
++----------------------+
+| state                |
++----------------------+
+| Louisiana            |
+| Minnesota            |
+| Georgia              |
+| Alaska               |
+| Ohio                 |
++----------------------+
+[localhost:21000] > desc formatted usa_cities_s3;
++------------------------------+------------------------------+---------+
+| name                         | type                         | comment |
++------------------------------+------------------------------+---------+
+| # col_name                   | data_type                    | comment |
+|                              | NULL                         | NULL    |
+| id                           | smallint                     | NULL    |
+| city                         | string                       | NULL    |
+| state                        | string                       | NULL    |
+|                              | NULL                         | NULL    |
+| # Detailed Table Information | NULL                         | NULL    |
+| Database:                    | s3_testing                   | NULL    |
+| Owner:                       | jrussell                     | NULL    |
+| CreateTime:                  | Mon Mar 16 11:36:25 PDT 2015 | NULL    |
+| LastAccessTime:              | UNKNOWN                      | NULL    |
+| Protect Mode:                | None                         | NULL    |
+| Retention:                   | 0                            | NULL    |
+| Location:                    | s3a://impala-demo/usa_cities | NULL    |
+| Table Type:                  | MANAGED_TABLE                | NULL    |
+...
++------------------------------+------------------------------+---------+
+</codeblock>
+
+<!-- Cut out unnecessary output, makes the example too wide.
+| Table Parameters:            | NULL                                          
                 | NULL                 |
+|                              | COLUMN_STATS_ACCURATE                         
                 | false                |
+|                              | numFiles                                      
                 | 0                    |
+|                              | numRows                                       
                 | -1                   |
+|                              | rawDataSize                                   
                 | -1                   |
+|                              | totalSize                                     
                 | 0                    |
+|                              | transient_lastDdlTime                         
                 | 1426528176           |
+|                              | NULL                                          
                 | NULL                 |
+| # Storage Information        | NULL                                          
                 | NULL                 |
+| SerDe Library:               | 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe             | NULL           
      |
+| InputFormat:                 | org.apache.hadoop.mapred.TextInputFormat      
                 | NULL                 |
+| OutputFormat:                | 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat     | NULL           
      |
+| Compressed:                  | No                                            
                 | NULL                 |
+| Num Buckets:                 | 0                                             
                 | NULL                 |
+| Bucket Columns:              | []                                            
                 | NULL                 |
+| Sort Columns:                | []                                            
                 | NULL                 |
+-->
+
+      <p>
+        In this case, we have already uploaded a Parquet file with a million 
rows of data to the
+        <codeph>sample_data</codeph> directory underneath the 
<codeph>impala-demo</codeph> bucket on S3. This
+        session creates a table with matching column settings pointing to the 
corresponding location in S3, then
+        queries the table. Because the data is already in place on S3 when the 
table is created, no
+        <codeph>REFRESH</codeph> statement is required.
+      </p>
+
+<codeblock>[localhost:21000] > create table sample_data_s3
+                  > (id int, id bigint, val int, zerofill string,
+                  > name string, assertion boolean, city string, state string)
+                  > stored as parquet location 's3a://impala-demo/sample_data';
+[localhost:21000] > select count(*) from sample_data_s3;;
++----------+
+| count(*) |
++----------+
+| 1000000  |
++----------+
+[localhost:21000] > select count(*) howmany, assertion from sample_data_s3 
group by assertion;
++---------+-----------+
+| howmany | assertion |
++---------+-----------+
+| 667149  | true      |
+| 332851  | false     |
++---------+-----------+
+</codeblock>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="s3_queries">
+
+    <title>Running and Tuning Impala Queries for Data Stored on S3</title>
+
+    <conbody>
+
+      <p>
+        Once the appropriate <codeph>LOCATION</codeph> attributes are set up 
at the table or partition level, you
+        query data stored in S3 exactly the same as data stored on HDFS or in 
HBase:
+      </p>
+
+      <ul>
+        <li>
+          Queries against S3 data support all the same file formats as for 
HDFS data.
+        </li>
+
+        <li>
+          Tables can be unpartitioned or partitioned. For partitioned tables, 
either manually construct paths in S3
+          corresponding to the HDFS directories representing partition key 
values, or use <codeph>ALTER TABLE ...
+          ADD PARTITION</codeph> to set up the appropriate paths in S3.
+        </li>
+
+        <li>
+          HDFS and HBase tables can be joined to S3 tables, or S3 tables can 
be joined with each other.
+        </li>
+
+        <li>
+          Authorization using the Sentry framework to control access to 
databases, tables, or columns works the
+          same whether the data is in HDFS or in S3.
+        </li>
+
+        <li>
+          The <cmdname>catalogd</cmdname> daemon caches metadata for both HDFS 
and S3 tables. Use
+          <codeph>REFRESH</codeph> and <codeph>INVALIDATE METADATA</codeph> 
for S3 tables in the same situations
+          where you would issue those statements for HDFS tables.
+        </li>
+
+        <li>
+          Queries against S3 tables are subject to the same kinds of admission 
control and resource management as
+          HDFS tables.
+        </li>
+
+        <li>
+          Metadata about S3 tables is stored in the same metastore database as 
for HDFS tables.
+        </li>
+
+        <li>
+          You can set up views referring to S3 tables, the same as for HDFS 
tables.
+        </li>
+
+        <li>
+          The <codeph>COMPUTE STATS</codeph>, <codeph>SHOW TABLE 
STATS</codeph>, and <codeph>SHOW COLUMN
+          STATS</codeph> statements work for S3 tables also.
+        </li>
+      </ul>
+
+    </conbody>
+
+    <concept id="s3_performance">
+
+      <title>Understanding and Tuning Impala Query Performance for S3 
Data</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Performance"/>
+    </metadata>
+  </prolog>
+
+      <conbody>
+
+        <p>
+          Although Impala queries for data stored in S3 might be less 
performant than queries against the
+          equivalent data stored in HDFS, you can still do some tuning. Here 
are techniques you can use to
+          interpret explain plans and profiles for queries against S3 data, 
and tips to achieve the best
+          performance possible for such queries.
+        </p>
+
+        <p>
+          All else being equal, performance is expected to be lower for 
queries running against data on S3 rather
+          than HDFS. The actual mechanics of the <codeph>SELECT</codeph> 
statement are somewhat different when the
+          data is in S3. Although the work is still distributed across the 
datanodes of the cluster, Impala might
+          parallelize the work for a distributed query differently for data on 
HDFS and S3. S3 does not have the
+          same block notion as HDFS, so Impala uses heuristics to determine 
how to split up large S3 files for
+          processing in parallel. Because all hosts can access any S3 data 
file with equal efficiency, the
+          distribution of work might be different than for HDFS data, where 
the data blocks are physically read
+          using short-circuit local reads by hosts that contain the 
appropriate block replicas. Although the I/O to
+          read the S3 data might be spread evenly across the hosts of the 
cluster, the fact that all data is
+          initially retrieved across the network means that the overall query 
performance is likely to be lower for
+          S3 data than for HDFS data.
+        </p>
+
+        <p conref="../shared/impala_common.xml#common/s3_block_splitting"/>
+
+        <p conref="../shared/impala_common.xml#common/s3_dml_performance"/>
+
+        <p>
+          When optimizing aspects of for complex queries such as the join 
order, Impala treats tables on HDFS and
+          S3 the same way. Therefore, follow all the same tuning 
recommendations for S3 tables as for HDFS ones,
+          such as using the <codeph>COMPUTE STATS</codeph> statement to help 
Impala construct accurate estimates of
+          row counts and cardinality. See <xref 
href="impala_performance.xml#performance"/> for details.
+        </p>
+
+        <p>
+          In query profile reports, the numbers for 
<codeph>BytesReadLocal</codeph>,
+          <codeph>BytesReadShortCircuit</codeph>, 
<codeph>BytesReadDataNodeCached</codeph>, and
+          <codeph>BytesReadRemoteUnexpected</codeph> are blank because those 
metrics come from HDFS.
+          If you do see any indications that a query against an S3 table 
performed <q>remote read</q>
+          operations, do not be alarmed. That is expected because, by 
definition, all the I/O for S3 tables involves
+          remote reads.
+        </p>
+
+      </conbody>
+
+    </concept>
+
+  </concept>
+
+  <concept id="s3_restrictions">
+
+    <title>Restrictions on Impala Support for S3</title>
+
+    <conbody>
+
+      <p>
+        Impala requires that the default filesystem for the cluster be HDFS. 
You cannot use S3 as the only
+        filesystem in the cluster.
+      </p>
+
+      <p rev="2.6.0 CDH-39913 IMPALA-1878">
+        Prior to CDH 5.8 / Impala 2.6, Impala could not perform DML operations 
(<codeph>INSERT</codeph>,
+        <codeph>LOAD DATA</codeph>, or <codeph>CREATE TABLE AS 
SELECT</codeph>) where the destination is a table
+        or partition located on an S3 filesystem. This restriction is lifted 
in CDH 5.8 / Impala 2.6 and higher.
+      </p>
+
+      <p>
+        Impala does not support the old <codeph>s3://</codeph> block-based and 
<codeph>s3n://</codeph> filesystem
+        schemes, only <codeph>s3a://</codeph>.
+      </p>
+
+      <p>
+        Although S3 is often used to store JSON-formatted data, the current 
Impala support for S3 does not include
+        directly querying JSON data. For Impala queries, use data files in one 
of the file formats listed in
+        <xref href="impala_file_formats.xml#file_formats"/>. If you have data 
in JSON format, you can prepare a
+        flattened version of that data for querying by Impala as part of your 
ETL cycle.
+      </p>
+
+      <p>
+        You cannot use the <codeph>ALTER TABLE ... SET CACHED</codeph> 
statement for tables or partitions that are
+        located in S3.
+      </p>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="s3_best_practices" rev="2.6.0 CDH-33310 CDH-39913 IMPALA-1878">
+    <title>Best Practices for Using Impala with S3</title>
+    <prolog>
+      <metadata>
+        <data name="Category" value="Guidelines"/>
+        <data name="Category" value="Best Practices"/>
+      </metadata>
+    </prolog>
+    <conbody>
+      <p>
+        The following guidelines represent best practices derived from testing 
and field experience with Impala on S3:
+      </p>
+      <ul>
+        <li>
+          <p>
+            Any reference to an S3 location must be fully qualified. (This 
rule applies when
+            S3 is not designated as the default filesystem.)
+          </p>
+        </li>
+        <li>
+          <p>
+            Set the safety valve <codeph>fs.s3a.connection.maximum</codeph> to 
1500 for <cmdname>impalad</cmdname>.
+          </p>
+        </li>
+        <li>
+          <p>
+            Set safety valve <codeph>fs.s3a.block.size</codeph> to 134217728
+            (128 MB in bytes) if most Parquet files queried by Impala were 
written by Hive
+            or ParquetMR jobs. Set the block size to 268435456 (256 MB in 
bytes) if most Parquet
+            files queried by Impala were written by Impala.
+          </p>
+        </li>
+        <li>
+          <p>
+            <codeph>DROP TABLE .. PURGE</codeph> is much faster than the 
default <codeph>DROP TABLE</codeph>.
+            The same applies to <codeph>ALTER TABLE ... DROP PARTITION 
PURGE</codeph>
+            versus the default <codeph>DROP PARTITION</codeph> operation.
+            However, due to the eventually consistent nature of S3, the files 
for that
+            table or partition could remain for some unbounded time when using 
<codeph>PURGE</codeph>.
+            The default <codeph>DROP TABLE/PARTITION</codeph> is slow because 
Impala copies the files to the HDFS trash folder,
+            and Impala waits until all the data is moved. <codeph>DROP 
TABLE/PARTITION .. PURGE</codeph> is a
+            fast delete operation, and the Impala statement finishes quickly 
even though the change might not
+            have propagated fully throughout S3.
+          </p>
+        </li>
+        <li>
+          <p>
+            <codeph>INSERT</codeph> statements are faster than <codeph>INSERT 
OVERWRITE</codeph> for S3.
+            The query option <codeph>S3_SKIP_INSERT_STAGING</codeph>, which is 
set to <codeph>true</codeph> by default,
+            skips the staging step for regular <codeph>INSERT</codeph> (but 
not <codeph>INSERT OVERWRITE</codeph>).
+            This makes the operation much faster, but consistency is not 
guaranteed: if a node fails during execution, the
+            table could end up with inconsistent data. Set this option to 
<codeph>false</codeph> if stronger
+            consistency is required, however this setting will make the 
<codeph>INSERT</codeph> operations slower.
+          </p>
+        </li>
+        <li>
+          <p>
+            Too many files in a table can make metadata loading and updating 
slow on S3.
+            If too many requests are made to S3, S3 has a back-off mechanism 
and
+            responds slower than usual. You might have many small files 
because of:
+          </p>
+          <ul>
+            <li>
+              <p>
+                Too many partitions due to over-granular partitioning. Prefer 
partitions with
+                many megabytes of data, so that even a query against a single 
partition can
+                be parallelized effectively.
+              </p>
+            </li>
+            <li>
+              <p>
+                Many small <codeph>INSERT</codeph> queries. Prefer bulk
+                <codeph>INSERT</codeph>s so that more data is written to fewer
+                files.
+              </p>
+            </li>
+          </ul>
+        </li>
+      </ul>
+
+    </conbody>
+  </concept>
 
 
+</concept>

[07/23] incubator-impala git commit: Update all impala* files to the latest CDH 5.9/5.10 versions.

Reply via email to