[1/6] incubator-impala git commit: Add files that weren't needed during initial build testing of SQL Reference.

jrussell Sun, 30 Oct 2016 22:54:08 -0700

Repository: incubator-impala
Updated Branches:
  refs/heads/doc_prototype 4bf483c44 -> bb88fdc0a



http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_replica_preference.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_replica_preference.xml 
b/docs/topics/impala_replica_preference.xml
new file mode 100644
index 0000000..fcf93cc
--- /dev/null
+++ b/docs/topics/impala_replica_preference.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="replica_preference" rev="2.7.0">
+
+  <title>REPLICA_PREFERENCE Query Option (CDH 5.9 or higher only)</title>
+  <titlealts audience="PDF"><navtitle>REPLICA_PREFERENCE</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.7.0">
+      <indexterm audience="Cloudera">REPLICA_PREFERENCE query 
option</indexterm>
+    </p>
+
+    <p>
+      The <codeph>REPLICA_PREFERENCE</codeph> query option
+      lets you spread the load more evenly if hotspots and bottlenecks 
persist, by allowing hosts to do local reads,
+      or even remote reads, to retrieve the data for cached blocks if Impala 
can determine that it would be
+      too expensive to do all such processing on a particular host.
+    </p>
+
+    <p>
+      <b>Type:</b> numeric (0, 3, 5)
+      or corresponding mnemonic strings (<codeph>CACHE_LOCAL</codeph>, 
<codeph>DISK_LOCAL</codeph>, <codeph>REMOTE</codeph>).
+      The gaps in the numeric sequence are to accomodate other intermediate
+      values that might be added in the future.
+    </p>
+
+    <p>
+      <b>Default:</b> 0 (equivalent to <codeph>CACHE_LOCAL</codeph>)
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_270"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_perf_hdfs_caching.xml#hdfs_caching"/>, <xref 
href="impala_schedule_random_replica.xml#schedule_random_replica"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_rm_initial_mem.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_rm_initial_mem.xml 
b/docs/topics/impala_rm_initial_mem.xml
new file mode 100644
index 0000000..cfd77cf
--- /dev/null
+++ b/docs/topics/impala_rm_initial_mem.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="rm_initial_mem" rev="2.5.0">
+
+  <title>RM_INITIAL_MEM Query Option</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.5.0">
+      <indexterm audience="Cloudera">RM_INITIAL_MEM query option</indexterm>
+    </p>
+
+    <p>
+      <b>Type:</b>
+    </p>
+
+    <p>
+      <b>Default:</b>
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_runtime_bloom_filter_size.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_runtime_bloom_filter_size.xml 
b/docs/topics/impala_runtime_bloom_filter_size.xml
new file mode 100644
index 0000000..e372d4f
--- /dev/null
+++ b/docs/topics/impala_runtime_bloom_filter_size.xml
@@ -0,0 +1,93 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="runtime_bloom_filter_size" rev="2.5.0">
+
+  <title>RUNTIME_BLOOM_FILTER_SIZE Query Option (CDH 5.7 or higher 
only)</title>
+  <titlealts 
audience="PDF"><navtitle>RUNTIME_BLOOM_FILTER_SIZE</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.5.0">
+      <indexterm audience="Cloudera">RUNTIME_BLOOM_FILTER_SIZE query 
option</indexterm>
+      Size (in bytes) of Bloom filter data structure used by the runtime 
filtering
+      feature.
+    </p>
+
+    <note type="important">
+      <p rev="2.6.0 CDH-41184 IMPALA-3007">
+        In CDH 5.8 / Impala 2.6 and higher, this query option only applies as 
a fallback, when statistics
+        are not available. By default, Impala estimates the optimal size of 
the Bloom filter structure
+        regardless of the setting for this option. (This is a change from the 
original behavior in
+        CDH 5.7 / Impala 2.5.)
+      </p>
+      <p rev="2.6.0 CDH-41184 IMPALA-3007">
+        In CDH 5.8 / Impala 2.6 and higher, when the value of this query 
option is used for query planning,
+        it is constrained by the minimum and maximum sizes specified by the
+        <codeph>RUNTIME_FILTER_MIN_SIZE</codeph> and 
<codeph>RUNTIME_FILTER_MAX_SIZE</codeph> query options.
+        The filter size is adjusted upward or downward if necessary to fit 
within the minimum/maximum range.
+      </p>
+    </note>
+
+    <p conref="../shared/impala_common.xml#common/type_integer"/>
+
+    <p>
+      <b>Default:</b> 1048576 (1 MB)
+    </p>
+
+    <p>
+      <b>Maximum:</b> 16 MB
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_250"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p>
+      This setting affects optimizations for large and complex queries, such
+      as dynamic partition pruning for partitioned tables, and join 
optimization
+      for queries that join large tables.
+      Larger filters are more effective at handling
+      higher cardinality input sets, but consume more memory per filter.
+      <!--
+      See <xref href="impala_runtime_filtering.xml#runtime_filtering"/> for 
details about when to change
+      the setting for this query option.
+      -->
+    </p>
+
+    <p>
+      If your query filters on high-cardinality columns (for example, millions 
of different values)
+      and you do not get the expected speedup from the runtime filtering 
mechanism, consider
+      doing some benchmarks with a higher value for 
<codeph>RUNTIME_BLOOM_FILTER_SIZE</codeph>.
+      The extra memory devoted to the Bloom filter data structures can help 
make the filtering
+      more accurate.
+    </p>
+
+    <p 
conref="../shared/impala_common.xml#common/runtime_filtering_option_caveat"/>
+
+    <p>
+      Because the effectiveness of this setting depends so much on query 
characteristics and data distribution,
+      you typically only use it for specific queries that need some extra 
tuning, and the ideal value depends
+      on the query. Consider setting this query option immediately before the 
expensive query and
+      unsetting it immediately afterward.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_runtime_filtering.xml"/>,
+      <!-- , <xref href="impala_partitioning.xml#dynamic_partition_pruning"/> 
-->
+      <xref href="impala_runtime_filter_mode.xml#runtime_filter_mode"/>,
+      <xref 
href="impala_runtime_filter_min_size.xml#runtime_filter_min_size"/>,
+      <xref href="impala_runtime_filter_max_size.xml#runtime_filter_max_size"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_runtime_filter_max_size.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_runtime_filter_max_size.xml 
b/docs/topics/impala_runtime_filter_max_size.xml
new file mode 100644
index 0000000..9f729b1
--- /dev/null
+++ b/docs/topics/impala_runtime_filter_max_size.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="runtime_filter_max_size" rev="2.6.0 IMPALA-3480 CDH-41184">
+
+  <title>RUNTIME_FILTER_MAX_SIZE Query Option (CDH 5.8 or higher only)</title>
+  <titlealts 
audience="PDF"><navtitle>RUNTIME_FILTER_MAX_SIZE</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.6.0 IMPALA-3480">
+      <indexterm audience="Cloudera">RUNTIME_FILTER_MAX_SIZE query 
option</indexterm>
+      The <codeph>RUNTIME_FILTER_MAX_SIZE</codeph> query option
+      adjusts the settings for the runtime filtering feature.
+      This option defines the maximum size for a filter,
+      no matter what the estimates produced by the planner are.
+      This value also overrides any lower number specified for the
+      <codeph>RUNTIME_BLOOM_FILTER_SIZE</codeph> query option.
+      Filter sizes are rounded up to the nearest power of two.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/type_integer"/>
+
+    <p>
+      <b>Default:</b> 0 (meaning use the value from the corresponding 
<cmdname>impalad</cmdname> startup option)
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_260"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p 
conref="../shared/impala_common.xml#common/runtime_filtering_option_caveat"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_runtime_filtering.xml"/>,
+      <xref href="impala_runtime_filter_mode.xml#runtime_filter_mode"/>,
+      <xref 
href="impala_runtime_filter_min_size.xml#runtime_filter_min_size"/>,
+      <xref 
href="impala_runtime_bloom_filter_size.xml#runtime_bloom_filter_size"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_runtime_filter_min_size.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_runtime_filter_min_size.xml 
b/docs/topics/impala_runtime_filter_min_size.xml
new file mode 100644
index 0000000..ec152f6
--- /dev/null
+++ b/docs/topics/impala_runtime_filter_min_size.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="runtime_filter_min_size" rev="2.6.0 IMPALA-3480 CDH-41184">
+
+  <title>RUNTIME_FILTER_MIN_SIZE Query Option (CDH 5.8 or higher only)</title>
+  <titlealts 
audience="PDF"><navtitle>RUNTIME_FILTER_MIN_SIZE</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.6.0 IMPALA-3480">
+      <indexterm audience="Cloudera">RUNTIME_FILTER_MIN_SIZE query 
option</indexterm>
+      The <codeph>RUNTIME_FILTER_MIN_SIZE</codeph> query option
+      adjusts the settings for the runtime filtering feature.
+      This option defines the minimum size for a filter,
+      no matter what the estimates produced by the planner are.
+      This value also overrides any lower number specified for the
+      <codeph>RUNTIME_BLOOM_FILTER_SIZE</codeph> query option.
+      Filter sizes are rounded up to the nearest power of two.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/type_integer"/>
+
+    <p>
+      <b>Default:</b> 0 (meaning use the value from the corresponding 
<cmdname>impalad</cmdname> startup option)
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_260"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p 
conref="../shared/impala_common.xml#common/runtime_filtering_option_caveat"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_runtime_filtering.xml"/>,
+      <xref href="impala_runtime_filter_mode.xml#runtime_filter_mode"/>,
+      <xref 
href="impala_runtime_filter_max_size.xml#runtime_filter_max_size"/>,
+      <xref 
href="impala_runtime_bloom_filter_size.xml#runtime_bloom_filter_size"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_runtime_filter_mode.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_runtime_filter_mode.xml 
b/docs/topics/impala_runtime_filter_mode.xml
new file mode 100644
index 0000000..2494621
--- /dev/null
+++ b/docs/topics/impala_runtime_filter_mode.xml
@@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="runtime_filter_mode" rev="2.5.0">
+
+  <title>RUNTIME_FILTER_MODE Query Option (CDH 5.7 or higher only)</title>
+  <titlealts 
audience="PDF"><navtitle>RUNTIME_FILTER_MODE</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.5.0">
+      <indexterm audience="Cloudera">RUNTIME_FILTER_MODE query 
option</indexterm>
+    </p>
+
+    <p>
+      The <codeph>RUNTIME_FILTER_MODE</codeph> query option
+      adjusts the settings for the runtime filtering feature.
+      It turns this feature on and off, and controls how
+      extensively the filters are transmitted between hosts.
+    </p>
+
+    <p>
+      <b>Type:</b> numeric (0, 1, 2)
+      or corresponding mnemonic strings (<codeph>OFF</codeph>, 
<codeph>LOCAL</codeph>, <codeph>GLOBAL</codeph>).
+    </p>
+
+    <p rev="2.6.0 CDH-41184">
+      <b>Default:</b> 2 (equivalent to <codeph>GLOBAL</codeph>); formerly was 
1 / <codeph>LOCAL</codeph>, in CDH 5.7 / Impala 2.5
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_250"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p rev="2.6.0 CDH-41184">
+      In CDH 5.8 / Impala 2.6 and higher, the default is 
<codeph>GLOBAL</codeph>.
+      This setting is recommended for a wide variety of workloads, to provide 
best
+      performance with <q>out of the box</q> settings.
+    </p>
+
+    <p rev="2.6.0 CDH-41184">
+      The lowest setting of <codeph>LOCAL</codeph> does a similar level of 
optimization
+      (such as partition pruning) as in earlier Impala releases.
+      This setting was the default in CDH 5.7 / Impala 2.5,
+      to allow for a period of post-upgrade testing for existing workloads.
+      This setting is suitable for workloads with non-performance-critical 
queries,
+      or if the coordinator node is under heavy CPU or memory pressure.
+    </p>
+
+    <p>
+      You might change the setting to <codeph>OFF</codeph> if your workload 
contains
+      many queries involving partitioned tables or joins that do not 
experience a performance
+      increase from the runtime filters feature. If the overhead of producing 
the runtime filters
+      outweighs the performance benefit for queries, you can turn the feature 
off entirely.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_partitioning.xml#partitioning"/> for details about 
runtime filtering.
+      <xref 
href="impala_disable_row_runtime_filtering.xml#disable_row_runtime_filtering"/>,
+      <xref 
href="impala_runtime_bloom_filter_size.xml#runtime_bloom_filter_size"/>,
+      <xref 
href="impala_runtime_filter_wait_time_ms.xml#runtime_filter_wait_time_ms"/>,
+      and
+      <xref href="impala_max_num_runtime_filters.xml#max_num_runtime_filters"/>
+      for tuning options for runtime filtering.
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_runtime_filter_wait_time_ms.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_runtime_filter_wait_time_ms.xml 
b/docs/topics/impala_runtime_filter_wait_time_ms.xml
new file mode 100644
index 0000000..222d65c
--- /dev/null
+++ b/docs/topics/impala_runtime_filter_wait_time_ms.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="runtime_filter_wait_time_ms" rev="2.5.0">
+
+  <title>RUNTIME_FILTER_WAIT_TIME_MS Query Option (CDH 5.7 or higher 
only)</title>
+  <titlealts 
audience="PDF"><navtitle>RUNTIME_FILTER_WAIT_TIME_MS</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.5.0">
+      <indexterm audience="Cloudera">RUNTIME_FILTER_WAIT_TIME_MS query 
option</indexterm>
+      The <codeph>RUNTIME_FILTER_WAIT_TIME_MS</codeph> query option
+      adjusts the settings for the runtime filtering feature.
+      It specifies a time in milliseconds that each scan node waits for
+      runtime filters to be produced by other plan fragments.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/type_integer"/>
+
+    <p>
+      <b>Default:</b> 0 (meaning use the value from the corresponding 
<cmdname>impalad</cmdname> startup option)
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_250"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p 
conref="../shared/impala_common.xml#common/runtime_filtering_option_caveat"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_runtime_filtering.xml"/>,
+      <xref href="impala_runtime_filter_mode.xml#runtime_filter_mode"/>
+      <!-- , <xref href="impala_partitioning.xml#dynamic_partition_pruning"/> 
-->
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_runtime_filtering.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_runtime_filtering.xml 
b/docs/topics/impala_runtime_filtering.xml
new file mode 100644
index 0000000..4c69e0c
--- /dev/null
+++ b/docs/topics/impala_runtime_filtering.xml
@@ -0,0 +1,506 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="runtime_filtering" rev="2.5.0">
+
+  <title id="runtime_filters">Runtime Filtering for Impala Queries</title>
+  <titlealts audience="PDF"><navtitle>Runtime Filtering</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="SQL"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.5.0">
+      <indexterm audience="Cloudera">runtime filtering</indexterm>
+      <term>Runtime filtering</term> is a wide-ranging optimization feature 
available in
+      CDH 5.7 / Impala 2.5 and higher. When only a fraction of the data in a 
table is
+      needed for a query against a partitioned table or to evaluate a join 
condition,
+      Impala determines the appropriate conditions while the query is running, 
and
+      broadcasts that information to all the <cmdname>impalad</cmdname> nodes 
that are reading the table
+      so that they can avoid unnecessary I/O to read partition data, and avoid
+      unnecessary network transmission by sending only the subset of rows that 
match the join keys
+      across the network.
+    </p>
+
+    <p>
+      This feature is primarily used to optimize queries against large 
partitioned tables
+      (under the name <term>dynamic partition pruning</term>) and joins of 
large tables.
+      The information in this section includes concepts, internals, and 
troubleshooting
+      information for the entire runtime filtering feature.
+      For specific tuning steps for partitioned tables,
+      <!-- and join queries, -->
+      see
+      <xref href="impala_partitioning.xml#dynamic_partition_pruning"/>.
+      <!-- and <xref href="impala_joins.xml#joins"/>. -->
+    </p>
+
+    <note type="important" rev="2.6.0 CDH-41184">
+      <p rev="2.6.0 CDH-41184">
+        When this feature made its debut in CDH 5.7 / Impala 2.5,
+        the default setting was <codeph>RUNTIME_FILTER_MODE=LOCAL</codeph>.
+        Now the default is <codeph>RUNTIME_FILTER_MODE=GLOBAL</codeph> in CDH 
5.8 / Impala 2.6 and higher,
+        which enables more wide-ranging and ambitious query optimization 
without requiring you to
+        explicitly set any query options.
+      </p>
+    </note>
+
+    <p outputclass="toc inpage"/>
+
+  </conbody>
+
+  <concept id="runtime_filtering_concepts">
+    <title>Background Information for Runtime Filtering</title>
+    <conbody>
+      <p>
+        To understand how runtime filtering works at a detailed level, you must
+        be familiar with some terminology from the field of distributed 
database technology:
+      </p>
+      <ul>
+        <li>
+          <p>
+            What a <term>plan fragment</term> is.
+            Impala decomposes each query into smaller units of work that are 
distributed across the cluster.
+            Wherever possible, a data block is read, filtered, and aggregated 
by plan fragments executing
+            on the same host. For some operations, such as joins and combining 
intermediate results into
+            a final result set, data is transmitted across the network from 
one DataNode to another.
+          </p>
+        </li>
+        <li>
+          <p>
+            What <codeph>SCAN</codeph> and <codeph>HASH JOIN</codeph> plan 
nodes are, and their role in computing query results:
+          </p>
+          <p>
+            In the Impala query plan, a <term>scan node</term> performs the 
I/O to read from the underlying data files.
+            Although this is an expensive operation from the traditional 
database perspective, Hadoop clusters and Impala are
+            optimized to do this kind of I/O in a highly parallel fashion. The 
major potential cost savings come from using
+            the columnar Parquet format (where Impala can avoid reading data 
for unneeded columns) and partitioned tables
+            (where Impala can avoid reading data for unneeded partitions).
+          </p>
+          <p>
+            Most Impala joins use the
+            <xref href="https://en.wikipedia.org/wiki/Hash_join"; 
scope="external" format="html"><term>hash join</term></xref>
+            mechanism. (It is only fairly recently that Impala
+            started using the nested-loop join technique, for certain kinds of 
non-equijoin queries.)
+            In a hash join, when evaluating join conditions from two tables, 
Impala constructs a hash table in memory with all
+            the different column values from the table on one side of the join.
+            Then, for each row from the table on the other side of the join, 
Impala tests whether the relevant column values
+            are in this hash table or not.
+          </p>
+          <p>
+            A <term>hash join node</term> constructs such an in-memory hash 
table, then performs the comparisons to
+            identify which rows match the relevant join conditions
+            and should be included in the result set (or at least sent on to 
the subsequent intermediate stage of
+            query processing). Because some of the input for a hash join might 
be transmitted across the network from another host,
+            it is especially important from a performance perspective to prune 
out ahead of time any data that is known to be
+            irrelevant.
+          </p>
+          <p>
+            The more distinct values are in the columns used as join keys, the 
larger the in-memory hash table and
+            thus the more memory required to process the query.
+          </p>
+        </li>
+        <li>
+          <p>
+            The difference between a <term>broadcast join</term> and a 
<term>shuffle join</term>.
+            (The Hadoop notion of a shuffle join is sometimes referred to in 
Impala as a <term>partitioned join</term>.)
+            In a broadcast join, the table from one side of the join 
(typically the smaller table)
+            is sent in its entirety to all the hosts involved in the query. 
Then each host can compare its
+            portion of the data from the other (larger) table against the full 
set of possible join keys.
+            In a shuffle join, there is no obvious <q>smaller</q> table, and 
so the contents of both tables
+            are divided up, and corresponding portions of the data are 
transmitted to each host involved in the query.
+            See <xref href="impala_hints.xml#hints"/> for information about 
how these different kinds of
+            joins are processed.
+          </p>
+        </li>
+        <li>
+          <p>
+            The notion of the build phase and probe phase when Impala 
processes a join query.
+            The <term>build phase</term> is where the rows containing the join 
key columns, typically for the smaller table,
+            are transmitted across the network and built into an in-memory 
hash table data structure on one or
+            more destination nodes.
+            The <term>probe phase</term> is where data is read locally 
(typically from the larger table) and the join key columns
+            are compared to the values in the in-memory hash table.
+            The corresponding input sources (tables, subqueries, and so on) 
for these
+            phases are referred to as the <term>build side</term> and the 
<term>probe side</term>.
+          </p>
+        </li>
+        <li>
+          <p>
+            How to set Impala query options: interactively within an 
<cmdname>impala-shell</cmdname> session through
+            the <codeph>SET</codeph> command, for a JDBC or ODBC application 
through the <codeph>SET</codeph> statement, or
+            globally for all <cmdname>impalad</cmdname> daemons through the 
<codeph>default_query_options</codeph> configuration
+            setting.
+          </p>
+        </li>
+      </ul>
+    </conbody>
+  </concept>
+
+  <concept id="runtime_filtering_internals">
+    <title>Runtime Filtering Internals</title>
+    <conbody>
+      <p>
+        The <term>filter</term> that is transmitted between plan fragments is 
essentially a list
+        of values for join key columns. When this list is values is 
transmitted in time to a scan node,
+        Impala can filter out non-matching values immediately after reading 
them, rather than transmitting
+        the raw data to another host to compare against the in-memory hash 
table on that host.
+        This data structure is implemented as a <term>Bloom filter</term>, 
which uses a probability-based
+        algorithm to determine all possible matching values. (The 
probability-based aspects means that the
+        filter might include some non-matching values, but if so, that does 
not cause any inaccuracy
+        in the final results.)
+      </p>
+      <p>
+        There are different kinds of filters to match the different kinds of 
joins (partitioned and broadcast).
+        A broadcast filter is a complete list of relevant values that can be 
immediately evaluated by a scan node.
+        A partitioned filter is a partial list of relevant values (based on 
the data processed by one host in the
+        cluster); all the partitioned filters must be combined into one (by 
the coordinator node) before the
+        scan nodes can use the results to accurately filter the data as it is 
read from storage.
+      </p>
+      <p>
+        Broadcast filters are also classified as local or global. With a local 
broadcast filter, the information
+        in the filter is used by a subsequent query fragment that is running 
on the same host that produced the filter.
+        A non-local broadcast filter must be transmitted across the network to 
a query fragment that is running on a
+        different host. Impala designates 3 hosts to each produce non-local 
broadcast filters, to guard against the
+        possibility of a single slow host taking too long. Depending on the 
setting of the <codeph>RUNTIME_FILTER_MODE</codeph> query option
+        (<codeph>LOCAL</codeph> or <codeph>GLOBAL</codeph>), Impala either 
uses a conservative optimization
+        strategy where filters are only consumed on the same host that 
produced them, or a more aggressive strategy
+        where filters are eligible to be transmitted across the network.
+      </p>
+
+      <note rev="2.6.0 CDH-41184 IMPALA-3333">
+        In CDH 5.8 / Impala 2.6 and higher, the default for runtime filtering 
is the <codeph>GLOBAL</codeph> setting.
+      </note>
+
+    </conbody>
+  </concept>
+
+  <concept id="runtime_filtering_file_formats">
+    <title>File Format Considerations for Runtime Filtering</title>
+    <conbody>
+      <p>
+        Parquet tables get the most benefit from
+        the runtime filtering optimizations. Runtime filtering can speed up
+        join queries against partitioned or unpartitioned Parquet tables,
+        and single-table queries against partitioned Parquet tables.
+        See <xref href="impala_parquet.xml#parquet"/> for information about
+        using Parquet tables with Impala.
+      </p>
+      <p>
+        For other file formats (text, Avro, RCFile, and SequenceFile),
+        runtime filtering speeds up queries against partitioned tables only.
+        Because partitioned tables can use a mixture of formats, Impala 
produces
+        the filters in all cases, even if they are not ultimately used to
+        optimize the query.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="runtime_filtering_timing">
+    <title>Wait Intervals for Runtime Filters</title>
+    <conbody>
+      <p>
+        Because it takes time to produce runtime filters, especially for
+        partitioned filters that must be combined by the coordinator node,
+        there is a time interval above which it is more efficient for
+        the scan nodes to go ahead and construct their intermediate result 
sets,
+        even if that intermediate data is larger than optimal. If it only takes
+        a few seconds to produce the filters, it is worth the extra time if 
pruning
+        the unnecessary data can save minutes in the overall query time.
+        You can specify the maximum wait time in milliseconds using the
+        <codeph>RUNTIME_FILTER_WAIT_TIME_MS</codeph> query option.
+      </p>
+      <p>
+        By default, each scan node waits for up to 1 second (1000 milliseconds)
+        for filters to arrive. If all filters have not arrived within the
+        specified interval, the scan node proceeds, using whatever filters
+        did arrive to help avoid reading unnecessary data. If a filter arrives
+        after the scan node begins reading data, the scan node applies that
+        filter to the data that is read after the filter arrives, but not to
+        the data that was already read.
+      </p>
+      <p>
+        If the cluster is relatively busy and your workload contains many
+        resource-intensive or long-running queries, consider increasing the 
wait time
+        so that complicated queries do not miss opportunities for optimization.
+        If the cluster is lightly loaded and your workload contains many small 
queries
+        taking only a few seconds, consider decreasing the wait time to avoid 
the
+        1 second delay for each query.
+      </p>
+    </conbody>
+  </concept>
+
+
+  <concept id="runtime_filtering_query_options">
+    <title>Query Options for Runtime Filtering</title>
+    <conbody>
+      <p>
+        See the following sections for information about the query options 
that control runtime filtering:
+      </p>
+      <ul>
+        <li>
+          <p>
+            The first query option adjusts the <q>sensitivity</q> of this 
feature.
+            <ph rev="2.6.0 CDH-41184 IMPALA-3333">By default, it is set to the 
highest level (<codeph>GLOBAL</codeph>).
+            (This default applies to CDH 5.8 / Impala 2.6 and higher.
+            In previous releases, the default was <codeph>LOCAL</codeph>.)</ph>
+          </p>
+          <ul>
+            <li>
+              <p>
+                <xref 
href="impala_runtime_filter_mode.xml#runtime_filter_mode"/>
+              </p>
+            </li>
+          </ul>
+        </li>
+        <li>
+          <p>
+            The other query options are tuning knobs that you typically only 
adjust after doing
+            performance testing, and that you might want to change only for 
the duration of a single
+            expensive query:
+          </p>
+          <ul>
+            <li>
+              <p>
+                <xref 
href="impala_max_num_runtime_filters.xml#max_num_runtime_filters"/>
+              </p>
+            </li>
+            <li>
+              <p>
+                <xref 
href="impala_disable_row_runtime_filtering.xml#disable_row_runtime_filtering"/>
+              </p>
+            </li>
+            <li>
+              <p rev="2.6.0 IMPALA-3480">
+                <xref 
href="impala_runtime_filter_max_size.xml#runtime_filter_max_size"/> 
+              </p>
+            </li>
+            <li>
+              <p rev="2.6.0 IMPALA-3480">
+                <xref 
href="impala_runtime_filter_min_size.xml#runtime_filter_min_size"/> 
+              </p>
+            </li>
+            <li>
+              <p rev="2.6.0 IMPALA-3007">
+                <xref 
href="impala_runtime_bloom_filter_size.xml#runtime_bloom_filter_size"/>;
+                in CDH 5.8 / Impala 2.6 and higher, this setting acts as a 
fallback when
+                statistics are not available, rather than as a directive.
+              </p>
+            </li>
+          </ul>
+        </li>
+      </ul>
+    </conbody>
+  </concept>
+
+  <concept id="runtime_filtering_explain_plan">
+    <title>Runtime Filtering and Query Plans</title>
+    <conbody>
+      <p>
+        In the same way the query plan displayed by the
+        <codeph>EXPLAIN</codeph> statement includes information
+        about predicates used by each plan fragment, it also
+        includes annotations showing whether a plan fragment
+        produces or consumes a runtime filter.
+        A plan fragment that produces a filter includes an
+        annotation such as
+        <codeph>runtime filters: <varname>filter_id</varname> &lt;- 
<varname>table</varname>.<varname>column</varname></codeph>,
+        while a plan fragment that consumes a filter includes an annotation 
such as
+        <codeph>runtime filters: <varname>filter_id</varname> -&gt; 
<varname>table</varname>.<varname>column</varname></codeph>.
+      </p>
+
+      <p>
+        The following example shows a query that uses a single runtime filter 
(labelled <codeph>RF00</codeph>)
+        to prune the partitions that are scanned in one stage of the query, 
based on evaluating the
+        result set of a subquery:
+      </p>
+
+<codeblock conref="../shared/impala_common.xml#common/simple_dpp_example"/>
+
+      <p>
+        The query profile (displayed by the <codeph>PROFILE</codeph> command 
in <cmdname>impala-shell</cmdname>)
+        contains both the <codeph>EXPLAIN</codeph> plan and more detailed 
information about the internal
+        workings of the query. The profile output includes a section labelled 
the <q>filter routing table</q>,
+        with information about each filter based on its ID.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="runtime_filtering_queries">
+    <title>Examples of Queries that Benefit from Runtime Filtering</title>
+    <conbody>
+
+      <p>
+        In this example, Impala would normally do extra work to interpret the 
columns
+        <codeph>C1</codeph>, <codeph>C2</codeph>, <codeph>C3</codeph>, and 
<codeph>ID</codeph>
+        for each row in <codeph>HUGE_T1</codeph>, before checking the 
<codeph>ID</codeph>
+        value against the in-memory hash table constructed from all the 
<codeph>TINY_T2.ID</codeph>
+        values. By producing a filter containing all the 
<codeph>TINY_T2.ID</codeph> values
+        even before the query starts scanning the <codeph>HUGE_T1</codeph> 
table, Impala
+        can skip the unnecessary work to parse the column info as soon as it 
determines
+        that an <codeph>ID</codeph> value does not match any of the values 
from the other table.
+      </p>
+
+      <p>
+        The example shows <codeph>COMPUTE STATS</codeph> statements for both 
the tables (even
+        though that is a one-time operation after loading data into those 
tables) because
+        Impala relies on up-to-date statistics to
+        determine which one has more distinct <codeph>ID</codeph> values than 
the other.
+        That information lets Impala make effective decisions about which 
table to use to
+        construct the in-memory hash table, and which table to read from disk 
and
+        compare against the entries in the hash table.
+      </p>
+
+<codeblock rev="2.6.0 CDH-41184">
+COMPUTE STATS huge_t1;
+COMPUTE STATS tiny_t2;
+SELECT c1, c2, c3 FROM huge_t1 JOIN tiny_t2 WHERE huge_t1.id = tiny_t2.id;
+</codeblock>
+
+<!-- The greater-than comparison prevents runtime filtering from applying. 
Comment out for now;
+     put back if the example can be reworked in a way that does produce some 
filters.
+      <p>
+        In this example, <codeph>T1</codeph> is a table partitioned by year. 
The subquery
+        on <codeph>T2</codeph> produces a single value with the 
<codeph>MIN(year)</codeph> result,
+        and transmits that value as a filter to the plan fragments that are 
reading from <codeph>T1</codeph>.
+        Any non-matching partitions in <codeph>T1</codeph> are skipped.
+      </p>
+
+<codeblock>
+select c1 from t1 where year > (select min(year) from t2);
+</codeblock>
+-->
+
+      <p>
+        In this example, <codeph>T1</codeph> is a table partitioned by year. 
The subquery
+        on <codeph>T2</codeph> produces multiple values, and transmits those 
values as a filter to the plan
+        fragments that are reading from <codeph>T1</codeph>. Any non-matching 
partitions in <codeph>T1</codeph>
+        are skipped.
+      </p>
+
+<codeblock rev="2.6.0 CDH-41184">
+select c1 from t1 where year in (select distinct year from t2);
+</codeblock>
+
+      <p>
+        Now the <codeph>WHERE</codeph> clause contains an additional test that 
does not apply to
+        the partition key column.
+        A filter on a column that is not a partition key is called a per-row 
filter.
+        Because per-row filters only apply for Parquet, <codeph>T1</codeph> 
must be a Parquet table.
+      </p>
+
+      <p>
+        The subqueries result in two filters being transmitted to
+        the scan nodes that read from <codeph>T1</codeph>. The filter on 
<codeph>YEAR</codeph> helps the query eliminate
+        entire partitions based on non-matching years. The filter on 
<codeph>C2</codeph> lets Impala discard
+        rows with non-matching <codeph>C2</codeph> values immediately after 
reading them. Without runtime filtering,
+        Impala would have to keep the non-matching values in memory, assemble 
<codeph>C1</codeph>, <codeph>C2</codeph>,
+        and <codeph>C3</codeph> into rows in the intermediate result set, and 
transmit all the intermediate rows
+        back to the coordinator node, where they would be eliminated only at 
the very end of the query.
+      </p>
+
+<codeblock rev="2.6.0 CDH-41184">
+select c1, c2, c3 from t1
+  where year in (select distinct year from t2)
+    and c2 in (select other_column from t3);
+</codeblock>
+
+      <p>
+        This example involves a broadcast join.
+        The fact that the <codeph>ON</codeph> clause would
+        return a small number of matching rows (because there
+        are not very many rows in <codeph>TINY_T2</codeph>)
+        means that the corresponding filter is very selective.
+        Therefore, runtime filtering will probably be effective
+        in optimizing this query.
+      </p>
+
+<codeblock rev="2.6.0 CDH-41184">
+select c1 from huge_t1 join [broadcast] tiny_t2
+  on huge_t1.id = tiny_t2.id
+  where huge_t1.year in (select distinct year from tiny_t2)
+    and c2 in (select other_column from t3);
+</codeblock>
+
+      <p>
+        This example involves a shuffle or partitioned join.
+        Assume that most rows in <codeph>HUGE_T1</codeph>
+        have a corresponding row in <codeph>HUGE_T2</codeph>.
+        The fact that the <codeph>ON</codeph> clause could
+        return a large number of matching rows means that
+        the corresponding filter would not be very selective.
+        Therefore, runtime filtering might be less effective
+        in optimizing this query.
+      </p>
+
+<codeblock rev="2.6.0 CDH-41184">
+select c1 from huge_t1 join [shuffle] huge_t2
+  on huge_t1.id = huge_t2.id
+  where huge_t1.year in (select distinct year from huge_t2)
+    and c2 in (select other_column from t3);
+</codeblock>
+
+    </conbody>
+  </concept>
+
+  <concept id="runtime_filtering_tuning">
+    <title>Tuning and Troubleshooting Queries that Use Runtime 
Filtering</title>
+    <conbody>
+      <p>
+        These tuning and troubleshooting procedures apply to queries that are
+        resource-intensive enough, long-running enough, and frequent enough
+        that you can devote special attention to optimizing them individually.
+      </p>
+
+      <p>
+        Use the <codeph>EXPLAIN</codeph> statement and examine the 
<codeph>runtime filters:</codeph>
+        lines to determine whether runtime filters are being applied to the 
<codeph>WHERE</codeph> predicates
+        and join clauses that you expect. For example, runtime filtering does 
not apply to queries that use
+        the nested loop join mechanism due to non-equijoin operators.
+      </p>
+
+      <p>
+        Make sure statistics are up-to-date for all tables involved in the 
queries.
+        Use the <codeph>COMPUTE STATS</codeph> statement after loading data 
into non-partitioned tables,
+        and <codeph>COMPUTE INCREMENTAL STATS</codeph> after adding new 
partitions to partitioned tables.
+      </p>
+
+      <p>
+        If join queries involving large tables use unique columns as the join 
keys,
+        for example joining a primary key column with a foreign key column, 
the overhead of
+        producing and transmitting the filter might outweigh the performance 
benefit because
+        not much data could be pruned during the early stages of the query.
+        For such queries, consider setting the query option 
<codeph>RUNTIME_FILTER_MODE=OFF</codeph>.
+      </p>
+
+    </conbody>
+  </concept>
+
+  <concept id="runtime_filtering_limits">
+    <title>Limitations and Restrictions for Runtime Filtering</title>
+    <conbody>
+      <p>
+        The runtime filtering feature is most effective for the Parquet file 
formats.
+        For other file formats, filtering only applies for partitioned tables.
+        See <xref 
href="impala_runtime_filtering.xml#runtime_filtering_file_formats"/>.
+      </p>
+
+      <!-- To do: check if this restriction is lifted in 5.8 / 2.6. -->
+      <p rev="IMPALA-3054">
+        When the spill-to-disk mechanism is activated on a particular host 
during a query,
+        that host does not produce any filters while processing that query.
+        This limitation does not affect the correctness of results; it only 
reduces the
+        amount of optimization that can be applied to the query.
+      </p>
+
+    </conbody>
+  </concept>
+
+
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_s3_skip_insert_staging.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_s3_skip_insert_staging.xml 
b/docs/topics/impala_s3_skip_insert_staging.xml
new file mode 100644
index 0000000..a9cceb5
--- /dev/null
+++ b/docs/topics/impala_s3_skip_insert_staging.xml
@@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="s3_skip_insert_staging" rev="2.6.0 IMPALA-3452 CDH-39913">
+
+  <title>S3_SKIP_INSERT_STAGING Query Option (CDH 5.8 or higher only)</title>
+  <titlealts 
audience="PDF"><navtitle>S3_SKIP_INSERT_STAGING</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Amazon"/>
+      <data name="Category" value="S3"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.6.0 IMPALA-3452 CDH-39913">
+      <indexterm audience="Cloudera">IMPALA_S3_SKIP_INSERT_STAGING query 
option</indexterm>
+    </p>
+
+    <p>
+      Speeds up <codeph>INSERT</codeph> operations on tables or partitions 
residing on the
+      Amazon S3 filesystem. The tradeoff is the possibility of inconsistent 
data left behind
+      if an error occurs partway through the operation.
+    </p>
+
+    <p>
+      By default, Impala write operations to S3 tables and partitions involve 
a two-stage process.
+      Impala writes intermediate files to S3, then (because S3 does not 
provide a <q>rename</q>
+      operation) those intermediate files are copied to their final location, 
making the process
+      more expensive as on a filesystem that supports renaming or moving files.
+      This query option makes Impala skip the intermediate files, and instead 
write the
+      new data directly to the final destination.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <note type="important">
+      <p>
+        If a host that is participating in the <codeph>INSERT</codeph> 
operation fails partway through
+        the query, you might be left with a table or partition that contains 
some but not all of the
+        expected data files. Therefore, this option is most appropriate for a 
development or test
+        environment where you have the ability to reconstruct the table if a 
problem during
+        <codeph>INSERT</codeph> leaves the data in an inconsistent state.
+      </p>
+    </note>
+
+    <p>
+      The timing of file deletion during an <codeph>INSERT OVERWRITE</codeph> 
operation
+      makes it impractical to write new files to S3 and delete the old files 
in a single operation.
+      Therefore, this query option only affects regular 
<codeph>INSERT</codeph> statements that add
+      to the existing data in a table, not <codeph>INSERT OVERWRITE</codeph> 
statements.
+      Use <codeph>TRUNCATE TABLE</codeph> if you need to remove all contents 
from an S3 table
+      before performing a fast <codeph>INSERT</codeph> with this option 
enabled.
+    </p>
+
+    <p>
+      Performance improvements with this option enabled can be substantial. 
The speed increase
+      might be more noticeable for non-partitioned tables than for partitioned 
tables.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/type_boolean"/>
+    <p conref="../shared/impala_common.xml#common/default_true_1"/>
+
+    <p conref="../shared/impala_common.xml#common/added_in_260"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_s3.xml#s3"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_scan_node_codegen_threshold.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_scan_node_codegen_threshold.xml 
b/docs/topics/impala_scan_node_codegen_threshold.xml
new file mode 100644
index 0000000..8080edd
--- /dev/null
+++ b/docs/topics/impala_scan_node_codegen_threshold.xml
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="scan_node_codegen_threshold" rev="2.5.0 IMPALA-1755">
+
+  <title>SCAN_NODE_CODEGEN_THRESHOLD Query Option (CDH 5.7 or higher 
only)</title>
+  <titlealts audience="PDF"><navtitle></navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.5.0 IMPALA-1755">
+      <indexterm audience="Cloudera">SCAN_NODE_CODEGEN_THRESHOLD query 
option</indexterm>
+      The <codeph>SCAN_NODE_CODEGEN_THRESHOLD</codeph> query option
+      adjusts the aggressiveness of the code generation optimization process
+      when performing I/O read operations. It can help to work around 
performance problems
+      for queries where the table is small and the <codeph>WHERE</codeph> 
clause is complicated.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/type_integer"/>
+
+    <p>
+      <b>Default:</b> 1800000 (1.8 million)
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_250"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p>
+      This query option is intended mainly for the case where a query with a 
very complicated
+      <codeph>WHERE</codeph> clause, such as an <codeph>IN</codeph> operator 
with thousands
+      of entries, is run against a small table, especially a small table using 
Parquet format.
+      The code generation phase can become the dominant factor in the query 
response time,
+      making the query take several seconds even though there is relatively 
little work to do.
+      In this case, increase the value of this option to a much larger amount, 
anything up to
+      the maximum for a 32-bit integer.
+    </p>
+
+    <p>
+      Because this option only affects the code generation phase for the 
portion of the
+      query that performs I/O (the <term>scan nodes</term> within the query 
plan), it
+      lets you continue to keep code generation enabled for other queries, and 
other parts
+      of the same query, that can benefit from it. In contrast, the
+      <codeph>IMPALA_DISABLE_CODEGEN</codeph> query option turns off code 
generation entirely.
+    </p>
+
+    <p>
+      Because of the way the work for queries is divided internally, this 
option might not
+      affect code generation for all kinds of queries. If a plan fragment 
contains a scan
+      node and some other kind of plan node, code generation still occurs 
regardless of
+      this option setting.
+    </p>
+
+    <p>
+      To use this option effectively, you should be familiar with reading 
query profile output
+      to determine the proportion of time spent in the code generation phase, 
and whether
+      code generation is enabled or not for specific plan fragments.
+    </p>
+
+<!--
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+    </p>
+-->
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_schedule_random_replica.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_schedule_random_replica.xml 
b/docs/topics/impala_schedule_random_replica.xml
new file mode 100644
index 0000000..bf8e667
--- /dev/null
+++ b/docs/topics/impala_schedule_random_replica.xml
@@ -0,0 +1,80 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="schedule_random_replica" rev="2.5.0">
+
+  <title>SCHEDULE_RANDOM_REPLICA Query Option (CDH 5.7 or higher only)</title>
+  <titlealts 
audience="PDF"><navtitle>SCHEDULE_RANDOM_REPLICA</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.5.0">
+      <indexterm audience="Cloudera">SCHEDULE_RANDOM_REPLICA query 
option</indexterm>
+    </p>
+
+    <p>
+      The <codeph>SCHEDULE_RANDOM_REPLICA</codeph> query option fine-tunes the 
algorithm for deciding which host
+      processes each HDFS data block. It only applies to tables and partitions 
that are not enabled
+      for the HDFS caching feature.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/type_boolean"/>
+
+    <p conref="../shared/impala_common.xml#common/default_false"/>
+
+    <p conref="../shared/impala_common.xml#common/added_in_250"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p>
+      In the presence of HDFS cached replicas, Impala randomizes
+      which host processes each cached data block.
+      To ensure that HDFS data blocks are cached on more
+      than one host, use the <codeph>WITH REPLICATION</codeph> clause along 
with
+      the <codeph>CACHED IN</codeph> clause in a
+      <codeph>CREATE TABLE</codeph> or <codeph>ALTER TABLE</codeph> statement.
+      Specify a replication value greater than or equal to the HDFS block 
replication factor.
+    </p>
+
+    <p>
+      The <codeph>SCHEDULE_RANDOM_REPLICA</codeph> query option applies to 
tables and partitions
+      that <i>do not</i> use HDFS caching.
+      By default, Impala estimates how much work each host has done for
+      the query, and selects the host that has the lowest workload.
+      This algorithm is intended to reduce CPU hotspots arising when the
+      same host is selected to process multiple data blocks, but hotspots
+      might still arise for some combinations of queries and data layout.
+      When the <codeph>SCHEDULE_RANDOM_REPLICA</codeph> option is enabled,
+      Impala further randomizes the scheduling algorithm for non-HDFS cached 
blocks,
+      which can further reduce the chance of CPU hotspots.
+    </p>
+
+    <p rev="CDH-43739 IMPALA-2979">
+      This query option works in conjunction with the work scheduling 
improvements
+      in CDH 5.7 / Impala 2.5 and higher. The scheduling improvements
+      distribute the processing for cached HDFS data blocks to minimize 
hotspots:
+      if a data block is cached on more than one host, Impala chooses which 
host
+      to process each block based on which host has read the fewest bytes 
during
+      the current query. Enable <codeph>SCHEDULE_RANDOM_REPLICA</codeph> 
setting if CPU hotspots
+      still persist because of cases where hosts are <q>tied</q> in terms of
+      the amount of work done; by default, Impala picks the first eligible host
+      in this case.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_perf_hdfs_caching.xml#hdfs_caching"/>,
+      <xref href="impala_scalability.xml#scalability_hotspots"/>
+      , <xref href="impala_replica_preference.xml#replica_preference"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_seq_compression_mode.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_seq_compression_mode.xml 
b/docs/topics/impala_seq_compression_mode.xml
new file mode 100644
index 0000000..cb7cb05
--- /dev/null
+++ b/docs/topics/impala_seq_compression_mode.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="seq_compression_mode" rev="2.5.0">
+
+<!-- This option is for internal use only and might go away without ever being 
documented. It's hidden in the DITA map. -->
+
+  <title>SEQ_COMPRESSION_MODE Query Option</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.5.0">
+      <indexterm audience="Cloudera">RM_INITIAL_MEM query option</indexterm>
+    </p>
+
+    <p>
+      <b>Type:</b>
+    </p>
+
+    <p>
+      <b>Default:</b>
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_trouble_bad_results.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_trouble_bad_results.xml 
b/docs/topics/impala_trouble_bad_results.xml
new file mode 100644
index 0000000..dc15c5f
--- /dev/null
+++ b/docs/topics/impala_trouble_bad_results.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="trouble_bad_results">
+
+  <title>Troubleshooting Incorrect Results</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Troubleshooting"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">incorrect results</indexterm>
+      <indexterm audience="Cloudera">bad results</indexterm>
+      <indexterm audience="Cloudera">wrong results</indexterm>
+    </p>
+
+    <p outputclass="toc inpage"/>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_trouble_memory.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_trouble_memory.xml 
b/docs/topics/impala_trouble_memory.xml
new file mode 100644
index 0000000..ae6ab2a
--- /dev/null
+++ b/docs/topics/impala_trouble_memory.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="trouble_memory">
+
+  <title>Troubleshooting Out-of-Memory Issues</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Troubleshooting"/>
+      <data name="Category" value="Memory"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">memory errors</indexterm>
+      <indexterm audience="Cloudera">out-of-memory errors</indexterm>
+    </p>
+
+    <p outputclass="toc inpage"/>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_trouble_query_fail.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_trouble_query_fail.xml 
b/docs/topics/impala_trouble_query_fail.xml
new file mode 100644
index 0000000..6b1d1ee
--- /dev/null
+++ b/docs/topics/impala_trouble_query_fail.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="trouble_query_fail">
+
+  <title>Troubleshooting Query Errors</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Troubleshooting"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">query errors</indexterm>
+    </p>
+
+    <p outputclass="toc inpage"/>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_trouble_sql.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_trouble_sql.xml 
b/docs/topics/impala_trouble_sql.xml
new file mode 100644
index 0000000..25d79f9
--- /dev/null
+++ b/docs/topics/impala_trouble_sql.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="trouble_sql">
+
+  <title>Troubleshooting SQL Syntax Errors</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="SQL"/>
+      <data name="Category" value="Troubleshooting"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">SQL errors</indexterm>
+      <indexterm audience="Cloudera">syntax errors</indexterm>
+    </p>
+
+    <p outputclass="toc inpage"/>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_trouble_startup.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_trouble_startup.xml 
b/docs/topics/impala_trouble_startup.xml
new file mode 100644
index 0000000..ff4b49e
--- /dev/null
+++ b/docs/topics/impala_trouble_startup.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="trouble_startup">
+
+  <title>Troubleshooting Startup Problems</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Troubleshooting"/>
+      <data name="Category" value="Starting and Stopping"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">startup errors</indexterm>
+    </p>
+
+    <p outputclass="toc inpage"/>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb88fdc0/docs/topics/impala_window_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_window_functions.xml 
b/docs/topics/impala_window_functions.xml
new file mode 100644
index 0000000..a577a13
--- /dev/null
+++ b/docs/topics/impala_window_functions.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.3.0" id="window_functions">
+
+  <title>Window Functions</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">window functions</indexterm>
+      Window functions are a special category of built-in functions. They 
produce one output value for each input
+      row, like scalar functions such as <codeph>length()</codeph> or 
<codeph>substr()</codeph>. Yet like aggregate
+      functions, they also examine the contents of multiple input rows to 
compute each output value.
+    </p>
+
+    <p outputclass="toc inpage"/>
+  </conbody>
+</concept>

[1/6] incubator-impala git commit: Add files that weren't needed during initial build testing of SQL Reference.

Reply via email to