[3/3] incubator-impala git commit: IMPALA-5583: [DOCS] Document default_join_distribution_mode query option

jrussell Mon, 10 Jul 2017 16:09:47 -0700

IMPALA-5583: [DOCS] Document default_join_distribution_mode query option

New page for the query option.


Change-Id: I4ec6213efc46bce0fe07c590841d51c009fb5c84
Reviewed-on: http://gerrit.cloudera.org:8080/7300
Reviewed-by: Mostafa Mokhtar <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/801c32de
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/801c32de
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/801c32de

Branch: refs/heads/master
Commit: 801c32dec3914939c95c2cab07f8628dd627aef5
Parents: db3f323
Author: John Russell <[email protected]>
Authored: Mon Jun 26 15:49:27 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Mon Jul 10 23:08:12 2017 +0000

----------------------------------------------------------------------
 docs/impala.ditamap                             |   1 +
 docs/impala_keydefs.ditamap                     |   1 +
 .../impala_default_join_distribution_mode.xml   | 134 +++++++++++++++++++
 3 files changed, 136 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/801c32de/docs/impala.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index 574602a..b10ddbf 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -176,6 +176,7 @@ under the License.
           <topicref href="topics/impala_batch_size.xml"/>
           <topicref href="topics/impala_compression_codec.xml"/>
           <topicref href="topics/impala_debug_action.xml"/>
+          <topicref rev="2.9.0 IMPALA-5381" 
href="topics/impala_default_join_distribution_mode.xml"/>
           <topicref href="topics/impala_default_order_by_limit.xml"/>
           <topicref audience="hidden" 
href="topics/impala_disable_cached_reads.xml"/>
           <topicref href="topics/impala_disable_codegen.xml"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/801c32de/docs/impala_keydefs.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index 7c9bb60..378a5bb 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -10749,6 +10749,7 @@ under the License.
   <keydef href="topics/impala_batch_size.xml" keys="batch_size"/>
   <keydef href="topics/impala_compression_codec.xml" keys="compression_codec"/>
   <keydef href="topics/impala_debug_action.xml" keys="debug_action"/>
+  <keydef href="topics/impala_default_join_distribution_mode.xml" 
keys="default_join_distribution_mode"/>
   <keydef href="topics/impala_default_order_by_limit.xml" 
keys="default_order_by_limit"/>
   <keydef href="topics/impala_disable_cached_reads.xml" 
keys="disable_cached_reads"/>
   <keydef href="topics/impala_disable_codegen.xml" keys="disable_codegen"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/801c32de/docs/topics/impala_default_join_distribution_mode.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_default_join_distribution_mode.xml 
b/docs/topics/impala_default_join_distribution_mode.xml
new file mode 100644
index 0000000..1b17d50
--- /dev/null
+++ b/docs/topics/impala_default_join_distribution_mode.xml
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="default_join_distribution_mode" rev="2.9.0 IMPALA-5381 
IMPALA-5583">
+
+  <title>DEFAULT_JOIN_DISTRIBUTION_MODE Query Option</title>
+  <titlealts 
audience="PDF"><navtitle>DEFAULT_JOIN_DISTRIBUTION_MODE</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="hidden">DEFAULT_JOIN_DISTRIBUTION_MODE query 
option</indexterm>
+      This option determines the join distribution that Impala uses when any 
of the tables
+      involved in a join query is missing statistics.
+    </p>
+
+    <p>
+      Impala optimizes join queries based on the presence of table statistics,
+      which are produced by the Impala <codeph>COMPUTE STATS</codeph> 
statement.
+      By default, when a table involved in the join query does not have 
statistics,
+      Impala uses the <q>broadcast</q> technique that transmits the entire 
contents
+      of the table to all executor nodes participating in the query. If one 
table
+      involved in a join has statistics and the other does not, the table 
without
+      statistics is broadcast. If both tables are missing statistics, the table
+      that is referenced second in the join order is broadcast. This behavior
+      is appropriate when the table involved is relatively small, but can lead 
to
+      excessive network, memory, and CPU overhead if the table being broadcast 
is
+      large.
+    </p>
+
+    <p>
+      Because Impala queries frequently involve very large tables, and 
suboptimal
+      joins for such tables could result in spilling or out-of-memory errors,
+      the setting <codeph>DEFAULT_JOIN_DISTRIBUTION_MODE=SHUFFLE</codeph> lets 
you
+      override the default behavior. The shuffle join mechanism divides the 
corresponding rows
+      of each table involved in a join query using a hashing algorithm, and 
transmits
+      subsets of the rows to other nodes for processing. Typically, this kind 
of join is
+      more efficient for joins between large tables of similar size.
+    </p>
+
+    <p>
+      The setting <codeph>DEFAULT_JOIN_DISTRIBUTION_MODE=SHUFFLE</codeph> is
+      recommended when setting up and deploying new clusters, because it is 
less likely
+      to result in serious consequences such as spilling or out-of-memory 
errors if
+      the query plan is based on incomplete information. This setting is not 
the default,
+      to avoid changing the performance characteristics of join queries for 
clusters that
+      are already tuned for their existing workloads.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/type_integer"/>
+    <p>
+      The allowed values are <codeph>BROADCAST</codeph> (equivalent to 0)
+      or <codeph>SHUFFLE</codeph> (equivalent to 1).
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+    <p>
+      The following examples demonstrate appropriate scenarios for each
+      setting of this query option.
+    </p>
+
+<codeblock>
+-- Create a billion-row table.
+create table big_table stored as parquet
+  as select * from huge_table limit 1e9;
+
+-- For a big table with no statistics, the
+-- shuffle join mechanism is appropriate.
+set default_join_distribution_mode=shuffle;
+
+...join queries involving the big table...
+</codeblock>
+
+<codeblock>
+-- Create a hundred-row table.
+create table tiny_table stored as parquet
+  as select * from huge_table limit 100;
+
+-- For a tiny table with no statistics, the
+-- broadcast join mechanism is appropriate.
+set default_join_distribution_mode=broadcast;
+
+...join queries involving the tiny table...
+</codeblock>
+
+<codeblock>
+compute stats tiny_table;
+compute stats big_table;
+
+-- Once the stats are computed, the query option has
+-- no effect on join queries involving these tables.
+-- Impala can determine the absolute and relative sizes
+-- of each side of the join query by examining the
+-- row size, cardinality, and so on of each table.
+
+...join queries involving both of these tables...
+</codeblock>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref keyref="compute_stats"/>,
+      <xref keyref="joins"/>,
+      <xref keyref="perf_joins"/>
+    </p>
+
+  </conbody>
+</concept>

[3/3] incubator-impala git commit: IMPALA-5583: [DOCS] Document default_join_distribution_mode query option

Reply via email to