[1/4] incubator-impala git commit: IMPALA-2522: Add doc for sortby() and clustered hints

sailesh Tue, 23 May 2017 11:25:13 -0700

Repository: incubator-impala
Updated Branches:
  refs/heads/master f5ef7e6ae -> 44e8bbffc



IMPALA-2522: Add doc for sortby() and clustered hints

Add CLUSTERED hint.

Update hint syntax in INSERT topic.

Also modernize the hint syntax as shown under INSERT
to include the -- and /* */ formats also. List
the [] style last since it is the least-preferred
option.

Switch to preferring /* */ syntax for hints
instead of using the [ ] notation by default.

Finally, take out references to the SORTBY hint because
it didn't actually make it in. Intent for future is to have a way
to get this behavior without using a hint.
Change-Id: Id3c1da9a87ace361b096fa73d8504b2f54e75bed
Reviewed-on: http://gerrit.cloudera.org:8080/5655
Reviewed-by: John Russell <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/691bbf03
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/691bbf03
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/691bbf03

Branch: refs/heads/master
Commit: 691bbf0345da1a4771a70be07aa1c714017f5c16
Parents: f5ef7e6
Author: John Russell <[email protected]>
Authored: Mon Jan 9 17:21:25 2017 -0800
Committer: Impala Public Jenkins <[email protected]>
Committed: Mon May 22 17:02:33 2017 +0000

----------------------------------------------------------------------
 docs/shared/impala_common.xml | 43 +++++++++++++++++++++++++-------------
 docs/topics/impala_hints.xml  | 42 +++++++++++++++++++++++++------------
 docs/topics/impala_insert.xml | 14 ++++++++++++-
 3 files changed, 71 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/691bbf03/docs/shared/impala_common.xml
----------------------------------------------------------------------
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index 9cceaf5..e75cd00 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -919,6 +919,12 @@ alter table partitioned_data set tblproperties 
('numRows'='1030000', 'STATS_GENE
         combination with the setting 
<codeph>RUNTIME_FILTER_MODE=GLOBAL</codeph>.
       </p>
 
+      <note id="square_bracket_hint_caveat" rev="IMPALA-2522">
+        The square bracket style of hint is now deprecated and might be 
removed in
+        a future release. For that reason, any newly added hints are not 
available
+        with the square bracket syntax.
+      </note>
+
       <p rev="2.5.0" id="runtime_filtering_option_caveat">
         Because the runtime filtering feature applies mainly to 
resource-intensive
         and long-running queries, only adjust this query option when tuning 
long-running queries
@@ -2570,29 +2576,26 @@ select max(height), avg(height) from census_data where 
age &gt; 20;
         <xref href="../topics/impala_views.xml#views"/> for details.
       </p>
 
-      <p id="insert_hints" rev="1.2.2">
+      <p id="insert_hints">
         When inserting into partitioned tables, especially using the Parquet 
file format, you can include a hint in
         the <codeph>INSERT</codeph> statement to fine-tune the overall 
performance of the operation and its
         resource usage:
         <ul>
-          <li>
-            These hints are available in Impala 1.2.2 and higher.
-          </li>
 
           <li>
-            You would only use these hints if an <codeph>INSERT</codeph> into 
a partitioned Parquet table was
+            You would only use hints if an <codeph>INSERT</codeph> into a 
partitioned Parquet table was
             failing due to capacity limits, or if such an 
<codeph>INSERT</codeph> was succeeding but with
             less-than-optimal performance.
           </li>
 
           <li>
-            To use these hints, put the hint keyword 
<codeph>[SHUFFLE]</codeph> or <codeph>[NOSHUFFLE]</codeph>
+            To use a hint to influence the join order, put the hint keyword 
<codeph>/* +SHUFFLE */</codeph> or <codeph>/* +NOSHUFFLE */</codeph>
             (including the square brackets) after the 
<codeph>PARTITION</codeph> clause, immediately before the
             <codeph>SELECT</codeph> keyword.
           </li>
 
           <li>
-            <codeph>[SHUFFLE]</codeph> selects an execution plan that 
minimizes the number of files being written
+            <codeph>/* +SHUFFLE */</codeph> selects an execution plan that 
reduces the number of files being written
             simultaneously to HDFS, and the number of memory buffers holding 
data for individual partitions. Thus
             it reduces overall resource usage for the <codeph>INSERT</codeph> 
operation, allowing some
             <codeph>INSERT</codeph> operations to succeed that otherwise would 
fail. It does involve some data
@@ -2601,27 +2604,39 @@ select max(height), avg(height) from census_data where 
age &gt; 20;
           </li>
 
           <li>
-            <codeph>[NOSHUFFLE]</codeph> selects an execution plan that might 
be faster overall, but might also
+            <codeph>/* +NOSHUFFLE */</codeph> selects an execution plan that 
might be faster overall, but might also
             produce a larger number of small data files or exceed capacity 
limits, causing the
-            <codeph>INSERT</codeph> operation to fail. Use 
<codeph>[SHUFFLE]</codeph> in cases where an
+            <codeph>INSERT</codeph> operation to fail. Use <codeph>/* +SHUFFLE 
*/</codeph> in cases where an
             <codeph>INSERT</codeph> statement fails or runs inefficiently due 
to all nodes attempting to construct
             data for all partitions.
           </li>
 
           <li>
-            Impala automatically uses the <codeph>[SHUFFLE]</codeph> method if 
any partition key column in the
+            Impala automatically uses the <codeph>/* +SHUFFLE */</codeph> 
method if any partition key column in the
             source table, mentioned in the <codeph>INSERT ... SELECT</codeph> 
query, does not have column
-            statistics. In this case, only the <codeph>[NOSHUFFLE]</codeph> 
hint would have any effect.
+            statistics. In this case, only the <codeph>/* +NOSHUFFLE 
*/</codeph> hint would have any effect.
           </li>
 
           <li>
             If column statistics are available for all partition key columns 
in the source table mentioned in the
-            <codeph>INSERT ... SELECT</codeph> query, Impala chooses whether 
to use the <codeph>[SHUFFLE]</codeph>
-            or <codeph>[NOSHUFFLE]</codeph> technique based on the estimated 
number of distinct values in those
+            <codeph>INSERT ... SELECT</codeph> query, Impala chooses whether 
to use the <codeph>/* +SHUFFLE */</codeph>
+            or <codeph>/* +NOSHUFFLE */</codeph> technique based on the 
estimated number of distinct values in those
             columns and the number of nodes involved in the 
<codeph>INSERT</codeph> operation. In this case, you
-            might need the <codeph>[SHUFFLE]</codeph> or the 
<codeph>[NOSHUFFLE]</codeph> hint to override the
+            might need the <codeph>/* +SHUFFLE */</codeph> or the <codeph>/* 
+NOSHUFFLE */</codeph> hint to override the
             execution plan selected by Impala.
           </li>
+
+          <li rev="IMPALA-2522 2.8.0">
+            In <keyword keyref="impala28_full"/> or higher, you can make the
+            <codeph>INSERT</codeph> operation organize (<q>cluster</q>)
+            the data for each partition to avoid buffering data for multiple 
partitions
+            and reduce the risk of an out-of-memory condition. Specify the 
hint as
+            <codeph>/* +CLUSTERED */</codeph>. This technique is primarily
+            useful for inserts into Parquet tables, where the large block
+            size requires substantial memory to buffer data for multiple
+            output files at once.
+          </li>
+
         </ul>
       </p>
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/691bbf03/docs/topics/impala_hints.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_hints.xml b/docs/topics/impala_hints.xml
index 4524c14..18b0dc6 100644
--- a/docs/topics/impala_hints.xml
+++ b/docs/topics/impala_hints.xml
@@ -62,18 +62,20 @@ under the License.
     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
 
     <p>
-      You can represent the hints as keywords surrounded by 
<codeph>[]</codeph> square brackets; include the
-      brackets in the text of the SQL statement.
+      You can also represent the hints as keywords surrounded by 
<codeph>[]</codeph>
+      square brackets; include the brackets in the text of the SQL statement.
+      <note 
conref="../shared/impala_common.xml#common/square_bracket_hint_caveat"/>
     </p>
 
 <codeblock>SELECT STRAIGHT_JOIN <varname>select_list</varname> FROM
 <varname>join_left_hand_table</varname>
-  JOIN [{BROADCAST|SHUFFLE}]
+  JOIN [{ /* +BROADCAST */ | /* +SHUFFLE */ }]
 <varname>join_right_hand_table</varname>
 <varname>remainder_of_query</varname>;
 
 INSERT <varname>insert_clauses</varname>
-  [{SHUFFLE|NOSHUFFLE}]
+  [{ /* +SHUFFLE */ | /* +NOSHUFFLE */ }]
+  [<ph rev="IMPALA-2522 2.8.0">/* +CLUSTERED */</ph>]
   SELECT <varname>remainder_of_query</varname>;
 </codeblock>
 
@@ -82,6 +84,12 @@ INSERT <varname>insert_clauses</varname>
       either the <codeph>/* */</codeph> or <codeph>--</codeph> notation. 
Specify a <codeph>+</codeph> symbol
       immediately before the hint name. Recently added hints are only 
available using the <codeph>/* */</codeph>
       and <codeph>--</codeph> notation.
+      For clarity, the <codeph>/* */</codeph> and <codeph>--</codeph> styles
+      are used in the syntax and examples throughout this section.
+      With the <codeph>/* */</codeph> or <codeph>--</codeph> notation for
+      hints, specify a <codeph>+</codeph> symbol immediately before the first 
hint name.
+      Multiple hints can be specified separated by commas, for example
+      <codeph>/* +clustered,shuffle */</codeph>
     </p>
 
 <codeblock rev="2.0.0">SELECT STRAIGHT_JOIN <varname>select_list</varname> FROM
@@ -109,6 +117,14 @@ INSERT <varname>insert_clauses</varname>
   /* +{SCHEDULE_CACHE_LOCAL | SCHEDULE_DISK_LOCAL | SCHEDULE_REMOTE}
     [,RANDOM_REPLICA] */
 <varname>remainder_of_query</varname>;</ph>
+
+<ph rev="IMPALA-2522 2.8.0">INSERT <varname>insert_clauses</varname>
+  -- +CLUSTERED
+  SELECT <varname>remainder_of_query</varname>;
+
+INSERT <varname>insert_clauses</varname>
+  /* +CLUSTERED */
+  SELECT <varname>remainder_of_query</varname>;</ph>
 </codeblock>
 
     <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
@@ -138,14 +154,14 @@ INSERT <varname>insert_clauses</varname>
     </p>
 
     <p>
-      The <codeph>[BROADCAST]</codeph> and <codeph>[SHUFFLE]</codeph> hints 
control the execution strategy for join
+      The <codeph>/* +BROADCAST */</codeph> and <codeph>/* +SHUFFLE 
*/</codeph> hints control the execution strategy for join
       queries. Specify one of the following constructs immediately after the 
<codeph>JOIN</codeph> keyword in a
       query:
     </p>
 
     <ul>
       <li>
-        <codeph>[SHUFFLE]</codeph> - Makes that join operation use the 
<q>partitioned</q> technique, which divides
+        <codeph>/* +SHUFFLE */</codeph> - Makes that join operation use the 
<q>partitioned</q> technique, which divides
         up corresponding rows from both tables using a hashing algorithm, 
sending subsets of the rows to other
         nodes for processing. (The keyword <codeph>SHUFFLE</codeph> is used to 
indicate a <q>partitioned join</q>,
         because that type of join is not related to <q>partitioned 
tables</q>.) Since the alternative
@@ -155,7 +171,7 @@ INSERT <varname>insert_clauses</varname>
       </li>
 
       <li>
-        <codeph>[BROADCAST]</codeph> - Makes that join operation use the 
<q>broadcast</q> technique that sends the
+        <codeph>/* +BROADCAST */</codeph> - Makes that join operation use the 
<q>broadcast</q> technique that sends the
         entire contents of the right-hand table to all nodes involved in 
processing the join. This is the default
         mode of operation when table and index statistics are unavailable, so 
you would typically only need it if
         stale metadata caused Impala to mistakenly choose a partitioned join 
operation. Typically, broadcast joins
@@ -235,7 +251,7 @@ INSERT <varname>insert_clauses</varname>
     </p>
 
     <p>
-      In particular, the <codeph>[BROADCAST]</codeph> and 
<codeph>[SHUFFLE]</codeph> hints are expected to be
+      In particular, the <codeph>/* +BROADCAST */</codeph> and <codeph>/* 
+SHUFFLE */</codeph> hints are expected to be
       needed much less frequently in Impala 1.2.2 and higher, because the join 
order optimization feature in
       combination with the <codeph>COMPUTE STATS</codeph> statement now 
automatically choose join order and join
       mechanism without the need to rewrite the query and add hints. See
@@ -264,21 +280,21 @@ INSERT <varname>insert_clauses</varname>
     <p>
       For example, this query joins a large customer table with a small lookup 
table of less than 100 rows. The
       right-hand table can be broadcast efficiently to all nodes involved in 
the join. Thus, you would use the
-      <codeph>[broadcast]</codeph> hint to force a broadcast join strategy:
+      <codeph>/* +broadcast */</codeph> hint to force a broadcast join 
strategy:
     </p>
 
 <codeblock>select straight_join customer.address, state_lookup.state_name
-  from customer join <b>[broadcast]</b> state_lookup
+  from customer join <b>/* +broadcast */</b> state_lookup
   on customer.state_id = state_lookup.state_id;</codeblock>
 
     <p>
       This query joins two large tables of unpredictable size. You might 
benchmark the query with both kinds of
       hints and find that it is more efficient to transmit portions of each 
table to other nodes for processing.
-      Thus, you would use the <codeph>[shuffle]</codeph> hint to force a 
partitioned join strategy:
+      Thus, you would use the <codeph>/* +shuffle */</codeph> hint to force a 
partitioned join strategy:
     </p>
 
 <codeblock>select straight_join weather.wind_velocity, geospatial.altitude
-  from weather join <b>[shuffle]</b> geospatial
+  from weather join <b>/* +shuffle */</b> geospatial
   on weather.lat = geospatial.lat and weather.long = 
geospatial.long;</codeblock>
 
     <p>
@@ -290,7 +306,7 @@ INSERT <varname>insert_clauses</varname>
     </p>
 
 <codeblock>select straight_join t1.name, t2.id, t3.price
-  from t1 join <b>[shuffle]</b> t2 join <b>[broadcast]</b> t3
+  from t1 join <b>/* +shuffle */</b> t2 join <b>/* +broadcast */</b> t3
   on t1.id = t2.id and t2.id = t3.id;</codeblock>
 
     <!-- To do: This is a good place to add more sample output showing before 
and after EXPLAIN plans. -->

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/691bbf03/docs/topics/impala_insert.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_insert.xml b/docs/topics/impala_insert.xml
index 1b7dab0..5a8e9a5 100644
--- a/docs/topics/impala_insert.xml
+++ b/docs/topics/impala_insert.xml
@@ -59,9 +59,21 @@ INSERT { INTO | OVERWRITE } [TABLE] 
<varname>table_name</varname>
 
 partition_clause ::= <varname>col_name</varname> [= 
<varname>constant</varname>] [, <varname>col_name</varname> [= 
<varname>constant</varname>] ...]
 
-hint_clause ::= [SHUFFLE] | [NOSHUFFLE]    (Note: the square brackets are part 
of the syntax.)
+hint_clause ::=
+  <varname>hint_with_dashes</varname> |
+  <varname>hint_with_cstyle_delimiters</varname> |
+  <varname>hint_with_brackets</varname>
+
+hint_with_dashes ::= -- +SHUFFLE | -- +NOSHUFFLE <ph rev="IMPALA-2522 
2.8.0">-- +CLUSTERED</ph>
+
+hint_with_cstyle_comments ::= /* +SHUFFLE */ | /* +NOSHUFFLE */ <ph 
rev="IMPALA-2522 2.8.0">| /* +CLUSTERED */</ph>
+
+hint_with_brackets ::= [SHUFFLE] | [NOSHUFFLE]
+  (With this hint format, the square brackets are part of the syntax.)
 </codeblock>
 
+    <note 
conref="../shared/impala_common.xml#common/square_bracket_hint_caveat"/>
+
     <p>
       <b>Appending or replacing (INTO and OVERWRITE clauses):</b>
     </p>

[1/4] incubator-impala git commit: IMPALA-2522: Add doc for sortby() and clustered hints

Reply via email to