[1/2] incubator-impala git commit: IMPALA-5309: [DOCS] Add TABLESAMPLE clause to SELECT statement

tarmstrong Wed, 23 Aug 2017 23:31:52 -0700

Repository: incubator-impala
Updated Branches:
  refs/heads/master acaf8b9f0 -> bb6b0ce24



IMPALA-5309: [DOCS] Add TABLESAMPLE clause to SELECT statement

Change-Id: Idd7e5b7cfe11c986348bc6c8d1b11921f34df336
Reviewed-on: http://gerrit.cloudera.org:8080/7680
Reviewed-by: Alex Behm <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/88d00b4b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/88d00b4b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/88d00b4b

Branch: refs/heads/master
Commit: 88d00b4b09ed4bf727083296dab2cfb843ae1c41
Parents: acaf8b9
Author: John Russell <[email protected]>
Authored: Thu Jul 27 11:56:35 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Thu Aug 24 05:54:01 2017 +0000

----------------------------------------------------------------------
 docs/impala.ditamap                |   1 +
 docs/impala_keydefs.ditamap        |   1 +
 docs/shared/impala_common.xml      |   8 +
 docs/topics/impala_hbase.xml       |   3 +
 docs/topics/impala_kudu.xml        |   2 +
 docs/topics/impala_scalability.xml |  42 ++-
 docs/topics/impala_select.xml      |   9 +
 docs/topics/impala_subqueries.xml  |   3 +
 docs/topics/impala_tablesample.xml | 556 ++++++++++++++++++++++++++++++++
 docs/topics/impala_views.xml       |   3 +
 10 files changed, 611 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/impala.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index b10ddbf..80514da 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -163,6 +163,7 @@ under the License.
         <topicref href="topics/impala_offset.xml"/>
         <topicref href="topics/impala_union.xml"/>
         <topicref href="topics/impala_subqueries.xml"/>
+        <topicref rev="IMPALA-5309" href="topics/impala_tablesample.xml"/>
         <topicref href="topics/impala_with.xml"/>
         <topicref href="topics/impala_distinct.xml"/>
         <topicref href="topics/impala_hints.xml"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/impala_keydefs.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index 8748d03..cdcaed6 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -10732,6 +10732,7 @@ under the License.
   <keydef href="topics/impala_joins.xml" keys="joins"/>
   <keydef href="topics/impala_order_by.xml" keys="order_by"/>
   <keydef href="topics/impala_group_by.xml" keys="group_by"/>
+  <keydef rev="IMPALA-5309" href="topics/impala_tablesample.xml" 
keys="tablesample"/>
   <keydef href="topics/impala_having.xml" keys="having"/>
   <keydef href="topics/impala_limit.xml" keys="limit"/>
   <keydef href="topics/impala_offset.xml" keys="offset"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/shared/impala_common.xml
----------------------------------------------------------------------
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index d4dfaab..5d79acc 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -765,6 +765,14 @@ select concat('abc','mno','xyz');</codeblock>
         they are primarily used in new SQL syntax topics underneath that 
parent topic.
       </p>
 
+      <p id="tablesample_caveat" rev="IMPALA-5309">
+        The <codeph>TABLESAMPLE</codeph> clause of the <codeph>SELECT</codeph>
+        statement does not apply to a table reference derived from a view, a 
subquery,
+        or anything other than a real base table. This clause only works for 
tables
+        backed by HDFS or HDFS-like data files, therefore it does not apply to 
Kudu or
+        HBase tables.
+      </p>
+
 <codeblock id="parquet_fallback_schema_resolution_example"><![CDATA[
 create database schema_evolution;
 use schema_evolution;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_hbase.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_hbase.xml b/docs/topics/impala_hbase.xml
index 2eae975..6c1822f 100644
--- a/docs/topics/impala_hbase.xml
+++ b/docs/topics/impala_hbase.xml
@@ -758,6 +758,9 @@ set hbase_caching=1000;
         <li>
           <p conref="../shared/impala_common.xml#common/hbase_no_load_data"/>
         </li>
+        <li>
+          <p conref="../shared/impala_common.xml#common/tablesample_caveat"/>
+        </li>
       </ul>
     </conbody>
   </concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_kudu.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_kudu.xml b/docs/topics/impala_kudu.xml
index 3334d6b..08d3559 100644
--- a/docs/topics/impala_kudu.xml
+++ b/docs/topics/impala_kudu.xml
@@ -1378,6 +1378,8 @@ kudu.table_name  | impala::some_database.table_name_demo
         the predicate pushdown for a specific query against a Kudu table.
       </p>
 
+      <p conref="../shared/impala_common.xml#common/tablesample_caveat"/>
+
       <!-- Hide until subtopics are ready to display. -->
       <p outputclass="toc inpage" audience="hidden"/>
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_scalability.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_scalability.xml 
b/docs/topics/impala_scalability.xml
index 2ea7603..7c24fe2 100644
--- a/docs/topics/impala_scalability.xml
+++ b/docs/topics/impala_scalability.xml
@@ -843,23 +843,31 @@ Currently, there is no throttling mechanism for Impala 
I/O.
 </conbody>
 </concept>
 
-<concept id="big_tables">
-<title>Scalability Considerations for Table Layout</title>
-<conbody>
-<p>
-Due to the overhead of retrieving and updating table metadata
-in the metastore database, try to limit the number of columns
-in a table to a maximum of approximately 2000.
-Although Impala can handle wider tables than this, the metastore overhead
-can become significant, leading to query performance that is slower
-than expected based on the actual data volume.
-</p>
-<p>
-To minimize overhead related to the metastore database and Impala query 
planning,
-try to limit the number of partitions for any partitioned table to a few tens 
of thousands.
-</p>
-</conbody>
-</concept>
+  <concept id="big_tables">
+    <title>Scalability Considerations for Table Layout</title>
+    <conbody>
+      <p>
+        Due to the overhead of retrieving and updating table metadata
+        in the metastore database, try to limit the number of columns
+        in a table to a maximum of approximately 2000.
+        Although Impala can handle wider tables than this, the metastore 
overhead
+        can become significant, leading to query performance that is slower
+        than expected based on the actual data volume.
+      </p>
+      <p>
+        To minimize overhead related to the metastore database and Impala 
query planning,
+        try to limit the number of partitions for any partitioned table to a 
few tens of thousands.
+      </p>
+      <p rev="IMPALA-5309">
+        If the volume of data within a table makes it impractical to run 
exploratory
+        queries, consider using the <codeph>TABLESAMPLE</codeph> clause to 
limit query processing
+        to only a percentage of data within the table. This technique reduces 
the overhead
+        for query startup, I/O to read the data, and the amount of network, 
CPU, and memory
+        needed to process intermediate results during the query. See <xref 
keyref="tablesample"/>
+        for details.
+      </p>
+    </conbody>
+  </concept>
 
 <concept rev="" id="kerberos_overhead_cluster_size">
 <title>Kerberos-Related Network Overhead for Large Clusters</title>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_select.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_select.xml b/docs/topics/impala_select.xml
index 52954e7..0253712 100644
--- a/docs/topics/impala_select.xml
+++ b/docs/topics/impala_select.xml
@@ -65,6 +65,9 @@ HAVING <codeph>conditions</codeph>
 ORDER BY { <i>column</i> | <i>expression</i> [ASC | DESC] [NULLS FIRST | NULLS 
LAST] [, ...] }
 LIMIT <i>expression</i> [OFFSET <i>expression</i>]
 [UNION [ALL] <i>select_statement</i>] ...]
+
+table_reference := { <varname>table_name</varname> | 
(<varname>subquery</varname>) }
+  <ph rev="IMPALA-5309">[ TABLESAMPLE SYSTEM(<varname>percentage</varname>) 
[REPEATABLE(<varname>seed</varname>)] ]</ph>
 </codeblock>
 
     <p>
@@ -166,6 +169,12 @@ LIMIT <i>expression</i> [OFFSET <i>expression</i>]
         <codeph>LIKE</codeph>, <codeph>IN</codeph>, <codeph>BETWEEN</codeph>, 
and <codeph>COALESCE</codeph>. Impala
         specifically supports built-ins described in <xref 
href="impala_functions.xml#builtins"/>.
       </li>
+
+      <li rev="IMPALA-5309">
+        In <keyword keyref="impala29_full"/> and higher, an optional 
<codeph>TABLESAMPLE</codeph>
+        clause immediately after a table reference, to specify that the query 
only processes a
+        specified percentage of the table data. See <xref 
keyref="tablesample"/> for details.
+      </li>
     </ul>
 
     <p conref="../shared/impala_common.xml#common/ignore_file_extensions"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_subqueries.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_subqueries.xml 
b/docs/topics/impala_subqueries.xml
index d3d509c..b7dd5af 100644
--- a/docs/topics/impala_subqueries.xml
+++ b/docs/topics/impala_subqueries.xml
@@ -313,6 +313,9 @@ Therefore, this is not an efficient construct to use with 
Impala queries for HBa
           when referring to any column from the outer query block within a 
subquery.
         </p>
       </li>
+      <li>
+        <p conref="../shared/impala_common.xml#common/tablesample_caveat"/>
+      </li>
     </ul>
 
     <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_tablesample.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_tablesample.xml 
b/docs/topics/impala_tablesample.xml
new file mode 100644
index 0000000..f60c5be
--- /dev/null
+++ b/docs/topics/impala_tablesample.xml
@@ -0,0 +1,556 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="tablesample" rev="IMPALA-5309">
+
+  <title>TABLESAMPLE Clause</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="SQL"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Specify the <codeph>TABLESAMPLE</codeph> clause in cases where you need
+      to explore the data distribution within the table, the table is very 
large,
+      and it is impractical or unnecessary to process all the data from the 
table
+      or selected partitions.
+    </p>
+
+    <p>
+      The clause makes the query process a randomized set of data files from 
the
+      table, so that the total volume of data is greater than or equal to the 
specified
+      percentage of data bytes within that table. (Or the data bytes within 
the set of
+      partitions that remain after partition pruning is performed.)
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>
+  <ph rev="IMPALA-5309">TABLESAMPLE SYSTEM(<varname>percentage</varname>) 
[REPEATABLE(<varname>seed</varname>)]</ph>
+</codeblock>
+
+    <p>
+      The <codeph>TABLESAMPLE</codeph> clause comes immediately after a table 
name or table alias.
+    </p>
+
+    <p>
+      The <codeph>SYSTEM</codeph> keyword represents the sampling method. 
Currently,
+      Impala only supports a single sampling method named 
<codeph>SYSTEM</codeph>.
+    </p>
+
+    <p>
+      The <varname>percentage</varname> argument is an integer literal from 0 
to 100.
+      A percentage of 0 produces an empty result set for a particular table 
reference,
+      while a percentage of 100 uses the entire contents. Because the sampling 
works by
+      selecting a random set of data files, the proportion of sampled data 
from the
+      table may be greater than the specified percentage, based on the number 
and sizes
+      of the underlying data files. See the usage notes for details.
+    </p>
+
+    <p>
+      The optional <codeph>REPEATABLE</codeph> keyword lets you specify an 
arbitrary
+      positive integer seed value that ensures that when the query is run 
again, the
+      sampling selects the same set of data files each time. 
<codeph>REPEATABLE</codeph>
+      does not have a default value. If you omit the 
<codeph>REPEATABLE</codeph> keyword,
+      the random seed is derived from the current time.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_290"/>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p>
+      You might use this clause with aggregation queries, such as finding
+      the approximate average, minimum, or maximum where exact precision
+      is not required. You can use these findings to plan the most effective
+      strategy for constructing queries against the full table or designing
+      a partitioning strategy for the data.
+    </p>
+
+    <p>
+      Some other database systems have a <codeph>TABLESAMPLE</codeph> clause.
+      The Impala syntax for this clause is modeled on the syntax for popular
+      relational databases, not the Hive <codeph>TABLESAMPLE</codeph> clause.
+      For example, there is no <codeph>BUCKETS</codeph> keyword as in HiveQL.
+    </p>
+
+    <p>
+      The precision of the <varname>percentage</varname> threshold depends on
+      the number and sizes of the underlying data files. Impala brings in
+      additional data files, one at a time, until the number of bytes exceeds
+      the specified percentage based on the total number of bytes for the
+      entire set of table data. The precision of the percentage threshold is 
higher
+      when the table contains many data files with consistent sizes. See the
+      code listings later in this section for examples.
+    </p>
+
+    <p>
+      When you estimate characteristics of the data distribution based on 
sampling
+      a percentage of the table data, be aware that the data might be unevenly 
distributed
+      between different files. Do not assume that the percentage figure 
reflects the
+      percentage of rows in the table. For example, one file might contain all 
blank values
+      for a <codeph>STRING</codeph> column, while another file contains long 
strings
+      in that column; therefore, one file could contain many more rows than 
another.
+      Likewise, a table created with the <codeph>SORT BY</codeph> clause might
+      contain narrow ranges of values for the sort columns, making it 
impractical to
+      extrapolate the number of distinct values for those columns based on 
sampling
+      only some of the data files.
+    </p>
+
+    <p>
+      Because a sample of the table data might not contain all values for a 
particular
+      column, if the <codeph>TABLESAMPLE</codeph> is used in a join query, the
+      key relationships between the tables might produce incomplete result sets
+      compared to joins using all the table data. For example, if you join 50%
+      of table A with 50% of table B, some values in the join columns might
+      not match between the two tables, even though overall there is a 1:1
+      relationship between the tables.
+    </p>
+
+    <p>
+      The <codeph>REPEATABLE</codeph> keyword makes identical queries use a
+      consistent set of data files when the query is repeated. You specify an
+      arbitrary integer key that acts as a seed value when Impala randomly
+      selects the set of data files to use in the query. This technique
+      lets you verify correctness, examine performance, and so on for queries
+      using the <codeph>TABLESAMPLE</codeph> clause without the sampled data
+      being different each time. The repeatable aspect is reset (that is, the
+      set of selected data files may change) any time the contents of the table
+      change. The statements or operations that can make sampling results
+      non-repeatable are:
+    </p>
+
+    <ul>
+      <li>
+        <codeph>INSERT</codeph>.
+      </li>
+      <li>
+        <codeph>TRUNCATE TABLE</codeph>.
+      </li>
+      <li>
+        <codeph>LOAD DATA</codeph>.
+      </li>
+      <li>
+        <codeph>REFRESH</codeph> or <codeph>INVALIDATE METADATA</codeph>
+        after files are added or removed by a non-Impala mechanism.
+      </li>
+      <li>
+      </li>
+    </ul>
+
+    <p>
+      This clause is similar in some ways to the <codeph>LIMIT</codeph> clause,
+      because both serve to limit the size of the intermediate data and final
+      result set. <codeph>LIMIT 0</codeph> is more efficient than
+      <codeph>TABLESAMPLE SYSTEM(0)</codeph> for verifying that a query can 
execute
+      without producing any results. <codeph>TABLESAMPLE 
SYSTEM(<varname>n</varname>)</codeph>
+      often makes query processing more efficient than using a 
<codeph>LIMIT</codeph> clause
+      by itself, because all phases of query execution use less data overall.
+      If the intent is to retrieve some representative values from the table
+      in an efficient way, you might combine <codeph>TABLESAMPLE</codeph>,
+      <codeph>ORDER BY</codeph>, and <codeph>LIMIT</codeph> clauses within a 
single query.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/partitioning_blurb"/>
+    <p>
+      When you query a partitioned table, any partition pruning happens
+      before Impala selects the data files to sample. For example, in a
+      table partitioned by year, a query with <codeph>WHERE year = 
2017</codeph>
+      and a <codeph>TABLESAMPLE SYSTEM(10)</codeph> clause would sample
+      data files representing at least 10% of the bytes present in the
+      2017 partition.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+    <p>
+      This clause applies to S3 tables the same way as tables
+      with data files stored on HDFS.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/adls_blurb"/>
+    <p>
+      This clause applies to ADLS tables the same way as tables
+      with data files stored on HDFS.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/kudu_blurb"/>
+    <p>
+      This clause does not apply to Kudu tables.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/hbase_blurb"/>
+    <p>
+      This clause does not apply to HBase tables.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/performance_blurb"/>
+    <p>
+      From a performance perspective, the <codeph>TABLESAMPLE</codeph>
+      clause is especially valuable for exploratory queries on
+      text, Avro, or other file formats other than Parquet. Text-based
+      or row-oriented file formats must process substantial amounts of
+      redundant data for queries that derive aggregate results such as
+      <codeph>MAX()</codeph>, <codeph>MIN()</codeph>, or <codeph>AVG()</codeph>
+      for a single column. Therefore, you might use 
<codeph>TABLESAMPLE</codeph>
+      early in the ETL pipeline, when data is still in raw text format
+      and has not been converted to Parquet or moved into a partitioned
+      table.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+    <p>
+      This clause applies only to tables that use a storage layer
+      with underlying raw data files, such as HDFS, Amazon S3,
+      or Microsoft ADLS.
+    </p>
+
+    <p>
+      This clause does not apply to table references that represent views.
+      A query that applies the <codeph>TABLESAMPLE</codeph> clause to a
+      view or a subquery fails with a semantic error.
+    </p>
+
+    <p>
+      Because the sampling works at the level of entire data files, it
+      is by nature coarse-grained. It is possible to specify a small
+      sample percentage but still process a substantial portion of the
+      table data if the table contains relatively few data files, if
+      each data file is very large, or if the data files vary substantially
+      in size. Be sure that you understand the data distribution and physical
+      file layout so that you can verify if the results are suitable for
+      extrapolation. For example, if the table contains only a single data 
file,
+      the <q>sample</q> will consist of all the table data regardless of
+      the percentage you specify. If the table contains data files of
+      1 GiB, 1 GiB, and 1 KiB, when you specify a sampling percentage of
+      50 you would either process slightly more than 50% of the table
+      (1 GiB + 1 KiB) or almost the entire table (1 GiB + 1 GiB),
+      depending on which data files were selected for sampling.
+    </p>
+
+    <p>
+      If data files are added by a non-Impala mechanism, and the
+      table metadata is not updated by a <codeph>REFRESH</codeph>
+      or <codeph>INVALIDATE METADATA</codeph> statement, the
+      <codeph>TABLESAMPLE</codeph> clause does not consider those
+      new files when computing the number of bytes in the table
+      or selecting which files to sample.
+    </p>
+
+    <p>
+      If data files are removed by a non-Impala mechanism, and the
+      table metadata is not updated by a <codeph>REFRESH</codeph>
+      or <codeph>INVALIDATE METADATA</codeph> statement, the
+      query fails if the <codeph>TABLESAMPLE</codeph> clause
+      attempts to reference any of the missing files.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+    <p>
+      The following examples demonstrate the <codeph>TABLESAMPLE</codeph> 
clause.
+      These examples intentionally use very small data sets to illustrate how
+      the number of files, size of each file, and overall size of data in the 
table
+      interact with the percentage specified in the clause.
+    </p>
+
+    <p>
+      These examples use an unpartitioned table, containing several files of 
roughly
+      the same size:
+    </p>
+
+<codeblock><![CDATA[
+create table sample_demo (x int, s string);
+
+insert into sample_demo values (1, 'one');
+insert into sample_demo values (2, 'two');
+insert into sample_demo values (3, 'three');
+insert into sample_demo values (4, 'four');
+insert into sample_demo values (5, 'five');
+
+show files in sample_demo;
++---------------------+------+-----------+
+| Path                | Size | Partition |
++---------------------+------+-----------+
+| 991213608_data.0.   | 7B   |           |
+| 982196806_data.0.   | 6B   |           |
+| _2122096884_data.0. | 8B   |           |
+| _586325431_data.0.  | 6B   |           |
+| 1894746258_data.0.  | 7B   |           |
++---------------------+------+-----------+
+
+show table stats sample_demo;
++-------+--------+------+--------+-------------------------+
+| #Rows | #Files | Size | Format | Location                |
++-------+--------+------+--------+-------------------------+
+| -1    | 5      | 34B  | TEXT   | /tsample.db/sample_demo |
++-------+--------+------+--------+-------------------------+
+</codeblock>
+
+    <p>
+      A query that samples 50% of the table must process at least
+      17 bytes of data. Based on the sizes of the data files,
+      we can predict that each such query uses 3 arbitrary files.
+      Any 1 or 2 files are not enough to reach 50% of the total
+      data in the table (34 bytes), so the query adds more files
+      until it passes the 50% threshold:
+    </p>
+
+<codeblock><![CDATA[
+select distinct x from sample_demo tablesample system(50);
++---+
+| x |
++---+
+| 4 |
+| 1 |
+| 5 |
++---+
+
+select distinct x from sample_demo tablesample system(50);
++---+
+| x |
++---+
+| 5 |
+| 4 |
+| 2 |
++---+
+
+select distinct x from sample_demo tablesample system(50);
++---+
+| x |
++---+
+| 5 |
+| 3 |
+| 2 |
++---+
+</codeblock>
+
+    <p>
+      To help run reproducible experiments, the <codeph>REPEATABLE</codeph>
+      clause causes Impala to choose the same set of files for each query.
+      Although the data set being considered is deterministic, the order
+      of results varies (in the absence of an <codeph>ORDER BY</codeph>
+      clause) because of the way distributed queries are processed:
+    </p>
+
+<codeblock><![CDATA[
+select distinct x from sample_demo
+  tablesample system(50) repeatable (12345);
++---+
+| x |
++---+
+| 3 |
+| 2 |
+| 1 |
++---+
+
+select distinct x from sample_demo
+  tablesample system(50) repeatable (12345);
++---+
+| x |
++---+
+| 2 |
+| 1 |
+| 3 |
++---+
+</codeblock>
+
+    <p>
+      The following examples show how uneven data distribution affects
+      which data is sampled. Adding another data file containing a long
+      string value changes the threshold for 50% of the total data in
+      the table:
+    </p>
+
+<codeblock><![CDATA[
+insert into sample_demo values (1000, 'Boyhood is the longest time in li
+fe for a boy. The last term of the school-year is made of decades, not o
+f weeks, and living through them is like waiting for the millennium. Boo
+th Tarkington');
+
+show files in sample_demo;
++---------------------+------+-----------+
+| Path                | Size | Partition |
++---------------------+------+-----------+
+| 991213608_data.0.   | 7B   |           |
+| 982196806_data.0.   | 6B   |           |
+| _253317650_data.0.  | 196B |           |
+| _2122096884_data.0. | 8B   |           |
+| _586325431_data.0.  | 6B   |           |
+| 1894746258_data.0.  | 7B   |           |
++---------------------+------+-----------+
+
+show table stats sample_demo;
++-------+--------+------+--------+-------------------------+
+| #Rows | #Files | Size | Format | Location                |
++-------+--------+------+--------+-------------------------+
+| -1    | 6      | 230B | TEXT   | /tsample.db/sample_demo |
++-------+--------+------+--------+-------------------------+
+</codeblock>
+
+    <p>
+      Even though the queries do not refer to the <codeph>S</codeph>
+      column containing the long value, all the sampling queries include
+      the data file containing the column value <codeph>X=1000</codeph>,
+      because the query cannot reach the 50% threshold (115 bytes) without
+      including that file. The large file might be considered first, in which
+      case it is the only file processed by the query. Or an arbitrary
+      set of other files might be considered first.
+    </p>
+
+<codeblock><![CDATA[
+select distinct x from sample_demo tablesample system(50);
++------+
+| x    |
++------+
+| 1000 |
+| 3    |
+| 1    |
++------+
+
+select distinct x from sample_demo tablesample system(50);
++------+
+| x    |
++------+
+| 1000 |
++------+
+
+select distinct x from sample_demo tablesample system(50);
++------+
+| x    |
++------+
+| 1000 |
+| 4    |
+| 2    |
+| 1    |
++------+
+</codeblock>
+
+    <p>
+      The following examples demonstrate how the <codeph>TABLESAMPLE</codeph>
+      clause interacts with other table aspects, such as partitioning and file
+      format:
+    </p>
+
+<codeblock><![CDATA[
+create table sample_demo_partitions (x int, s string) partitioned by (n int) 
stored as parquet;
+
+insert into sample_demo_partitions partition (n = 1) select * from sample_demo;
+insert into sample_demo_partitions partition (n = 2) select * from sample_demo;
+insert into sample_demo_partitions partition (n = 3) select * from sample_demo;
+
+show files in sample_demo_partitions;
++--------------------------------+--------+-----------+
+| Path                           | Size   | Partition |
++--------------------------------+--------+-----------+
+| 000000_364262785_data.0.parq   | 1.24KB | n=1       |
+| 000001_973526736_data.0.parq   | 566B   | n=1       |
+| 0000000_1300598134_data.0.parq | 1.24KB | n=2       |
+| 0000001_689099063_data.0.parq  | 568B   | n=2       |
+| 0000000_1861371709_data.0.parq | 1.24KB | n=3       |
+| 0000001_1065507912_data.0.parq | 566B   | n=3       |
++--------------------------------+--------+-----------+
+
+show table stats tablesample_demo_partitioned;
++-------+-------+--------+--------+---------+----------------------------------------------+
+| n     | #Rows | #Files | Size   | Format  | Location                         
            |
++-------+-------+--------+--------+---------+----------------------------------------------+
+| 1     | -1    | 2      | 1.79KB | PARQUET | 
/tsample.db/tablesample_demo_partitioned/n=1 |
+| 2     | -1    | 2      | 1.80KB | PARQUET | 
/tsample.db/tablesample_demo_partitioned/n=2 |
+| 3     | -1    | 2      | 1.79KB | PARQUET | 
/tsample.db/tablesample_demo_partitioned/n=3 |
+| Total | -1    | 6      | 5.39KB |         |                                  
            |
++-------+-------+--------+--------+---------+----------------------------------------------+
+</codeblock>
+
+    <p>
+      If the query does not involve any partition pruning, the
+      sampling applies to the data volume of the entire table:
+    </p>
+
+<codeblock><![CDATA[
+-- 18 rows total.
+select count(*) from sample_demo_partitions;
++----------+
+| count(*) |
++----------+
+| 18       |
++----------+
+
+-- The number of rows per data file is not
+-- perfectly balanced, therefore the count
+-- is different depending on which set of files
+-- is considered.
+select count(*) from sample_demo_partitions
+  tablesample system(75);
++----------+
+| count(*) |
++----------+
+| 14       |
++----------+
+
+select count(*) from sample_demo_partitions
+  tablesample system(75);
++----------+
+| count(*) |
++----------+
+| 16       |
++----------+
+</codeblock>
+
+    <p>
+      If the query only processes certain partitions,
+      the query computes the sampling threshold based on
+      the data size and set of files only from the
+      relevant partitions:
+    </p>
+
+<codeblock><![CDATA[
+select count(*) from sample_demo_partitions
+  tablesample system(50) where n = 1;
++----------+
+| count(*) |
++----------+
+| 6        |
++----------+
+
+select count(*) from sample_demo_partitions
+  tablesample system(50) where n = 1;
++----------+
+| count(*) |
++----------+
+| 2        |
++----------+
+]]>
+</codeblock>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref href="impala_select.xml#select"/>
+    </p>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_views.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_views.xml b/docs/topics/impala_views.xml
index 09693fe..678f681 100644
--- a/docs/topics/impala_views.xml
+++ b/docs/topics/impala_views.xml
@@ -192,6 +192,9 @@ select * from top_10_days;</codeblock>
       <li rev="1.4.0">
         <p 
conref="../shared/impala_common.xml#common/order_by_view_restriction"/>
       </li>
+      <li>
+        <p conref="../shared/impala_common.xml#common/tablesample_caveat"/>
+      </li>
     </ul>
 
     <p>

[1/2] incubator-impala git commit: IMPALA-5309: [DOCS] Add TABLESAMPLE clause to SELECT statement

Reply via email to