[10/19] impala git commit: IMPALA-6024: Min sample bytes for COMPUTE STATS TABLESAMPLE

philz Fri, 02 Feb 2018 10:52:29 -0800

IMPALA-6024: Min sample bytes for COMPUTE STATS TABLESAMPLE

Adds a new query option COMPUTE_STATS_MIN_SAMPLE_SIZE
which is the minimum number of bytes that will be scanned
in COMPUTE STATS TABLESAMPLE, regardless of the user-supplied
sampling percent.


The motivation is to prevent sampling for very small tables
where accurate stats can be obtained cheaply without sampling.

This patch changes COMPUTE STATS TABLESAMPLE to run the regular
COMPUTE STATS if the effective sampling percent is 0% or 100%.
For a 100% sampling rate, the sampling-based stats queries
are more expensive and produce less accurate stats than the
regular COMPUTE STATS.

Default: COMPUTE_STATS_MIN_SAMPLE_SIZE=1GB

Testing:
- added new unit tests and ran them locally

Change-Id: I2cb91a40bec50b599875109c2f7c5bf6f41c2400
Reviewed-on: http://gerrit.cloudera.org:8080/9113
Reviewed-by: Alex Behm <alex.b...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/db0f3810
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/db0f3810
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/db0f3810

Branch: refs/heads/2.x
Commit: db0f3810e9841af9ea1f43c11b613c6d056347dd
Parents: 68b7c8b
Author: Alex Behm <alex.b...@cloudera.com>
Authored: Mon Jan 22 23:07:25 2018 -0800
Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org>
Committed: Fri Feb 2 01:10:15 2018 +0000

----------------------------------------------------------------------
 be/src/service/query-options-test.cc            |  3 +-
 be/src/service/query-options.cc                 | 10 +++
 be/src/service/query-options.h                  |  4 +-
 common/thrift/ImpalaInternalService.thrift      |  4 ++
 common/thrift/ImpalaService.thrift              |  4 ++
 .../impala/analysis/ComputeStatsStmt.java       | 35 ++++++++--
 .../org/apache/impala/catalog/HdfsTable.java    | 16 +++--
 .../org/apache/impala/planner/HdfsScanNode.java |  7 +-
 .../apache/impala/analysis/AnalyzeDDLTest.java  | 67 ++++++++++++++++++--
 .../custom_cluster/test_stats_extrapolation.py  |  4 ++
 10 files changed, 135 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/db0f3810/be/src/service/query-options-test.cc
----------------------------------------------------------------------
diff --git a/be/src/service/query-options-test.cc 
b/be/src/service/query-options-test.cc
index 80c9866..552c218 100644
--- a/be/src/service/query-options-test.cc
+++ b/be/src/service/query-options-test.cc
@@ -141,7 +141,8 @@ TEST(QueryOptions, SetByteOptions) {
       {MAKE_OPTIONDEF(rm_initial_mem),        {-1, I64_MAX}},
       {MAKE_OPTIONDEF(buffer_pool_limit),     {-1, I64_MAX}},
       {MAKE_OPTIONDEF(max_row_size),          {1, ROW_SIZE_LIMIT}},
-      {MAKE_OPTIONDEF(parquet_file_size),     {-1, I32_MAX}}
+      {MAKE_OPTIONDEF(parquet_file_size),     {-1, I32_MAX}},
+      {MAKE_OPTIONDEF(compute_stats_min_sample_size), {-1, I64_MAX}},
   };
   vector<pair<OptionDef<int32_t>, Range<int32_t>>> case_set_i32 {
       {MAKE_OPTIONDEF(runtime_filter_min_size),

http://git-wip-us.apache.org/repos/asf/impala/blob/db0f3810/be/src/service/query-options.cc
----------------------------------------------------------------------
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index e3b5a1f..ff2fd4e 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -603,6 +603,16 @@ Status impala::SetQueryOption(const string& key, const 
string& value,
         query_options->__set_idle_session_timeout(requested_timeout);
         break;
       }
+      case TImpalaQueryOptions::COMPUTE_STATS_MIN_SAMPLE_SIZE: {
+        int64_t min_sample_size;
+        RETURN_IF_ERROR(ParseMemValue(value, "Min sample size", 
&min_sample_size));
+        if (min_sample_size < 0) {
+          return Status(
+              Substitute("Min sample size must be greater or equal to zero: 
$0", value));
+        }
+        query_options->__set_compute_stats_min_sample_size(min_sample_size);
+        break;
+      }
       default:
         // We hit this DCHECK(false) if we forgot to add the corresponding 
entry here
         // when we add a new query option.

http://git-wip-us.apache.org/repos/asf/impala/blob/db0f3810/be/src/service/query-options.h
----------------------------------------------------------------------
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index be3607f..9cdc935 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -41,7 +41,7 @@ typedef std::unordered_map<string, 
beeswax::TQueryOptionLevel::type>
 // the DCHECK.
 #define QUERY_OPTS_TABLE\
   DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\
-      TImpalaQueryOptions::IDLE_SESSION_TIMEOUT + 1);\
+      TImpalaQueryOptions::COMPUTE_STATS_MIN_SAMPLE_SIZE + 1);\
   QUERY_OPT_FN(abort_on_default_limit_exceeded, 
ABORT_ON_DEFAULT_LIMIT_EXCEEDED,\
       TQueryOptionLevel::DEPRECATED)\
   QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\
@@ -130,6 +130,8 @@ typedef std::unordered_map<string, 
beeswax::TQueryOptionLevel::type>
       TQueryOptionLevel::ADVANCED)\
   QUERY_OPT_FN(max_row_size, MAX_ROW_SIZE, TQueryOptionLevel::REGULAR)\
   QUERY_OPT_FN(idle_session_timeout, IDLE_SESSION_TIMEOUT, 
TQueryOptionLevel::REGULAR)\
+  QUERY_OPT_FN(compute_stats_min_sample_size, COMPUTE_STATS_MIN_SAMPLE_SIZE,\
+      TQueryOptionLevel::ADVANCED)\
   ;
 
 /// Enforce practical limits on some query options to avoid undesired query 
state.

http://git-wip-us.apache.org/repos/asf/impala/blob/db0f3810/common/thrift/ImpalaInternalService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaInternalService.thrift 
b/common/thrift/ImpalaInternalService.thrift
index 121c551..bf00424 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -292,6 +292,10 @@ struct TQueryOptions {
   // running queries cancelled) by Impala. If 0, idle sessions never expire.
   // The default session timeout is set by the command line flag of the same 
name.
   61: optional i32 idle_session_timeout;
+
+  // Minimum number of bytes that will be scanned in COMPUTE STATS TABLESAMPLE,
+  // regardless of the user-supplied sampling percent. Default value: 1GB
+  62: optional i64 compute_stats_min_sample_size = 1073741824;
 }
 
 // Impala currently has two types of sessions: Beeswax and HiveServer2

http://git-wip-us.apache.org/repos/asf/impala/blob/db0f3810/common/thrift/ImpalaService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaService.thrift 
b/common/thrift/ImpalaService.thrift
index 53f6e81..0f2d3d0 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -292,6 +292,10 @@ enum TImpalaQueryOptions {
   // The time, in seconds, that a session may be idle for before it is closed 
(and all
   // running queries cancelled) by Impala. If 0, idle sessions never expire.
   IDLE_SESSION_TIMEOUT,
+
+  // Minimum number of bytes that will be scanned in COMPUTE STATS TABLESAMPLE,
+  // regardless of the user-supplied sampling percent.
+  COMPUTE_STATS_MIN_SAMPLE_SIZE,
 }
 
 // The summary of a DML statement.

http://git-wip-us.apache.org/repos/asf/impala/blob/db0f3810/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java 
b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
index ed81f89..4e61d86 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
@@ -103,6 +103,7 @@ public class ComputeStatsStmt extends StatementBase {
 
   // Effective sampling percent based on the total number of bytes in the 
files sample.
   // Set to -1 for non-HDFS tables or if TABLESAMPLE was not specified.
+  // We run the regular COMPUTE STATS for 0.0 and 1.0 where sampling has no 
benefit.
   protected double effectiveSamplePerc_ = -1;
 
   // The Null count is not currently being used in optimization or run-time,
@@ -202,7 +203,7 @@ public class ComputeStatsStmt extends StatementBase {
       String colRefSql = ToSqlUtils.getIdentSql(c.getName());
       if (isIncremental_) {
         columnStatsSelectList.add("NDV_NO_FINALIZE(" + colRefSql + ") AS " + 
colRefSql);
-      } else if (effectiveSamplePerc_ > 0) {
+      } else if (isSampling()) {
         columnStatsSelectList.add(String.format("SAMPLED_NDV(%s, %.10f) AS %s",
             colRefSql, effectiveSamplePerc_, colRefSql));
       } else {
@@ -461,7 +462,7 @@ public class ComputeStatsStmt extends StatementBase {
     // Query for getting the per-partition row count and the total row count.
     StringBuilder tableStatsQueryBuilder = new StringBuilder("SELECT ");
     String countSql = "COUNT(*)";
-    if (effectiveSamplePerc_ > 0) {
+    if (isSampling()) {
       // Extrapolate the count based on the effective sampling rate.
       countSql = String.format("ROUND(COUNT(*) / %.10f)", 
effectiveSamplePerc_);
     }
@@ -529,7 +530,8 @@ public class ComputeStatsStmt extends StatementBase {
    * not sampling. If sampling, the returned SQL includes a fixed random seed 
so all
    * child queries generate a consistent sample, even if the user did not 
originally
    * specify REPEATABLE.
-   * No-op if this statement has no TABLESAMPLE clause.
+   * Returns the empty string if this statement has no TABLESAMPLE clause or if
+   * the effective sampling rate is 0.0 or 1.0 (see isSampling()).
    */
   private String analyzeTableSampleClause(Analyzer analyzer) throws 
AnalysisException {
     if (sampleParams_ == null) return "";
@@ -550,9 +552,11 @@ public class ComputeStatsStmt extends StatementBase {
     }
 
     // Compute the sample of files and set 'sampleFileBytes_'.
+    long minSampleBytes = 
analyzer.getQueryOptions().compute_stats_min_sample_size;
+    long samplePerc = sampleParams_.getPercentBytes();
     HdfsTable hdfsTable = (HdfsTable) table_;
     Map<Long, List<FileDescriptor>> sample = hdfsTable.getFilesSample(
-        hdfsTable.getPartitions(), sampleParams_.getPercentBytes(), 
sampleSeed);
+        hdfsTable.getPartitions(), samplePerc, minSampleBytes, sampleSeed);
     long sampleFileBytes = 0;
     for (List<FileDescriptor> fds: sample.values()) {
       for (FileDescriptor fd: fds) sampleFileBytes += fd.getFileLength();
@@ -567,6 +571,17 @@ public class ComputeStatsStmt extends StatementBase {
     }
     Preconditions.checkState(effectiveSamplePerc_ >= 0.0 && 
effectiveSamplePerc_ <= 1.0);
 
+    // Warn if we will ignore TABLESAMPLE and run the regular COMPUTE STATS.
+    if (effectiveSamplePerc_ == 1.0) {
+      Preconditions.checkState(!isSampling());
+      analyzer.addWarning(String.format(
+          "Ignoring TABLESAMPLE because the effective sampling rate is 
100%%.\n" +
+          "The minimum sample size is COMPUTE_STATS_MIN_SAMPLE_SIZE=%s " +
+          "and the table size %s",
+          PrintUtils.printBytes(minSampleBytes), 
PrintUtils.printBytes(totalFileBytes)));
+    }
+    if (!isSampling()) return "";
+
     return " " + sampleParams_.toSql(sampleSeed);
   }
 
@@ -647,6 +662,17 @@ public class ComputeStatsStmt extends StatementBase {
   }
 
   /**
+   * Returns true if this COMPUTE STATS statement should perform sampling.
+   * Returns false if TABLESAMPLE was not specified (effectiveSamplePerc_ == 
-1)
+   * or if the effective sampling percent is 0% or 100% where sampling has no 
benefit.
+   */
+  private boolean isSampling() {
+    Preconditions.checkState(effectiveSamplePerc_ == -1
+        || effectiveSamplePerc_ >= 0.0 || effectiveSamplePerc_ <= 1.0);
+    return effectiveSamplePerc_ > 0.0 && effectiveSamplePerc_ < 1.0;
+  }
+
+  /**
    * Returns true if the given column should be ignored for the purpose of 
computing
    * column stats. Columns with an invalid/unsupported/complex type are 
ignored.
    * For example, complex types in an HBase-backed table will appear as 
invalid types.
@@ -656,6 +682,7 @@ public class ComputeStatsStmt extends StatementBase {
     return !t.isValid() || !t.isSupported() || t.isComplexType();
   }
 
+  public double getEffectiveSamplingPerc() { return effectiveSamplePerc_; }
   public String getTblStatsQuery() { return tableStatsQueryStr_; }
   public String getColStatsQuery() { return columnStatsQueryStr_; }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/db0f3810/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java 
b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index 04599f5..adc6aef 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -88,6 +88,8 @@ import org.apache.impala.util.TResultRowBuilder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.codahale.metrics.Gauge;
+import com.codahale.metrics.Timer;
 import com.google.common.base.Function;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableMap;
@@ -95,9 +97,6 @@ import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 
-import com.codahale.metrics.Gauge;
-import com.codahale.metrics.Timer;
-
 /**
  * Internal representation of table-related metadata of a file-resident table 
on a
  * Hadoop filesystem. The table data can be accessed through libHDFS (which is 
more of
@@ -258,7 +257,7 @@ public class HdfsTable extends Table {
   //   - Used for reporting through catalog web UI.
   //   - Stats are reset whenever the table is loaded (due to a metadata 
operation) and
   //   are set when the table is serialized to Thrift.
-  private FileMetadataStats fileMetadataStats_ = new FileMetadataStats();
+  private final FileMetadataStats fileMetadataStats_ = new FileMetadataStats();
 
   private final static Logger LOG = LoggerFactory.getLogger(HdfsTable.class);
 
@@ -2149,16 +2148,18 @@ public class HdfsTable extends Table {
   /**
    * Selects a random sample of files from the given list of partitions such 
that the sum
    * of file sizes is at least 'percentBytes' percent of the total number of 
bytes in
-   * those partitions. The sample is returned as a map from partition id to a 
list of
-   * file descriptors selected from that partition.
+   * those partitions and at least 'minSampleBytes'. The sample is returned as 
a map from
+   * partition id to a list of file descriptors selected from that partition.
    * This function allocates memory proportional to the number of files in 
'inputParts'.
    * Its implementation tries to minimize the constant factor and object 
generation.
    * The given 'randomSeed' is used for random number generation.
    * The 'percentBytes' parameter must be between 0 and 100.
    */
   public Map<Long, List<FileDescriptor>> getFilesSample(
-      Collection<HdfsPartition> inputParts, long percentBytes, long 
randomSeed) {
+      Collection<HdfsPartition> inputParts, long percentBytes, long 
minSampleBytes,
+      long randomSeed) {
     Preconditions.checkState(percentBytes >= 0 && percentBytes <= 100);
+    Preconditions.checkState(minSampleBytes >= 0);
 
     // Conservative max size for Java arrays. The actual maximum varies
     // from JVM version and sometimes between configurations.
@@ -2200,6 +2201,7 @@ public class HdfsTable extends Table {
     int numFilesRemaining = idx;
     double fracPercentBytes = (double) percentBytes / 100;
     long targetBytes = (long) Math.round(totalBytes * fracPercentBytes);
+    targetBytes = Math.max(targetBytes, minSampleBytes);
 
     // Randomly select files until targetBytes has been reached or all files 
have been
     // selected.

http://git-wip-us.apache.org/repos/asf/impala/blob/db0f3810/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java 
b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index 596129b..4bcf112 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -201,7 +201,7 @@ public class HdfsScanNode extends ScanNode {
   // the TupleDescriptor of this scan node map to indices into 
PlanNodes.conjuncts_ and
   // slots in the TupleDescriptors of nested types map to indices into
   // collectionConjuncts_.
-  private Map<SlotDescriptor, List<Integer>> dictionaryFilterConjuncts_ =
+  private final Map<SlotDescriptor, List<Integer>> dictionaryFilterConjuncts_ =
       Maps.newLinkedHashMap();
 
   // Number of partitions that have the row count statistic.
@@ -702,7 +702,10 @@ public class HdfsScanNode extends ScanNode {
       } else {
         randomSeed = System.currentTimeMillis();
       }
-      sampledFiles = tbl_.getFilesSample(partitions_, percentBytes, 
randomSeed);
+      // Pass a minimum sample size of 0 because users cannot set a minimum 
sample size
+      // for scans directly. For compute stats, a minimum sample size can be 
set, and
+      // the sampling percent is adjusted to reflect it.
+      sampledFiles = tbl_.getFilesSample(partitions_, percentBytes, 0, 
randomSeed);
     }
 
     long maxScanRangeLength = 
analyzer.getQueryCtx().client_request.getQuery_options()

http://git-wip-us.apache.org/repos/asf/impala/blob/db0f3810/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java 
b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
index 895d8c5..3083f1f 100644
--- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
@@ -32,6 +32,7 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.impala.catalog.ArrayType;
+import org.apache.impala.catalog.Catalog;
 import org.apache.impala.catalog.CatalogException;
 import org.apache.impala.catalog.ColumnStats;
 import org.apache.impala.catalog.DataSource;
@@ -48,6 +49,7 @@ import org.apache.impala.service.BackendConfig;
 import org.apache.impala.testutil.TestUtils;
 import org.apache.impala.thrift.TBackendGflags;
 import org.apache.impala.thrift.TDescribeTableParams;
+import org.apache.impala.thrift.TQueryOptions;
 import org.apache.impala.util.MetaStoreUtil;
 import org.apache.kudu.ColumnSchema.CompressionAlgorithm;
 import org.apache.kudu.ColumnSchema.Encoding;
@@ -1169,12 +1171,23 @@ public class AnalyzeDDLTest extends FrontendTestBase {
         "Unsupported column option for non-Kudu table: DROP DEFAULT");
   }
 
-  void checkComputeStatsStmt(String stmt) throws AnalysisException {
-    ParseNode parseNode = AnalyzesOk(stmt);
+  ComputeStatsStmt checkComputeStatsStmt(String stmt) throws AnalysisException 
{
+    return checkComputeStatsStmt(stmt, createAnalyzer(Catalog.DEFAULT_DB));
+  }
+
+  ComputeStatsStmt checkComputeStatsStmt(String stmt, Analyzer analyzer)
+      throws AnalysisException {
+    return checkComputeStatsStmt(stmt, analyzer, null);
+  }
+
+  ComputeStatsStmt checkComputeStatsStmt(String stmt, Analyzer analyzer,
+      String expectedWarning) throws AnalysisException {
+    ParseNode parseNode = AnalyzesOk(stmt, analyzer, expectedWarning);
     assertTrue(parseNode instanceof ComputeStatsStmt);
     ComputeStatsStmt parsedStmt = (ComputeStatsStmt)parseNode;
     AnalyzesOk(parsedStmt.getTblStatsQuery());
     AnalyzesOk(parsedStmt.getColStatsQuery());
+    return parsedStmt;
   }
 
   @Test
@@ -1222,9 +1235,55 @@ public class AnalyzeDDLTest extends FrontendTestBase {
       testGflags.setEnable_stats_extrapolation(true);
       BackendConfig.create(testGflags);
 
-      checkComputeStatsStmt("compute stats functional.alltypes tablesample 
system (10)");
+      // Test different COMPUTE_STATS_MIN_SAMPLE_BYTES.
+      TQueryOptions queryOpts = new TQueryOptions();
+
+      // The default minimum sample size is greater than 'functional.alltypes'.
+      // We expect TABLESAMPLE to be ignored.
+      Preconditions.checkState(
+          queryOpts.compute_stats_min_sample_size == 1024 * 1024 * 1024);
+      ComputeStatsStmt noSamplingStmt = checkComputeStatsStmt(
+          "compute stats functional.alltypes tablesample system (10) 
repeatable(1)",
+          createAnalyzer(queryOpts),
+          "Ignoring TABLESAMPLE because the effective sampling rate is 100%");
+      Assert.assertTrue(noSamplingStmt.getEffectiveSamplingPerc() == 1.0);
+      String tblStatsQuery = noSamplingStmt.getTblStatsQuery().toUpperCase();
+      Assert.assertTrue(!tblStatsQuery.contains("TABLESAMPLE"));
+      Assert.assertTrue(!tblStatsQuery.contains("SAMPLED_NDV"));
+      String colStatsQuery = noSamplingStmt.getColStatsQuery().toUpperCase();
+      Assert.assertTrue(!colStatsQuery.contains("TABLESAMPLE"));
+      Assert.assertTrue(!colStatsQuery.contains("SAMPLED_NDV"));
+
+      // No minimum sample bytes.
+      queryOpts.setCompute_stats_min_sample_size(0);
+      checkComputeStatsStmt("compute stats functional.alltypes tablesample 
system (10)",
+          createAnalyzer(queryOpts));
       checkComputeStatsStmt(
-          "compute stats functional.alltypes tablesample system (55) 
repeatable(1)");
+          "compute stats functional.alltypes tablesample system (55) 
repeatable(1)",
+          createAnalyzer(queryOpts));
+
+      // Sample is adjusted based on the minimum sample bytes.
+      // Assumes that functional.alltypes has 24 files of roughly 20KB each.
+      // The baseline statement with no sampling minimum should select exactly 
one file
+      // and have an effective sampling rate of ~0.04 (1/24).
+      queryOpts.setCompute_stats_min_sample_size(0);
+      ComputeStatsStmt baselineStmt = checkComputeStatsStmt(
+          "compute stats functional.alltypes tablesample system (1) 
repeatable(1)",
+          createAnalyzer(queryOpts));
+      // Approximate validation of effective sampling rate.
+      Assert.assertTrue(baselineStmt.getEffectiveSamplingPerc() > 0.03);
+      Assert.assertTrue(baselineStmt.getEffectiveSamplingPerc() < 0.05);
+      // The adjusted statement with a 100KB minimum should select ~5 files 
and have
+      // an effective sampling rate of ~0.21 (5/24).
+      queryOpts.setCompute_stats_min_sample_size(100 * 1024);
+      ComputeStatsStmt adjustedStmt = checkComputeStatsStmt(
+          "compute stats functional.alltypes tablesample system (1) 
repeatable(1)",
+          createAnalyzer(queryOpts));
+      // Approximate validation to avoid flakiness due to sampling and file 
size
+      // changes. Expect a sample between 4 and 6 of the 24 total files.
+      Assert.assertTrue(adjustedStmt.getEffectiveSamplingPerc() >= 4.0 / 24);
+      Assert.assertTrue(adjustedStmt.getEffectiveSamplingPerc() <= 6.0 / 24);
+
       AnalysisError("compute stats functional.alltypes tablesample system 
(101)",
           "Invalid percent of bytes value '101'. " +
           "The percent of bytes to sample must be between 0 and 100.");

http://git-wip-us.apache.org/repos/asf/impala/blob/db0f3810/tests/custom_cluster/test_stats_extrapolation.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_stats_extrapolation.py 
b/tests/custom_cluster/test_stats_extrapolation.py
index 6d0e34d..ef0b675 100644
--- a/tests/custom_cluster/test_stats_extrapolation.py
+++ b/tests/custom_cluster/test_stats_extrapolation.py
@@ -50,6 +50,10 @@ class TestStatsExtrapolation(CustomClusterTestSuite):
     COMPUTE STATS TABLESAMPLE computes in-the-right-ballpark stats and 
successfully
     stores them in the HMS."""
 
+    # Since our test tables are small, set the minimum sample size to 0 to 
make sure
+    # we exercise the sampling code paths.
+    self.client.execute("set compute_stats_min_sample_size=0")
+
     # Test partitioned table.
     part_test_tbl = unique_database + ".alltypes"
     self.clone_table("functional.alltypes", part_test_tbl, True, vector)

[10/19] impala git commit: IMPALA-6024: Min sample bytes for COMPUTE STATS TABLESAMPLE

Reply via email to