This is an automated email from the ASF dual-hosted git repository.
michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 93c64e7e9 IMPALA-13376: Add docs for AGG_MEM_CORRELATION_FACTOR etc
93c64e7e9 is described below
commit 93c64e7e9a771d72b860d1a6bdccc33d9f46f5ac
Author: Riza Suminto <[email protected]>
AuthorDate: Wed Sep 11 17:48:38 2024 -0700
IMPALA-13376: Add docs for AGG_MEM_CORRELATION_FACTOR etc
This patch adds documentation for AGG_MEM_CORRELATION_FACTOR and
LARGE_AGG_MEM_THRESHOLD option introduced in Apache Impala 4.4.0.
IMPALA-12548 fix behavior of AGG_MEM_CORRELATION_FACTOR. Higher value
will lower memory estimation, while lower value will result in higher
memory estimation. The documentation in ImpalaService.thrift, however,
says the opposite. This patch fix documentation in thrift file as well.
Testing:
- Run "make plain-html" in docs/ dir and confirm the output.
- Manually check with comments in
PlannerTest.testAggNodeMaxMemEstimate()
Change-Id: I00956a50fb7616ca3c3ea2fd75fd11239a6bcd90
Reviewed-on: http://gerrit.cloudera.org:8080/21793
Tested-by: Impala Public Jenkins <[email protected]>
Reviewed-by: Michael Smith <[email protected]>
---
common/thrift/ImpalaService.thrift | 2 +-
docs/impala.ditamap | 9 +--
docs/shared/ImpalaVariables.xml | 1 +
docs/shared/impala_common.xml | 8 +++
docs/topics/impala_agg_mem_correlation_factor.xml | 78 +++++++++++++++++++++++
docs/topics/impala_large_agg_mem_threshold.xml | 68 ++++++++++++++++++++
6 files changed, 161 insertions(+), 5 deletions(-)
diff --git a/common/thrift/ImpalaService.thrift
b/common/thrift/ImpalaService.thrift
index a3e4c1c66..e01b9c995 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -847,7 +847,7 @@ enum TImpalaQueryOptions {
// value means there is high correlation between grouping expressions /
columns, while
// low value means there is low correlation between them. High correlation
means
// aggregation node can be scheduled with lower memory estimation (lower
memScale).
- // Setting value 1.0 will result in an equal memory estimate as the default
estimation
+ // Setting value 0.0 will result in an equal memory estimate as the default
estimation
// (no change). Defaults to 0.5.
AGG_MEM_CORRELATION_FACTOR = 163
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index e022243ba..c050ff04c 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -178,6 +178,7 @@ under the License.
<topicref href="topics/impala_abort_on_error.xml"/>
<topicref href="topics/impala_allow_erasure_coded_files.xml"/>
<topicref href="topics/impala_allow_unsupported_formats.xml"/>
+ <topicref rev="4.4.0"
href="topics/impala_agg_mem_correlation_factor.xml"/>
<topicref href="topics/impala_appx_count_distinct.xml"/>
<topicref href="topics/impala_batch_size.xml"/>
<topicref href="topics/impala_broadcast_bytes_limit.xml"/>
@@ -202,19 +203,19 @@ under the License.
<topicref href="topics/impala_exec_single_node_rows_threshold.xml"/>
<topicref href="topics/impala_exec_time_limit_s.xml"/>
<topicref rev="4.2.0" href="topics/impala_expand_complex_types.xml"/>
- <topicref href="topics/impala_explain_level.xml">
- <topicref rev="2.5.0"
href="topics/impala_max_num_runtime_filters.xml"/>
- </topicref>
+ <topicref href="topics/impala_explain_level.xml"/>
<topicref href="topics/impala_fetch_rows_timeout_ms.xml"/>
- <topicref href="topics/impala_join_rows_produced_limit.xml"/>
<topicref href="topics/impala_hbase_cache_blocks.xml"/>
<topicref href="topics/impala_hbase_caching.xml"/>
<topicref href="topics/impala_idle_session_timeout.xml"/>
+ <topicref href="topics/impala_join_rows_produced_limit.xml"/>
<topicref href="topics/impala_kudu_read_mode.xml"/>
+ <topicref rev="4.4.0"
href="topics/impala_large_agg_mem_threshold.xml"/>
<topicref href="topics/impala_live_progress.xml"/>
<topicref href="topics/impala_live_summary.xml"/>
<topicref href="topics/impala_max_errors.xml"/>
<topicref rev="3.1 IMPALA-6847"
href="topics/impala_max_mem_estimate_for_admission.xml"/>
+ <topicref rev="2.5.0"
href="topics/impala_max_num_runtime_filters.xml"/>
<topicref href="topics/impala_max_result_spooling_mem.xml"/>
<topicref rev="2.10.0 IMPALA-3200"
href="topics/impala_max_row_size.xml"/>
<topicref href="topics/impala_max_scan_range_length.xml"/>
diff --git a/docs/shared/ImpalaVariables.xml b/docs/shared/ImpalaVariables.xml
index d7236f06d..cf25bc00e 100644
--- a/docs/shared/ImpalaVariables.xml
+++ b/docs/shared/ImpalaVariables.xml
@@ -42,6 +42,7 @@ under the License.
The docs included with a distro can refer to the distro release number
by
editing the values here.
<ul>
+ <li><ph id="impala44">Impala 4.4</ph></li>
<li><ph id="impala40">Impala 4.0</ph></li>
<li><ph id="impala34">Impala 3.4</ph></li>
<li><ph id="impala33">Impala 3.3</ph></li>
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index 091f6c85d..b4c1792f5 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -2479,6 +2479,10 @@ PROFILE;
<b>Type:</b> integer
</p>
+ <p id="type_double">
+ <b>Type:</b> double
+ </p>
+
<p id="default_blurb">
<b>Default:</b>
</p>
@@ -3432,6 +3436,10 @@ flight_num: INT32 SNAPPY DO:83456393
FPO:83488603 SZ:10216514/11474301
needed to represent each value.
</p>
+ <p rev="4.4.0" id="added_in_440">
+ <b>Added in:</b> <keyword keyref="impala44"/>
+ </p>
+
<p rev="4.0.0" id="added_in_400">
<b>Added in:</b> <keyword keyref="impala40"/>
</p>
diff --git a/docs/topics/impala_agg_mem_correlation_factor.xml
b/docs/topics/impala_agg_mem_correlation_factor.xml
new file mode 100644
index 000000000..d9a27f7d1
--- /dev/null
+++ b/docs/topics/impala_agg_mem_correlation_factor.xml
@@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="agg_mem_correlation_factor" rev="4.4.0">
+
+ <title>AGG_MEM_CORRELATION_FACTOR Query Option (<keyword keyref="impala44"/>
or higher only)</title>
+ <titlealts audience="PDF">
+ <navtitle>AGG MEM CORRELATION FACTOR</navtitle>
+ </titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Performance"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Data Analysts"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p rev="4.4.0">
+ <indexterm audience="hidden">AGG_MEM_CORRELATION_FACTOR query
option</indexterm>
+ Default correlation factor between two or more grouping columns in
aggregation node.
+ When grouping over multiple columns, the query planner will assume this
value to reason
+ about how correlated the columns are. A value close to 1.0 means columns
are highly
+ correlated, while 0.0 means no correlation. In popular RDBMS, this
column correlation
+ can usually be measured by using <codeph>CORR</codeph> function.
+ </p>
+ <p>
+ If both <codeph>AGG_MEM_CORRELATION_FACTOR</codeph> and
+ <codeph>LARGE_AGG_MEM_THRESHOLD</codeph> are set larger than 0, the
planner will
+ switch memory estimation calculation for aggregation node from using NDV
multiplication-based
+ algorithm to correlation-based memory estimation that should yield lower
+ estimate. Setting a high <codeph>AGG_MEM_CORRELATION_FACTOR</codeph>
will result in
+ lower memory estimation, but no less than
+ <codeph>LARGE_AGG_MEM_THRESHOLD</codeph>. Setting a low value will
result in higher
+ memory estimation, but will not exceed the default NDV
multiplication-based
+ estimation.
+ </p>
+ <p>
+ Users can set this option value to 0.0 so the planner stays using the
default
+ NDV multiplication based-estimation.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_double"/>
+
+ <p conref="../shared/impala_common.xml#common/default_blurb"/>
+ <p>
+ <codeph>0.5</codeph>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_440"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_large_agg_mem_threshold.xml"/>
+ </p>
+
+ </conbody>
+</concept>
diff --git a/docs/topics/impala_large_agg_mem_threshold.xml
b/docs/topics/impala_large_agg_mem_threshold.xml
new file mode 100644
index 000000000..33debbd89
--- /dev/null
+++ b/docs/topics/impala_large_agg_mem_threshold.xml
@@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="large_agg_mem_threshold" rev="4.4.0">
+
+ <title>LARGE_AGG_MEM_THRESHOLD Query Option (<keyword keyref="impala44"/> or
higher only)</title>
+ <titlealts audience="PDF">
+ <navtitle>LARGE AGG MEM THRESHOLD</navtitle>
+ </titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Performance"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Data Analysts"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p rev="4.4.0">
+ <indexterm audience="hidden">LARGE_AGG_MEM_THRESHOLD query
option</indexterm>
+ The threshold for the query planner to determine whether memory
estimation for an
+ aggregation node is large or not. Together with
<codeph>AGG_MEM_CORRELATION_FACTOR</codeph>,
+ the planner will use this value to lower memory estimation for a large
aggregation node.
+ </p>
+ <p>
+ Users can set this option value to 0.0 so the planner stays using the
default
+ NDV multiplication based-estimation. See
<codeph>AGG_MEM_CORRELATION_FACTOR</codeph>
+ documentation for more detail.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_integer"/>
+
+ <p conref="../shared/impala_common.xml#common/default_blurb"/>
+ <p>
+ <codeph>536870912</codeph> (512 MB)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/units_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/added_in_440"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_agg_mem_correlation_factor.xml"/>
+ </p>
+
+ </conbody>
+</concept>