IMPALA-5229: huge page-backed buffers with TCMalloc
This commit enables transparent huge pages when we're
allocating via malloc(), not just mmap(). This gives
us the perf benefits of huge pages, without the
challenge that the mmap() path presented - the overhead
of mapping and unmapping memory and the difficulty in
reasoning about peak virtual memory consumption.
Also sneak in some cleanup - use Rvalue refs for
BufferHandle methods where appropriate.
Testing:
Updated backend tests to ensure this combination is covered.
Ran some end-to-end tests and stress tests on my buffer pool
dev branch and all looks good.
Perf:
Compared to current master, this provides a pretty clear perf
benefit: I ran benchmarks on a single daemon with a reasonably
large TPC-H scale factor. Large aggregations are much faster
and everything else is the same (within variance) or slightly
faster.
Report Generated on 2017-04-18
Run Description: "Base: 68f32e52bc42bef578330a4fe0edc5b292891eea vs Ref:
f39d69bcd8bdc7d6d4fb42ef19966a26dea3a29d"
Cluster Name: UNKNOWN
Lab Run Info: UNKNOWN
Impala Version: impalad version 2.9.0-SNAPSHOT RELEASE ()
Baseline Impala Version: impalad version 2.9.0-SNAPSHOT RELEASE (2017-04-06)
+--------------------+-----------------------+---------+------------+------------+----------------+
| Workload | File Format | Avg (s) | Delta(Avg) |
GeoMean(s) | Delta(GeoMean) |
+--------------------+-----------------------+---------+------------+------------+----------------+
| TARGETED-PERF(_60) | parquet / none / none | 19.30 | -3.05% | 4.91
| -0.91% |
+--------------------+-----------------------+---------+------------+------------+----------------+
+--------------------+--------------------------------------------------------+-----------------------+--------+-------------+------------+------------+----------------+-------------+-------+
| Workload | Query |
File Format | Avg(s) | Base Avg(s) | Delta(Avg) | StdDev(%) | Base
StdDev(%) | Num Clients | Iters |
+--------------------+--------------------------------------------------------+-----------------------+--------+-------------+------------+------------+----------------+-------------+-------+
| TARGETED-PERF(_60) | PERF_LIMIT-Q1 |
parquet / none / none | 0.01 | 0.01 | R +22.95% | 6.12% |
2.30% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_topn_bigint |
parquet / none / none | 5.14 | 4.66 | +10.35% | 5.00% | *
13.39% * | 1 | 5 |
| TARGETED-PERF(_60) | primitive_conjunct_ordering_4 |
parquet / none / none | 0.24 | 0.23 | +8.12% | * 12.81% * |
1.76% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_broadcast_join_3 |
parquet / none / none | 7.86 | 7.39 | +6.44% | 1.49% |
1.41% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_STRING-Q6 |
parquet / none / none | 10.53 | 10.30 | +2.24% | 0.61% |
0.30% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_conjunct_ordering_5 |
parquet / none / none | 17.23 | 16.90 | +1.90% | 1.61% |
1.05% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_conjunct_ordering_3 |
parquet / none / none | 3.19 | 3.13 | +1.81% | 1.47% |
0.45% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_filter_bigint_non_selective |
parquet / none / none | 1.08 | 1.06 | +1.60% | 0.26% |
2.07% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_STRING-Q3 |
parquet / none / none | 3.75 | 3.71 | +1.14% | 0.39% |
0.80% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_broadcast_join_2 |
parquet / none / none | 5.15 | 5.09 | +1.11% | 1.18% |
0.89% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_STRING-Q2 |
parquet / none / none | 3.47 | 3.44 | +1.03% | 1.27% |
0.61% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_AGG-Q1 |
parquet / none / none | 2.53 | 2.51 | +1.01% | 1.75% |
1.91% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_STRING-Q7 |
parquet / none / none | 8.37 | 8.31 | +0.81% | 0.49% |
0.58% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_filter_string_non_selective |
parquet / none / none | 1.90 | 1.88 | +0.74% | 1.81% |
0.73% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_filter_string_like |
parquet / none / none | 14.82 | 14.73 | +0.62% | 0.17% |
0.02% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_top-n_all |
parquet / none / none | 40.93 | 40.69 | +0.61% | 0.09% |
0.15% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_STRING-Q4 |
parquet / none / none | 3.76 | 3.74 | +0.60% | 0.79% |
0.53% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_broadcast_join_1 |
parquet / none / none | 2.54 | 2.53 | +0.56% | 1.43% |
0.88% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_AGG-Q2 |
parquet / none / none | 8.90 | 8.85 | +0.56% | 0.80% |
0.81% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_empty_build_join_1 |
parquet / none / none | 23.47 | 23.39 | +0.35% | 0.24% |
0.30% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_filter_bigint_selective |
parquet / none / none | 0.17 | 0.17 | +0.35% | 0.91% |
1.37% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_orderby_all |
parquet / none / none | 32.74 | 32.66 | +0.24% | 0.30% |
0.24% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_AGG-Q5 |
parquet / none / none | 0.63 | 0.63 | +0.02% | 0.42% |
0.79% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_filter_string_selective |
parquet / none / none | 1.94 | 1.94 | -0.08% | 2.31% |
1.13% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_filter_decimal_selective |
parquet / none / none | 1.63 | 1.63 | -0.10% | 0.40% |
0.64% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_STRING-Q5 |
parquet / none / none | 4.63 | 4.64 | -0.13% | 0.54% |
0.67% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_conjunct_ordering_2 |
parquet / none / none | 9.98 | 10.03 | -0.44% | 0.41% |
0.64% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_AGG-Q7 |
parquet / none / none | 2.04 | 2.05 | -0.59% | 2.70% |
2.54% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_orderby_bigint |
parquet / none / none | 5.76 | 5.81 | -0.75% | 0.89% |
0.40% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_filter_in_predicate |
parquet / none / none | 2.26 | 2.27 | -0.84% | 1.72% |
1.91% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_exchange_broadcast |
parquet / none / none | 37.21 | 37.57 | -0.95% | 1.86% |
1.55% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_STRING-Q1 |
parquet / none / none | 3.15 | 3.20 | -1.33% | 0.66% |
0.41% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_groupby_decimal_lowndv.test |
parquet / none / none | 3.82 | 3.87 | -1.34% | 1.12% |
1.45% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_shuffle_join_one_to_many_string_with_groupby |
parquet / none / none | 262.47 | 266.28 | -1.43% | 0.83% |
0.65% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_conjunct_ordering_1 |
parquet / none / none | 0.07 | 0.07 | -1.76% | 0.46% |
3.42% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_groupby_bigint_highndv |
parquet / none / none | 32.43 | 33.08 | -1.97% | 0.55% |
0.89% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_groupby_bigint_lowndv |
parquet / none / none | 3.80 | 3.89 | -2.25% | 0.50% |
0.88% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_groupby_bigint_pk |
parquet / none / none | 111.05 | 113.86 | -2.47% | 0.31% |
0.25% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_shuffle_join_union_all_with_groupby |
parquet / none / none | 52.19 | 53.59 | -2.60% | 0.12% |
0.39% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_filter_decimal_non_selective |
parquet / none / none | 1.64 | 1.69 | -2.77% | 1.37% |
3.05% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_AGG-Q3 |
parquet / none / none | 12.34 | 12.90 | -4.32% | 0.81% |
1.05% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_exchange_shuffle |
parquet / none / none | 80.49 | 85.96 | -6.37% | 3.41% |
0.54% | 1 | 5 |
| TARGETED-PERF(_60) | PERF_AGG-Q6 |
parquet / none / none | 2.01 | 2.27 | -11.49% | 1.05% | *
15.97% * | 1 | 5 |
| TARGETED-PERF(_60) | PERF_AGG-Q4 |
parquet / none / none | 17.69 | 21.46 | -17.59% | 0.44% |
0.10% | 1 | 5 |
| TARGETED-PERF(_60) | primitive_groupby_decimal_highndv |
parquet / none / none | 21.50 | 31.83 | I -32.43% | 2.50% |
0.56% | 1 | 5 |
+--------------------+--------------------------------------------------------+-----------------------+--------+-------------+------------+------------+----------------+-------------+-------+
(R) Regression: TARGETED-PERF(_60) PERF_LIMIT-Q1 [parquet / none / none] (0.01s
-> 0.01s [+22.95%])
+----------+------------+-----+----------+------------+-----------+-----+----------+------------+--------+-------+-----------+
| Operator | % of Query | Avg | Base Avg | Delta(Avg) | StdDev(%) | Max | Base
Max | Delta(Max) | #Hosts | #Rows | Est #Rows |
+----------+------------+-----+----------+------------+-----------+-----+----------+------------+--------+-------+-----------+
+----------+------------+-----+----------+------------+-----------+-----+----------+------------+--------+-------+-----------+
(I) Improvement: TARGETED-PERF(_60) primitive_groupby_decimal_highndv [parquet
/ none / none] (31.83s -> 21.50s [-32.43%])
+--------------+------------+----------+----------+------------+-----------+----------+----------+------------+--------+---------+-----------+
| Operator | % of Query | Avg | Base Avg | Delta(Avg) | StdDev(%) |
Max | Base Max | Delta(Max) | #Hosts | #Rows | Est #Rows |
+--------------+------------+----------+----------+------------+-----------+----------+----------+------------+--------+---------+-----------+
| 01:AGGREGATE | 94.77% | 20.36s | 30.60s | -33.47% | 2.53% |
21.20s | 30.73s | -31.02% | 1 | 3.17M | 2.98M |
| 00:SCAN HDFS | 3.65% | 783.14ms | 841.47ms | -6.93% | 3.69% |
832.41ms | 861.74ms | -3.40% | 1 | 360.01M | 360.01M |
+--------------+------------+----------+----------+------------+-----------+----------+----------+------------+--------+---------+-----------+
(V) Significant Variability: TARGETED-PERF(_60) primitive_conjunct_ordering_4
[parquet / none / none] (1.76% -> 12.81%)
+--------------+------------+-----------+----------------+------------------+--------+-------+-----------+
| Operator | % of Query | StdDev(%) | Base StdDev(%) | Delta(StdDev(%)) |
#Hosts | #Rows | Est #Rows |
+--------------+------------+-----------+----------------+------------------+--------+-------+-----------+
| 03:AGGREGATE | 10.76% | 18.81% | 16.87% | +11.48% | 1
| 0 | 1 |
| 01:AGGREGATE | 5.02% | 38.41% | 14.69% | +161.46% | 1
| 0 | 1 |
+--------------+------------+-----------+----------------+------------------+--------+-------+-----------+
Significant perf change detected
Run Description: "Base: 68f32e52bc42bef578330a4fe0edc5b292891eea vs Ref:
f39d69bcd8bdc7d6d4fb42ef19966a26dea3a29d"
Cluster Name: UNKNOWN
Lab Run Info: UNKNOWN
Impala Version: impalad version 2.9.0-SNAPSHOT RELEASE ()
Baseline Impala Version: impalad version 2.9.0-SNAPSHOT RELEASE (2017-04-06)
+-----------+-----------------------+---------+------------+------------+----------------+
| Workload | File Format | Avg (s) | Delta(Avg) | GeoMean(s) |
Delta(GeoMean) |
+-----------+-----------------------+---------+------------+------------+----------------+
| TPCH(_60) | parquet / none / none | 17.50 | -1.88% | 12.18 |
-1.70% |
+-----------+-----------------------+---------+------------+------------+----------------+
+-----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------------+-------+
| Workload | Query | File Format | Avg(s) | Base Avg(s) |
Delta(Avg) | StdDev(%) | Base StdDev(%) | Num Clients | Iters |
+-----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------------+-------+
| TPCH(_60) | TPCH-Q12 | parquet / none / none | 9.63 | 9.31 |
+3.50% | 4.84% | 2.62% | 1 | 5 |
| TPCH(_60) | TPCH-Q14 | parquet / none / none | 8.20 | 7.96 |
+2.99% | 5.30% | 0.30% | 1 | 5 |
| TPCH(_60) | TPCH-Q17 | parquet / none / none | 16.61 | 16.19 |
+2.64% | 1.10% | 0.55% | 1 | 5 |
| TPCH(_60) | TPCH-Q5 | parquet / none / none | 9.77 | 9.70 |
+0.76% | 0.39% | 0.71% | 1 | 5 |
| TPCH(_60) | TPCH-Q1 | parquet / none / none | 28.02 | 27.86 |
+0.59% | 0.61% | 0.55% | 1 | 5 |
| TPCH(_60) | TPCH-Q20 | parquet / none / none | 7.83 | 7.79 |
+0.45% | 1.61% | 1.53% | 1 | 5 |
| TPCH(_60) | TPCH-Q15 | parquet / none / none | 11.02 | 10.97 |
+0.42% | 0.19% | 0.53% | 1 | 5 |
| TPCH(_60) | TPCH-Q6 | parquet / none / none | 4.61 | 4.59 |
+0.40% | 0.44% | 0.98% | 1 | 5 |
| TPCH(_60) | TPCH-Q21 | parquet / none / none | 68.05 | 68.11 |
-0.09% | 0.08% | 0.21% | 1 | 5 |
| TPCH(_60) | TPCH-Q18 | parquet / none / none | 47.03 | 47.10 |
-0.15% | 1.63% | 0.28% | 1 | 5 |
| TPCH(_60) | TPCH-Q11 | parquet / none / none | 2.43 | 2.44 |
-0.58% | 4.45% | 2.23% | 1 | 5 |
| TPCH(_60) | TPCH-Q8 | parquet / none / none | 12.31 | 12.43 |
-0.94% | 0.79% | 0.54% | 1 | 5 |
| TPCH(_60) | TPCH-Q4 | parquet / none / none | 7.94 | 8.04 |
-1.16% | 0.92% | 0.52% | 1 | 5 |
| TPCH(_60) | TPCH-Q19 | parquet / none / none | 11.02 | 11.20 |
-1.67% | 0.70% | 0.39% | 1 | 5 |
| TPCH(_60) | TPCH-Q2 | parquet / none / none | 3.28 | 3.34 |
-1.95% | 0.86% | 2.08% | 1 | 5 |
| TPCH(_60) | TPCH-Q9 | parquet / none / none | 35.53 | 36.37 |
-2.29% | 0.48% | 0.23% | 1 | 5 |
| TPCH(_60) | TPCH-Q3 | parquet / none / none | 12.56 | 13.11 |
-4.19% | 0.99% | 0.81% | 1 | 5 |
| TPCH(_60) | TPCH-Q16 | parquet / none / none | 5.28 | 5.52 |
-4.31% | 0.69% | 0.49% | 1 | 5 |
| TPCH(_60) | TPCH-Q10 | parquet / none / none | 13.85 | 14.55 |
-4.82% | 0.66% | 0.81% | 1 | 5 |
| TPCH(_60) | TPCH-Q7 | parquet / none / none | 41.17 | 44.36 |
-7.20% | 1.40% | 0.29% | 1 | 5 |
| TPCH(_60) | TPCH-Q13 | parquet / none / none | 22.26 | 24.06 |
-7.46% | 0.55% | 0.27% | 1 | 5 |
| TPCH(_60) | TPCH-Q22 | parquet / none / none | 6.57 | 7.35 |
-10.63% | 0.62% | 0.50% | 1 | 5 |
+-----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------------+-------+
Change-Id: If84b46a46efed9aee6af41b5f10bf3f4b15889b8
Reviewed-on: http://gerrit.cloudera.org:8080/6687
Reviewed-by: Tim Armstrong <[email protected]>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/58b206ff
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/58b206ff
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/58b206ff
Branch: refs/heads/master
Commit: 58b206ff0e66e6357aaf81a9e54f660472171c88
Parents: 7fcf1ea
Author: Tim Armstrong <[email protected]>
Authored: Mon Apr 17 23:49:31 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Fri Apr 21 21:25:40 2017 +0000
----------------------------------------------------------------------
.../runtime/bufferpool/buffer-allocator-test.cc | 1 -
be/src/runtime/bufferpool/buffer-allocator.cc | 4 +-
be/src/runtime/bufferpool/system-allocator.cc | 70 ++++++++++++++++----
be/src/runtime/bufferpool/system-allocator.h | 3 +
be/src/runtime/row-batch.cc | 4 +-
be/src/runtime/row-batch.h | 4 +-
6 files changed, 67 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/bufferpool/buffer-allocator-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-allocator-test.cc
b/be/src/runtime/bufferpool/buffer-allocator-test.cc
index 9887086..167298d 100644
--- a/be/src/runtime/bufferpool/buffer-allocator-test.cc
+++ b/be/src/runtime/bufferpool/buffer-allocator-test.cc
@@ -182,7 +182,6 @@ int main(int argc, char** argv) {
int result = 0;
for (bool mmap : {false, true}) {
for (bool madvise : {false, true}) {
- if (madvise && !mmap) continue; // Not an interesting combination.
std::cerr << "+==================================================" <<
std::endl
<< "| Running tests with mmap=" << mmap << " madvise=" <<
madvise
<< std::endl
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/bufferpool/buffer-allocator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-allocator.cc
b/be/src/runtime/bufferpool/buffer-allocator.cc
index 0fd88fd..77169ce 100644
--- a/be/src/runtime/bufferpool/buffer-allocator.cc
+++ b/be/src/runtime/bufferpool/buffer-allocator.cc
@@ -43,7 +43,7 @@ class BufferPool::FreeBufferArena : public CacheLineAligned {
/// Add a free buffer to the free lists. May free buffers to the system
allocator
/// if the list becomes full. Caller should not hold 'lock_'
- void AddFreeBuffer(BufferHandle buffer);
+ void AddFreeBuffer(BufferHandle&& buffer);
/// Try to get a free buffer of 'buffer_len' bytes from this arena. Returns
true and
/// sets 'buffer' if found or false if not found. Caller should not hold
'lock_'.
@@ -406,7 +406,7 @@ BufferPool::FreeBufferArena::~FreeBufferArena() {
}
}
-void BufferPool::FreeBufferArena::AddFreeBuffer(BufferHandle buffer) {
+void BufferPool::FreeBufferArena::AddFreeBuffer(BufferHandle&& buffer) {
lock_guard<SpinLock> al(lock_);
PerSizeLists* lists = GetListsForSize(buffer.len());
FreeList* list = &lists->free_buffers;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/bufferpool/system-allocator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/system-allocator.cc
b/be/src/runtime/bufferpool/system-allocator.cc
index 7598d3c..756170a 100644
--- a/be/src/runtime/bufferpool/system-allocator.cc
+++ b/be/src/runtime/bufferpool/system-allocator.cc
@@ -19,27 +19,40 @@
#include <sys/mman.h>
+#include <gperftools/malloc_extension.h>
+
+#include "gutil/strings/substitute.h"
#include "util/bit-util.h"
+#include "common/names.h"
+
// TODO: IMPALA-5073: this should eventually become the default once we are
confident
// that it is superior to allocating via TCMalloc.
DEFINE_bool(mmap_buffers, false,
- "(Advanced) If true, allocate buffers directly from the operating system
instead of "
- "with TCMalloc.");
+ "(Experimental) If true, allocate buffers directly from the operating
system "
+ "instead of with TCMalloc.");
DEFINE_bool(madvise_huge_pages, true,
- "(Advanced) If true and --mmap_buffers is also "
- "true, advise operating system to back large memory buffers with huge
pages");
+ "(Advanced) If true, advise operating system to back large memory buffers
with huge "
+ "pages");
namespace impala {
-/// This is the huge page size on x86-64. We could parse /proc/meminfo to
programmatically
+/// These are the page sizes on x86-64. We could parse /proc/meminfo to
programmatically
/// get this, but it is unlikely to change unless we port to a different
architecture.
+static int64_t SMALL_PAGE_SIZE = 4LL * 1024;
static int64_t HUGE_PAGE_SIZE = 2LL * 1024 * 1024;
SystemAllocator::SystemAllocator(int64_t min_buffer_len)
: min_buffer_len_(min_buffer_len) {
DCHECK(BitUtil::IsPowerOf2(min_buffer_len));
+#ifndef ADDRESS_SANITIZER
+ // Free() assumes that aggressive decommit is enabled for TCMalloc.
+ size_t aggressive_decommit_enabled;
+ MallocExtension::instance()->GetNumericProperty(
+ "tcmalloc.aggressive_memory_decommit", &aggressive_decommit_enabled);
+ CHECK_EQ(true, aggressive_decommit_enabled);
+#endif
}
Status SystemAllocator::Allocate(int64_t len, BufferPool::BufferHandle*
buffer) {
@@ -51,13 +64,7 @@ Status SystemAllocator::Allocate(int64_t len,
BufferPool::BufferHandle* buffer)
if (FLAGS_mmap_buffers) {
RETURN_IF_ERROR(AllocateViaMMap(len, &buffer_mem));
} else {
- // AddressSanitizer does not instrument mmap(). Use malloc() to preserve
- // instrumentation.
- buffer_mem = reinterpret_cast<uint8_t*>(malloc(len));
- if (buffer_mem == nullptr) {
- return Status(
- TErrorCode::BUFFER_ALLOCATION_FAILED, len, "malloc() failed under
asan");
- }
+ RETURN_IF_ERROR(AllocateViaMalloc(len, &buffer_mem));
}
buffer->Open(buffer_mem, len, CpuInfo::GetCurrentCore());
return Status::OK();
@@ -107,11 +114,50 @@ Status SystemAllocator::AllocateViaMMap(int64_t len,
uint8_t** buffer_mem) {
return Status::OK();
}
+Status SystemAllocator::AllocateViaMalloc(int64_t len, uint8_t** buffer_mem) {
+ bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
+ // Allocate, aligned to the page size that we expect to back the memory
range.
+ // This ensures that it can be backed by a whole pages, rather than parts of
pages.
+ size_t alignment = use_huge_pages ? HUGE_PAGE_SIZE : SMALL_PAGE_SIZE;
+ int rc = posix_memalign(reinterpret_cast<void**>(buffer_mem), alignment,
len);
+ if (rc != 0) {
+ return Status(TErrorCode::BUFFER_ALLOCATION_FAILED, len,
+ Substitute("posix_memalign() failed to allocate buffer: $0",
GetStrErrMsg()));
+ }
+ if (use_huge_pages) {
+#ifdef MADV_HUGEPAGE
+ // According to madvise() docs it may return EAGAIN to signal that we
should retry.
+ do {
+ rc = madvise(*buffer_mem, len, MADV_HUGEPAGE);
+ } while (rc == -1 && errno == EAGAIN);
+ DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
+#endif
+ }
+ return Status::OK();
+}
+
void SystemAllocator::Free(BufferPool::BufferHandle&& buffer) {
if (FLAGS_mmap_buffers) {
int rc = munmap(buffer.data(), buffer.len());
DCHECK_EQ(rc, 0) << "Unexpected munmap() error: " << errno;
} else {
+ bool use_huge_pages = buffer.len() % HUGE_PAGE_SIZE == 0 &&
FLAGS_madvise_huge_pages;
+ if (use_huge_pages) {
+ // Undo the madvise so that is isn't a candidate to be newly backed by
huge pages.
+ // We depend on TCMalloc's "aggressive decommit" mode decommitting the
physical
+ // huge pages with madvise(DONTNEED) when we call free(). Otherwise,
this huge
+ // page region may be divvied up and subsequently decommitted in smaller
chunks,
+ // which may not actually release the physical memory, causing Impala
physical
+ // memory usage to exceed the process limit.
+#ifdef MADV_NOHUGEPAGE
+ // According to madvise() docs it may return EAGAIN to signal that we
should retry.
+ int rc;
+ do {
+ rc = madvise(buffer.data(), buffer.len(), MADV_NOHUGEPAGE);
+ } while (rc == -1 && errno == EAGAIN);
+ DCHECK(rc == 0) << "madvise(MADV_NOHUGEPAGE) shouldn't fail" << errno;
+#endif
+ }
free(buffer.data());
}
buffer.Reset(); // Avoid DCHECK in ~BufferHandle().
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/bufferpool/system-allocator.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/system-allocator.h
b/be/src/runtime/bufferpool/system-allocator.h
index 33ad525..d57b8df 100644
--- a/be/src/runtime/bufferpool/system-allocator.h
+++ b/be/src/runtime/bufferpool/system-allocator.h
@@ -43,6 +43,9 @@ class SystemAllocator {
/// Allocate 'len' bytes of memory for a buffer via mmap().
Status AllocateViaMMap(int64_t len, uint8_t** buffer_mem);
+ /// Allocate 'len' bytes of memory for a buffer via our malloc
implementation.
+ Status AllocateViaMalloc(int64_t len, uint8_t** buffer_mem);
+
const int64_t min_buffer_len_;
};
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/row-batch.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/row-batch.cc b/be/src/runtime/row-batch.cc
index 8dfc4ba..8c4ab54 100644
--- a/be/src/runtime/row-batch.cc
+++ b/be/src/runtime/row-batch.cc
@@ -310,8 +310,8 @@ void RowBatch::AddBlock(BufferedBlockMgr::Block* block,
FlushMode flush) {
if (flush == FlushMode::FLUSH_RESOURCES) MarkFlushResources();
}
-void RowBatch::AddBuffer(
- BufferPool::ClientHandle* client, BufferPool::BufferHandle buffer,
FlushMode flush) {
+void RowBatch::AddBuffer(BufferPool::ClientHandle* client,
+ BufferPool::BufferHandle&& buffer, FlushMode flush) {
auxiliary_mem_usage_ += buffer.len();
BufferInfo buffer_info;
buffer_info.client = client;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/row-batch.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/row-batch.h b/be/src/runtime/row-batch.h
index 0bb71d8..cc41adb 100644
--- a/be/src/runtime/row-batch.h
+++ b/be/src/runtime/row-batch.h
@@ -230,8 +230,8 @@ class RowBatch {
/// for further explanation).
/// TODO: IMPALA-4179: after IMPALA-3200, simplify the ownership transfer
model and
/// make it consistent between buffers and I/O buffers.
- void AddBuffer(
- BufferPool::ClientHandle* client, BufferPool::BufferHandle buffer,
FlushMode flush);
+ void AddBuffer(BufferPool::ClientHandle* client, BufferPool::BufferHandle&&
buffer,
+ FlushMode flush);
/// Used by an operator to indicate that it cannot produce more rows until
the
/// resources that it has attached to the row batch are freed or acquired by
an