This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new e54c80cf9 KUDU-3562 fix integer overflow in available space metrics
e54c80cf9 is described below
commit e54c80cf9ba129b943eed6af3266899753b1b4ac
Author: Alexey Serbin <[email protected]>
AuthorDate: Sun Mar 31 12:48:27 2024 -0700
KUDU-3562 fix integer overflow in available space metrics
This patch addresses KUDU-3562. The problem was in implicit casting
of 64-bit integers down to 32-bit ones when introducing a helper
variable in CalculateAvailableSpace(). A new test scenario is added
to cover the fixed issue and catch regressions in future, if any.
In addition, this patch corrects the type of the related metrics in
disk_failure-itest.cc. It also clarifies on the description and tags
of the --fs_{data_dirs,wal_dir}_available_space_cache_seconds flags.
Change-Id: I974aea822626e4648886388c0de3741ac459f2ec
Reviewed-on: http://gerrit.cloudera.org:8080/21227
Reviewed-by: Abhishek Chennaka <[email protected]>
Tested-by: Abhishek Chennaka <[email protected]>
Reviewed-by: Mahesh Reddy <[email protected]>
---
src/kudu/fs/data_dirs.cc | 8 +-
src/kudu/integration-tests/disk_failure-itest.cc | 8 +-
.../integration-tests/disk_reservation-itest.cc | 92 +++++++++++++++++++++-
src/kudu/server/server_base.cc | 15 ++--
4 files changed, 109 insertions(+), 14 deletions(-)
diff --git a/src/kudu/fs/data_dirs.cc b/src/kudu/fs/data_dirs.cc
index 2198a6c9b..59584237f 100644
--- a/src/kudu/fs/data_dirs.cc
+++ b/src/kudu/fs/data_dirs.cc
@@ -78,18 +78,22 @@ TAG_FLAG(fs_data_dirs_reserved_bytes, runtime);
TAG_FLAG(fs_data_dirs_reserved_bytes, evolving);
DEFINE_int32(fs_data_dirs_available_space_cache_seconds, 10,
- "Number of seconds we cache the available disk space in the block
manager.");
+ "TTL for the cached metric of the available disk space "
+ "in the data directories, in seconds");
DEFINE_validator(fs_data_dirs_available_space_cache_seconds,
[](const char* /*n*/, int32_t v) { return v >= 0; });
TAG_FLAG(fs_data_dirs_available_space_cache_seconds, advanced);
TAG_FLAG(fs_data_dirs_available_space_cache_seconds, evolving);
+TAG_FLAG(fs_data_dirs_available_space_cache_seconds, runtime);
DEFINE_int32(fs_wal_dir_available_space_cache_seconds, 10,
- "Number of seconds we cache the available disk space the WAL
directory.");
+ "TTL for the cached metric of the available disk space "
+ "in the WAL directories, in seconds");
DEFINE_validator(fs_wal_dir_available_space_cache_seconds,
[](const char* /*n*/, int32_t v) { return v >= 0; });
TAG_FLAG(fs_wal_dir_available_space_cache_seconds, advanced);
TAG_FLAG(fs_wal_dir_available_space_cache_seconds, evolving);
+TAG_FLAG(fs_wal_dir_available_space_cache_seconds, runtime);
DEFINE_bool(fs_lock_data_dirs, true,
"Lock the data directories to prevent concurrent usage. "
diff --git a/src/kudu/integration-tests/disk_failure-itest.cc
b/src/kudu/integration-tests/disk_failure-itest.cc
index c2e677068..416d1996b 100644
--- a/src/kudu/integration-tests/disk_failure-itest.cc
+++ b/src/kudu/integration-tests/disk_failure-itest.cc
@@ -57,8 +57,8 @@ METRIC_DECLARE_gauge_int32(num_raft_leaders);
METRIC_DECLARE_gauge_size(num_rowsets_on_disk);
METRIC_DECLARE_gauge_uint64(data_dirs_failed);
METRIC_DECLARE_gauge_uint32(tablets_num_failed);
-METRIC_DECLARE_gauge_uint64(wal_dir_space_available_bytes);
-METRIC_DECLARE_gauge_uint64(data_dirs_space_available_bytes);
+METRIC_DECLARE_gauge_int64(wal_dir_space_available_bytes);
+METRIC_DECLARE_gauge_int64(data_dirs_space_available_bytes);
using kudu::client::sp::shared_ptr;
using kudu::client::KuduClient;
@@ -321,7 +321,7 @@ TEST_P(TabletServerDiskErrorITest, TestFailOnBootstrap) {
// Wait for the cluster to return to a healthy state.
ClusterVerifier v(cluster_.get());
NO_FATALS(v.CheckCluster());
-};
+}
TEST_P(TabletServerDiskErrorITest, TestSpaceAvailableMetrics) {
// Get the wal_dir_space_available_bytes, data_dirs_space_available_bytes
and make sure
@@ -379,7 +379,7 @@ TEST_P(TabletServerDiskErrorITest,
TestSpaceAvailableMetrics) {
ASSERT_OK(get_metrics(&wal_dir_space, &data_dir_space));
ASSERT_NE(wal_dir_space, -1);
ASSERT_EQ(data_dir_space, -1);
-};
+}
TEST_P(TabletServerDiskErrorITest, TestFailDuringScanWorkload) {
// Make one server to be more likely to host leader replicas: its Raft
diff --git a/src/kudu/integration-tests/disk_reservation-itest.cc
b/src/kudu/integration-tests/disk_reservation-itest.cc
index 7bc29f050..519a2dfa7 100644
--- a/src/kudu/integration-tests/disk_reservation-itest.cc
+++ b/src/kudu/integration-tests/disk_reservation-itest.cc
@@ -33,21 +33,24 @@
#include "kudu/mini-cluster/external_mini_cluster.h"
#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
+#include "kudu/util/net/net_util.h"
#include "kudu/util/status.h"
#include "kudu/util/test_macros.h"
#include "kudu/util/test_util.h"
+using kudu::cluster::ExternalMiniClusterOptions;
+using kudu::itest::GetInt64Metric;
using std::string;
using std::vector;
using strings::Substitute;
METRIC_DECLARE_entity(server);
+METRIC_DECLARE_gauge_int64(wal_dir_space_available_bytes);
+METRIC_DECLARE_gauge_int64(data_dirs_space_available_bytes);
METRIC_DECLARE_gauge_uint64(data_dirs_full);
namespace kudu {
-using cluster::ExternalMiniClusterOptions;
-
class DiskReservationITest : public ExternalMiniClusterITestBase {
};
@@ -161,4 +164,89 @@ TEST_F(DiskReservationITest, TestWalWriteToFullDiskAborts)
{
workload.StopAndJoin();
}
+// Make sure the metrics on the available space in the WAL and the data
+// directories behave consistently with and without space reservations.
+TEST_F(DiskReservationITest, AvailableSpaceMetrics) {
+ constexpr const char* const kMetricValue = "value";
+
+ // To speed up the test, do not cache the metrics on the available disk
space.
+ const vector<string> ts_flags = {
+ "--fs_data_dirs_available_space_cache_seconds=0",
+ "--fs_wal_dir_available_space_cache_seconds=0",
+ };
+ NO_FATALS(StartCluster(ts_flags, {}, 1));
+
+ auto* ts = cluster_->tablet_server(0);
+ DCHECK_NE(nullptr, ts);
+ const auto& addr = ts->bound_http_hostport();
+
+ auto space_getter_data_dirs = [&](int64_t* available_bytes) {
+ return GetInt64Metric(addr,
+ &METRIC_ENTITY_server,
+ nullptr,
+ &METRIC_data_dirs_space_available_bytes,
+ kMetricValue,
+ available_bytes);
+ };
+ auto space_getter_wal_dir = [&](int64_t* available_bytes) {
+ return GetInt64Metric(addr,
+ &METRIC_ENTITY_server,
+ nullptr,
+ &METRIC_wal_dir_space_available_bytes,
+ kMetricValue,
+ available_bytes);
+ };
+
+ constexpr const char* const kFlagDataReserved =
"fs_data_dirs_reserved_bytes";
+ constexpr const char* const kFlagWalReserved = "fs_wal_dir_reserved_bytes";
+
+ // Make sure metrics capture non-negative numbers if the space reservations
+ // use their default settings: 1% of the available disk space for the WAL
+ // and the data directories.
+ {
+ int64_t data_dirs_space_bytes = -1;
+ ASSERT_OK(space_getter_data_dirs(&data_dirs_space_bytes));
+ ASSERT_GE(data_dirs_space_bytes, 0);
+
+ int64_t wal_dir_space_bytes = -1;
+ ASSERT_OK(space_getter_wal_dir(&wal_dir_space_bytes));
+ ASSERT_GE(wal_dir_space_bytes, 0);
+ }
+
+ // Set space reservation to 0 bytes.
+ ASSERT_OK(cluster_->SetFlag(ts, kFlagDataReserved, "0"));
+ ASSERT_OK(cluster_->SetFlag(ts, kFlagWalReserved, "0"));
+
+ int64_t data_dirs_space_bytes_no_reserve = -1;
+ ASSERT_OK(space_getter_data_dirs(&data_dirs_space_bytes_no_reserve));
+ ASSERT_GE(data_dirs_space_bytes_no_reserve, 0);
+
+ int64_t wal_dir_space_bytes_no_reserve = -1;
+ ASSERT_OK(space_getter_wal_dir(&wal_dir_space_bytes_no_reserve));
+ ASSERT_GE(wal_dir_space_bytes_no_reserve, 0);
+
+ // Set space reservation to 1027 GiB: this is to test for integer overflow
+ // in the related code. Essentially, it would be enough to set the
reservation
+ // to anything beyond 2 GiB. However, to make this test scenario stable upon
+ // concurrent activity at the test node, it's necessary to have a wide margin
+ // for the comparison between the available space metrics for data/WAL
+ // directories with&without space reservation.
+ constexpr const char* const k1027GiB = "1102732853248";
+ ASSERT_OK(cluster_->SetFlag(ts, kFlagDataReserved, k1027GiB));
+ ASSERT_OK(cluster_->SetFlag(ts, kFlagWalReserved, k1027GiB));
+
+ int64_t data_dirs_space_bytes_reserve = -1;
+ ASSERT_OK(space_getter_data_dirs(&data_dirs_space_bytes_reserve));
+ ASSERT_GE(data_dirs_space_bytes_reserve, 0);
+
+ int64_t wal_dir_space_bytes_reserve = -1;
+ ASSERT_OK(space_getter_wal_dir(&wal_dir_space_bytes_reserve));
+ ASSERT_GE(wal_dir_space_bytes_reserve, 0);
+
+ // The available space without reservation should not be less than
+ // the available space with reservation.
+ ASSERT_GE(data_dirs_space_bytes_no_reserve, data_dirs_space_bytes_reserve);
+ ASSERT_GE(wal_dir_space_bytes_no_reserve, wal_dir_space_bytes_reserve);
+}
+
} // namespace kudu
diff --git a/src/kudu/server/server_base.cc b/src/kudu/server/server_base.cc
index 3ba74867e..f2f0568b6 100644
--- a/src/kudu/server/server_base.cc
+++ b/src/kudu/server/server_base.cc
@@ -51,6 +51,7 @@
#include "kudu/fs/fs_report.h"
#include "kudu/gutil/integral_types.h"
#include "kudu/gutil/map-util.h"
+#include "kudu/gutil/port.h"
#include "kudu/gutil/strings/numbers.h"
#include "kudu/gutil/strings/split.h"
#include "kudu/gutil/strings/strcat.h"
@@ -610,15 +611,17 @@ shared_ptr<MemTracker> CreateMemTrackerForServer() {
// Calculates the free space on the given WAL/data directory's disk. Returns
-1 in case of disk
// failure.
-inline int64_t CalculateAvailableSpace(const ServerBaseOptions& options, const
string& dir,
- int64_t flag_reserved_bytes, SpaceInfo*
space_info) {
- if (!options.env->GetSpaceInfo(dir, space_info).ok()) {
+int64_t CalculateAvailableSpace(const ServerBaseOptions& options,
+ const string& dir,
+ int64_t flag_reserved_bytes,
+ SpaceInfo* space_info) {
+ if (PREDICT_FALSE(!options.env->GetSpaceInfo(dir, space_info).ok())) {
return -1;
}
- bool should_reserve_one_percent = flag_reserved_bytes == -1;
- int reserved_bytes = should_reserve_one_percent ?
+ const bool should_reserve_one_percent = flag_reserved_bytes == -1;
+ int64_t reserved_bytes = should_reserve_one_percent ?
space_info->capacity_bytes / 100 : flag_reserved_bytes;
- return std::max(static_cast<int64_t>(0), space_info->free_bytes -
reserved_bytes);
+ return std::max<int64_t>(0, space_info->free_bytes - reserved_bytes);
}
int64_t GetFileCacheCapacity(Env* env) {