This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new f26fa17 [tools] add --row_count_only and --report_scanner_stats flags
f26fa17 is described below
commit f26fa17968f4b323a42b61ebeace5be9e12057c0
Author: Alexey Serbin <[email protected]>
AuthorDate: Tue Jun 15 16:59:42 2021 -0700
[tools] add --row_count_only and --report_scanner_stats flags
This patch updates `kudu table scan` and `kudu perf table_scan` CLI
tools to introduce two new flags:
* --row_count_only
* --report_scanner_stats
The motivation for this patch the necessity to:
* have an explicit option to run something similar to
SELECT COUNT(1) FROM <table_name> when using `kudu perf scan_table`
and `kudu table scan` CLI tools
* report on the scanner metrics while running a query like
SELECT COUNT(1) FROM <table_name>
I also added a few generic test scenarios to cover the newly introduced
new functionality.
Another notable change in behavior is not polluting the output of the
table rows with scanner stats by default: only the summary is printed
in the very end unless --report_scanner_stats is specified (of course,
scanner stats have been enriched with this patch as well).
A few examples of the output from the updated CLI tools:
kudu table scan 10.17.240.17:9876 \
default.loadgen_auto_d61d513aeec4478d9c4638e434565dee \
--row_count_only
Total count 3000000 cost 0.16833 seconds
kudu perf table_scan 10.17.240.17:9876 \
default.loadgen_auto_d61d513aeec4478d9c4638e434565dee \
--tablets=e1195e0e0e6842b3a9c19ef6de696440 \
--row_count_only \
--report_scanner_stats
T e1195e0e0e6842b3a9c19ef6de696440 scanned 93753 rows in 0.050205518
seconds
NextBatch() calls 1
bytes_read 0
cfile_cache_hit_bytes 0
cfile_cache_miss_bytes 0
cpu_system_nanos 1000000
cpu_user_nanos 38994000
queue_duration_nanos 131807
total_duration_nanos 41545038
Total count 93753 cost 0.051109 second
kudu perf table_scan 10.17.240.17:9876 \
default.loadgen_auto_d61d513aeec4478d9c4638e434565dee \
--tablets=e1195e0e0e6842b3a9c19ef6de696440 \
--columns=* \
--report_scanner_stats
T e1195e0e0e6842b3a9c19ef6de696440 scanned 93753 rows in 0.084285734
seconds
NextBatch() calls 6
bytes_read 3262005
cfile_cache_hit_bytes 0
cfile_cache_miss_bytes 3524539
cpu_system_nanos 21998000
cpu_user_nanos 164975000
queue_duration_nanos 339230
total_duration_nanos 51007394
Total count 93753 cost 0.0854584 seconds
Change-Id: Ia64e1a0b26996f0087d2473d15350d590581a69c
Reviewed-on: http://gerrit.cloudera.org:8080/17598
Reviewed-by: Andrew Wong <[email protected]>
Tested-by: Kudu Jenkins
---
src/kudu/tools/kudu-tool-test.cc | 90 +++++++++++++++++++++++++++++++++++-
src/kudu/tools/table_scanner.cc | 37 ++++++++++-----
src/kudu/tools/tool_action_common.cc | 23 +++++++++
src/kudu/tools/tool_action_perf.cc | 2 +
src/kudu/tools/tool_action_table.cc | 8 +++-
5 files changed, 146 insertions(+), 14 deletions(-)
diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index ff4f8d9..45ef158 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -15,12 +15,12 @@
// specific language governing permissions and limitations
// under the License.
-#include <stdlib.h>
#include <sys/stat.h>
#include <algorithm>
#include <cstdint>
#include <cstdio>
+#include <cstdlib>
#include <fstream>
#include <functional>
#include <initializer_list>
@@ -612,7 +612,7 @@ class ToolTest : public KuduTest {
kDstTableName, args.columns), &dst_lines));
if (args.mode == TableCopyMode::COPY_SCHEMA_ONLY) {
- ASSERT_GT(dst_lines.size(), 1);
+ ASSERT_GE(dst_lines.size(), 1);
ASSERT_STR_CONTAINS(*dst_lines.rbegin(), "Total count 0 ");
} else {
// Rows scanned from source table can be found in destination table.
@@ -2827,6 +2827,49 @@ TEST_F(ToolTest, TestPerfTableScan) {
NO_FATALS(RunScanTableCheck(kTableName, "", 1, 2000, {}, "perf table_scan"));
}
+TEST_F(ToolTest, PerfTableScanCountOnly) {
+ constexpr const char* const kTableName = "perf.table_scan.row_count_only";
+ // Be specific about the number of threads even if it matches the default
+ // value for the --num_threads flag. This is to be explicit about the
expected
+ // number of rows written into the table.
+ NO_FATALS(RunLoadgen(1,
+ {
+ "--num_threads=2",
+ "--num_rows_per_thread=1234",
+ },
+ kTableName));
+
+ // Run table_scan with --row_count_only option
+ {
+ string out;
+ string err;
+ vector<string> out_lines;
+ const auto s = RunTool(
+ Substitute("perf table_scan $0 $1 --row_count_only",
+ cluster_->master()->bound_rpc_addr().ToString(),
kTableName),
+ &out, &err, &out_lines);
+ ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+ ASSERT_EQ(1, out_lines.size()) << out;
+ ASSERT_STR_CONTAINS(out, "Total count 2468 ");
+ }
+
+ // Add the --report_scanner_stats flag as well.
+ {
+ string out;
+ string err;
+ const auto s = RunTool(Substitute(
+ "perf table_scan $0 $1 --row_count_only --report_scanner_stats",
+ cluster_->master()->bound_rpc_addr().ToString(), kTableName),
+ &out, &err);
+ ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+ ASSERT_STR_CONTAINS(out, "bytes_read 0");
+ ASSERT_STR_CONTAINS(out, "cfile_cache_hit_bytes 0");
+ ASSERT_STR_CONTAINS(out, "cfile_cache_miss_bytes 0");
+ ASSERT_STR_CONTAINS(out, "total_duration_nanos ");
+ ASSERT_STR_CONTAINS(out, "Total count 2468 ");
+ }
+}
+
TEST_F(ToolTest, TestPerfTabletScan) {
// Create a table.
constexpr const char* const kTableName = "perf.tablet_scan";
@@ -3858,6 +3901,49 @@ TEST_F(ToolTest, TestScanTableMultiPredicates) {
ASSERT_LE(lines.size(), mid);
}
+TEST_F(ToolTest, TableScanRowCountOnly) {
+ constexpr const char* const kTableName = "kudu.table.scan.row_count_only";
+ // Be specific about the number of threads even if it matches the default
+ // value for the --num_threads flag. This is to be explicit about the
expected
+ // number of rows written into the table.
+ NO_FATALS(RunLoadgen(1,
+ {
+ "--num_threads=2",
+ "--num_rows_per_thread=1234",
+ },
+ kTableName));
+
+ // Run table_scan with --row_count_only option
+ {
+ string out;
+ string err;
+ vector<string> out_lines;
+ const auto s = RunTool(
+ Substitute("table scan $0 $1 --row_count_only",
+ cluster_->master()->bound_rpc_addr().ToString(),
kTableName),
+ &out, &err, &out_lines);
+ ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+ ASSERT_EQ(1, out_lines.size()) << out;
+ ASSERT_STR_CONTAINS(out, "Total count 2468 ");
+ }
+
+ // Add the --report_scanner_stats flag as well.
+ {
+ string out;
+ string err;
+ const auto s = RunTool(
+ Substitute("table scan $0 $1 --row_count_only --report_scanner_stats",
+ cluster_->master()->bound_rpc_addr().ToString(),
kTableName),
+ &out, &err);
+ ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+ ASSERT_STR_CONTAINS(out, "bytes_read 0");
+ ASSERT_STR_CONTAINS(out, "cfile_cache_hit_bytes 0");
+ ASSERT_STR_CONTAINS(out, "cfile_cache_miss_bytes 0");
+ ASSERT_STR_CONTAINS(out, "total_duration_nanos ");
+ ASSERT_STR_CONTAINS(out, "Total count 2468 ");
+ }
+}
+
TEST_P(ToolTestCopyTableParameterized, TestCopyTable) {
for (const auto& arg : GenerateArgs()) {
NO_FATALS(RunCopyTableCheck(arg));
diff --git a/src/kudu/tools/table_scanner.cc b/src/kudu/tools/table_scanner.cc
index f37315e..48d2a8d 100644
--- a/src/kudu/tools/table_scanner.cc
+++ b/src/kudu/tools/table_scanner.cc
@@ -22,6 +22,7 @@
#include <cstdint>
#include <cstring>
#include <initializer_list>
+#include <iomanip>
#include <iostream>
#include <iterator>
#include <map>
@@ -34,6 +35,7 @@
#include <rapidjson/document.h>
#include "kudu/client/client.h"
+#include "kudu/client/resource_metrics.h"
#include "kudu/client/scan_batch.h"
#include "kudu/client/scan_predicate.h"
#include "kudu/client/schema.h"
@@ -77,7 +79,9 @@ using std::endl;
using std::map;
using std::ostream;
using std::ostringstream;
+using std::right;
using std::set;
+using std::setw;
using std::string;
using std::unique_ptr;
using std::vector;
@@ -106,6 +110,8 @@ DEFINE_string(predicates, "",
"For example,\n"
R"*( ["AND", [">=", "col1", "value"], ["NOTNULL",
"col2"]])*""\n"
"The only supported predicate operator is `AND`.");
+DEFINE_bool(report_scanner_stats, false,
+ "Whether to report scanner statistics");
DEFINE_bool(show_values, false,
"Whether to show values of scanned rows.");
DEFINE_string(write_type, "insert",
@@ -479,27 +485,34 @@ Status TableScanner::ScanData(const
std::vector<kudu::client::KuduScanToken*>& t
RETURN_NOT_OK(scanner->Open());
uint64_t count = 0;
+ size_t next_batch_calls = 0;
while (scanner->HasMoreRows()) {
KuduScanBatch batch;
RETURN_NOT_OK(scanner->NextBatch(&batch));
count += batch.NumRows();
total_count_ += batch.NumRows();
+ ++next_batch_calls;
cb(batch);
}
-
sw.stop();
- if (out_) {
+
+ if (FLAGS_report_scanner_stats && out_) {
+ auto& out = *out_;
MutexLock l(output_lock_);
- *out_ << "T " << token->tablet().id() << " scanned count " << count
- << " cost " << sw.elapsed().wall_seconds() << " seconds" << endl;
+ out << Substitute("T $0 scanned $1 rows in $2 seconds\n",
+ token->tablet().id(), count,
sw.elapsed().wall_seconds());
+ const auto& metrics = scanner->GetResourceMetrics();
+ out << setw(32) << right << "NextBatch() calls"
+ << setw(16) << right << next_batch_calls << endl;
+ for (const auto& [k, v] : metrics.Get()) {
+ out << setw(32) << right << k << setw(16) << right << v << endl;
+ }
}
}
-
return Status::OK();
-
}
-void TableScanner::ScanTask(const vector<KuduScanToken *>& tokens, Status*
thread_status) {
+void TableScanner::ScanTask(const vector<KuduScanToken*>& tokens, Status*
thread_status) {
*thread_status = ScanData(tokens, [&](const KuduScanBatch& batch) {
if (out_ && FLAGS_show_values) {
MutexLock l(output_lock_);
@@ -564,10 +577,12 @@ Status TableScanner::StartWork(WorkType type) {
// Set projection if needed.
if (type == WorkType::kScan) {
- bool project_all = FLAGS_columns == "*" ||
- (FLAGS_show_values && FLAGS_columns.empty());
- if (!project_all) {
- vector<string> projected_column_names = Split(FLAGS_columns, ",",
strings::SkipEmpty());
+ const auto project_all = FLAGS_columns == "*" || FLAGS_columns.empty();
+ if (!project_all || FLAGS_row_count_only) {
+ vector<string> projected_column_names;
+ if (!FLAGS_row_count_only && !FLAGS_columns.empty()) {
+ projected_column_names = Split(FLAGS_columns, ",",
strings::SkipEmpty());
+ }
RETURN_NOT_OK(builder.SetProjectedColumnNames(projected_column_names));
}
}
diff --git a/src/kudu/tools/tool_action_common.cc
b/src/kudu/tools/tool_action_common.cc
index 4d9fdfd..332c464 100644
--- a/src/kudu/tools/tool_action_common.cc
+++ b/src/kudu/tools/tool_action_common.cc
@@ -151,6 +151,12 @@ DEFINE_string(sasl_protocol_name,
"servers' service principal name base (e.g. if it's
\"kudu/_HOST\", then "
"sasl_protocol_name must be \"kudu\" to be able to connect.");
+DEFINE_bool(row_count_only, false,
+ "Whether to only count rows instead of reading row cells: yields "
+ "an empty projection for the table");
+
+DECLARE_bool(show_values);
+
bool ValidateTimeoutSettings() {
if (FLAGS_timeout_ms < FLAGS_negotiation_timeout_ms) {
LOG(ERROR) << strings::Substitute(
@@ -164,6 +170,23 @@ bool ValidateTimeoutSettings() {
}
GROUP_FLAG_VALIDATOR(timeout_flags, ValidateTimeoutSettings);
+bool ValidateSchemaProjectionFlags() {
+ if (FLAGS_row_count_only && !FLAGS_columns.empty()) {
+ LOG(ERROR) <<
+ "--row_count_only and --columns flags are conflicting: "
+ "either remove/unset --columns or remove/unset --row_count_only";
+ return false;
+ }
+ if (FLAGS_row_count_only && FLAGS_show_values) {
+ LOG(ERROR) <<
+ "--row_count_only and --show_values flags are conflicting: either "
+ "remove/unset --show_values or remove/unset --row_count_only";
+ return false;
+ }
+ return true;
+}
+GROUP_FLAG_VALIDATOR(schema_projection_flags, ValidateSchemaProjectionFlags);
+
using kudu::client::KuduClient;
using kudu::client::KuduClientBuilder;
using kudu::client::internal::AsyncLeaderMasterRpc;
diff --git a/src/kudu/tools/tool_action_perf.cc
b/src/kudu/tools/tool_action_perf.cc
index 151f11b..c333ab4 100644
--- a/src/kudu/tools/tool_action_perf.cc
+++ b/src/kudu/tools/tool_action_perf.cc
@@ -1052,6 +1052,8 @@ unique_ptr<Mode> BuildPerfMode() {
"or whether there is a long latency tail when scanning different
tables.")
.AddRequiredParameter({ kTableNameArg, "Name of the table to scan"})
.AddOptionalParameter("columns")
+ .AddOptionalParameter("row_count_only")
+ .AddOptionalParameter("report_scanner_stats")
.AddOptionalParameter("fill_cache")
.AddOptionalParameter("num_threads")
.AddOptionalParameter("predicates")
diff --git a/src/kudu/tools/tool_action_table.cc
b/src/kudu/tools/tool_action_table.cc
index d5da37f..005157d 100644
--- a/src/kudu/tools/tool_action_table.cc
+++ b/src/kudu/tools/tool_action_table.cc
@@ -115,6 +115,8 @@ DEFINE_string(upper_bound_type, "EXCLUSIVE_BOUND",
"The type of the upper bound, either inclusive or exclusive. "
"Defaults to exclusive. This flag is case-insensitive.");
+DECLARE_bool(row_count_only);
+DECLARE_bool(show_scanner_stats);
DECLARE_bool(show_values);
DECLARE_string(tables);
@@ -482,7 +484,9 @@ Status ScanTable(const RunnerContext &context) {
const string& table_name = FindOrDie(context.required_args, kTableNameArg);
- FLAGS_show_values = true;
+ if (!FLAGS_row_count_only) {
+ FLAGS_show_values = true;
+ }
TableScanner scanner(client, table_name);
scanner.SetOutput(&cout);
return scanner.StartScan();
@@ -1292,6 +1296,8 @@ unique_ptr<Mode> BuildTableMode() {
"for the --predicates flag on how predicates can be
specified.")
.AddRequiredParameter({ kTableNameArg, "Name of the table to scan"})
.AddOptionalParameter("columns")
+ .AddOptionalParameter("row_count_only")
+ .AddOptionalParameter("report_scanner_stats")
.AddOptionalParameter("fill_cache")
.AddOptionalParameter("num_threads")
.AddOptionalParameter("predicates")