This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new f26fa17  [tools] add --row_count_only and --report_scanner_stats flags
f26fa17 is described below

commit f26fa17968f4b323a42b61ebeace5be9e12057c0
Author: Alexey Serbin <[email protected]>
AuthorDate: Tue Jun 15 16:59:42 2021 -0700

    [tools] add --row_count_only and --report_scanner_stats flags
    
    This patch updates `kudu table scan` and `kudu perf table_scan` CLI
    tools to introduce two new flags:
      * --row_count_only
      * --report_scanner_stats
    
    The motivation for this patch the necessity to:
      * have an explicit option to run something similar to
        SELECT COUNT(1) FROM <table_name> when using `kudu perf scan_table`
        and `kudu table scan` CLI tools
      * report on the scanner metrics while running a query like
        SELECT COUNT(1) FROM <table_name>
    
    I also added a few generic test scenarios to cover the newly introduced
    new functionality.
    
    Another notable change in behavior is not polluting the output of the
    table rows with scanner stats by default: only the summary is printed
    in the very end unless --report_scanner_stats is specified (of course,
    scanner stats have been enriched with this patch as well).
    
    A few examples of the output from the updated CLI tools:
    
      kudu table scan 10.17.240.17:9876 \
          default.loadgen_auto_d61d513aeec4478d9c4638e434565dee \
          --row_count_only
      Total count 3000000 cost 0.16833 seconds
    
      kudu perf table_scan 10.17.240.17:9876 \
          default.loadgen_auto_d61d513aeec4478d9c4638e434565dee \
          --tablets=e1195e0e0e6842b3a9c19ef6de696440 \
          --row_count_only \
          --report_scanner_stats
      T e1195e0e0e6842b3a9c19ef6de696440 scanned 93753 rows in 0.050205518 
seconds
                     NextBatch() calls               1
                            bytes_read               0
                 cfile_cache_hit_bytes               0
                cfile_cache_miss_bytes               0
                      cpu_system_nanos         1000000
                        cpu_user_nanos        38994000
                  queue_duration_nanos          131807
                  total_duration_nanos        41545038
      Total count 93753 cost 0.051109 second
    
      kudu perf table_scan 10.17.240.17:9876 \
          default.loadgen_auto_d61d513aeec4478d9c4638e434565dee \
          --tablets=e1195e0e0e6842b3a9c19ef6de696440 \
          --columns=* \
          --report_scanner_stats
      T e1195e0e0e6842b3a9c19ef6de696440 scanned 93753 rows in 0.084285734 
seconds
                     NextBatch() calls               6
                            bytes_read         3262005
                 cfile_cache_hit_bytes               0
                cfile_cache_miss_bytes         3524539
                      cpu_system_nanos        21998000
                        cpu_user_nanos       164975000
                  queue_duration_nanos          339230
                  total_duration_nanos        51007394
      Total count 93753 cost 0.0854584 seconds
    
    Change-Id: Ia64e1a0b26996f0087d2473d15350d590581a69c
    Reviewed-on: http://gerrit.cloudera.org:8080/17598
    Reviewed-by: Andrew Wong <[email protected]>
    Tested-by: Kudu Jenkins
---
 src/kudu/tools/kudu-tool-test.cc     | 90 +++++++++++++++++++++++++++++++++++-
 src/kudu/tools/table_scanner.cc      | 37 ++++++++++-----
 src/kudu/tools/tool_action_common.cc | 23 +++++++++
 src/kudu/tools/tool_action_perf.cc   |  2 +
 src/kudu/tools/tool_action_table.cc  |  8 +++-
 5 files changed, 146 insertions(+), 14 deletions(-)

diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index ff4f8d9..45ef158 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -15,12 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <stdlib.h>
 #include <sys/stat.h>
 
 #include <algorithm>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <fstream>
 #include <functional>
 #include <initializer_list>
@@ -612,7 +612,7 @@ class ToolTest : public KuduTest {
                  kDstTableName, args.columns), &dst_lines));
 
     if (args.mode == TableCopyMode::COPY_SCHEMA_ONLY) {
-      ASSERT_GT(dst_lines.size(), 1);
+      ASSERT_GE(dst_lines.size(), 1);
       ASSERT_STR_CONTAINS(*dst_lines.rbegin(), "Total count 0 ");
     } else {
       // Rows scanned from source table can be found in destination table.
@@ -2827,6 +2827,49 @@ TEST_F(ToolTest, TestPerfTableScan) {
   NO_FATALS(RunScanTableCheck(kTableName, "", 1, 2000, {}, "perf table_scan"));
 }
 
+TEST_F(ToolTest, PerfTableScanCountOnly) {
+  constexpr const char* const kTableName = "perf.table_scan.row_count_only";
+  // Be specific about the number of threads even if it matches the default
+  // value for the --num_threads flag. This is to be explicit about the 
expected
+  // number of rows written into the table.
+  NO_FATALS(RunLoadgen(1,
+                       {
+                         "--num_threads=2",
+                         "--num_rows_per_thread=1234",
+                       },
+                       kTableName));
+
+  // Run table_scan with --row_count_only option
+  {
+    string out;
+    string err;
+    vector<string> out_lines;
+    const auto s = RunTool(
+        Substitute("perf table_scan $0 $1 --row_count_only",
+                   cluster_->master()->bound_rpc_addr().ToString(), 
kTableName),
+        &out, &err, &out_lines);
+    ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+    ASSERT_EQ(1, out_lines.size()) << out;
+    ASSERT_STR_CONTAINS(out, "Total count 2468 ");
+  }
+
+  // Add the --report_scanner_stats flag as well.
+  {
+    string out;
+    string err;
+    const auto s = RunTool(Substitute(
+        "perf table_scan $0 $1 --row_count_only --report_scanner_stats",
+        cluster_->master()->bound_rpc_addr().ToString(), kTableName),
+        &out, &err);
+    ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+    ASSERT_STR_CONTAINS(out, "bytes_read               0");
+    ASSERT_STR_CONTAINS(out, "cfile_cache_hit_bytes               0");
+    ASSERT_STR_CONTAINS(out, "cfile_cache_miss_bytes               0");
+    ASSERT_STR_CONTAINS(out, "total_duration_nanos ");
+    ASSERT_STR_CONTAINS(out, "Total count 2468 ");
+  }
+}
+
 TEST_F(ToolTest, TestPerfTabletScan) {
   // Create a table.
   constexpr const char* const kTableName = "perf.tablet_scan";
@@ -3858,6 +3901,49 @@ TEST_F(ToolTest, TestScanTableMultiPredicates) {
   ASSERT_LE(lines.size(), mid);
 }
 
+TEST_F(ToolTest, TableScanRowCountOnly) {
+  constexpr const char* const kTableName = "kudu.table.scan.row_count_only";
+  // Be specific about the number of threads even if it matches the default
+  // value for the --num_threads flag. This is to be explicit about the 
expected
+  // number of rows written into the table.
+  NO_FATALS(RunLoadgen(1,
+                       {
+                         "--num_threads=2",
+                         "--num_rows_per_thread=1234",
+                       },
+                       kTableName));
+
+  // Run table_scan with --row_count_only option
+  {
+    string out;
+    string err;
+    vector<string> out_lines;
+    const auto s = RunTool(
+        Substitute("table scan $0 $1 --row_count_only",
+                   cluster_->master()->bound_rpc_addr().ToString(), 
kTableName),
+        &out, &err, &out_lines);
+    ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+    ASSERT_EQ(1, out_lines.size()) << out;
+    ASSERT_STR_CONTAINS(out, "Total count 2468 ");
+  }
+
+  // Add the --report_scanner_stats flag as well.
+  {
+    string out;
+    string err;
+    const auto s = RunTool(
+        Substitute("table scan $0 $1 --row_count_only --report_scanner_stats",
+                   cluster_->master()->bound_rpc_addr().ToString(), 
kTableName),
+                   &out, &err);
+    ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+    ASSERT_STR_CONTAINS(out, "bytes_read               0");
+    ASSERT_STR_CONTAINS(out, "cfile_cache_hit_bytes               0");
+    ASSERT_STR_CONTAINS(out, "cfile_cache_miss_bytes               0");
+    ASSERT_STR_CONTAINS(out, "total_duration_nanos ");
+    ASSERT_STR_CONTAINS(out, "Total count 2468 ");
+  }
+}
+
 TEST_P(ToolTestCopyTableParameterized, TestCopyTable) {
   for (const auto& arg : GenerateArgs()) {
     NO_FATALS(RunCopyTableCheck(arg));
diff --git a/src/kudu/tools/table_scanner.cc b/src/kudu/tools/table_scanner.cc
index f37315e..48d2a8d 100644
--- a/src/kudu/tools/table_scanner.cc
+++ b/src/kudu/tools/table_scanner.cc
@@ -22,6 +22,7 @@
 #include <cstdint>
 #include <cstring>
 #include <initializer_list>
+#include <iomanip>
 #include <iostream>
 #include <iterator>
 #include <map>
@@ -34,6 +35,7 @@
 #include <rapidjson/document.h>
 
 #include "kudu/client/client.h"
+#include "kudu/client/resource_metrics.h"
 #include "kudu/client/scan_batch.h"
 #include "kudu/client/scan_predicate.h"
 #include "kudu/client/schema.h"
@@ -77,7 +79,9 @@ using std::endl;
 using std::map;
 using std::ostream;
 using std::ostringstream;
+using std::right;
 using std::set;
+using std::setw;
 using std::string;
 using std::unique_ptr;
 using std::vector;
@@ -106,6 +110,8 @@ DEFINE_string(predicates, "",
               "For example,\n"
               R"*(   ["AND", [">=", "col1", "value"], ["NOTNULL", 
"col2"]])*""\n"
               "The only supported predicate operator is `AND`.");
+DEFINE_bool(report_scanner_stats, false,
+            "Whether to report scanner statistics");
 DEFINE_bool(show_values, false,
             "Whether to show values of scanned rows.");
 DEFINE_string(write_type, "insert",
@@ -479,27 +485,34 @@ Status TableScanner::ScanData(const 
std::vector<kudu::client::KuduScanToken*>& t
     RETURN_NOT_OK(scanner->Open());
 
     uint64_t count = 0;
+    size_t next_batch_calls = 0;
     while (scanner->HasMoreRows()) {
       KuduScanBatch batch;
       RETURN_NOT_OK(scanner->NextBatch(&batch));
       count += batch.NumRows();
       total_count_ += batch.NumRows();
+      ++next_batch_calls;
       cb(batch);
     }
-
     sw.stop();
-    if (out_) {
+
+    if (FLAGS_report_scanner_stats && out_) {
+      auto& out = *out_;
       MutexLock l(output_lock_);
-      *out_ << "T " << token->tablet().id() << " scanned count " << count
-           << " cost " << sw.elapsed().wall_seconds() << " seconds" << endl;
+      out << Substitute("T $0 scanned $1 rows in $2 seconds\n",
+                        token->tablet().id(), count, 
sw.elapsed().wall_seconds());
+      const auto& metrics = scanner->GetResourceMetrics();
+      out << setw(32) << right << "NextBatch() calls"
+          << setw(16) << right << next_batch_calls << endl;
+      for (const auto& [k, v] : metrics.Get()) {
+        out << setw(32) << right << k << setw(16) << right << v << endl;
+      }
     }
   }
-
   return Status::OK();
-
 }
 
-void TableScanner::ScanTask(const vector<KuduScanToken *>& tokens, Status* 
thread_status) {
+void TableScanner::ScanTask(const vector<KuduScanToken*>& tokens, Status* 
thread_status) {
   *thread_status = ScanData(tokens, [&](const KuduScanBatch& batch) {
     if (out_ && FLAGS_show_values) {
       MutexLock l(output_lock_);
@@ -564,10 +577,12 @@ Status TableScanner::StartWork(WorkType type) {
 
   // Set projection if needed.
   if (type == WorkType::kScan) {
-    bool project_all = FLAGS_columns == "*" ||
-                       (FLAGS_show_values && FLAGS_columns.empty());
-    if (!project_all) {
-      vector<string> projected_column_names = Split(FLAGS_columns, ",", 
strings::SkipEmpty());
+    const auto project_all = FLAGS_columns == "*" || FLAGS_columns.empty();
+    if (!project_all || FLAGS_row_count_only) {
+      vector<string> projected_column_names;
+      if (!FLAGS_row_count_only && !FLAGS_columns.empty()) {
+        projected_column_names = Split(FLAGS_columns, ",", 
strings::SkipEmpty());
+      }
       RETURN_NOT_OK(builder.SetProjectedColumnNames(projected_column_names));
     }
   }
diff --git a/src/kudu/tools/tool_action_common.cc 
b/src/kudu/tools/tool_action_common.cc
index 4d9fdfd..332c464 100644
--- a/src/kudu/tools/tool_action_common.cc
+++ b/src/kudu/tools/tool_action_common.cc
@@ -151,6 +151,12 @@ DEFINE_string(sasl_protocol_name,
               "servers' service principal name base (e.g. if it's 
\"kudu/_HOST\", then "
               "sasl_protocol_name must be \"kudu\" to be able to connect.");
 
+DEFINE_bool(row_count_only, false,
+            "Whether to only count rows instead of reading row cells: yields "
+            "an empty projection for the table");
+
+DECLARE_bool(show_values);
+
 bool ValidateTimeoutSettings() {
   if (FLAGS_timeout_ms < FLAGS_negotiation_timeout_ms) {
     LOG(ERROR) << strings::Substitute(
@@ -164,6 +170,23 @@ bool ValidateTimeoutSettings() {
 }
 GROUP_FLAG_VALIDATOR(timeout_flags, ValidateTimeoutSettings);
 
+bool ValidateSchemaProjectionFlags() {
+  if (FLAGS_row_count_only && !FLAGS_columns.empty()) {
+    LOG(ERROR) <<
+        "--row_count_only and --columns flags are conflicting: "
+        "either remove/unset --columns or remove/unset --row_count_only";
+    return false;
+  }
+  if (FLAGS_row_count_only && FLAGS_show_values) {
+    LOG(ERROR) <<
+        "--row_count_only and --show_values flags are conflicting: either "
+        "remove/unset --show_values or remove/unset --row_count_only";
+    return false;
+  }
+  return true;
+}
+GROUP_FLAG_VALIDATOR(schema_projection_flags, ValidateSchemaProjectionFlags);
+
 using kudu::client::KuduClient;
 using kudu::client::KuduClientBuilder;
 using kudu::client::internal::AsyncLeaderMasterRpc;
diff --git a/src/kudu/tools/tool_action_perf.cc 
b/src/kudu/tools/tool_action_perf.cc
index 151f11b..c333ab4 100644
--- a/src/kudu/tools/tool_action_perf.cc
+++ b/src/kudu/tools/tool_action_perf.cc
@@ -1052,6 +1052,8 @@ unique_ptr<Mode> BuildPerfMode() {
           "or whether there is a long latency tail when scanning different 
tables.")
       .AddRequiredParameter({ kTableNameArg, "Name of the table to scan"})
       .AddOptionalParameter("columns")
+      .AddOptionalParameter("row_count_only")
+      .AddOptionalParameter("report_scanner_stats")
       .AddOptionalParameter("fill_cache")
       .AddOptionalParameter("num_threads")
       .AddOptionalParameter("predicates")
diff --git a/src/kudu/tools/tool_action_table.cc 
b/src/kudu/tools/tool_action_table.cc
index d5da37f..005157d 100644
--- a/src/kudu/tools/tool_action_table.cc
+++ b/src/kudu/tools/tool_action_table.cc
@@ -115,6 +115,8 @@ DEFINE_string(upper_bound_type, "EXCLUSIVE_BOUND",
               "The type of the upper bound, either inclusive or exclusive. "
               "Defaults to exclusive. This flag is case-insensitive.");
 
+DECLARE_bool(row_count_only);
+DECLARE_bool(show_scanner_stats);
 DECLARE_bool(show_values);
 DECLARE_string(tables);
 
@@ -482,7 +484,9 @@ Status ScanTable(const RunnerContext &context) {
 
   const string& table_name = FindOrDie(context.required_args, kTableNameArg);
 
-  FLAGS_show_values = true;
+  if (!FLAGS_row_count_only) {
+    FLAGS_show_values = true;
+  }
   TableScanner scanner(client, table_name);
   scanner.SetOutput(&cout);
   return scanner.StartScan();
@@ -1292,6 +1296,8 @@ unique_ptr<Mode> BuildTableMode() {
                         "for the --predicates flag on how predicates can be 
specified.")
       .AddRequiredParameter({ kTableNameArg, "Name of the table to scan"})
       .AddOptionalParameter("columns")
+      .AddOptionalParameter("row_count_only")
+      .AddOptionalParameter("report_scanner_stats")
       .AddOptionalParameter("fill_cache")
       .AddOptionalParameter("num_threads")
       .AddOptionalParameter("predicates")

Reply via email to