Revert 'IMPALA-6338: Fix flaky test_profile_fragment_instances'

There have been several crashes observed in testing recently, and its
not clear what's going on, so for now revert this moderately risky
change.

Change-Id: I48c11f0817c5190a3a94f8260f3e8ef653357ab3
Reviewed-on: http://gerrit.cloudera.org:8080/9243
Reviewed-by: Alex Behm <alex.b...@cloudera.com>
Tested-by: Thomas Tauber-Marshall <tmarsh...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/1ddb1562
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/1ddb1562
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/1ddb1562

Branch: refs/heads/2.x
Commit: 1ddb15621a2b028a00a1a0a489d2a49224f17a43
Parents: 9e887b0
Author: Thomas Tauber-Marshall <tmarsh...@cloudera.com>
Authored: Wed Feb 7 11:12:47 2018 -0800
Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org>
Committed: Thu Feb 8 07:01:53 2018 +0000

----------------------------------------------------------------------
 be/src/common/status.h                      |  6 ------
 be/src/runtime/coordinator-backend-state.cc | 12 ++++++------
 be/src/runtime/coordinator-backend-state.h  |  6 ++----
 be/src/runtime/coordinator.cc               | 10 ++++------
 be/src/runtime/coordinator.h                |  4 +---
 tests/query_test/test_observability.py      | 10 +++++-----
 6 files changed, 18 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/1ddb1562/be/src/common/status.h
----------------------------------------------------------------------
diff --git a/be/src/common/status.h b/be/src/common/status.h
index f0f91f7..24dba8b 100644
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@@ -95,13 +95,7 @@ class NODISCARD Status {
   static Status MemLimitExceeded();
   static Status MemLimitExceeded(const std::string& details);
 
-  /// Indicates a 'cancelled' status. CANCELLED should not be reported by a 
fragment
-  /// instance that encounters a problem - instances should return a specific 
error,
-  /// and then the coordinator will initiate cancellation.
-  /// TODO: we use this in some places to indicate things other than query 
cancellation,
-  /// which can be confusing.
   static const Status CANCELLED;
-
   static const Status DEPRECATED_RPC;
 
   /// Copy c'tor makes copy of error detail so Status can be returned by value.

http://git-wip-us.apache.org/repos/asf/impala/blob/1ddb1562/be/src/runtime/coordinator-backend-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator-backend-state.cc 
b/be/src/runtime/coordinator-backend-state.cc
index b238cad..914a3e4 100644
--- a/be/src/runtime/coordinator-backend-state.cc
+++ b/be/src/runtime/coordinator-backend-state.cc
@@ -234,7 +234,7 @@ void Coordinator::BackendState::LogFirstInProgress(
 }
 
 inline bool Coordinator::BackendState::IsDone() const {
-  return num_remaining_instances_ == 0 || (!status_.ok() && 
!status_.IsCancelled());
+  return num_remaining_instances_ == 0 || !status_.ok();
 }
 
 bool Coordinator::BackendState::ApplyExecStatusReport(
@@ -338,8 +338,8 @@ bool Coordinator::BackendState::Cancel() {
   // Nothing to cancel if the exec rpc was not sent
   if (!rpc_sent_) return false;
 
-  // don't cancel if it already finished (for any reason) or cancelled
-  if (IsDone() || status_.IsCancelled()) return false;
+  // don't cancel if it already finished (for any reason)
+  if (IsDone()) return false;
 
   /// If the status is not OK, we still try to cancel - !OK status might mean
   /// communication failure between backend and coordinator, but fragment
@@ -391,10 +391,10 @@ bool Coordinator::BackendState::Cancel() {
 void Coordinator::BackendState::PublishFilter(const TPublishFilterParams& 
rpc_params) {
   DCHECK_EQ(rpc_params.dst_query_id, query_id_);
   {
-    // If the backend is already done or cancelled, it's not waiting for this 
filter, so
-    // we skip sending it in this case.
+    // If the backend is already done, it's not waiting for this filter, so we 
skip
+    // sending it in this case.
     lock_guard<mutex> l(lock_);
-    if (IsDone() || status_.IsCancelled()) return;
+    if (IsDone()) return;
   }
 
   if (fragments_.count(rpc_params.dst_fragment_idx) == 0) return;

http://git-wip-us.apache.org/repos/asf/impala/blob/1ddb1562/be/src/runtime/coordinator-backend-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator-backend-state.h 
b/be/src/runtime/coordinator-backend-state.h
index 860b968..0973ca3 100644
--- a/be/src/runtime/coordinator-backend-state.h
+++ b/be/src/runtime/coordinator-backend-state.h
@@ -219,7 +219,7 @@ class Coordinator::BackendState {
 
   /// If the status indicates an error status, execution has either been 
aborted by the
   /// executing impalad (which then reported the error) or cancellation has 
been
-  /// initiated by the coordinator.
+  /// initiated; either way, execution must not be cancelled.
   Status status_;
 
   /// Used to distinguish between errors reported by a specific fragment 
instance,
@@ -254,9 +254,7 @@ class Coordinator::BackendState {
       const FilterRoutingTable& filter_routing_table,
       TExecQueryFInstancesParams* rpc_params);
 
-  /// Return true if execution at this backend is done. The backend is 
considered done if
-  /// either all instances have completed, or an error (other than cancel) is 
encountered.
-  /// Caller must hold lock_.
+  /// Return true if execution at this backend is done. Caller must hold lock_.
   bool IsDone() const;
 };
 

http://git-wip-us.apache.org/repos/asf/impala/blob/1ddb1562/be/src/runtime/coordinator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.cc b/be/src/runtime/coordinator.cc
index 05ecf9f..7973775 100644
--- a/be/src/runtime/coordinator.cc
+++ b/be/src/runtime/coordinator.cc
@@ -867,9 +867,6 @@ Status Coordinator::GetNext(QueryResultSet* results, int 
max_rows, bool* eos) {
     ReleaseExecResources();
     // wait for all backends to complete before computing the summary
     // TODO: relocate this so GetNext() won't have to wait for backends to 
complete?
-    // Note that doing this here allows us to ensure that a query that 
completes
-    // successfully will have a full runtime profile by the time that Fetch() 
indicates
-    // all of the results have been returned.
     RETURN_IF_ERROR(WaitForBackendCompletion());
     // Release admission control resources after backends are finished.
     ReleaseAdmissionControlResources();
@@ -923,8 +920,10 @@ Status Coordinator::UpdateBackendExecStatus(const 
TReportExecStatusParams& param
         Substitute("Unknown backend index $0 (max known: $1)",
             params.coord_state_idx, backend_states_.size() - 1));
   }
-  // If the query was cancelled, don't process the update.
-  if (query_status_.IsCancelled()) return Status::OK();
+  BackendState* backend_state = backend_states_[params.coord_state_idx];
+  // TODO: return here if returned_all_results_?
+  // TODO: return CANCELLED in that case? Although that makes the cancellation 
propagation
+  // path more irregular.
 
   // TODO: only do this when the sink is done; probably missing a done field
   // in TReportExecStatus for that
@@ -932,7 +931,6 @@ Status Coordinator::UpdateBackendExecStatus(const 
TReportExecStatusParams& param
     UpdateInsertExecStatus(params.insert_exec_status);
   }
 
-  BackendState* backend_state = backend_states_[params.coord_state_idx];
   if (backend_state->ApplyExecStatusReport(params, &exec_summary_, 
&progress_)) {
     // This report made this backend done, so update the status and
     // num_remaining_backends_.

http://git-wip-us.apache.org/repos/asf/impala/blob/1ddb1562/be/src/runtime/coordinator.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.h b/be/src/runtime/coordinator.h
index fbbdfa9..d630b9a 100644
--- a/be/src/runtime/coordinator.h
+++ b/be/src/runtime/coordinator.h
@@ -290,9 +290,7 @@ class Coordinator { // NOLINT: The member variables could 
be re-ordered to save
   boost::mutex lock_;
 
   /// Overall status of the entire query; set to the first reported fragment 
error
-  /// status or to CANCELLED, if Cancel() is called. Note that some fragments 
may have
-  /// status CANCELLED even if this is not CANCELLED if cancellation is 
initiated because
-  /// returned_all_results_ is true or an error is encountered.
+  /// status or to CANCELLED, if Cancel() is called.
   Status query_status_;
 
   /// If true, the query is done returning all results.  It is possible that 
the

http://git-wip-us.apache.org/repos/asf/impala/blob/1ddb1562/tests/query_test/test_observability.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_observability.py 
b/tests/query_test/test_observability.py
index a7508a4..85fc4f1 100644
--- a/tests/query_test/test_observability.py
+++ b/tests/query_test/test_observability.py
@@ -124,14 +124,14 @@ class TestObservability(ImpalaTestSuite):
         join (select * from l LIMIT 2000000) b on a.l_orderkey = 
-b.l_orderkey;""")
     # There are 3 scan nodes and each appears in the profile 4 times (for 3 
fragment
     # instances + the averaged fragment).
-    assert results.runtime_profile.count("HDFS_SCAN_NODE") == 12, 
results.runtime_profile
+    assert results.runtime_profile.count("HDFS_SCAN_NODE") == 12
     # There are 3 exchange nodes and each appears in the profile 2 times (for 
1 fragment
     # instance + the averaged fragment).
-    assert results.runtime_profile.count("EXCHANGE_NODE") == 6, 
results.runtime_profile
+    assert results.runtime_profile.count("EXCHANGE_NODE") == 6
     # The following appear only in the root fragment which has 1 instance.
-    assert results.runtime_profile.count("HASH_JOIN_NODE") == 2, 
results.runtime_profile
-    assert results.runtime_profile.count("AGGREGATION_NODE") == 2, 
results.runtime_profile
-    assert results.runtime_profile.count("PLAN_ROOT_SINK") == 2, 
results.runtime_profile
+    assert results.runtime_profile.count("HASH_JOIN_NODE") == 2
+    assert results.runtime_profile.count("AGGREGATION_NODE") == 2
+    assert results.runtime_profile.count("PLAN_ROOT_SINK") == 2
 
   # IMPALA-6399: Run this test serially to avoid a delay over the wait time in 
fetching
   # the profile.

Reply via email to