This is an automated email from the ASF dual-hosted git repository.

lokiore pushed a commit to branch PHOENIX-7562-feature-new
in repository https://gitbox.apache.org/repos/asf/phoenix.git


The following commit(s) were added to refs/heads/PHOENIX-7562-feature-new by 
this push:
     new efe34a5b48 PHOENIX-7765 :- Eliminate spurious ERROR log on first 
cluster-role fetch (#2516)
efe34a5b48 is described below

commit efe34a5b4863d8b0dbbaf1bcb716c771f8848275
Author: Lokesh Khurana <[email protected]>
AuthorDate: Fri Jun 12 15:48:45 2026 -0700

    PHOENIX-7765 :- Eliminate spurious ERROR log on first cluster-role fetch 
(#2516)
    
    GetClusterRoleRecordUtil.getClusterRoleRecord and
    ValidateLastDDLTimestampUtil.validateLastDDLTimestamp call
    ConnectionQueryServicesImpl.getLiveRegionServers(), which returns
    null until refreshLiveRegionServers() runs. The list is only
    auto-populated at CQSI init when LAST_DDL_TIMESTAMP_VALIDATION_ENABLED
    is true (default false), so callers on the default config path always
    NPE on first invocation. The existing catch block already recovers
    (refreshes the list and recurses), but logs an ERROR line per first
    call -- N false errors per CCF cluster init, polluting operator log
    surfaces.
    
    Add inline null-check + refresh + re-fetch at the call site, before
    the NPE could fire. The retry catch block remains for genuine
    exceptions (auth, transport, RPC). First-call path no longer logs
    ERROR; only real failures do.
    
    Verified empirically on FailoverPhoenixConnection2IT: pre-fix the
    output log carried 36 "Error in getting ClusterRoleRecord" lines;
    post-fix it carries 34 lines, and zero of those 34 have a
    NullPointerException in their stack -- the remaining errors are
    legitimate ZK-down / failover / connection-restart scenarios.
    
    Generated-by: Claude Code (Opus 4.7)
---
 .../org/apache/phoenix/util/GetClusterRoleRecordUtil.java   | 13 ++++++++++++-
 .../apache/phoenix/util/ValidateLastDDLTimestampUtil.java   | 12 +++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git 
a/phoenix-core-client/src/main/java/org/apache/phoenix/util/GetClusterRoleRecordUtil.java
 
b/phoenix-core-client/src/main/java/org/apache/phoenix/util/GetClusterRoleRecordUtil.java
index 5adbedb169..e6ac19d9e9 100644
--- 
a/phoenix-core-client/src/main/java/org/apache/phoenix/util/GetClusterRoleRecordUtil.java
+++ 
b/phoenix-core-client/src/main/java/org/apache/phoenix/util/GetClusterRoleRecordUtil.java
@@ -88,8 +88,19 @@ public class GetClusterRoleRecordUtil {
     Connection conn = getConnection(url, properties);
     PhoenixConnection connection = conn.unwrap(PhoenixConnection.class);
     try (Admin admin = connection.getQueryServices().getAdmin()) {
-      // get all live region servers
+      // Get all live region servers. The list is only auto-populated at CQSI 
init when
+      // LAST_DDL_TIMESTAMP_VALIDATION_ENABLED is true (default false), so on 
the first call
+      // for the default config path the list is null. Refresh inline rather 
than letting the
+      // catch block recover via NPE: keeps the retry path reserved for 
genuine failures
+      // (auth, transport, RPC) and avoids logging an ERROR per first 
invocation.
       List<ServerName> regionServers = 
connection.getQueryServices().getLiveRegionServers();
+      if (regionServers == null || regionServers.isEmpty()) {
+        connection.getQueryServices().refreshLiveRegionServers();
+        regionServers = connection.getQueryServices().getLiveRegionServers();
+        if (regionServers == null || regionServers.isEmpty()) {
+          throw new SQLException("No live region servers available for HA 
group " + haGroupName);
+        }
+      }
       // pick one at random
       ServerName regionServer =
         
regionServers.get(ThreadLocalRandom.current().nextInt(regionServers.size()));
diff --git 
a/phoenix-core-client/src/main/java/org/apache/phoenix/util/ValidateLastDDLTimestampUtil.java
 
b/phoenix-core-client/src/main/java/org/apache/phoenix/util/ValidateLastDDLTimestampUtil.java
index 2ad41e46b3..921421b73e 100644
--- 
a/phoenix-core-client/src/main/java/org/apache/phoenix/util/ValidateLastDDLTimestampUtil.java
+++ 
b/phoenix-core-client/src/main/java/org/apache/phoenix/util/ValidateLastDDLTimestampUtil.java
@@ -99,8 +99,18 @@ public class ValidateLastDDLTimestampUtil {
     }
     String infoString = getInfoString(conn.getTenantId(), tableRefs);
     try (Admin admin = conn.getQueryServices().getAdmin()) {
-      // get all live region servers
+      // Get all live region servers. Although this method is typically 
reached only when
+      // LAST_DDL_TIMESTAMP_VALIDATION_ENABLED is true (which gates CQSI 
init-time refresh),
+      // the gating is enforced by callers, not this method itself. 
Defensively populate the
+      // list inline if null so the retry path is reserved for genuine 
failures.
       List<ServerName> regionServers = 
conn.getQueryServices().getLiveRegionServers();
+      if (regionServers == null || regionServers.isEmpty()) {
+        conn.getQueryServices().refreshLiveRegionServers();
+        regionServers = conn.getQueryServices().getLiveRegionServers();
+        if (regionServers == null || regionServers.isEmpty()) {
+          throw new SQLException("No live region servers available to validate 
DDL timestamp");
+        }
+      }
       // pick one at random
       ServerName regionServer =
         
regionServers.get(ThreadLocalRandom.current().nextInt(regionServers.size()));

Reply via email to