This is an automated email from the ASF dual-hosted git repository.
lokiore pushed a commit to branch PHOENIX-7562-feature-new
in repository https://gitbox.apache.org/repos/asf/phoenix.git
The following commit(s) were added to refs/heads/PHOENIX-7562-feature-new by
this push:
new efe34a5b48 PHOENIX-7765 :- Eliminate spurious ERROR log on first
cluster-role fetch (#2516)
efe34a5b48 is described below
commit efe34a5b4863d8b0dbbaf1bcb716c771f8848275
Author: Lokesh Khurana <[email protected]>
AuthorDate: Fri Jun 12 15:48:45 2026 -0700
PHOENIX-7765 :- Eliminate spurious ERROR log on first cluster-role fetch
(#2516)
GetClusterRoleRecordUtil.getClusterRoleRecord and
ValidateLastDDLTimestampUtil.validateLastDDLTimestamp call
ConnectionQueryServicesImpl.getLiveRegionServers(), which returns
null until refreshLiveRegionServers() runs. The list is only
auto-populated at CQSI init when LAST_DDL_TIMESTAMP_VALIDATION_ENABLED
is true (default false), so callers on the default config path always
NPE on first invocation. The existing catch block already recovers
(refreshes the list and recurses), but logs an ERROR line per first
call -- N false errors per CCF cluster init, polluting operator log
surfaces.
Add inline null-check + refresh + re-fetch at the call site, before
the NPE could fire. The retry catch block remains for genuine
exceptions (auth, transport, RPC). First-call path no longer logs
ERROR; only real failures do.
Verified empirically on FailoverPhoenixConnection2IT: pre-fix the
output log carried 36 "Error in getting ClusterRoleRecord" lines;
post-fix it carries 34 lines, and zero of those 34 have a
NullPointerException in their stack -- the remaining errors are
legitimate ZK-down / failover / connection-restart scenarios.
Generated-by: Claude Code (Opus 4.7)
---
.../org/apache/phoenix/util/GetClusterRoleRecordUtil.java | 13 ++++++++++++-
.../apache/phoenix/util/ValidateLastDDLTimestampUtil.java | 12 +++++++++++-
2 files changed, 23 insertions(+), 2 deletions(-)
diff --git
a/phoenix-core-client/src/main/java/org/apache/phoenix/util/GetClusterRoleRecordUtil.java
b/phoenix-core-client/src/main/java/org/apache/phoenix/util/GetClusterRoleRecordUtil.java
index 5adbedb169..e6ac19d9e9 100644
---
a/phoenix-core-client/src/main/java/org/apache/phoenix/util/GetClusterRoleRecordUtil.java
+++
b/phoenix-core-client/src/main/java/org/apache/phoenix/util/GetClusterRoleRecordUtil.java
@@ -88,8 +88,19 @@ public class GetClusterRoleRecordUtil {
Connection conn = getConnection(url, properties);
PhoenixConnection connection = conn.unwrap(PhoenixConnection.class);
try (Admin admin = connection.getQueryServices().getAdmin()) {
- // get all live region servers
+ // Get all live region servers. The list is only auto-populated at CQSI
init when
+ // LAST_DDL_TIMESTAMP_VALIDATION_ENABLED is true (default false), so on
the first call
+ // for the default config path the list is null. Refresh inline rather
than letting the
+ // catch block recover via NPE: keeps the retry path reserved for
genuine failures
+ // (auth, transport, RPC) and avoids logging an ERROR per first
invocation.
List<ServerName> regionServers =
connection.getQueryServices().getLiveRegionServers();
+ if (regionServers == null || regionServers.isEmpty()) {
+ connection.getQueryServices().refreshLiveRegionServers();
+ regionServers = connection.getQueryServices().getLiveRegionServers();
+ if (regionServers == null || regionServers.isEmpty()) {
+ throw new SQLException("No live region servers available for HA
group " + haGroupName);
+ }
+ }
// pick one at random
ServerName regionServer =
regionServers.get(ThreadLocalRandom.current().nextInt(regionServers.size()));
diff --git
a/phoenix-core-client/src/main/java/org/apache/phoenix/util/ValidateLastDDLTimestampUtil.java
b/phoenix-core-client/src/main/java/org/apache/phoenix/util/ValidateLastDDLTimestampUtil.java
index 2ad41e46b3..921421b73e 100644
---
a/phoenix-core-client/src/main/java/org/apache/phoenix/util/ValidateLastDDLTimestampUtil.java
+++
b/phoenix-core-client/src/main/java/org/apache/phoenix/util/ValidateLastDDLTimestampUtil.java
@@ -99,8 +99,18 @@ public class ValidateLastDDLTimestampUtil {
}
String infoString = getInfoString(conn.getTenantId(), tableRefs);
try (Admin admin = conn.getQueryServices().getAdmin()) {
- // get all live region servers
+ // Get all live region servers. Although this method is typically
reached only when
+ // LAST_DDL_TIMESTAMP_VALIDATION_ENABLED is true (which gates CQSI
init-time refresh),
+ // the gating is enforced by callers, not this method itself.
Defensively populate the
+ // list inline if null so the retry path is reserved for genuine
failures.
List<ServerName> regionServers =
conn.getQueryServices().getLiveRegionServers();
+ if (regionServers == null || regionServers.isEmpty()) {
+ conn.getQueryServices().refreshLiveRegionServers();
+ regionServers = conn.getQueryServices().getLiveRegionServers();
+ if (regionServers == null || regionServers.isEmpty()) {
+ throw new SQLException("No live region servers available to validate
DDL timestamp");
+ }
+ }
// pick one at random
ServerName regionServer =
regionServers.get(ThreadLocalRandom.current().nextInt(regionServers.size()));