Manager changes for Consistent HA [phoenix]

via GitHub Mon, 11 Aug 2025 09:55:37 -0700


ritegarg commented on code in PR #2244:
URL: https://github.com/apache/phoenix/pull/2244#discussion_r2267425563



##########
phoenix-core-client/src/main/java/org/apache/phoenix/jdbc/HAGroupStoreClient.java:
##########
@@ -17,191 +17,640 @@
  */
 package org.apache.phoenix.jdbc;
 
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
 
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.tuple.Pair;
 import org.apache.curator.framework.recipes.cache.ChildData;
 import org.apache.curator.framework.recipes.cache.PathChildrenCache;
 import org.apache.curator.framework.recipes.cache.PathChildrenCacheListener;
 import org.apache.curator.utils.ZKPaths;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.phoenix.exception.InvalidClusterRoleTransitionException;
+import org.apache.phoenix.exception.StaleHAGroupStoreRecordVersionException;
+import org.apache.phoenix.jdbc.ClusterRoleRecord.ClusterRole;
+import org.apache.phoenix.jdbc.ClusterRoleRecord.RegistryType;
+import org.apache.phoenix.jdbc.HAGroupStoreRecord.HAGroupState;
 import 
org.apache.phoenix.thirdparty.com.google.common.annotations.VisibleForTesting;
-import org.apache.phoenix.thirdparty.com.google.common.collect.ImmutableList;
+import org.apache.phoenix.thirdparty.com.google.common.base.Preconditions;
+import org.apache.phoenix.util.JDBCUtil;
+import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.Closeable;
-import java.io.IOException;
-import java.util.List;
-import java.util.Objects;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.TimeUnit;
-import java.util.stream.Collectors;
+import static org.apache.phoenix.jdbc.PhoenixDatabaseMetaData.CLUSTER_ROLE_1;
+import static org.apache.phoenix.jdbc.PhoenixDatabaseMetaData.CLUSTER_ROLE_2;
+import static org.apache.phoenix.jdbc.PhoenixDatabaseMetaData.CLUSTER_URL_1;
+import static org.apache.phoenix.jdbc.PhoenixDatabaseMetaData.CLUSTER_URL_2;
+import static org.apache.phoenix.jdbc.PhoenixDatabaseMetaData.HA_GROUP_NAME;
+import static org.apache.phoenix.jdbc.PhoenixDatabaseMetaData.POLICY;
+import static 
org.apache.phoenix.jdbc.PhoenixDatabaseMetaData.SYSTEM_HA_GROUP_NAME;
+import static org.apache.phoenix.jdbc.PhoenixDatabaseMetaData.VERSION;
+import static org.apache.phoenix.jdbc.PhoenixDatabaseMetaData.ZK_URL_1;
+import static org.apache.phoenix.jdbc.PhoenixDatabaseMetaData.ZK_URL_2;
+import static org.apache.phoenix.jdbc.PhoenixHAAdmin.getLocalZkUrl;
+import static org.apache.phoenix.jdbc.PhoenixHAAdmin.toPath;
+import static 
org.apache.phoenix.query.QueryServices.HA_STORE_AND_FORWARD_MODE_REFRESH_INTERVAL_MS;
+import static 
org.apache.phoenix.query.QueryServices.HA_SYNC_MODE_REFRESH_INTERVAL_MS;
+import static org.apache.phoenix.query.QueryServicesOptions
+        .DEFAULT_HA_STORE_AND_FORWARD_MODE_REFRESH_INTERVAL_MS;
+import static org.apache.phoenix.query.QueryServicesOptions
+        .DEFAULT_HA_SYNC_MODE_REFRESH_INTERVAL_MS;
+import static org.apache.phoenix.util.PhoenixRuntime.JDBC_PROTOCOL_SEPARATOR;
+import static org.apache.phoenix.util.PhoenixRuntime.JDBC_PROTOCOL_ZK;
 
 
 /**
- * Write-through cache for HAGroupStore.
+ * Main implementation of HAGroupStoreClient with peer support.
+ * Write-through cache for HAGroupStore based on {@link HAGroupStoreRecord}.
  * Uses {@link PathChildrenCache} from {@link 
org.apache.curator.framework.CuratorFramework}.
  */
 public class HAGroupStoreClient implements Closeable {
 
+    public static final String ZK_CONSISTENT_HA_NAMESPACE =
+            "phoenix" + ZKPaths.PATH_SEPARATOR + "consistentHA";
     private static final long HA_GROUP_STORE_CLIENT_INITIALIZATION_TIMEOUT_MS 
= 30000L;
-    private static volatile HAGroupStoreClient haGroupStoreClientInstance;
+    private static final String CACHE_TYPE_LOCAL = "LOCAL";
+    private static final String CACHE_TYPE_PEER = "PEER";
     private PhoenixHAAdmin phoenixHaAdmin;
+    private PhoenixHAAdmin peerPhoenixHaAdmin;
     private static final Logger LOGGER = 
LoggerFactory.getLogger(HAGroupStoreClient.class);
-    // Map contains <ClusterRole, Map<HAGroupName(String), ClusterRoleRecord>>
-    private final ConcurrentHashMap<ClusterRoleRecord.ClusterRole, 
ConcurrentHashMap<String, ClusterRoleRecord>> clusterRoleToCRRMap
-            = new ConcurrentHashMap<>();
-    private PathChildrenCache pathChildrenCache;
-    private volatile boolean isHealthy;
+    // Map of <ZKUrl, <HAGroupName, HAGroupStoreClientInstance>>
+    private static final Map<String, ConcurrentHashMap<String, 
HAGroupStoreClient>> instances =
+            new ConcurrentHashMap<>();
+    // HAGroupName for this instance
+    private final String haGroupName;
+    // PathChildrenCache for current cluster and HAGroupName
+    private PathChildrenCache pathChildrenCache = null;
+    // PathChildrenCache for peer cluster and HAGroupName
+    private PathChildrenCache peerPathChildrenCache = null;
+    // Whether the client is healthy
+    private volatile boolean isHealthy = false;
+    // Configuration
+    private final Configuration conf;
+    // ZK URL for the current cluster and HAGroupName
+    private String zkUrl;
+    // Peer ZK URL for peer cluster and HAGroupName
+    private String peerZKUrl = null;
+    // Peer Custom Event Listener
+    private final PathChildrenCacheListener 
peerCustomPathChildrenCacheListener;
+    // Wait time for sync mode
+    private final long waitTimeForSyncModeInMs;
+    // Wait time for store and forward mode
+    private final long waitTimeForStoreAndForwardModeInMs;
+    // Policy for the HA group
+    private HighAvailabilityPolicy policy;
+    private ClusterRole clusterRole;
+    private ClusterRole peerClusterRole;
+    private String clusterUrl;
+    private String peerClusterUrl;
+    private long clusterRoleRecordVersion;
+
 
     /**
      * Creates/gets an instance of HAGroupStoreClient.
      * Can return null instance if unable to initialize.
      *
      * @param conf configuration
+     * @param haGroupName name of the HA group. Only specified HA group is 
tracked.
+     * @param zkUrl zkUrl to use for the client. Prefer providing this 
parameter to avoid
+     *             the overhead of getting the local zkUrl from the 
configuration.
      * @return HAGroupStoreClient instance
      */
-    public static HAGroupStoreClient getInstance(Configuration conf) {
-        if (haGroupStoreClientInstance == null || 
!haGroupStoreClientInstance.isHealthy) {
+    public static HAGroupStoreClient getInstance(Configuration conf, String 
haGroupName,
+            String zkUrl) throws SQLException {
+        Preconditions.checkNotNull(haGroupName, "haGroupName cannot be null");
+        String localZkUrl = Objects.toString(zkUrl, getLocalZkUrl(conf));
+        Preconditions.checkNotNull(localZkUrl, "zkUrl cannot be null");
+        HAGroupStoreClient result = instances.getOrDefault(localZkUrl, new 
ConcurrentHashMap<>())
+                .getOrDefault(haGroupName, null);
+        if (result == null || !result.isHealthy) {
             synchronized (HAGroupStoreClient.class) {
-                if (haGroupStoreClientInstance == null || 
!haGroupStoreClientInstance.isHealthy) {
-                    haGroupStoreClientInstance = new HAGroupStoreClient(conf, 
null);
-                    if (!haGroupStoreClientInstance.isHealthy) {
-                        haGroupStoreClientInstance.close();
-                        haGroupStoreClientInstance = null;
+                result = instances.getOrDefault(localZkUrl, new 
ConcurrentHashMap<>())
+                        .getOrDefault(haGroupName, null);
+                if (result == null || !result.isHealthy) {
+                    result = new HAGroupStoreClient(conf, null, null, 
haGroupName, zkUrl);
+                    if (!result.isHealthy) {
+                        result.close();
+                        result = null;
+                    } else {
+                        instances.putIfAbsent(localZkUrl, new 
ConcurrentHashMap<>());
+                        instances.get(localZkUrl).put(haGroupName, result);
                     }
                 }
             }
         }
-        return haGroupStoreClientInstance;
+        return result;
+    }
+
+    /**
+     * Get the list of HAGroupNames from system table.
+     * We can also get the list of HAGroupNames from the system table by 
providing the zkUrl in
+     * where clause but we need to match the formatted zkUrl with the zkUrl in 
the system table so
+     * that matching is done correctly.
+     *
+     * @param zkUrl for connecting to Table
+     * @return the list of HAGroupNames
+     * @throws SQLException in case of unexpected error
+     */
+    public static List<String> getHAGroupNames(String zkUrl) throws 
SQLException {
+        List<String> result = new ArrayList<>();
+        String queryString = String.format("SELECT %s,%s,%s FROM %s", 
HA_GROUP_NAME, ZK_URL_1,
+                ZK_URL_2, SYSTEM_HA_GROUP_NAME);
+        try (PhoenixConnection conn = (PhoenixConnection) 
DriverManager.getConnection(
+                JDBC_PROTOCOL_ZK + JDBC_PROTOCOL_SEPARATOR + zkUrl);
+             Statement stmt = conn.createStatement();
+             ResultSet rs = stmt.executeQuery(queryString)) {
+            while (rs.next()) {
+                String zkUrl1 = rs.getString(ZK_URL_1);
+                String zkUrl2 = rs.getString(ZK_URL_2);
+                String formattedZkUrl1 = JDBCUtil.formatUrl(zkUrl1, 
RegistryType.ZK);
+                String formattedZkUrl2 = JDBCUtil.formatUrl(zkUrl2, 
RegistryType.ZK);
+                String formattedZkUrl = JDBCUtil.formatUrl(zkUrl, 
RegistryType.ZK);
+                if (StringUtils.equals(formattedZkUrl1, formattedZkUrl) ||
+                        StringUtils.equals(formattedZkUrl2, formattedZkUrl)) {
+                    result.add(rs.getString(HA_GROUP_NAME));
+                }
+            }
+        }
+        return result;
     }
 
     @VisibleForTesting
-    HAGroupStoreClient(final Configuration conf, final 
PathChildrenCacheListener pathChildrenCacheListener) {
+    HAGroupStoreClient(final Configuration conf,
+            final PathChildrenCacheListener pathChildrenCacheListener,
+            final PathChildrenCacheListener peerPathChildrenCacheListener,
+            final String haGroupName,
+            final String zkUrl) {
+        this.conf = conf;
+        this.haGroupName = haGroupName;
+        this.zkUrl = zkUrl;
+        this.waitTimeForSyncModeInMs = 
conf.getLong(HA_SYNC_MODE_REFRESH_INTERVAL_MS,
+                DEFAULT_HA_SYNC_MODE_REFRESH_INTERVAL_MS);
+        this.waitTimeForStoreAndForwardModeInMs = conf.getLong(
+                HA_STORE_AND_FORWARD_MODE_REFRESH_INTERVAL_MS,
+                DEFAULT_HA_STORE_AND_FORWARD_MODE_REFRESH_INTERVAL_MS);
+        // Custom Event Listener
+        this.peerCustomPathChildrenCacheListener = 
peerPathChildrenCacheListener;
         try {
-            this.phoenixHaAdmin = new PhoenixHAAdmin(conf);
-            final PathChildrenCache pathChildrenCache;
-                pathChildrenCache = new 
PathChildrenCache(phoenixHaAdmin.getCurator(), ZKPaths.PATH_SEPARATOR, true);
-            final CountDownLatch latch = new CountDownLatch(1);
-            if (pathChildrenCacheListener != null) {
-                
pathChildrenCache.getListenable().addListener(pathChildrenCacheListener);
-            } else {
-                pathChildrenCache.getListenable().addListener((client, event) 
-> {
-                    LOGGER.info("HAGroupStoreClient PathChildrenCache Received 
event for type {}", event.getType());
-                    final ChildData childData = event.getData();
-                    ClusterRoleRecord eventCRR = extractCRROrNull(childData);
-                    switch (event.getType()) {
-                        case CHILD_ADDED:
-                        case CHILD_UPDATED:
-                            if (eventCRR != null && eventCRR.getHaGroupName() 
!= null) {
-                                updateClusterRoleRecordMap(eventCRR);
-                            }
-                            break;
-                        case CHILD_REMOVED:
-                            // In case of CHILD_REMOVED, we get the old 
version of data that was just deleted in event.
-                            if (eventCRR != null && eventCRR.getHaGroupName() 
!= null
-                                    && !eventCRR.getHaGroupName().isEmpty()
-                                    && 
eventCRR.getRole(phoenixHaAdmin.getZkUrl()) != null) {
-                                LOGGER.info("Received CHILD_REMOVED event, 
Removing CRR {} from existing CRR Map {}", eventCRR, clusterRoleToCRRMap);
-                                final ClusterRoleRecord.ClusterRole role = 
eventCRR.getRole(phoenixHaAdmin.getZkUrl());
-                                clusterRoleToCRRMap.putIfAbsent(role, new 
ConcurrentHashMap<>());
-                                
clusterRoleToCRRMap.get(role).remove(eventCRR.getHaGroupName());
-                            }
-                            break;
-                        case INITIALIZED:
-                            latch.countDown();
-                            break;
-                        case CONNECTION_LOST:
-                        case CONNECTION_SUSPENDED:
-                            isHealthy = false;
-                            break;
-                        case CONNECTION_RECONNECTED:
-                            isHealthy = true;
-                            break;
-                        default:
-                            LOGGER.warn("Unexpected event type {}, complete 
event {}", event.getType(), event);
-                    }
-                });
+            // Initialize HAGroupStoreClient attributes
+            initializeHAGroupStoreClientAttributes(haGroupName);
+            // Initialize Phoenix HA Admin
+            this.phoenixHaAdmin = new PhoenixHAAdmin(this.zkUrl, conf, 
ZK_CONSISTENT_HA_NAMESPACE);
+            // Initialize local cache
+            this.pathChildrenCache = 
initializePathChildrenCache(phoenixHaAdmin,
+                    pathChildrenCacheListener, CACHE_TYPE_LOCAL);
+            // Initialize ZNode if not present in ZK
+            initializeZNodeIfNeeded();
+            if (this.pathChildrenCache != null) {
+                this.isHealthy = true;
+                // Initialize peer cache
+                maybeInitializePeerPathChildrenCache();
             }
-            
pathChildrenCache.start(PathChildrenCache.StartMode.POST_INITIALIZED_EVENT);
-            this.pathChildrenCache = pathChildrenCache;
-            isHealthy = 
latch.await(HA_GROUP_STORE_CLIENT_INITIALIZATION_TIMEOUT_MS, 
TimeUnit.MILLISECONDS);
-            buildClusterRoleToCRRMap();
+
         } catch (Exception e) {
-            isHealthy = false;
-            LOGGER.error("Unexpected error occurred while initializing 
HAGroupStoreClient, marking cache as unhealthy", e);
+            this.isHealthy = false;
+            close();
+            LOGGER.error("Unexpected error occurred while initializing 
HAGroupStoreClient, "
+                    + "marking cache as unhealthy", e);
         }
     }
 
-    private ClusterRoleRecord extractCRROrNull(final ChildData childData) {
-        if (childData != null) {
-            byte[] data = childData.getData();
-            return ClusterRoleRecord.fromJson(data).orElse(null);
+    /**
+     * Rebuilds the internal cache by querying for all needed data.
+     * This is a blocking operation that does not generate events to listeners.
+     * @param broadcastUpdate whether to broadcast the update to all 
regionservers
+     *
+     * @throws Exception if rebuild fails or client is not healthy
+     */
+    public void rebuild(boolean broadcastUpdate) throws Exception {
+        if (!isHealthy) {
+            throw new IOException("HAGroupStoreClient is not healthy");
         }
-        return null;
+        LOGGER.info("Rebuilding HAGroupStoreClient for HA group {} with 
broadcastUpdate {}",
+                haGroupName, broadcastUpdate);
+        if (broadcastUpdate) {
+            // TODO: Add a lock for the row in system table.

Review Comment:
   Adding row locking and broadcast system table update to all other RS’s 
HAGroupStoreClient instance
   1. Intercept the mutation to System.HA_Group table in IndexRegionObserver.
   2. Obtain read and write lock on row to maintain serialization
   3. If attribute changed is in worth broadcasting an update. eg. we don't 
need to broadcast update for clusterRole because ZNode will contain the state 
already.
                a. If broadcast worthy, we use same mechanism as MetadataCache 
to invalidate. We create a new RPC to send updated attributes to all other 
HAGroupStoreClient and each HAGroupStoreClient updates its attributes. This has 
1 retry built in.
                b. Once complete for all RS, we let mutation complete. If 
mutation failed for even 1 RS, we fail the mutation and operator can retry.
   4. Release the lock
   
   
   **NOTE**: Need to check if we need to obtain Scan lock as well as it might 
be possible we retrieve the locked entry in scans as well while the mutation of 
row is in progress and HAGroupStoreClient might have inconsistent data. 
Currently we have 2 use cases for the table 
   1. During initialization we retrieve table entry for HAGroupName via where 
clause for HAGroupName which is the primary key.
   2. We have an option to retrieve all the HAGroupNames associated with local 
cluster where we scan this table. This output should not change unless we 
delete the HAGroupName entry
   **NOTE**: Need to handle delete case.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@phoenix.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] PHOENIX-7566 [1/n] HAGroupStoreClient/Manager changes for Consistent HA [phoenix]

Reply via email to