[
https://issues.apache.org/jira/browse/HBASE-29216?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
YR updated HBASE-29216:
-----------------------
Description:
Recovered replication stuck , when enabled
“hbase.separate.oldlogdir.by.regionserver”
The WAL location cannot be found after the configuration is enabled.
The execution logic looks like this
1. Set “hbase.separate.oldlogdir.by.regionserver” to enabled
2. Restart the RegionServer, the "write a head log" will from
/hbase/WALs/servername/
{wal-filename} moves to /hbase/oldWALs/servername/\{wal-filename}
3. WALEntryStream will find archive logs using
AbstractFSWALProvider.findArchivedLog
To solve this problem, we can try to improve the findArchiveLog method
Some codes
{code:java}
// HRegionServer.java
private void shutdownWAL(final boolean close) {
if (this.walFactory != null) {
try {
if (close) {
walFactory.close(); // here will move wals to oldwals
} else {
walFactory.shutdown();
}
} catch (Throwable e) {
e = e instanceof RemoteException ? ((RemoteException)
e).unwrapRemoteException() : e;
LOG.error("Shutdown / close of WAL failed: " + e);
LOG.debug("Shutdown / close exception details:", e);
}
}
}
{code}
{code:java}
// AbstractFSWALProvider.java:450
public static Path findArchivedLog(Path path, Configuration conf) throws
IOException {
// Here will be return , stuck the replication
if (path.toString().contains(HConstants.HREGION_OLDLOGDIR_NAME)) {
return null;
}
Path walRootDir = CommonFSUtils.getWALRootDir(conf);
FileSystem fs = path.getFileSystem(conf);
// Try finding the log in old dir
Path oldLogDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME);
Path archivedLogLocation = new Path(oldLogDir, path.getName());
if (fs.exists(archivedLogLocation)) {
LOG.info("Log " + path + " was moved to " + archivedLogLocation);
return archivedLogLocation;
}
ServerName serverName = getServerNameFromWALDirectoryName(path);
if (serverName == null) {
LOG.warn("Can not extract server name from path {}, "
+ "give up searching the separated old log dir", path);
return null;
}
// Try finding the log in separate old log dir
oldLogDir = new Path(walRootDir, new
StringBuilder(HConstants.HREGION_OLDLOGDIR_NAME)
.append(Path.SEPARATOR).append(serverName.getServerName()).toString());
archivedLogLocation = new Path(oldLogDir, path.getName());
if (fs.exists(archivedLogLocation)) {
LOG.info("Log " + path + " was moved to " + archivedLogLocation);
return archivedLogLocation;
}
LOG.error("Couldn't locate log: " + path);
return null;
}
{code}
{code:java}
// Some error message
2025-03-23 15:16:19,824 WARN
[RS_CLAIM_REPLICATION_QUEUE-regionserver/regionserver-101:16020-0.replicationSource,peer1-regionserver-130,16020,1742659271913-regionserver-166,16020,1742659271939.replicationSource.wal-reader.regionserver-130%2C16020%2C1742659271913.regionserver-130%2C16020%2C1742659271913.regiongroup-0,peer1-regionserver-130,16020,1742659271913-regionserver-166,16020,1742659271939]
regionserver.WALEntryStream: Failed to open WAL reader for path:
hdfs://snake/hbase/oldWALs/regionserver-130%2C16020%2C1742659271913.regionserver-130%2C16020%2C1742659271913.regiongroup-0.1742659287672
java.io.FileNotFoundException: File does not exist:
/hbase/oldWALs/regionserver-130%2C16020%2C1742659271913.regionserver-130%2C16020%2C1742659271913.regiongroup-0.1742659287672
at
org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
at
org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
at
org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:155)
at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:2143)
at
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:753)
at
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:493)
at
org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:554)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1105)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1067)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:994)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:2010)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3068) at
sun.reflect.GeneratedConstructorAccessor34.newInstance(Unknown Source)
at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at
org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:106)
at
org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:73)
at
org.apache.hadoop.hdfs.DFSClient.callGetBlockLocations(DFSClient.java:1006)
at org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:986)
at org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:974)
at org.apache.hadoop.hdfs.DFSClient.open(DFSClient.java:1160)
at
org.apache.hadoop.hdfs.DistributedFileSystem$4.doCall(DistributedFileSystem.java:381)
at
org.apache.hadoop.hdfs.DistributedFileSystem$4.doCall(DistributedFileSystem.java:376)
at
org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
at
org.apache.hadoop.hdfs.DistributedFileSystem.open(DistributedFileSystem.java:399)
at org.apache.hadoop.fs.FilterFileSystem.open(FilterFileSystem.java:164)
at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:900)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractProtobufWALReader.open(AbstractProtobufWALReader.java:194)
at
org.apache.hadoop.hbase.regionserver.wal.AbstractProtobufWALReader.init(AbstractProtobufWALReader.java:141)
at
org.apache.hadoop.hbase.wal.WALFactory.createTailingReader(WALFactory.java:464)
at
org.apache.hadoop.hbase.replication.regionserver.WALEntryStream.prepareReader(WALEntryStream.java:267)
at
org.apache.hadoop.hbase.replication.regionserver.WALEntryStream.tryAdvanceEntry(WALEntryStream.java:342)
at
org.apache.hadoop.hbase.replication.regionserver.WALEntryStream.hasNext(WALEntryStream.java:130)
at
org.apache.hadoop.hbase.replication.regionserver.ReplicationSourceWALReader.run(ReplicationSourceWALReader.java:150)
Caused by:
org.apache.hadoop.ipc.RemoteException(java.io.FileNotFoundException): File does
not exist:
/hbase/oldWALs/regionserver-130%2C16020%2C1742659271913.regionserver-130%2C16020%2C1742659271913.regiongroup-0.1742659287672
at
org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
at
org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
at
org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:155){code}
was:
Recovered replication stuck , when enabled
“hbase.separate.oldlogdir.by.regionserver”
The WAL location cannot be found after the configuration is enabled.
The execution logic looks like this
1. Set “hbase.separate.oldlogdir.by.regionserver” to enabled
2. Restart the RegionServer, the "write a head log" will from
/hbase/WALs/servername/{wal-filename} moves to
/hbase/oldWALs/servername/{wal-filename}
3. WALEntryStream will find archive logs using
AbstractFSWALProvider.findArchivedLog
To solve this problem, we can try to improve the findArchiveLog method
Some codes
{code:java}
// HRegionServer.java
private void shutdownWAL(final boolean close) {
if (this.walFactory != null) {
try {
if (close) {
walFactory.close(); // here will move wals to oldwals
} else {
walFactory.shutdown();
}
} catch (Throwable e) {
e = e instanceof RemoteException ? ((RemoteException)
e).unwrapRemoteException() : e;
LOG.error("Shutdown / close of WAL failed: " + e);
LOG.debug("Shutdown / close exception details:", e);
}
}
}
{code}
{code:java}
// AbstractFSWALProvider.java:450
public static Path findArchivedLog(Path path, Configuration conf) throws
IOException {
// Here will be return , stuck the replication
if (path.toString().contains(HConstants.HREGION_OLDLOGDIR_NAME)) {
return null;
}
Path walRootDir = CommonFSUtils.getWALRootDir(conf);
FileSystem fs = path.getFileSystem(conf);
// Try finding the log in old dir
Path oldLogDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME);
Path archivedLogLocation = new Path(oldLogDir, path.getName());
if (fs.exists(archivedLogLocation)) {
LOG.info("Log " + path + " was moved to " + archivedLogLocation);
return archivedLogLocation;
}
ServerName serverName = getServerNameFromWALDirectoryName(path);
if (serverName == null) {
LOG.warn("Can not extract server name from path {}, "
+ "give up searching the separated old log dir", path);
return null;
}
// Try finding the log in separate old log dir
oldLogDir = new Path(walRootDir, new
StringBuilder(HConstants.HREGION_OLDLOGDIR_NAME)
.append(Path.SEPARATOR).append(serverName.getServerName()).toString());
archivedLogLocation = new Path(oldLogDir, path.getName());
if (fs.exists(archivedLogLocation)) {
LOG.info("Log " + path + " was moved to " + archivedLogLocation);
return archivedLogLocation;
}
LOG.error("Couldn't locate log: " + path);
return null;
}
{code}
> Recovered replication stuck , when enabled
> “hbase.separate.oldlogdir.by.regionserver”
> ---------------------------------------------------------------------------------------
>
> Key: HBASE-29216
> URL: https://issues.apache.org/jira/browse/HBASE-29216
> Project: HBase
> Issue Type: Bug
> Components: regionserver
> Reporter: YR
> Assignee: YR
> Priority: Minor
> Labels: pull-request-available
>
> Recovered replication stuck , when enabled
> “hbase.separate.oldlogdir.by.regionserver”
> The WAL location cannot be found after the configuration is enabled.
> The execution logic looks like this
> 1. Set “hbase.separate.oldlogdir.by.regionserver” to enabled
> 2. Restart the RegionServer, the "write a head log" will from
> /hbase/WALs/servername/
> {wal-filename} moves to /hbase/oldWALs/servername/\{wal-filename}
>
> 3. WALEntryStream will find archive logs using
> AbstractFSWALProvider.findArchivedLog
> To solve this problem, we can try to improve the findArchiveLog method
> Some codes
> {code:java}
> // HRegionServer.java
> private void shutdownWAL(final boolean close) {
> if (this.walFactory != null) {
> try {
> if (close) {
> walFactory.close(); // here will move wals to oldwals
> } else {
> walFactory.shutdown();
> }
> } catch (Throwable e) {
> e = e instanceof RemoteException ? ((RemoteException)
> e).unwrapRemoteException() : e;
> LOG.error("Shutdown / close of WAL failed: " + e);
> LOG.debug("Shutdown / close exception details:", e);
> }
> }
> }
> {code}
> {code:java}
> // AbstractFSWALProvider.java:450
> public static Path findArchivedLog(Path path, Configuration conf) throws
> IOException {
> // Here will be return , stuck the replication
> if (path.toString().contains(HConstants.HREGION_OLDLOGDIR_NAME)) {
> return null;
> }
> Path walRootDir = CommonFSUtils.getWALRootDir(conf);
> FileSystem fs = path.getFileSystem(conf);
> // Try finding the log in old dir
> Path oldLogDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME);
> Path archivedLogLocation = new Path(oldLogDir, path.getName());
> if (fs.exists(archivedLogLocation)) {
> LOG.info("Log " + path + " was moved to " + archivedLogLocation);
> return archivedLogLocation;
> }
> ServerName serverName = getServerNameFromWALDirectoryName(path);
> if (serverName == null) {
> LOG.warn("Can not extract server name from path {}, "
> + "give up searching the separated old log dir", path);
> return null;
> }
> // Try finding the log in separate old log dir
> oldLogDir = new Path(walRootDir, new
> StringBuilder(HConstants.HREGION_OLDLOGDIR_NAME)
> .append(Path.SEPARATOR).append(serverName.getServerName()).toString());
> archivedLogLocation = new Path(oldLogDir, path.getName());
> if (fs.exists(archivedLogLocation)) {
> LOG.info("Log " + path + " was moved to " + archivedLogLocation);
> return archivedLogLocation;
> }
> LOG.error("Couldn't locate log: " + path);
> return null;
> }
> {code}
> {code:java}
> // Some error message
> 2025-03-23 15:16:19,824 WARN
> [RS_CLAIM_REPLICATION_QUEUE-regionserver/regionserver-101:16020-0.replicationSource,peer1-regionserver-130,16020,1742659271913-regionserver-166,16020,1742659271939.replicationSource.wal-reader.regionserver-130%2C16020%2C1742659271913.regionserver-130%2C16020%2C1742659271913.regiongroup-0,peer1-regionserver-130,16020,1742659271913-regionserver-166,16020,1742659271939]
> regionserver.WALEntryStream: Failed to open WAL reader for path:
> hdfs://snake/hbase/oldWALs/regionserver-130%2C16020%2C1742659271913.regionserver-130%2C16020%2C1742659271913.regiongroup-0.1742659287672
> java.io.FileNotFoundException: File does not exist:
> /hbase/oldWALs/regionserver-130%2C16020%2C1742659271913.regionserver-130%2C16020%2C1742659271913.regiongroup-0.1742659287672
> at
> org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
> at
> org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
> at
> org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:155)
> at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:2143)
> at
> org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:753)
> at
> org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:493)
> at
> org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
> at
> org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:554)
> at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1105)
> at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1067)
> at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:994)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:422)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:2010)
> at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3068) at
> sun.reflect.GeneratedConstructorAccessor34.newInstance(Unknown Source)
> at
> sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
> at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
> at
> org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:106)
> at
> org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:73)
> at
> org.apache.hadoop.hdfs.DFSClient.callGetBlockLocations(DFSClient.java:1006)
> at org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:986)
> at org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:974)
> at org.apache.hadoop.hdfs.DFSClient.open(DFSClient.java:1160)
> at
> org.apache.hadoop.hdfs.DistributedFileSystem$4.doCall(DistributedFileSystem.java:381)
> at
> org.apache.hadoop.hdfs.DistributedFileSystem$4.doCall(DistributedFileSystem.java:376)
> at
> org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
> at
> org.apache.hadoop.hdfs.DistributedFileSystem.open(DistributedFileSystem.java:399)
> at org.apache.hadoop.fs.FilterFileSystem.open(FilterFileSystem.java:164)
> at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:900)
> at
> org.apache.hadoop.hbase.regionserver.wal.AbstractProtobufWALReader.open(AbstractProtobufWALReader.java:194)
> at
> org.apache.hadoop.hbase.regionserver.wal.AbstractProtobufWALReader.init(AbstractProtobufWALReader.java:141)
> at
> org.apache.hadoop.hbase.wal.WALFactory.createTailingReader(WALFactory.java:464)
> at
> org.apache.hadoop.hbase.replication.regionserver.WALEntryStream.prepareReader(WALEntryStream.java:267)
> at
> org.apache.hadoop.hbase.replication.regionserver.WALEntryStream.tryAdvanceEntry(WALEntryStream.java:342)
> at
> org.apache.hadoop.hbase.replication.regionserver.WALEntryStream.hasNext(WALEntryStream.java:130)
> at
> org.apache.hadoop.hbase.replication.regionserver.ReplicationSourceWALReader.run(ReplicationSourceWALReader.java:150)
> Caused by:
> org.apache.hadoop.ipc.RemoteException(java.io.FileNotFoundException): File
> does not exist:
> /hbase/oldWALs/regionserver-130%2C16020%2C1742659271913.regionserver-130%2C16020%2C1742659271913.regiongroup-0.1742659287672
> at
> org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
> at
> org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
> at
> org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:155){code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)