[
https://issues.apache.org/jira/browse/PHOENIX-3230?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15472097#comment-15472097
]
Samarth Jain commented on PHOENIX-3230:
---------------------------------------
When upgrading with multiple clients (different JVMs), we are running into race
conditions.
Client 1 trying to execute:
{code}
metaConnection = addColumnsIfNotExists(metaConnection,
PhoenixDatabaseMetaData.SYSTEM_CATALOG,
MetaDataProtocol.MIN_SYSTEM_TABLE_TIMESTAMP_4_8_0 - 1,
PhoenixDatabaseMetaData.AUTO_PARTITION_SEQ + " "
+
PVarchar.INSTANCE.getSqlTypeName());
{code}
Client 2 trying to execute:
{code}
createSnapshot(snapshotName,
sysCatalogTableName);
{code}
Client 2 then fails with:
{code}
java.sql.SQLException: org.apache.hadoop.hbase.snapshot.HBaseSnapshotException:
org.apache.hadoop.hbase.snapshot.HBaseSnapshotException: Snapshot {
ss=SNAPSHOT_SYSTEM.CATALOG_null_TO_4.8.0_20160907155025-0700
table=SYSTEM.CATALOG type=FLUSH } had an error. Procedure
SNAPSHOT_SYSTEM.CATALOG_null_TO_4.8.0_20160907155025-0700 { waiting=[]
done=[localhost,58539,1473287287332] }
at
org.apache.hadoop.hbase.master.snapshot.SnapshotManager.isSnapshotDone(SnapshotManager.java:342)
at
org.apache.hadoop.hbase.master.HMaster.isSnapshotDone(HMaster.java:3237)
at
org.apache.hadoop.hbase.protobuf.generated.MasterProtos$MasterService$2.callBlockingMethod(MasterProtos.java:43294)
at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2149)
at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:104)
at
org.apache.hadoop.hbase.ipc.FifoRpcScheduler$1.run(FifoRpcScheduler.java:74)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by:
org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable via
localhost,58539,1473287287332:org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable:
org.apache.hadoop.hbase.NotServingRegionException:
SYSTEM.CATALOG,,1473283460590.fe40df52aa069a8d4a3ee52e4b282e5c. is closing
at
org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher.rethrowException(ForeignExceptionDispatcher.java:83)
at
org.apache.hadoop.hbase.master.snapshot.TakeSnapshotHandler.rethrowExceptionIfFailed(TakeSnapshotHandler.java:307)
at
org.apache.hadoop.hbase.master.snapshot.SnapshotManager.isSnapshotDone(SnapshotManager.java:332)
... 10 more
Caused by:
org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable:
org.apache.hadoop.hbase.NotServingRegionException:
SYSTEM.CATALOG,,1473283460590.fe40df52aa069a8d4a3ee52e4b282e5c. is closing
at
org.apache.hadoop.hbase.regionserver.snapshot.RegionServerSnapshotManager$SnapshotSubprocedurePool.waitForOutstandingTasks(RegionServerSnapshotManager.java:338)
at
org.apache.hadoop.hbase.regionserver.snapshot.FlushSnapshotSubprocedure.flushSnapshot(FlushSnapshotSubprocedure.java:138)
at
org.apache.hadoop.hbase.regionserver.snapshot.FlushSnapshotSubprocedure.insideBarrier(FlushSnapshotSubprocedure.java:157)
at
org.apache.hadoop.hbase.procedure.Subprocedure.call(Subprocedure.java:186)
at
org.apache.hadoop.hbase.procedure.Subprocedure.call(Subprocedure.java:53)
... 4 more
at
org.apache.phoenix.query.ConnectionQueryServicesImpl$13.createSnapshot(ConnectionQueryServicesImpl.java:2597)
at
org.apache.phoenix.query.ConnectionQueryServicesImpl$13.call(ConnectionQueryServicesImpl.java:2337)
at
org.apache.phoenix.query.ConnectionQueryServicesImpl$13.call(ConnectionQueryServicesImpl.java:1)
at
org.apache.phoenix.util.PhoenixContextExecutor.call(PhoenixContextExecutor.java:78)
at
org.apache.phoenix.query.ConnectionQueryServicesImpl.init(ConnectionQueryServicesImpl.java:2272)
at
org.apache.phoenix.jdbc.PhoenixDriver.getConnectionQueryServices(PhoenixDriver.java:232)
at
org.apache.phoenix.jdbc.PhoenixEmbeddedDriver.createConnection(PhoenixEmbeddedDriver.java:147)
at org.apache.phoenix.jdbc.PhoenixDriver.connect(PhoenixDriver.java:202)
at java.sql.DriverManager.getConnection(DriverManager.java:571)
at java.sql.DriverManager.getConnection(DriverManager.java:233)
at
org.apache.phoenix.end2end.PhoenixRuntimeIT.testConnection(PhoenixRuntimeIT.java:150)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
at
org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
at
org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
at
org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
at
org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27)
at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325)
at
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
at
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
at org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
at org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
at
org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26)
at
org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27)
at org.junit.rules.ExternalResource$1.evaluate(ExternalResource.java:48)
at org.junit.rules.RunRules.evaluate(RunRules.java:20)
at org.junit.runners.ParentRunner.run(ParentRunner.java:363)
at
org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:50)
at
org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:467)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:683)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:390)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:197)
Caused by: org.apache.hadoop.hbase.snapshot.HBaseSnapshotException:
org.apache.hadoop.hbase.snapshot.HBaseSnapshotException: Snapshot {
ss=SNAPSHOT_SYSTEM.CATALOG_null_TO_4.8.0_20160907155025-0700
table=SYSTEM.CATALOG type=FLUSH } had an error. Procedure
SNAPSHOT_SYSTEM.CATALOG_null_TO_4.8.0_20160907155025-0700 { waiting=[]
done=[localhost,58539,1473287287332] }
at
org.apache.hadoop.hbase.master.snapshot.SnapshotManager.isSnapshotDone(SnapshotManager.java:342)
at
org.apache.hadoop.hbase.master.HMaster.isSnapshotDone(HMaster.java:3237)
at
org.apache.hadoop.hbase.protobuf.generated.MasterProtos$MasterService$2.callBlockingMethod(MasterProtos.java:43294)
at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2149)
at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:104)
at
org.apache.hadoop.hbase.ipc.FifoRpcScheduler$1.run(FifoRpcScheduler.java:74)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by:
org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable via
localhost,58539,1473287287332:org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable:
org.apache.hadoop.hbase.NotServingRegionException:
SYSTEM.CATALOG,,1473283460590.fe40df52aa069a8d4a3ee52e4b282e5c. is closing
at
org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher.rethrowException(ForeignExceptionDispatcher.java:83)
at
org.apache.hadoop.hbase.master.snapshot.TakeSnapshotHandler.rethrowExceptionIfFailed(TakeSnapshotHandler.java:307)
at
org.apache.hadoop.hbase.master.snapshot.SnapshotManager.isSnapshotDone(SnapshotManager.java:332)
... 10 more
Caused by:
org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable:
org.apache.hadoop.hbase.NotServingRegionException:
SYSTEM.CATALOG,,1473283460590.fe40df52aa069a8d4a3ee52e4b282e5c. is closing
at
org.apache.hadoop.hbase.regionserver.snapshot.RegionServerSnapshotManager$SnapshotSubprocedurePool.waitForOutstandingTasks(RegionServerSnapshotManager.java:338)
at
org.apache.hadoop.hbase.regionserver.snapshot.FlushSnapshotSubprocedure.flushSnapshot(FlushSnapshotSubprocedure.java:138)
at
org.apache.hadoop.hbase.regionserver.snapshot.FlushSnapshotSubprocedure.insideBarrier(FlushSnapshotSubprocedure.java:157)
at
org.apache.hadoop.hbase.procedure.Subprocedure.call(Subprocedure.java:186)
at
org.apache.hadoop.hbase.procedure.Subprocedure.call(Subprocedure.java:53)
... 4 more
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at
org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:106)
at
org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:95)
at
org.apache.hadoop.hbase.client.RpcRetryingCaller.translateException(RpcRetryingCaller.java:211)
at
org.apache.hadoop.hbase.client.RpcRetryingCaller.translateException(RpcRetryingCaller.java:225)
at
org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:123)
at
org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:91)
at
org.apache.hadoop.hbase.client.HBaseAdmin.executeCallable(HBaseAdmin.java:3479)
at
org.apache.hadoop.hbase.client.HBaseAdmin.snapshot(HBaseAdmin.java:2796)
at
org.apache.hadoop.hbase.client.HBaseAdmin.snapshot(HBaseAdmin.java:2729)
at
org.apache.hadoop.hbase.client.HBaseAdmin.snapshot(HBaseAdmin.java:2650)
at
org.apache.phoenix.query.ConnectionQueryServicesImpl$13.createSnapshot(ConnectionQueryServicesImpl.java:2593)
... 38 more
Caused by:
org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.apache.hadoop.hbase.snapshot.HBaseSnapshotException):
org.apache.hadoop.hbase.snapshot.HBaseSnapshotException: Snapshot {
ss=SNAPSHOT_SYSTEM.CATALOG_null_TO_4.8.0_20160907155025-0700
table=SYSTEM.CATALOG type=FLUSH } had an error. Procedure
SNAPSHOT_SYSTEM.CATALOG_null_TO_4.8.0_20160907155025-0700 { waiting=[]
done=[localhost,58539,1473287287332] }
at
org.apache.hadoop.hbase.master.snapshot.SnapshotManager.isSnapshotDone(SnapshotManager.java:342)
at
org.apache.hadoop.hbase.master.HMaster.isSnapshotDone(HMaster.java:3237)
at
org.apache.hadoop.hbase.protobuf.generated.MasterProtos$MasterService$2.callBlockingMethod(MasterProtos.java:43294)
at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2149)
at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:104)
at
org.apache.hadoop.hbase.ipc.FifoRpcScheduler$1.run(FifoRpcScheduler.java:74)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by:
org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable via
localhost,58539,1473287287332:org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable:
org.apache.hadoop.hbase.NotServingRegionException:
SYSTEM.CATALOG,,1473283460590.fe40df52aa069a8d4a3ee52e4b282e5c. is closing
at
org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher.rethrowException(ForeignExceptionDispatcher.java:83)
at
org.apache.hadoop.hbase.master.snapshot.TakeSnapshotHandler.rethrowExceptionIfFailed(TakeSnapshotHandler.java:307)
at
org.apache.hadoop.hbase.master.snapshot.SnapshotManager.isSnapshotDone(SnapshotManager.java:332)
... 10 more
Caused by:
org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable:
org.apache.hadoop.hbase.NotServingRegionException:
SYSTEM.CATALOG,,1473283460590.fe40df52aa069a8d4a3ee52e4b282e5c. is closing
at
org.apache.hadoop.hbase.regionserver.snapshot.RegionServerSnapshotManager$SnapshotSubprocedurePool.waitForOutstandingTasks(RegionServerSnapshotManager.java:338)
at
org.apache.hadoop.hbase.regionserver.snapshot.FlushSnapshotSubprocedure.flushSnapshot(FlushSnapshotSubprocedure.java:138)
at
org.apache.hadoop.hbase.regionserver.snapshot.FlushSnapshotSubprocedure.insideBarrier(FlushSnapshotSubprocedure.java:157)
at
org.apache.hadoop.hbase.procedure.Subprocedure.call(Subprocedure.java:186)
at
org.apache.hadoop.hbase.procedure.Subprocedure.call(Subprocedure.java:53)
... 4 more
at org.apache.hadoop.hbase.ipc.RpcClient.call(RpcClient.java:1489)
at
org.apache.hadoop.hbase.ipc.RpcClient.callBlockingMethod(RpcClient.java:1691)
at
org.apache.hadoop.hbase.ipc.RpcClient$BlockingRpcChannelImplementation.callBlockingMethod(RpcClient.java:1750)
at
org.apache.hadoop.hbase.protobuf.generated.MasterProtos$MasterService$BlockingStub.isSnapshotDone(MasterProtos.java:45813)
at
org.apache.hadoop.hbase.client.HConnectionManager$HConnectionImplementation$5.isSnapshotDone(HConnectionManager.java:2154)
at
org.apache.hadoop.hbase.client.HBaseAdmin$25.call(HBaseAdmin.java:2799)
at
org.apache.hadoop.hbase.client.HBaseAdmin$25.call(HBaseAdmin.java:2796)
at
org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:115)
... 44 more
{code}
Two possible solutions:
1) I don't think we need to modify HTable metadata every time we add a column
to the table. Unless it is adding a new column family or some other HTable
level property is being modified/added. This may solve the above race
condition. But I am not sure if that is the perfect solution.
2) Use zookeeper to get a distributed lock that basically makes sure only one
JVM is able to run the upgrade code.
[~jamestaylor], WDYT?
> SYSTEM.CATALOG get restored from snapshot with multi-client connection
> ----------------------------------------------------------------------
>
> Key: PHOENIX-3230
> URL: https://issues.apache.org/jira/browse/PHOENIX-3230
> Project: Phoenix
> Issue Type: Bug
> Reporter: Mujtaba Chohan
> Assignee: Samarth Jain
> Fix For: 4.8.1
>
>
> If two separate Phoenix connections try to upgrade Phoenix from v4.7 to 4.8.1
> then second connection fails with the following exception. This happens even
> if second connection is couple of seconds apart but within upgrade window.
> This is likely to happen in situation where pool of client machines all get
> upgraded to latest Phoenix version. After this exception, all clients will
> cease to work with undefined column exception due to restore/aborted upgrade.
> {noformat}
> WARN query.ConnectionQueryServicesImpl: Table already modified at this
> timestamp, so assuming add of these columns already done: IS_NAMESPACE_MAPPED
> BOOLEAN
> WARN query.ConnectionQueryServicesImpl: Table already modified at this
> timestamp, so assuming add of these columns already done: AUTO_PARTITION_SEQ
> VARCHAR
> WARN query.ConnectionQueryServicesImpl: Table already modified at this
> timestamp, so assuming add of these columns already done: APPEND_ONLY_SCHEMA
> BOOLEAN
> WARN query.ConnectionQueryServicesImpl: Starting restore of SYSTEM.CATALOG
> using snapshot SNAPSHOT_SYSTEM.CATALOG_4.7.x_TO_4.8.0_20160831114048-0700
> because upgrade failed
> 16/08/31 11:41:05 WARN query.ConnectionQueryServicesImpl: Successfully
> restored SYSTEM.CATALOG using snapshot
> SNAPSHOT_SYSTEM.CATALOG_4.7.x_TO_4.8.0_20160831114048-0700
> 16/08/31 11:41:09 WARN query.ConnectionQueryServicesImpl: Successfully
> restored and enabled SYSTEM.CATALOG using snapshot
> SNAPSHOT_SYSTEM.CATALOG_4.7.x_TO_4.8.0_20160831114048-0700
> Error: ERROR 504 (42703): Undefined column. columnName=IS_NAMESPACE_MAPPED
> (state=42703,code=504)
> org.apache.phoenix.schema.ColumnNotFoundException: ERROR 504 (42703):
> Undefined column. columnName=IS_NAMESPACE_MAPPED
> at org.apache.phoenix.schema.PTableImpl.getColumn(PTableImpl.java:693)
> at
> org.apache.phoenix.compile.FromCompiler$SingleTableColumnResolver.resolveColumn(FromCompiler.java:449)
> at
> org.apache.phoenix.compile.UpsertCompiler.compile(UpsertCompiler.java:418)
> at
> org.apache.phoenix.jdbc.PhoenixStatement$ExecutableUpsertStatement.compilePlan(PhoenixStatement.java:590)
> at
> org.apache.phoenix.jdbc.PhoenixStatement$ExecutableUpsertStatement.compilePlan(PhoenixStatement.java:578)
> at
> org.apache.phoenix.jdbc.PhoenixStatement$2.call(PhoenixStatement.java:333)
> at
> org.apache.phoenix.jdbc.PhoenixStatement$2.call(PhoenixStatement.java:328)
> at org.apache.phoenix.call.CallRunner.run(CallRunner.java:53)
> at
> org.apache.phoenix.jdbc.PhoenixStatement.executeMutation(PhoenixStatement.java:326)
> at
> org.apache.phoenix.jdbc.PhoenixStatement.execute(PhoenixStatement.java:247)
> at
> org.apache.phoenix.jdbc.PhoenixPreparedStatement.execute(PhoenixPreparedStatement.java:172)
> at
> org.apache.phoenix.jdbc.PhoenixPreparedStatement.execute(PhoenixPreparedStatement.java:177)
> at
> org.apache.phoenix.schema.MetaDataClient.createTableInternal(MetaDataClient.java:2275)
> at
> org.apache.phoenix.schema.MetaDataClient.createTable(MetaDataClient.java:920)
> at
> org.apache.phoenix.compile.CreateTableCompiler$2.execute(CreateTableCompiler.java:193)
> at
> org.apache.phoenix.jdbc.PhoenixStatement$2.call(PhoenixStatement.java:340)
> at
> org.apache.phoenix.jdbc.PhoenixStatement$2.call(PhoenixStatement.java:328)
> at org.apache.phoenix.call.CallRunner.run(CallRunner.java:53)
> at
> org.apache.phoenix.jdbc.PhoenixStatement.executeMutation(PhoenixStatement.java:326)
> at
> org.apache.phoenix.jdbc.PhoenixStatement.executeUpdate(PhoenixStatement.java:1369)
> at
> org.apache.phoenix.query.ConnectionQueryServicesImpl$13.call(ConnectionQueryServicesImpl.java:2486)
> at
> org.apache.phoenix.query.ConnectionQueryServicesImpl$13.call(ConnectionQueryServicesImpl.java:2282)
> at
> org.apache.phoenix.util.PhoenixContextExecutor.call(PhoenixContextExecutor.java:78)
> at
> org.apache.phoenix.query.ConnectionQueryServicesImpl.init(ConnectionQueryServicesImpl.java:2282)
> at
> org.apache.phoenix.jdbc.PhoenixDriver.getConnectionQueryServices(PhoenixDriver.java:231)
> at
> org.apache.phoenix.jdbc.PhoenixEmbeddedDriver.createConnection(PhoenixEmbeddedDriver.java:144)
> at org.apache.phoenix.jdbc.PhoenixDriver.connect(PhoenixDriver.java:202)
> at sqlline.DatabaseConnection.connect(DatabaseConnection.java:157)
> at sqlline.DatabaseConnection.getConnection(DatabaseConnection.java:203)
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)