[
https://issues.apache.org/jira/browse/HDDS-7701?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Kohei Sugihara updated HDDS-7701:
---------------------------------
Description:
A key is unavailable after time spent even if we don't delete the keys. We dug
SCM logs and then found that SCM removed a container that consists the key. I
think this is not intentional behavior.
I will attach our logs about ReplicationManager for two cases. In each case,
the key is 6GB and uploaded via multi-part upload using aws-cli. We experienced
the first data loss in Case #1 and overwrote the key to fix it. However, the
key was made unavailable again a month later in Case #2.
h2. Case #1
Key was reported a NO_REPLCIA_FOUND error. Container #41524 was deleted. We had
not use ContainerBalancer at that time.
h3. S3G
{quote}2022-11-16 18:29:40,727 [qtp1869039062-812583] WARN
org.eclipse.jetty.server.HttpChannel: /path/to/file.zip
javax.servlet.ServletException: javax.servlet.ServletException:
java.lang.IllegalArgumentException: NO_REPLICA_FOUND
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:162)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.server.Server.handle(Server.java:516)
at
org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:388)
at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:633)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:380)
at
org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277)
at
org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)
at
org.eclipse.jetty.io.ssl.SslConnection$DecryptedEndPoint.onFillable(SslConnection.java:555)
at
org.eclipse.jetty.io.ssl.SslConnection.onFillable(SslConnection.java:410)
at
org.eclipse.jetty.io.ssl.SslConnection$2.succeeded(SslConnection.java:164)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)
at org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)
at
org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338)
at
org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315)
at
org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173)
at
org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:131)
at
org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:386)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: javax.servlet.ServletException: java.lang.IllegalArgumentException:
NO_REPLICA_FOUND
at
org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:410)
at
org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:346)
at
org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:366)
at
org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:319)
at
org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:205)
at
org.eclipse.jetty.servlet.ServletHolder$NotAsync.service(ServletHolder.java:1459)
at
org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:799)
at
org.eclipse.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1626)
at
org.apache.hadoop.ozone.s3.RootPageDisplayFilter.doFilter(RootPageDisplayFilter.java:53)
at
org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at
org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
at
org.apache.hadoop.ozone.s3.EmptyContentTypeFilter.doFilter(EmptyContentTypeFilter.java:76)
at
org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:201)
at
org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
at
org.apache.hadoop.hdds.server.http.HttpServer2$QuotingInputFilter.doFilter(HttpServer2.java:1681)
at
org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at
org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
at
org.apache.hadoop.hdds.server.http.NoCacheFilter.doFilter(NoCacheFilter.java:48)
at
org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at
org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:548)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:602)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at
org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1624)
at
org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1434)
at
org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)
at
org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:501)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1594)
at
org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1349)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146)
... 21 more
Caused by: java.lang.IllegalArgumentException: NO_REPLICA_FOUND
at
com.google.common.base.Preconditions.checkArgument(Preconditions.java:145)
at
org.apache.hadoop.hdds.scm.XceiverClientManager.acquireClient(XceiverClientManager.java:164)
at
org.apache.hadoop.hdds.scm.XceiverClientManager.acquireClientForReadData(XceiverClientManager.java:157)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.acquireClient(BlockInputStream.java:256)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.getChunkInfos(BlockInputStream.java:223)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.initialize(BlockInputStream.java:146)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.readWithStrategy(BlockInputStream.java:283)
at
org.apache.hadoop.hdds.scm.storage.ExtendedInputStream.read(ExtendedInputStream.java:54)
at
org.apache.hadoop.hdds.scm.storage.ByteArrayReader.readFromBlock(ByteArrayReader.java:57)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.readWithStrategy(KeyInputStream.java:273)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:240)
at
org.apache.hadoop.ozone.client.io.OzoneInputStream.read(OzoneInputStream.java:56)
at org.apache.commons.io.IOUtils.copyLarge(IOUtils.java:1384)
at
org.apache.hadoop.ozone.s3.endpoint.ObjectEndpoint.lambda$get$1(ObjectEndpoint.java:344)
at
org.glassfish.jersey.message.internal.StreamingOutputProvider.writeTo(StreamingOutputProvider.java:55)
at
org.glassfish.jersey.message.internal.StreamingOutputProvider.writeTo(StreamingOutputProvider.java:37)
at
org.glassfish.jersey.message.internal.WriterInterceptorExecutor$TerminalWriterInterceptor.invokeWriteTo(WriterInterceptorExecutor.java:242)
at
org.glassfish.jersey.message.internal.WriterInterceptorExecutor$TerminalWriterInterceptor.aroundWriteTo(WriterInterceptorExecutor.java:227)
at
org.glassfish.jersey.message.internal.WriterInterceptorExecutor.proceed(WriterInterceptorExecutor.java:139)
at
org.glassfish.jersey.server.internal.JsonWithPaddingInterceptor.aroundWriteTo(JsonWithPaddingInterceptor.java:85)
at
org.glassfish.jersey.message.internal.WriterInterceptorExecutor.proceed(WriterInterceptorExecutor.java:139)
at
org.glassfish.jersey.server.internal.MappableExceptionWrapperInterceptor.aroundWriteTo(MappableExceptionWrapperInterceptor.java:61)
at
org.glassfish.jersey.message.internal.WriterInterceptorExecutor.proceed(WriterInterceptorExecutor.java:139)
at
org.glassfish.jersey.message.internal.MessageBodyFactory.writeTo(MessageBodyFactory.java:1116)
at
org.glassfish.jersey.server.ServerRuntime$Responder.writeResponse(ServerRuntime.java:635)
at
org.glassfish.jersey.server.ServerRuntime$Responder.processResponse(ServerRuntime.java:373)
at
org.glassfish.jersey.server.ServerRuntime$Responder.process(ServerRuntime.java:363)
at
org.glassfish.jersey.server.ServerRuntime$1.run(ServerRuntime.java:258)
at org.glassfish.jersey.internal.Errors$1.call(Errors.java:248)
at org.glassfish.jersey.internal.Errors$1.call(Errors.java:244)
at org.glassfish.jersey.internal.Errors.process(Errors.java:292)
at org.glassfish.jersey.internal.Errors.process(Errors.java:274)
at org.glassfish.jersey.internal.Errors.process(Errors.java:244)
at
org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:265)
at
org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:234)
at
org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:680)
at
org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:394)
... 55 more
{quote}
h3. ReplicationManager @ SCM
{quote}2022-11-15 16:30:16,680
[EventQueue-CloseContainerForCloseContainerEventHandler] INFO
org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
container Event triggered for container : #41524
2022-11-15 16:30:37,073
[EventQueue-CloseContainerForCloseContainerEventHandler] INFO
org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
container Event triggered for container : #41524
2022-11-15 16:30:37,279
[EventQueue-CloseContainerForCloseContainerEventHandler] INFO
org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
container Event triggered for container : #41524
2022-11-15 16:30:41,806
[EventQueue-CloseContainerForCloseContainerEventHandler] INFO
org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
container Event triggered for container : #41524
2022-11-15 16:30:41,815 [FixedThreadPoolWithAffinityExecutor-8-0] INFO
org.apache.hadoop.hdds.scm.container.IncrementalContainerReportHandler: Moving
container #41524 to CLOSED state, datanode DN35-UUID\{<snip>} reported CLOSED
replica.
2022-11-16 09:18:55,963 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #41524 to datanode
DN21-UUID\{<snip>}
2022-11-16 09:18:55,963 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #41524 to datanode
DN35-UUID\{<snip>}
2022-11-16 09:18:55,963 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #41524 to datanode
DN18-UUID\{<snip>}
{quote}
h3. sh key info & admin container info
SCM knows containers that construct the key, but actually, ReplicationManager
removed one of the containers.
{quote}% ozone sh key info /path/to/file.zip > key-info.log
% jq '.ozoneKeyLocations[].containerID' < key-info.log | sort | uniq >
containers.list
% cat containers.list | xargs echo
41196 41198 41200 41201 41204 41205 41206 41207 41208 41209 41210 41211 41212
41213 41216 41217 41218 41219 41220 41222 41224 41225 41226 41228 41229 41230
41231 41232 41233 41234 41235 41236 41237 41238 41239 41241 41242 41243 41244
41245 41246 41247 41248 41249 41250 41251 41252 41253 41254 41255 41256 41257
41258 41259 41260 41261 41263 41264 41265 41266 41267 41268 41269 41270 41272
41273 41275 41276 41277 41278 41279 41280 41281 41282 41283 41284 41285 41287
41288 41289 41290 41291 41292 41295 41296 41297 41298 41299 41300 41301 41303
41304 41305 41306 41307 41308 41309 41311 41312 41313 41314 41315 41316 41317
41318 41321 41322 41323 41324 41325 41326 41327 41328 41329 41330 41331 41332
41333 41334 41335 41336 41337 41338 41339 41340 41341 41342 41343 41344 41345
41346 41347 41348 41349 41350 41351 41352 41353 41355 41356 41357 41359 41360
41361 41362 41363 41364 41365 41366 41367 41368 41369 41370 41371 41372 41373
41374 41375 41376 41377 41378 41379 41380 41381 41382 41383 41384 41385 41386
41387 41388 41389 41390 41391 41392 41393 41394 41395 41396 41397 41398 41399
41400 41401 41402 41403 41404 41405 41406 41407 41408 41409 41410 41411 41412
41413 41414 41415 41416 41417 41418 41419 41420 41421 41422 41423 41424 41425
41426 41427 41428 41429 41431 41432 41433 41434 41435 41436 41437 41438 41439
41440 41442 41443 41446 41448 41449 41450 41451 41452 41453 41454 41455 41456
41457 41458 41459 41460 41461 41463 41464 41465 41466 41467 41469 41470 41471
41473 41474 41475 41476 41478 41479 41480 41481 41483 41484 41486 41487 41488
41489 41490 41491 41492 41493 41494 41495 41496 41497 41498 41499 41500 41501
41504 41505 41506 41508 41509 41510 41511 41512 41513 41514 41515 41516 41518
41519 41521 41522 41523 {color:#de350b}*41524*{color} 41525 41526 41527 41528
41529 41530 41531 41532 41533 41534 41535 41536 41537 41538 41539 41540 41541
41543 41545 41546 41547 41548 41549 41550 41551 41552 41553 41554 41555 41556
41557 41558 41559 41560 41561 41562 41564 41565 41567 41568 41569 41570 41571
41572 41573 41574 41577 41578 41579 41580 41581 41582 41583 41584 41585 41586
41587 41588 41589 41590 41591 41592 41593 41594 41595 41596 41597 41598 41599
41600 41601 41602 41604 41605 41606 41607 41608 41609 41610 41611
% ozone admin container info 41524
Container id: 41524
Pipeline id: 7a478e5f-d948-4520-9147-323299d4d10a
Container State: *DELETED*
Datanodes: []
Replicas: []
{quote}
h2. Case #2
After re-uploading the file to the same path and a month later, Container
#55905 was deleted. ContainerBalancer is running but I don't make sure that
this is relevant. Ozone CLI and S3G logs are the same like the above.
{quote}2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
org.apache.hadoop.hdds.scm.container.balancer.ContainerBalancerTask:
ContainerBalancer matched source datanode DN19-UUID with target datanode
DN14-UUID for container move.
2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
org.apache.hadoop.hdds.scm.container.balancer.ContainerBalancerTask:
ContainerBalancer is trying to move container #55905 with size 247113087B from
source datanode DN19-UUID to target datanode DN14-UUID
2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending replicate container command for container #55905 to datanode
DN14-UUID\{<snip>}]
2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
receive a move request about container #55905 , from DN19-UUID to DN14-UUID
2022-12-21 17:53:31,409 [ReplicationMonitor] WARN
org.apache.hadoop.hdds.scm.container.balancer.ContainerBalancerTask: Container
move for container #55905 from source DN19-UUID to target DN14-UUID failed:
REPLICATION_FAIL_TIME_OUT
...
2022-12-21 18:23:34,070 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Container #55905 is over replicated. Expected replica count is 3, but found 4.
2022-12-21 18:23:34,070 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #55905 to datanode
DN9-UUID\{<snip>}
...
2022-12-21 18:38:35,024 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #55905 to datanode
DN8-UUID\{<snip>}
2022-12-21 18:38:35,025 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #55905 to datanode
DN14-UUID\{<snip>}
2022-12-21 18:38:35,025 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #55905 to datanode
DN19-UUID\{<snip>}
{quote}
was:
A key is unavailable after time spent even if we don't delete the keys. We dug
SCM logs and then found that SCM removed a container that consists the key. I
think this is not intentional behavior.
I will attach our logs about ReplicationManager for two cases. In each case,
the key is 6GB and uploaded via multi-part upload using aws-cli. We experienced
the first data loss in Case #1 and overwrote the key to fix it. However, the
key was made unavailable again a month later in Case #2.
h2. Case #1
Key was reported a NO_REPLCIA_FOUND error. Container #41524 was deleted. We had
not use ContainerBalancer at that time.
h3. S3G
{quote}2022-11-16 18:29:40,727 [qtp1869039062-812583] WARN
org.eclipse.jetty.server.HttpChannel: /path/to/file.zip
javax.servlet.ServletException: javax.servlet.ServletException:
java.lang.IllegalArgumentException: NO_REPLICA_FOUND
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:162)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.server.Server.handle(Server.java:516)
at
org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:388)
at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:633)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:380)
at
org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277)
at
org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)
at
org.eclipse.jetty.io.ssl.SslConnection$DecryptedEndPoint.onFillable(SslConnection.java:555)
at
org.eclipse.jetty.io.ssl.SslConnection.onFillable(SslConnection.java:410)
at
org.eclipse.jetty.io.ssl.SslConnection$2.succeeded(SslConnection.java:164)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)
at org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)
at
org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338)
at
org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315)
at
org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173)
at
org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:131)
at
org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:386)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: javax.servlet.ServletException: java.lang.IllegalArgumentException:
NO_REPLICA_FOUND
at
org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:410)
at
org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:346)
at
org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:366)
at
org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:319)
at
org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:205)
at
org.eclipse.jetty.servlet.ServletHolder$NotAsync.service(ServletHolder.java:1459)
at
org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:799)
at
org.eclipse.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1626)
at
org.apache.hadoop.ozone.s3.RootPageDisplayFilter.doFilter(RootPageDisplayFilter.java:53)
at
org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at
org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
at
org.apache.hadoop.ozone.s3.EmptyContentTypeFilter.doFilter(EmptyContentTypeFilter.java:76)
at
org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:201)
at
org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
at
org.apache.hadoop.hdds.server.http.HttpServer2$QuotingInputFilter.doFilter(HttpServer2.java:1681)
at
org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at
org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
at
org.apache.hadoop.hdds.server.http.NoCacheFilter.doFilter(NoCacheFilter.java:48)
at
org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at
org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:548)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:602)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at
org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1624)
at
org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1434)
at
org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)
at
org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:501)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1594)
at
org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1349)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146)
... 21 more
Caused by: java.lang.IllegalArgumentException: NO_REPLICA_FOUND
at
com.google.common.base.Preconditions.checkArgument(Preconditions.java:145)
at
org.apache.hadoop.hdds.scm.XceiverClientManager.acquireClient(XceiverClientManager.java:164)
at
org.apache.hadoop.hdds.scm.XceiverClientManager.acquireClientForReadData(XceiverClientManager.java:157)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.acquireClient(BlockInputStream.java:256)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.getChunkInfos(BlockInputStream.java:223)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.initialize(BlockInputStream.java:146)
at
org.apache.hadoop.hdds.scm.storage.BlockInputStream.readWithStrategy(BlockInputStream.java:283)
at
org.apache.hadoop.hdds.scm.storage.ExtendedInputStream.read(ExtendedInputStream.java:54)
at
org.apache.hadoop.hdds.scm.storage.ByteArrayReader.readFromBlock(ByteArrayReader.java:57)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.readWithStrategy(KeyInputStream.java:273)
at
org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:240)
at
org.apache.hadoop.ozone.client.io.OzoneInputStream.read(OzoneInputStream.java:56)
at org.apache.commons.io.IOUtils.copyLarge(IOUtils.java:1384)
at
org.apache.hadoop.ozone.s3.endpoint.ObjectEndpoint.lambda$get$1(ObjectEndpoint.java:344)
at
org.glassfish.jersey.message.internal.StreamingOutputProvider.writeTo(StreamingOutputProvider.java:55)
at
org.glassfish.jersey.message.internal.StreamingOutputProvider.writeTo(StreamingOutputProvider.java:37)
at
org.glassfish.jersey.message.internal.WriterInterceptorExecutor$TerminalWriterInterceptor.invokeWriteTo(WriterInterceptorExecutor.java:242)
at
org.glassfish.jersey.message.internal.WriterInterceptorExecutor$TerminalWriterInterceptor.aroundWriteTo(WriterInterceptorExecutor.java:227)
at
org.glassfish.jersey.message.internal.WriterInterceptorExecutor.proceed(WriterInterceptorExecutor.java:139)
at
org.glassfish.jersey.server.internal.JsonWithPaddingInterceptor.aroundWriteTo(JsonWithPaddingInterceptor.java:85)
at
org.glassfish.jersey.message.internal.WriterInterceptorExecutor.proceed(WriterInterceptorExecutor.java:139)
at
org.glassfish.jersey.server.internal.MappableExceptionWrapperInterceptor.aroundWriteTo(MappableExceptionWrapperInterceptor.java:61)
at
org.glassfish.jersey.message.internal.WriterInterceptorExecutor.proceed(WriterInterceptorExecutor.java:139)
at
org.glassfish.jersey.message.internal.MessageBodyFactory.writeTo(MessageBodyFactory.java:1116)
at
org.glassfish.jersey.server.ServerRuntime$Responder.writeResponse(ServerRuntime.java:635)
at
org.glassfish.jersey.server.ServerRuntime$Responder.processResponse(ServerRuntime.java:373)
at
org.glassfish.jersey.server.ServerRuntime$Responder.process(ServerRuntime.java:363)
at
org.glassfish.jersey.server.ServerRuntime$1.run(ServerRuntime.java:258)
at org.glassfish.jersey.internal.Errors$1.call(Errors.java:248)
at org.glassfish.jersey.internal.Errors$1.call(Errors.java:244)
at org.glassfish.jersey.internal.Errors.process(Errors.java:292)
at org.glassfish.jersey.internal.Errors.process(Errors.java:274)
at org.glassfish.jersey.internal.Errors.process(Errors.java:244)
at
org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:265)
at
org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:234)
at
org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:680)
at
org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:394)
... 55 more
{quote}
h3. ReplicationManager @ S3G
{quote}2022-11-15 16:30:16,680
[EventQueue-CloseContainerForCloseContainerEventHandler] INFO
org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
container Event triggered for container : #41524
2022-11-15 16:30:37,073
[EventQueue-CloseContainerForCloseContainerEventHandler] INFO
org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
container Event triggered for container : #41524
2022-11-15 16:30:37,279
[EventQueue-CloseContainerForCloseContainerEventHandler] INFO
org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
container Event triggered for container : #41524
2022-11-15 16:30:41,806
[EventQueue-CloseContainerForCloseContainerEventHandler] INFO
org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
container Event triggered for container : #41524
2022-11-15 16:30:41,815 [FixedThreadPoolWithAffinityExecutor-8-0] INFO
org.apache.hadoop.hdds.scm.container.IncrementalContainerReportHandler: Moving
container #41524 to CLOSED state, datanode DN35-UUID\{<snip>} reported CLOSED
replica.
2022-11-16 09:18:55,963 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #41524 to datanode
DN21-UUID\{<snip>}
2022-11-16 09:18:55,963 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #41524 to datanode
DN35-UUID\{<snip>}
2022-11-16 09:18:55,963 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #41524 to datanode
DN18-UUID\{<snip>}
{quote}
h3. sh key info & admin container info
SCM knows containers that construct the key, but actually, ReplicationManager
removed one of the containers.
{quote}% ozone sh key info /path/to/file.zip > key-info.log
% jq '.ozoneKeyLocations[].containerID' < key-info.log | sort | uniq >
containers.list
% cat containers.list | xargs echo
41196 41198 41200 41201 41204 41205 41206 41207 41208 41209 41210 41211 41212
41213 41216 41217 41218 41219 41220 41222 41224 41225 41226 41228 41229 41230
41231 41232 41233 41234 41235 41236 41237 41238 41239 41241 41242 41243 41244
41245 41246 41247 41248 41249 41250 41251 41252 41253 41254 41255 41256 41257
41258 41259 41260 41261 41263 41264 41265 41266 41267 41268 41269 41270 41272
41273 41275 41276 41277 41278 41279 41280 41281 41282 41283 41284 41285 41287
41288 41289 41290 41291 41292 41295 41296 41297 41298 41299 41300 41301 41303
41304 41305 41306 41307 41308 41309 41311 41312 41313 41314 41315 41316 41317
41318 41321 41322 41323 41324 41325 41326 41327 41328 41329 41330 41331 41332
41333 41334 41335 41336 41337 41338 41339 41340 41341 41342 41343 41344 41345
41346 41347 41348 41349 41350 41351 41352 41353 41355 41356 41357 41359 41360
41361 41362 41363 41364 41365 41366 41367 41368 41369 41370 41371 41372 41373
41374 41375 41376 41377 41378 41379 41380 41381 41382 41383 41384 41385 41386
41387 41388 41389 41390 41391 41392 41393 41394 41395 41396 41397 41398 41399
41400 41401 41402 41403 41404 41405 41406 41407 41408 41409 41410 41411 41412
41413 41414 41415 41416 41417 41418 41419 41420 41421 41422 41423 41424 41425
41426 41427 41428 41429 41431 41432 41433 41434 41435 41436 41437 41438 41439
41440 41442 41443 41446 41448 41449 41450 41451 41452 41453 41454 41455 41456
41457 41458 41459 41460 41461 41463 41464 41465 41466 41467 41469 41470 41471
41473 41474 41475 41476 41478 41479 41480 41481 41483 41484 41486 41487 41488
41489 41490 41491 41492 41493 41494 41495 41496 41497 41498 41499 41500 41501
41504 41505 41506 41508 41509 41510 41511 41512 41513 41514 41515 41516 41518
41519 41521 41522 41523 {color:#de350b}*41524*{color} 41525 41526 41527 41528
41529 41530 41531 41532 41533 41534 41535 41536 41537 41538 41539 41540 41541
41543 41545 41546 41547 41548 41549 41550 41551 41552 41553 41554 41555 41556
41557 41558 41559 41560 41561 41562 41564 41565 41567 41568 41569 41570 41571
41572 41573 41574 41577 41578 41579 41580 41581 41582 41583 41584 41585 41586
41587 41588 41589 41590 41591 41592 41593 41594 41595 41596 41597 41598 41599
41600 41601 41602 41604 41605 41606 41607 41608 41609 41610 41611
% ozone admin container info 41524
Container id: 41524
Pipeline id: 7a478e5f-d948-4520-9147-323299d4d10a
Container State: *DELETED*
Datanodes: []
Replicas: []
{quote}
h2. Case #2
After re-uploading the file to the same path and a month later, Container
#55905 was deleted. ContainerBalancer is running but I don't make sure that
this is relevant. Ozone CLI and S3G logs are the same like the above.
{quote}2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
org.apache.hadoop.hdds.scm.container.balancer.ContainerBalancerTask:
ContainerBalancer matched source datanode DN19-UUID with target datanode
DN14-UUID for container move.
2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
org.apache.hadoop.hdds.scm.container.balancer.ContainerBalancerTask:
ContainerBalancer is trying to move container #55905 with size 247113087B from
source datanode DN19-UUID to target datanode DN14-UUID
2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending replicate container command for container #55905 to datanode
DN14-UUID\{<snip>}]
2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
receive a move request about container #55905 , from DN19-UUID to DN14-UUID
2022-12-21 17:53:31,409 [ReplicationMonitor] WARN
org.apache.hadoop.hdds.scm.container.balancer.ContainerBalancerTask: Container
move for container #55905 from source DN19-UUID to target DN14-UUID failed:
REPLICATION_FAIL_TIME_OUT
...
2022-12-21 18:23:34,070 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Container #55905 is over replicated. Expected replica count is 3, but found 4.
2022-12-21 18:23:34,070 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #55905 to datanode
DN9-UUID\{<snip>}
...
2022-12-21 18:38:35,024 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #55905 to datanode
DN8-UUID\{<snip>}
2022-12-21 18:38:35,025 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #55905 to datanode
DN14-UUID\{<snip>}
2022-12-21 18:38:35,025 [ReplicationMonitor] INFO
org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
Sending delete container command for container #55905 to datanode
DN19-UUID\{<snip>}
{quote}
> Data loss by deleting containers having non-deleted keys
> --------------------------------------------------------
>
> Key: HDDS-7701
> URL: https://issues.apache.org/jira/browse/HDDS-7701
> Project: Apache Ozone
> Issue Type: Bug
> Components: DN, OM, SCM
> Affects Versions: 1.3.0
> Environment: Version: Our forked version based ozone-1.3.0 branch.
> commit hash is
> [https://github.com/apache/ozone/commit/9c61a8aa497ab96c014ad3bb7b1ee4f731ebfaf8]
> FSO-enabled bucket
> Reporter: Kohei Sugihara
> Priority: Critical
>
> A key is unavailable after time spent even if we don't delete the keys. We
> dug SCM logs and then found that SCM removed a container that consists the
> key. I think this is not intentional behavior.
> I will attach our logs about ReplicationManager for two cases. In each case,
> the key is 6GB and uploaded via multi-part upload using aws-cli. We
> experienced the first data loss in Case #1 and overwrote the key to fix it.
> However, the key was made unavailable again a month later in Case #2.
> h2. Case #1
> Key was reported a NO_REPLCIA_FOUND error. Container #41524 was deleted. We
> had not use ContainerBalancer at that time.
> h3. S3G
> {quote}2022-11-16 18:29:40,727 [qtp1869039062-812583] WARN
> org.eclipse.jetty.server.HttpChannel: /path/to/file.zip
> javax.servlet.ServletException: javax.servlet.ServletException:
> java.lang.IllegalArgumentException: NO_REPLICA_FOUND
> at
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:162)
> at
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
> at org.eclipse.jetty.server.Server.handle(Server.java:516)
> at
> org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:388)
> at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:633)
> at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:380)
> at
> org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277)
> at
> org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
> at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)
> at
> org.eclipse.jetty.io.ssl.SslConnection$DecryptedEndPoint.onFillable(SslConnection.java:555)
> at
> org.eclipse.jetty.io.ssl.SslConnection.onFillable(SslConnection.java:410)
> at
> org.eclipse.jetty.io.ssl.SslConnection$2.succeeded(SslConnection.java:164)
> at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)
> at
> org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)
> at
> org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338)
> at
> org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315)
> at
> org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173)
> at
> org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:131)
> at
> org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:386)
> at
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883)
> at
> org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034)
> at java.base/java.lang.Thread.run(Thread.java:829)
> Caused by: javax.servlet.ServletException:
> java.lang.IllegalArgumentException: NO_REPLICA_FOUND
> at
> org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:410)
> at
> org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:346)
> at
> org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:366)
> at
> org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:319)
> at
> org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:205)
> at
> org.eclipse.jetty.servlet.ServletHolder$NotAsync.service(ServletHolder.java:1459)
> at
> org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:799)
> at
> org.eclipse.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1626)
> at
> org.apache.hadoop.ozone.s3.RootPageDisplayFilter.doFilter(RootPageDisplayFilter.java:53)
> at
> org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
> at
> org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
> at
> org.apache.hadoop.ozone.s3.EmptyContentTypeFilter.doFilter(EmptyContentTypeFilter.java:76)
> at
> org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:201)
> at
> org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
> at
> org.apache.hadoop.hdds.server.http.HttpServer2$QuotingInputFilter.doFilter(HttpServer2.java:1681)
> at
> org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
> at
> org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
> at
> org.apache.hadoop.hdds.server.http.NoCacheFilter.doFilter(NoCacheFilter.java:48)
> at
> org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
> at
> org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
> at
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:548)
> at
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
> at
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:602)
> at
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
> at
> org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)
> at
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1624)
> at
> org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
> at
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1434)
> at
> org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)
> at
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:501)
> at
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1594)
> at
> org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)
> at
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1349)
> at
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
> at
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146)
> ... 21 more
> Caused by: java.lang.IllegalArgumentException: NO_REPLICA_FOUND
> at
> com.google.common.base.Preconditions.checkArgument(Preconditions.java:145)
> at
> org.apache.hadoop.hdds.scm.XceiverClientManager.acquireClient(XceiverClientManager.java:164)
> at
> org.apache.hadoop.hdds.scm.XceiverClientManager.acquireClientForReadData(XceiverClientManager.java:157)
> at
> org.apache.hadoop.hdds.scm.storage.BlockInputStream.acquireClient(BlockInputStream.java:256)
> at
> org.apache.hadoop.hdds.scm.storage.BlockInputStream.getChunkInfos(BlockInputStream.java:223)
> at
> org.apache.hadoop.hdds.scm.storage.BlockInputStream.initialize(BlockInputStream.java:146)
> at
> org.apache.hadoop.hdds.scm.storage.BlockInputStream.readWithStrategy(BlockInputStream.java:283)
> at
> org.apache.hadoop.hdds.scm.storage.ExtendedInputStream.read(ExtendedInputStream.java:54)
> at
> org.apache.hadoop.hdds.scm.storage.ByteArrayReader.readFromBlock(ByteArrayReader.java:57)
> at
> org.apache.hadoop.ozone.client.io.KeyInputStream.readWithStrategy(KeyInputStream.java:273)
> at
> org.apache.hadoop.ozone.client.io.KeyInputStream.read(KeyInputStream.java:240)
> at
> org.apache.hadoop.ozone.client.io.OzoneInputStream.read(OzoneInputStream.java:56)
> at org.apache.commons.io.IOUtils.copyLarge(IOUtils.java:1384)
> at
> org.apache.hadoop.ozone.s3.endpoint.ObjectEndpoint.lambda$get$1(ObjectEndpoint.java:344)
> at
> org.glassfish.jersey.message.internal.StreamingOutputProvider.writeTo(StreamingOutputProvider.java:55)
> at
> org.glassfish.jersey.message.internal.StreamingOutputProvider.writeTo(StreamingOutputProvider.java:37)
> at
> org.glassfish.jersey.message.internal.WriterInterceptorExecutor$TerminalWriterInterceptor.invokeWriteTo(WriterInterceptorExecutor.java:242)
> at
> org.glassfish.jersey.message.internal.WriterInterceptorExecutor$TerminalWriterInterceptor.aroundWriteTo(WriterInterceptorExecutor.java:227)
> at
> org.glassfish.jersey.message.internal.WriterInterceptorExecutor.proceed(WriterInterceptorExecutor.java:139)
> at
> org.glassfish.jersey.server.internal.JsonWithPaddingInterceptor.aroundWriteTo(JsonWithPaddingInterceptor.java:85)
> at
> org.glassfish.jersey.message.internal.WriterInterceptorExecutor.proceed(WriterInterceptorExecutor.java:139)
> at
> org.glassfish.jersey.server.internal.MappableExceptionWrapperInterceptor.aroundWriteTo(MappableExceptionWrapperInterceptor.java:61)
> at
> org.glassfish.jersey.message.internal.WriterInterceptorExecutor.proceed(WriterInterceptorExecutor.java:139)
> at
> org.glassfish.jersey.message.internal.MessageBodyFactory.writeTo(MessageBodyFactory.java:1116)
> at
> org.glassfish.jersey.server.ServerRuntime$Responder.writeResponse(ServerRuntime.java:635)
> at
> org.glassfish.jersey.server.ServerRuntime$Responder.processResponse(ServerRuntime.java:373)
> at
> org.glassfish.jersey.server.ServerRuntime$Responder.process(ServerRuntime.java:363)
> at
> org.glassfish.jersey.server.ServerRuntime$1.run(ServerRuntime.java:258)
> at org.glassfish.jersey.internal.Errors$1.call(Errors.java:248)
> at org.glassfish.jersey.internal.Errors$1.call(Errors.java:244)
> at org.glassfish.jersey.internal.Errors.process(Errors.java:292)
> at org.glassfish.jersey.internal.Errors.process(Errors.java:274)
> at org.glassfish.jersey.internal.Errors.process(Errors.java:244)
> at
> org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:265)
> at
> org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:234)
> at
> org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:680)
> at
> org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:394)
> ... 55 more
> {quote}
> h3. ReplicationManager @ SCM
> {quote}2022-11-15 16:30:16,680
> [EventQueue-CloseContainerForCloseContainerEventHandler] INFO
> org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
> container Event triggered for container : #41524
> 2022-11-15 16:30:37,073
> [EventQueue-CloseContainerForCloseContainerEventHandler] INFO
> org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
> container Event triggered for container : #41524
> 2022-11-15 16:30:37,279
> [EventQueue-CloseContainerForCloseContainerEventHandler] INFO
> org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
> container Event triggered for container : #41524
> 2022-11-15 16:30:41,806
> [EventQueue-CloseContainerForCloseContainerEventHandler] INFO
> org.apache.hadoop.hdds.scm.container.CloseContainerEventHandler: Close
> container Event triggered for container : #41524
> 2022-11-15 16:30:41,815 [FixedThreadPoolWithAffinityExecutor-8-0] INFO
> org.apache.hadoop.hdds.scm.container.IncrementalContainerReportHandler:
> Moving container #41524 to CLOSED state, datanode DN35-UUID\{<snip>} reported
> CLOSED replica.
> 2022-11-16 09:18:55,963 [ReplicationMonitor] INFO
> org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
> Sending delete container command for container #41524 to datanode
> DN21-UUID\{<snip>}
> 2022-11-16 09:18:55,963 [ReplicationMonitor] INFO
> org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
> Sending delete container command for container #41524 to datanode
> DN35-UUID\{<snip>}
> 2022-11-16 09:18:55,963 [ReplicationMonitor] INFO
> org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
> Sending delete container command for container #41524 to datanode
> DN18-UUID\{<snip>}
> {quote}
> h3. sh key info & admin container info
> SCM knows containers that construct the key, but actually, ReplicationManager
> removed one of the containers.
> {quote}% ozone sh key info /path/to/file.zip > key-info.log
> % jq '.ozoneKeyLocations[].containerID' < key-info.log | sort | uniq >
> containers.list
> % cat containers.list | xargs echo
> 41196 41198 41200 41201 41204 41205 41206 41207 41208 41209 41210 41211 41212
> 41213 41216 41217 41218 41219 41220 41222 41224 41225 41226 41228 41229 41230
> 41231 41232 41233 41234 41235 41236 41237 41238 41239 41241 41242 41243 41244
> 41245 41246 41247 41248 41249 41250 41251 41252 41253 41254 41255 41256 41257
> 41258 41259 41260 41261 41263 41264 41265 41266 41267 41268 41269 41270 41272
> 41273 41275 41276 41277 41278 41279 41280 41281 41282 41283 41284 41285 41287
> 41288 41289 41290 41291 41292 41295 41296 41297 41298 41299 41300 41301 41303
> 41304 41305 41306 41307 41308 41309 41311 41312 41313 41314 41315 41316 41317
> 41318 41321 41322 41323 41324 41325 41326 41327 41328 41329 41330 41331 41332
> 41333 41334 41335 41336 41337 41338 41339 41340 41341 41342 41343 41344 41345
> 41346 41347 41348 41349 41350 41351 41352 41353 41355 41356 41357 41359 41360
> 41361 41362 41363 41364 41365 41366 41367 41368 41369 41370 41371 41372 41373
> 41374 41375 41376 41377 41378 41379 41380 41381 41382 41383 41384 41385 41386
> 41387 41388 41389 41390 41391 41392 41393 41394 41395 41396 41397 41398 41399
> 41400 41401 41402 41403 41404 41405 41406 41407 41408 41409 41410 41411 41412
> 41413 41414 41415 41416 41417 41418 41419 41420 41421 41422 41423 41424 41425
> 41426 41427 41428 41429 41431 41432 41433 41434 41435 41436 41437 41438 41439
> 41440 41442 41443 41446 41448 41449 41450 41451 41452 41453 41454 41455 41456
> 41457 41458 41459 41460 41461 41463 41464 41465 41466 41467 41469 41470 41471
> 41473 41474 41475 41476 41478 41479 41480 41481 41483 41484 41486 41487 41488
> 41489 41490 41491 41492 41493 41494 41495 41496 41497 41498 41499 41500 41501
> 41504 41505 41506 41508 41509 41510 41511 41512 41513 41514 41515 41516 41518
> 41519 41521 41522 41523 {color:#de350b}*41524*{color} 41525 41526 41527 41528
> 41529 41530 41531 41532 41533 41534 41535 41536 41537 41538 41539 41540 41541
> 41543 41545 41546 41547 41548 41549 41550 41551 41552 41553 41554 41555 41556
> 41557 41558 41559 41560 41561 41562 41564 41565 41567 41568 41569 41570 41571
> 41572 41573 41574 41577 41578 41579 41580 41581 41582 41583 41584 41585 41586
> 41587 41588 41589 41590 41591 41592 41593 41594 41595 41596 41597 41598 41599
> 41600 41601 41602 41604 41605 41606 41607 41608 41609 41610 41611
> % ozone admin container info 41524
> Container id: 41524
> Pipeline id: 7a478e5f-d948-4520-9147-323299d4d10a
> Container State: *DELETED*
> Datanodes: []
> Replicas: []
> {quote}
> h2. Case #2
> After re-uploading the file to the same path and a month later, Container
> #55905 was deleted. ContainerBalancer is running but I don't make sure that
> this is relevant. Ozone CLI and S3G logs are the same like the above.
> {quote}2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
> org.apache.hadoop.hdds.scm.container.balancer.ContainerBalancerTask:
> ContainerBalancer matched source datanode DN19-UUID with target datanode
> DN14-UUID for container move.
> 2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
> org.apache.hadoop.hdds.scm.container.balancer.ContainerBalancerTask:
> ContainerBalancer is trying to move container #55905 with size 247113087B
> from source datanode DN19-UUID to target datanode DN14-UUID
> 2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
> org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
> Sending replicate container command for container #55905 to datanode
> DN14-UUID\{<snip>}]
> 2022-12-21 17:21:45,797 [ContainerBalancerTask] INFO
> org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
> receive a move request about container #55905 , from DN19-UUID to DN14-UUID
> 2022-12-21 17:53:31,409 [ReplicationMonitor] WARN
> org.apache.hadoop.hdds.scm.container.balancer.ContainerBalancerTask:
> Container move for container #55905 from source DN19-UUID to target DN14-UUID
> failed: REPLICATION_FAIL_TIME_OUT
> ...
> 2022-12-21 18:23:34,070 [ReplicationMonitor] INFO
> org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
> Container #55905 is over replicated. Expected replica count is 3, but found 4.
> 2022-12-21 18:23:34,070 [ReplicationMonitor] INFO
> org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
> Sending delete container command for container #55905 to datanode
> DN9-UUID\{<snip>}
> ...
> 2022-12-21 18:38:35,024 [ReplicationMonitor] INFO
> org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
> Sending delete container command for container #55905 to datanode
> DN8-UUID\{<snip>}
> 2022-12-21 18:38:35,025 [ReplicationMonitor] INFO
> org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
> Sending delete container command for container #55905 to datanode
> DN14-UUID\{<snip>}
> 2022-12-21 18:38:35,025 [ReplicationMonitor] INFO
> org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager:
> Sending delete container command for container #55905 to datanode
> DN19-UUID\{<snip>}
> {quote}
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]