[ https://issues.apache.org/jira/browse/HDDS-10137?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ivan Andika updated HDDS-10137: ------------------------------- Description: To test 1.4.0-rc1, I rebuild a new Ozone cluster to do some testing. After I initialized and started SCM and OM, I started Recon. However, Recon was not able to download the OM DB Snapshot At first there is a NPE encountered when downloading OM DB Snapshot Recon's log {code:java} 2024-01-17 10:56:29,324 [Recon-SyncOM-0] ERROR org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl: Unable to obtain Ozone Manager DB Snapshot. java.io.IOException: Server returned HTTP response code: 500 for URL: http://<redacted>/dbCheckpoint at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1900) at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$getOzoneManagerDBSnapshot$1(OzoneManagerServiceProviderImpl.java:358) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) at org.apache.hadoop.security.SecurityUtil.doAsUser(SecurityUtil.java:551) at org.apache.hadoop.security.SecurityUtil.doAsLoginUser(SecurityUtil.java:531) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.getOzoneManagerDBSnapshot(OzoneManagerServiceProviderImpl.java:355) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.updateReconOmDBWithNewSnapshot(OzoneManagerServiceProviderImpl.java:387) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.syncDataFromOM(OzoneManagerServiceProviderImpl.java:539) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$startSyncDataFromOM$0(OzoneManagerServiceProviderImpl.java:267) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2024-01-17 10:56:29,325 [Recon-SyncOM-0] ERROR org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl: Null snapshot location got from OM.{code} OM leader's log (This seems to be because SstFilteringService might be null) {code:java} 2024-01-17 10:56:29,321 [qtp3981395-71] ERROR org.apache.hadoop.hdds.utils.DBCheckpointServlet: Unable to process metadata snapshot request. java.lang.NullPointerException at org.apache.hadoop.ozone.om.OMDBCheckpointServlet$Lock.lock(OMDBCheckpointServlet.java:667) at org.apache.hadoop.hdds.utils.DBCheckpointServlet.generateSnapshotCheckpoint(DBCheckpointServlet.java:197) at org.apache.hadoop.hdds.utils.DBCheckpointServlet.doGet(DBCheckpointServlet.java:303) at javax.servlet.http.HttpServlet.service(HttpServlet.java:687) at javax.servlet.http.HttpServlet.service(HttpServlet.java:790) at org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:799) at org.eclipse.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1656) at org.apache.hadoop.http.lib.StaticUserWebFilter$StaticUserFilter.doFilter(StaticUserWebFilter.java:110) at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626) at org.apache.hadoop.hdds.server.http.HttpServer2$QuotingInputFilter.doFilter(HttpServer2.java:1665) at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626) at org.apache.hadoop.hdds.server.http.NoCacheFilter.doFilter(NoCacheFilter.java:48) at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626) at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:552) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143) at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:600) at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235) at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1624) at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233) at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1440) at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188) at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:505) at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1594) at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186) at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1355) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146) at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) at org.eclipse.jetty.server.Server.handle(Server.java:516) at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:487) at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:732) at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:479) at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277) at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311) at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105) at org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104) at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338) at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315) at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:137) at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883) at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034) at java.lang.Thread.run(Thread.java:748) {code} Afterwards, Recon encountered "Read timed out" exception every time it tries to download the OM DB Snapshot Recon side {code:java} 2024-01-17 11:04:04,419 [Recon-SyncOM-0] ERROR org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl: Unable to obtain Ozone Manager DB Snapshot. java.net.SocketTimeoutException: Read timed out at java.net.SocketInputStream.socketRead0(Native Method) at java.net.SocketInputStream.socketRead(SocketInputStream.java:116) at java.net.SocketInputStream.read(SocketInputStream.java:171) at java.net.SocketInputStream.read(SocketInputStream.java:141) at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) at java.io.BufferedInputStream.read(BufferedInputStream.java:345) at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:735) at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:678) at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1593) at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$getOzoneManagerDBSnapshot$1(OzoneManagerServiceProviderImpl.java:358) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) at org.apache.hadoop.security.SecurityUtil.doAsUser(SecurityUtil.java:551) at org.apache.hadoop.security.SecurityUtil.doAsLoginUser(SecurityUtil.java:531) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.getOzoneManagerDBSnapshot(OzoneManagerServiceProviderImpl.java:355) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.updateReconOmDBWithNewSnapshot(OzoneManagerServiceProviderImpl.java:387) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.syncDataFromOM(OzoneManagerServiceProviderImpl.java:539) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$startSyncDataFromOM$0(OzoneManagerServiceProviderImpl.java:267) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) {code} On OM side {code:java} 2024-01-17 10:57:29,333 [qtp3981395-72] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 10:58:36,098 [qtp3981395-73] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 10:59:39,357 [qtp3981395-74] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:00:44,365 [qtp3981395-75] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:01:49,393 [qtp3981395-77] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:02:54,405 [qtp3981395-78] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:03:59,416 [qtp3981395-79] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:05:04,428 [qtp3981395-80] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:06:09,439 [qtp3981395-68] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:07:14,450 [qtp3981395-245] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:08:19,462 [qtp3981395-246] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:09:24,468 [qtp3981395-225] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:10:29,482 [qtp3981395-227] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:11:34,495 [qtp3981395-232] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:12:39,508 [qtp3981395-233] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot{code} I will put this as blocker for now. Please let me know if I miss anything. cc: [~Symious] [~Sammi] was: To test 1.4.0-rc1, I rebuild a new Ozone cluster to do some testing. After I initialized and started SCM and OM, I started Recon. However, Recon was not able to download the OM DB Snapshot At first there is a NPE encountered when downloading OM DB Snapshot Recon's log {code:java} 2024-01-17 10:56:29,324 [Recon-SyncOM-0] ERROR org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl: Unable to obtain Ozone Manager DB Snapshot. java.io.IOException: Server returned HTTP response code: 500 for URL: http://<redacted>/dbCheckpoint at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1900) at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$getOzoneManagerDBSnapshot$1(OzoneManagerServiceProviderImpl.java:358) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) at org.apache.hadoop.security.SecurityUtil.doAsUser(SecurityUtil.java:551) at org.apache.hadoop.security.SecurityUtil.doAsLoginUser(SecurityUtil.java:531) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.getOzoneManagerDBSnapshot(OzoneManagerServiceProviderImpl.java:355) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.updateReconOmDBWithNewSnapshot(OzoneManagerServiceProviderImpl.java:387) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.syncDataFromOM(OzoneManagerServiceProviderImpl.java:539) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$startSyncDataFromOM$0(OzoneManagerServiceProviderImpl.java:267) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2024-01-17 10:56:29,325 [Recon-SyncOM-0] ERROR org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl: Null snapshot location got from OM.{code} OM leader's log {code:java} 2024-01-17 10:56:29,321 [qtp3981395-71] ERROR org.apache.hadoop.hdds.utils.DBCheckpointServlet: Unable to process metadata snapshot request. java.lang.NullPointerException at org.apache.hadoop.ozone.om.OMDBCheckpointServlet$Lock.lock(OMDBCheckpointServlet.java:667) at org.apache.hadoop.hdds.utils.DBCheckpointServlet.generateSnapshotCheckpoint(DBCheckpointServlet.java:197) at org.apache.hadoop.hdds.utils.DBCheckpointServlet.doGet(DBCheckpointServlet.java:303) at javax.servlet.http.HttpServlet.service(HttpServlet.java:687) at javax.servlet.http.HttpServlet.service(HttpServlet.java:790) at org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:799) at org.eclipse.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1656) at org.apache.hadoop.http.lib.StaticUserWebFilter$StaticUserFilter.doFilter(StaticUserWebFilter.java:110) at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626) at org.apache.hadoop.hdds.server.http.HttpServer2$QuotingInputFilter.doFilter(HttpServer2.java:1665) at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626) at org.apache.hadoop.hdds.server.http.NoCacheFilter.doFilter(NoCacheFilter.java:48) at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626) at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:552) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143) at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:600) at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235) at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1624) at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233) at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1440) at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188) at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:505) at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1594) at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186) at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1355) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146) at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) at org.eclipse.jetty.server.Server.handle(Server.java:516) at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:487) at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:732) at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:479) at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277) at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311) at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105) at org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104) at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338) at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315) at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:137) at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883) at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034) at java.lang.Thread.run(Thread.java:748) {code} Afterwards, Recon encountered "Read timed out" exception every time it tries to download the OM DB Snapshot Recon side {code:java} 2024-01-17 11:04:04,419 [Recon-SyncOM-0] ERROR org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl: Unable to obtain Ozone Manager DB Snapshot. java.net.SocketTimeoutException: Read timed out at java.net.SocketInputStream.socketRead0(Native Method) at java.net.SocketInputStream.socketRead(SocketInputStream.java:116) at java.net.SocketInputStream.read(SocketInputStream.java:171) at java.net.SocketInputStream.read(SocketInputStream.java:141) at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) at java.io.BufferedInputStream.read(BufferedInputStream.java:345) at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:735) at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:678) at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1593) at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$getOzoneManagerDBSnapshot$1(OzoneManagerServiceProviderImpl.java:358) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) at org.apache.hadoop.security.SecurityUtil.doAsUser(SecurityUtil.java:551) at org.apache.hadoop.security.SecurityUtil.doAsLoginUser(SecurityUtil.java:531) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.getOzoneManagerDBSnapshot(OzoneManagerServiceProviderImpl.java:355) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.updateReconOmDBWithNewSnapshot(OzoneManagerServiceProviderImpl.java:387) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.syncDataFromOM(OzoneManagerServiceProviderImpl.java:539) at org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$startSyncDataFromOM$0(OzoneManagerServiceProviderImpl.java:267) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) {code} On OM side {code:java} 2024-01-17 10:57:29,333 [qtp3981395-72] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 10:58:36,098 [qtp3981395-73] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 10:59:39,357 [qtp3981395-74] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:00:44,365 [qtp3981395-75] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:01:49,393 [qtp3981395-77] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:02:54,405 [qtp3981395-78] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:03:59,416 [qtp3981395-79] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:05:04,428 [qtp3981395-80] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:06:09,439 [qtp3981395-68] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:07:14,450 [qtp3981395-245] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:08:19,462 [qtp3981395-246] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:09:24,468 [qtp3981395-225] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:10:29,482 [qtp3981395-227] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:11:34,495 [qtp3981395-232] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot 2024-01-17 11:12:39,508 [qtp3981395-233] INFO org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to obtain DB checkpoint snapshot{code} I will put this as blocker for now. Please let me know if I miss anything. cc: [~Symious] [~Sammi] > OM DB snapshot unable to be downloaded > -------------------------------------- > > Key: HDDS-10137 > URL: https://issues.apache.org/jira/browse/HDDS-10137 > Project: Apache Ozone > Issue Type: Bug > Components: OM, Ozone Recon > Affects Versions: 1.4.0 > Reporter: Ivan Andika > Priority: Blocker > > To test 1.4.0-rc1, I rebuild a new Ozone cluster to do some testing. > After I initialized and started SCM and OM, I started Recon. However, Recon > was not able to download the OM DB Snapshot > At first there is a NPE encountered when downloading OM DB Snapshot > Recon's log > {code:java} > 2024-01-17 10:56:29,324 [Recon-SyncOM-0] ERROR > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl: > Unable to obtain Ozone Manager DB Snapshot. > java.io.IOException: Server returned HTTP response code: 500 for URL: > http://<redacted>/dbCheckpoint > at > sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1900) > at > sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498) > at > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$getOzoneManagerDBSnapshot$1(OzoneManagerServiceProviderImpl.java:358) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) > at > org.apache.hadoop.security.SecurityUtil.doAsUser(SecurityUtil.java:551) > at > org.apache.hadoop.security.SecurityUtil.doAsLoginUser(SecurityUtil.java:531) > at > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.getOzoneManagerDBSnapshot(OzoneManagerServiceProviderImpl.java:355) > at > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.updateReconOmDBWithNewSnapshot(OzoneManagerServiceProviderImpl.java:387) > at > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.syncDataFromOM(OzoneManagerServiceProviderImpl.java:539) > at > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$startSyncDataFromOM$0(OzoneManagerServiceProviderImpl.java:267) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > 2024-01-17 10:56:29,325 [Recon-SyncOM-0] ERROR > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl: Null > snapshot location got from OM.{code} > OM leader's log (This seems to be because SstFilteringService might be null) > {code:java} > 2024-01-17 10:56:29,321 [qtp3981395-71] ERROR > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Unable to process metadata > snapshot request. > java.lang.NullPointerException > at > org.apache.hadoop.ozone.om.OMDBCheckpointServlet$Lock.lock(OMDBCheckpointServlet.java:667) > at > org.apache.hadoop.hdds.utils.DBCheckpointServlet.generateSnapshotCheckpoint(DBCheckpointServlet.java:197) > at > org.apache.hadoop.hdds.utils.DBCheckpointServlet.doGet(DBCheckpointServlet.java:303) > at javax.servlet.http.HttpServlet.service(HttpServlet.java:687) > at javax.servlet.http.HttpServlet.service(HttpServlet.java:790) > at org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:799) > at > org.eclipse.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1656) > at > org.apache.hadoop.http.lib.StaticUserWebFilter$StaticUserFilter.doFilter(StaticUserWebFilter.java:110) > at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) > at > org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626) > at > org.apache.hadoop.hdds.server.http.HttpServer2$QuotingInputFilter.doFilter(HttpServer2.java:1665) > at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) > at > org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626) > at > org.apache.hadoop.hdds.server.http.NoCacheFilter.doFilter(NoCacheFilter.java:48) > at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) > at > org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626) > at > org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:552) > at > org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143) > at > org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:600) > at > org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) > at > org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235) > at > org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1624) > at > org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233) > at > org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1440) > at > org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188) > at > org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:505) > at > org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1594) > at > org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186) > at > org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1355) > at > org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) > at > org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146) > at > org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) > at org.eclipse.jetty.server.Server.handle(Server.java:516) > at > org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:487) > at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:732) > at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:479) > at > org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277) > at > org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311) > at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105) > at org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104) > at > org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338) > at > org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315) > at > org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) > at > org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:137) > at > org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883) > at > org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034) > at java.lang.Thread.run(Thread.java:748) {code} > Afterwards, Recon encountered "Read timed out" exception every time it tries > to download the OM DB Snapshot > Recon side > {code:java} > 2024-01-17 11:04:04,419 [Recon-SyncOM-0] ERROR > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl: > Unable to obtain Ozone Manager DB Snapshot. > java.net.SocketTimeoutException: Read timed out > at java.net.SocketInputStream.socketRead0(Native Method) > at java.net.SocketInputStream.socketRead(SocketInputStream.java:116) > at java.net.SocketInputStream.read(SocketInputStream.java:171) > at java.net.SocketInputStream.read(SocketInputStream.java:141) > at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) > at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) > at java.io.BufferedInputStream.read(BufferedInputStream.java:345) > at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:735) > at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:678) > at > sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1593) > at > sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498) > at > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$getOzoneManagerDBSnapshot$1(OzoneManagerServiceProviderImpl.java:358) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) > at > org.apache.hadoop.security.SecurityUtil.doAsUser(SecurityUtil.java:551) > at > org.apache.hadoop.security.SecurityUtil.doAsLoginUser(SecurityUtil.java:531) > at > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.getOzoneManagerDBSnapshot(OzoneManagerServiceProviderImpl.java:355) > at > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.updateReconOmDBWithNewSnapshot(OzoneManagerServiceProviderImpl.java:387) > at > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.syncDataFromOM(OzoneManagerServiceProviderImpl.java:539) > at > org.apache.hadoop.ozone.recon.spi.impl.OzoneManagerServiceProviderImpl.lambda$startSyncDataFromOM$0(OzoneManagerServiceProviderImpl.java:267) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) {code} > On OM side > {code:java} > 2024-01-17 10:57:29,333 [qtp3981395-72] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 10:58:36,098 [qtp3981395-73] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 10:59:39,357 [qtp3981395-74] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:00:44,365 [qtp3981395-75] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:01:49,393 [qtp3981395-77] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:02:54,405 [qtp3981395-78] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:03:59,416 [qtp3981395-79] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:05:04,428 [qtp3981395-80] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:06:09,439 [qtp3981395-68] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:07:14,450 [qtp3981395-245] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:08:19,462 [qtp3981395-246] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:09:24,468 [qtp3981395-225] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:10:29,482 [qtp3981395-227] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:11:34,495 [qtp3981395-232] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot > 2024-01-17 11:12:39,508 [qtp3981395-233] INFO > org.apache.hadoop.hdds.utils.DBCheckpointServlet: Received GET request to > obtain DB checkpoint snapshot{code} > I will put this as blocker for now. Please let me know if I miss anything. > cc: [~Symious] [~Sammi] -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@ozone.apache.org For additional commands, e-mail: issues-h...@ozone.apache.org