Oliver Bates created SOLR-10259: ----------------------------------- Summary: admin/cores?action=STATUS returns 500 when a single core has init failures Key: SOLR-10259 URL: https://issues.apache.org/jira/browse/SOLR-10259 Project: Solr Issue Type: Bug Security Level: Public (Default Security Level. Issues are Public) Affects Versions: 5.3 Reporter: Oliver Bates Priority: Trivial
When I have a healthy core on a node and I call solr/admin/cores?action=STATUS, I get the following: ######### Healthy response ############ <response> <lst name="responseHeader"> <int name="status">0</int> <int name="QTime">1607</int> </lst> <lst name="initFailures"/> <lst name="status"> <lst name="whoisbanana_shard1_replica1"> <str name="name">whoisbanana_shard1_replica1</str> <str name="instanceDir"> /tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/ </str> <str name="dataDir"> /tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/ </str> <str name="config">solrconfig.xml</str> <str name="schema">schema.xml</str> <date name="startTime">2017-03-08T15:59:50.18Z</date> <long name="uptime">380431</long> <str name="lastPublished">active</str> <int name="configVersion">0</int> <lst name="index"> <int name="numDocs">0</int> <int name="maxDoc">0</int> <int name="deletedDocs">0</int> <long name="indexHeapUsageBytes">0</long> <long name="version">2</long> <int name="segmentCount">0</int> <bool name="current">true</bool> <bool name="hasDeletions">false</bool> <str name="directory"> org.apache.lucene.store.NRTCachingDirectory:NRTCachingDirectory(MMapDirectory@/tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index lockFactory=org.apache.lucene.store.NativeFSLockFactory@762404a0; maxCacheMB=48.0 maxMergeSizeMB=4.0) </str> <lst name="userData"/> <long name="sizeInBytes">71</long> <str name="size">71 bytes</str> </lst> </lst> </lst> </response> ### If I then corrupt the index file and reload, e.g. like this: echo "cheese" >> /tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index/segments_1 And then I call the same endpoint (solr/admin/cores?action=STATUS), I get a 500 back: ############ Corrupted response ############ <response> <lst name="responseHeader"> <int name="status">500</int> <int name="QTime">1508</int> </lst> <lst name="error"> <str name="msg">Error handling 'status' action</str> <str name="trace"> org.apache.solr.common.SolrException: Error handling 'status' action at org.apache.solr.handler.admin.CoreAdminHandler.handleStatusAction(CoreAdminHandler.java:755) at org.apache.solr.handler.admin.CoreAdminHandler.handleRequestInternal(CoreAdminHandler.java:231) at org.apache.solr.handler.admin.CoreAdminHandler.handleRequestBody(CoreAdminHandler.java:196) at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:146) at org.apache.solr.servlet.HttpSolrCall.handleAdminRequest(HttpSolrCall.java:676) at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:443) at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:210) at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:179) at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652) at com.apple.cie.search.auth.TrustFilter.doFilter(TrustFilter.java:44) at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652) at com.apple.cie.search.id.IdFilter.doFilter(IdFilter.java:38) at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652) at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143) at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577) at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223) at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127) at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515) at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185) at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215) at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110) at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97) at org.eclipse.jetty.server.Server.handle(Server.java:499) at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:310) at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:257) at org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540) at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635) at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555) at java.lang.Thread.run(Thread.java:745) Caused by: org.apache.lucene.index.CorruptIndexException: misplaced codec footer (file extended?): remaining=23, expected=16 (resource=BufferedChecksumIndexInput(MMapIndexInput(path="/tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index/segments_1"))) at org.apache.lucene.codecs.CodecUtil.validateFooter(CodecUtil.java:411) at org.apache.lucene.codecs.CodecUtil.checkFooter(CodecUtil.java:331) at org.apache.lucene.index.SegmentInfos.readCommit(SegmentInfos.java:442) at org.apache.lucene.index.SegmentInfos$1.doBody(SegmentInfos.java:493) at org.apache.lucene.index.SegmentInfos$1.doBody(SegmentInfos.java:490) at org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:731) at org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:683) at org.apache.lucene.index.SegmentInfos.readLatestCommit(SegmentInfos.java:490) at org.apache.lucene.index.StandardDirectoryReader.isCurrent(StandardDirectoryReader.java:344) at org.apache.lucene.index.FilterDirectoryReader.isCurrent(FilterDirectoryReader.java:124) at org.apache.lucene.index.FilterDirectoryReader.isCurrent(FilterDirectoryReader.java:124) at org.apache.solr.handler.admin.LukeRequestHandler.getIndexInfo(LukeRequestHandler.java:585) at org.apache.solr.handler.admin.CoreAdminHandler.getCoreStatus(CoreAdminHandler.java:1202) at org.apache.solr.handler.admin.CoreAdminHandler.handleStatusAction(CoreAdminHandler.java:743) ... 31 more </str> <int name="code">500</int> </lst> </response> ### It seems to me like what we really want is to still return a 200, but to list the init failures under the 'initFailures' key of the response (as seen in 'healthy response' above). This way, if a node is hosting 10 cores and 1 is corrupted, I can still query the STATUS endpoint to do get information about the non-corrupted cores, AND I can more easily determine what the problem with my corrupted core is because I can see the stack trace. This allows automated tooling, for instance, to go in there and delete and re-add a replica until the day arrives that REQUESTRECOVERY and/or leader-initiated-recovery both work when the index is corrupted (see https://issues.apache.org/jira/browse/SOLR-9836). I am not sure which solution the world would like best, so I am proposing two patches: ############## Patch 1 ############### diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 69d3a88..c84d4c1 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -740,7 +740,13 @@ public class CoreAdminHandler extends RequestHandlerBase { try { if (cname == null) { for (String name : coreContainer.getAllCoreNames()) { - status.add(name, getCoreStatus(coreContainer, name, isIndexInfoNeeded)); + if (!failures.containsKey(name)) { + status.add(name, getCoreStatus(coreContainer, name, isIndexInfoNeeded)); + } else { + NamedList<Object> failure = new NamedList<>(); + failure.add("initFailure", failures.get(name)); + status.add(name, failure); + } } rsp.add("initFailures", failures); } else { ############## Patch 2 ################### diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 69d3a88..6e9b94c 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -19,6 +19,8 @@ package org.apache.solr.handler.admin; import java.io.File; import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; @@ -740,7 +742,17 @@ public class CoreAdminHandler extends RequestHandlerBase { try { if (cname == null) { for (String name : coreContainer.getAllCoreNames()) { - status.add(name, getCoreStatus(coreContainer, name, isIndexInfoNeeded)); + try { + status.add(name, getCoreStatus(coreContainer, name, isIndexInfoNeeded)); + } catch (SolrException ex) { + NamedList<Object> error = new NamedList<>(); + error.add("msg", ex.getMessage()); + StringWriter sw = new StringWriter(); + ex.printStackTrace(new PrintWriter(sw)); + error.add("trace", sw.toString()); + status.add(name, error); + } } rsp.add("initFailures", failures); } else { -- This message was sent by Atlassian JIRA (v6.3.15#6346) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org