[ 
https://issues.apache.org/jira/browse/SOLR-10259?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Oliver Bates updated SOLR-10259:
--------------------------------
    Description: 
When I have a healthy core on a node and I call solr/admin/cores?action=STATUS, 
I get the following healthy response:

<response>
  <lst name="responseHeader">
    <int name="status">0</int>
    <int name="QTime">1607</int>
  </lst>
  <lst name="initFailures"/>
  <lst name="status">
    <lst name="whoisbanana_shard1_replica1">
      <str name="name">whoisbanana_shard1_replica1</str>
      <str name="instanceDir">
        /tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/
      </str>
      <str name="dataDir">
        /tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/
      </str>
      <str name="config">solrconfig.xml</str>
      <str name="schema">schema.xml</str>
      <date name="startTime">2017-03-08T15:59:50.18Z</date>
      <long name="uptime">380431</long>
      <str name="lastPublished">active</str>
      <int name="configVersion">0</int>
      <lst name="index">
      <int name="numDocs">0</int>
      <int name="maxDoc">0</int>
      <int name="deletedDocs">0</int>
      <long name="indexHeapUsageBytes">0</long>
      <long name="version">2</long>
      <int name="segmentCount">0</int>
      <bool name="current">true</bool>
      <bool name="hasDeletions">false</bool>
      <str name="directory">
org.apache.lucene.store.NRTCachingDirectory:NRTCachingDirectory(MMapDirectory@/tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index
 lockFactory=org.apache.lucene.store.NativeFSLockFactory@762404a0; 
maxCacheMB=48.0 maxMergeSizeMB=4.0)
      </str>
      <lst name="userData"/>
      <long name="sizeInBytes">71</long>
      <str name="size">71 bytes</str>
    </lst>
  </lst>
</lst>
</response>

If I then corrupt the index file and reload, e.g. like this:
echo "cheese" >> 
/tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index/segments_1

And then I call the same endpoint (solr/admin/cores?action=STATUS), I get a 500 
back:

<response>
  <lst name="responseHeader">
    <int name="status">500</int>
    <int name="QTime">1508</int>
  </lst>
  <lst name="error">
    <str name="msg">Error handling 'status' action</str>
    <str name="trace">
org.apache.solr.common.SolrException: Error handling 'status' action at 
org.apache.solr.handler.admin.CoreAdminHandler.handleStatusAction(CoreAdminHandler.java:755)
 at 
org.apache.solr.handler.admin.CoreAdminHandler.handleRequestInternal(CoreAdminHandler.java:231)
 at 
org.apache.solr.handler.admin.CoreAdminHandler.handleRequestBody(CoreAdminHandler.java:196)
 at 
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:146)
 at 
org.apache.solr.servlet.HttpSolrCall.handleAdminRequest(HttpSolrCall.java:676) 
at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:443) at 
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:210)
 at 
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:179)
 at 
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
 at com.apple.cie.search.auth.TrustFilter.doFilter(TrustFilter.java:44) at 
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
 at com.apple.cie.search.id.IdFilter.doFilter(IdFilter.java:38) at 
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
 at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585) 
at 
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143) 
at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577) 
at 
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223)
 at 
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127)
 at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515) 
at 
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
 at 
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061)
 at 
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) 
at 
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
 at 
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110)
 at 
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97) 
at org.eclipse.jetty.server.Server.handle(Server.java:499) at 
org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:310) at 
org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:257) at 
org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540) at 
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
 at 
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555) 
at java.lang.Thread.run(Thread.java:745) Caused by: 
org.apache.lucene.index.CorruptIndexException: misplaced codec footer (file 
extended?): remaining=23, expected=16 
(resource=BufferedChecksumIndexInput(MMapIndexInput(path="/tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index/segments_1")))
 at org.apache.lucene.codecs.CodecUtil.validateFooter(CodecUtil.java:411) at 
org.apache.lucene.codecs.CodecUtil.checkFooter(CodecUtil.java:331) at 
org.apache.lucene.index.SegmentInfos.readCommit(SegmentInfos.java:442) at 
org.apache.lucene.index.SegmentInfos$1.doBody(SegmentInfos.java:493) at 
org.apache.lucene.index.SegmentInfos$1.doBody(SegmentInfos.java:490) at 
org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:731)
 at 
org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:683)
 at 
org.apache.lucene.index.SegmentInfos.readLatestCommit(SegmentInfos.java:490) at 
org.apache.lucene.index.StandardDirectoryReader.isCurrent(StandardDirectoryReader.java:344)
 at 
org.apache.lucene.index.FilterDirectoryReader.isCurrent(FilterDirectoryReader.java:124)
 at 
org.apache.lucene.index.FilterDirectoryReader.isCurrent(FilterDirectoryReader.java:124)
 at 
org.apache.solr.handler.admin.LukeRequestHandler.getIndexInfo(LukeRequestHandler.java:585)
 at 
org.apache.solr.handler.admin.CoreAdminHandler.getCoreStatus(CoreAdminHandler.java:1202)
 at 
org.apache.solr.handler.admin.CoreAdminHandler.handleStatusAction(CoreAdminHandler.java:743)
 ... 31 more
    </str>
    <int name="code">500</int>
  </lst>
</response>

It seems to me like what we really want is to still return a 200, but to list 
the init failures under the 'initFailures' key of the  response (as seen in 
'healthy response' above). This way, if a node is hosting 10 cores and 1 is 
corrupted, I can still query the STATUS endpoint to do get information about 
the non-corrupted cores, AND I can more easily determine what the problem with 
my corrupted core is because I can see the stack trace. This allows automated 
tooling, for instance, to go in there and delete and re-add a replica until the 
day arrives that REQUESTRECOVERY and/or leader-initiated-recovery both work 
when the index is corrupted (see 
https://issues.apache.org/jira/browse/SOLR-9836).

I am not sure which solution the world would like best, so I am proposing two 
patches:

-- Patch1 --

diff --git 
a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java 
b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
index 69d3a88..c84d4c1 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
@@ -740,7 +740,13 @@ public class CoreAdminHandler extends RequestHandlerBase {
     try {
       if (cname == null) {
         for (String name : coreContainer.getAllCoreNames()) {
-          status.add(name, getCoreStatus(coreContainer, name, 
isIndexInfoNeeded));
+          if (!failures.containsKey(name)) {
+            status.add(name, getCoreStatus(coreContainer, name, 
isIndexInfoNeeded));
+          } else {
+            NamedList<Object> failure = new NamedList<>();
+            failure.add("initFailure", failures.get(name));
+            status.add(name, failure);
+          }
         }
         rsp.add("initFailures", failures);
       } else {


-- Patch 2 --

diff --git 
a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java 
b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
index 69d3a88..6e9b94c 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
@@ -19,6 +19,8 @@ package org.apache.solr.handler.admin;

 import java.io.File;
 import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -740,7 +742,17 @@ public class CoreAdminHandler extends RequestHandlerBase {
     try {
       if (cname == null) {
         for (String name : coreContainer.getAllCoreNames()) {
-          status.add(name, getCoreStatus(coreContainer, name, 
isIndexInfoNeeded));
+          try {
+            status.add(name, getCoreStatus(coreContainer, name, 
isIndexInfoNeeded));
+          } catch (SolrException ex) {
+            NamedList<Object> error = new NamedList<>();
+            error.add("msg", ex.getMessage());
+            StringWriter sw = new StringWriter();
+            ex.printStackTrace(new PrintWriter(sw));
+            error.add("trace", sw.toString());
+            status.add(name, error);
+          }
         }
         rsp.add("initFailures", failures);
       } else {


  was:
When I have a healthy core on a node and I call solr/admin/cores?action=STATUS, 
I get the following:

######### Healthy response ############
<response>
  <lst name="responseHeader">
    <int name="status">0</int>
    <int name="QTime">1607</int>
  </lst>
  <lst name="initFailures"/>
  <lst name="status">
    <lst name="whoisbanana_shard1_replica1">
      <str name="name">whoisbanana_shard1_replica1</str>
      <str name="instanceDir">
        /tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/
      </str>
      <str name="dataDir">
        /tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/
      </str>
      <str name="config">solrconfig.xml</str>
      <str name="schema">schema.xml</str>
      <date name="startTime">2017-03-08T15:59:50.18Z</date>
      <long name="uptime">380431</long>
      <str name="lastPublished">active</str>
      <int name="configVersion">0</int>
      <lst name="index">
      <int name="numDocs">0</int>
      <int name="maxDoc">0</int>
      <int name="deletedDocs">0</int>
      <long name="indexHeapUsageBytes">0</long>
      <long name="version">2</long>
      <int name="segmentCount">0</int>
      <bool name="current">true</bool>
      <bool name="hasDeletions">false</bool>
      <str name="directory">
        
org.apache.lucene.store.NRTCachingDirectory:NRTCachingDirectory(MMapDirectory@/tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index
 lockFactory=org.apache.lucene.store.NativeFSLockFactory@762404a0; 
maxCacheMB=48.0 maxMergeSizeMB=4.0)
      </str>
      <lst name="userData"/>
      <long name="sizeInBytes">71</long>
      <str name="size">71 bytes</str>
    </lst>
  </lst>
</lst>
</response>
###

If I then corrupt the index file and reload, e.g. like this:
echo "cheese" >> 
/tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index/segments_1

And then I call the same endpoint (solr/admin/cores?action=STATUS), I get a 500 
back:

############ Corrupted response ############
<response>
  <lst name="responseHeader">
    <int name="status">500</int>
    <int name="QTime">1508</int>
  </lst>
  <lst name="error">
    <str name="msg">Error handling 'status' action</str>
    <str name="trace">
org.apache.solr.common.SolrException: Error handling 'status' action at 
org.apache.solr.handler.admin.CoreAdminHandler.handleStatusAction(CoreAdminHandler.java:755)
 at 
org.apache.solr.handler.admin.CoreAdminHandler.handleRequestInternal(CoreAdminHandler.java:231)
 at 
org.apache.solr.handler.admin.CoreAdminHandler.handleRequestBody(CoreAdminHandler.java:196)
 at 
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:146)
 at 
org.apache.solr.servlet.HttpSolrCall.handleAdminRequest(HttpSolrCall.java:676) 
at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:443) at 
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:210)
 at 
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:179)
 at 
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
 at com.apple.cie.search.auth.TrustFilter.doFilter(TrustFilter.java:44) at 
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
 at com.apple.cie.search.id.IdFilter.doFilter(IdFilter.java:38) at 
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
 at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585) 
at 
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143) 
at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577) 
at 
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223)
 at 
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127)
 at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515) 
at 
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
 at 
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061)
 at 
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) 
at 
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
 at 
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110)
 at 
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97) 
at org.eclipse.jetty.server.Server.handle(Server.java:499) at 
org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:310) at 
org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:257) at 
org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540) at 
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
 at 
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555) 
at java.lang.Thread.run(Thread.java:745) Caused by: 
org.apache.lucene.index.CorruptIndexException: misplaced codec footer (file 
extended?): remaining=23, expected=16 
(resource=BufferedChecksumIndexInput(MMapIndexInput(path="/tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index/segments_1")))
 at org.apache.lucene.codecs.CodecUtil.validateFooter(CodecUtil.java:411) at 
org.apache.lucene.codecs.CodecUtil.checkFooter(CodecUtil.java:331) at 
org.apache.lucene.index.SegmentInfos.readCommit(SegmentInfos.java:442) at 
org.apache.lucene.index.SegmentInfos$1.doBody(SegmentInfos.java:493) at 
org.apache.lucene.index.SegmentInfos$1.doBody(SegmentInfos.java:490) at 
org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:731)
 at 
org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:683)
 at 
org.apache.lucene.index.SegmentInfos.readLatestCommit(SegmentInfos.java:490) at 
org.apache.lucene.index.StandardDirectoryReader.isCurrent(StandardDirectoryReader.java:344)
 at 
org.apache.lucene.index.FilterDirectoryReader.isCurrent(FilterDirectoryReader.java:124)
 at 
org.apache.lucene.index.FilterDirectoryReader.isCurrent(FilterDirectoryReader.java:124)
 at 
org.apache.solr.handler.admin.LukeRequestHandler.getIndexInfo(LukeRequestHandler.java:585)
 at 
org.apache.solr.handler.admin.CoreAdminHandler.getCoreStatus(CoreAdminHandler.java:1202)
 at 
org.apache.solr.handler.admin.CoreAdminHandler.handleStatusAction(CoreAdminHandler.java:743)
 ... 31 more
    </str>
    <int name="code">500</int>
  </lst>
</response>
###

It seems to me like what we really want is to still return a 200, but to list 
the init failures under the 'initFailures' key of the  response (as seen in 
'healthy response' above). This way, if a node is hosting 10 cores and 1 is 
corrupted, I can still query the STATUS endpoint to do get information about 
the non-corrupted cores, AND I can more easily determine what the problem with 
my corrupted core is because I can see the stack trace. This allows automated 
tooling, for instance, to go in there and delete and re-add a replica until the 
day arrives that REQUESTRECOVERY and/or leader-initiated-recovery both work 
when the index is corrupted (see 
https://issues.apache.org/jira/browse/SOLR-9836).

I am not sure which solution the world would like best, so I am proposing two 
patches:

############## Patch 1 ###############

diff --git 
a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java 
b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
index 69d3a88..c84d4c1 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
@@ -740,7 +740,13 @@ public class CoreAdminHandler extends RequestHandlerBase {
     try {
       if (cname == null) {
         for (String name : coreContainer.getAllCoreNames()) {
-          status.add(name, getCoreStatus(coreContainer, name, 
isIndexInfoNeeded));
+          if (!failures.containsKey(name)) {
+            status.add(name, getCoreStatus(coreContainer, name, 
isIndexInfoNeeded));
+          } else {
+            NamedList<Object> failure = new NamedList<>();
+            failure.add("initFailure", failures.get(name));
+            status.add(name, failure);
+          }
         }
         rsp.add("initFailures", failures);
       } else {


############## Patch 2 ###################

diff --git 
a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java 
b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
index 69d3a88..6e9b94c 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
@@ -19,6 +19,8 @@ package org.apache.solr.handler.admin;

 import java.io.File;
 import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -740,7 +742,17 @@ public class CoreAdminHandler extends RequestHandlerBase {
     try {
       if (cname == null) {
         for (String name : coreContainer.getAllCoreNames()) {
-          status.add(name, getCoreStatus(coreContainer, name, 
isIndexInfoNeeded));
+          try {
+            status.add(name, getCoreStatus(coreContainer, name, 
isIndexInfoNeeded));
+          } catch (SolrException ex) {
+            NamedList<Object> error = new NamedList<>();
+            error.add("msg", ex.getMessage());
+            StringWriter sw = new StringWriter();
+            ex.printStackTrace(new PrintWriter(sw));
+            error.add("trace", sw.toString());
+            status.add(name, error);
+          }
         }
         rsp.add("initFailures", failures);
       } else {



> admin/cores?action=STATUS returns 500 when a single core has init failures
> --------------------------------------------------------------------------
>
>                 Key: SOLR-10259
>                 URL: https://issues.apache.org/jira/browse/SOLR-10259
>             Project: Solr
>          Issue Type: Bug
>      Security Level: Public(Default Security Level. Issues are Public) 
>    Affects Versions: 5.3
>            Reporter: Oliver Bates
>            Priority: Trivial
>
> When I have a healthy core on a node and I call 
> solr/admin/cores?action=STATUS, I get the following healthy response:
> <response>
>   <lst name="responseHeader">
>     <int name="status">0</int>
>     <int name="QTime">1607</int>
>   </lst>
>   <lst name="initFailures"/>
>   <lst name="status">
>     <lst name="whoisbanana_shard1_replica1">
>       <str name="name">whoisbanana_shard1_replica1</str>
>       <str name="instanceDir">
>         /tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/
>       </str>
>       <str name="dataDir">
>         /tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/
>       </str>
>       <str name="config">solrconfig.xml</str>
>       <str name="schema">schema.xml</str>
>       <date name="startTime">2017-03-08T15:59:50.18Z</date>
>       <long name="uptime">380431</long>
>       <str name="lastPublished">active</str>
>       <int name="configVersion">0</int>
>       <lst name="index">
>       <int name="numDocs">0</int>
>       <int name="maxDoc">0</int>
>       <int name="deletedDocs">0</int>
>       <long name="indexHeapUsageBytes">0</long>
>       <long name="version">2</long>
>       <int name="segmentCount">0</int>
>       <bool name="current">true</bool>
>       <bool name="hasDeletions">false</bool>
>       <str name="directory">
> org.apache.lucene.store.NRTCachingDirectory:NRTCachingDirectory(MMapDirectory@/tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index
>  lockFactory=org.apache.lucene.store.NativeFSLockFactory@762404a0; 
> maxCacheMB=48.0 maxMergeSizeMB=4.0)
>       </str>
>       <lst name="userData"/>
>       <long name="sizeInBytes">71</long>
>       <str name="size">71 bytes</str>
>     </lst>
>   </lst>
> </lst>
> </response>
> If I then corrupt the index file and reload, e.g. like this:
> echo "cheese" >> 
> /tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index/segments_1
> And then I call the same endpoint (solr/admin/cores?action=STATUS), I get a 
> 500 back:
> <response>
>   <lst name="responseHeader">
>     <int name="status">500</int>
>     <int name="QTime">1508</int>
>   </lst>
>   <lst name="error">
>     <str name="msg">Error handling 'status' action</str>
>     <str name="trace">
> org.apache.solr.common.SolrException: Error handling 'status' action at 
> org.apache.solr.handler.admin.CoreAdminHandler.handleStatusAction(CoreAdminHandler.java:755)
>  at 
> org.apache.solr.handler.admin.CoreAdminHandler.handleRequestInternal(CoreAdminHandler.java:231)
>  at 
> org.apache.solr.handler.admin.CoreAdminHandler.handleRequestBody(CoreAdminHandler.java:196)
>  at 
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:146)
>  at 
> org.apache.solr.servlet.HttpSolrCall.handleAdminRequest(HttpSolrCall.java:676)
>  at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:443) at 
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:210)
>  at 
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:179)
>  at 
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
>  at com.apple.cie.search.auth.TrustFilter.doFilter(TrustFilter.java:44) at 
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
>  at com.apple.cie.search.id.IdFilter.doFilter(IdFilter.java:38) at 
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
>  at 
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585) at 
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143) 
> at 
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577) 
> at 
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223)
>  at 
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127)
>  at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515) 
> at 
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
>  at 
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061)
>  at 
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) 
> at 
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
>  at 
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110)
>  at 
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97)
>  at org.eclipse.jetty.server.Server.handle(Server.java:499) at 
> org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:310) at 
> org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:257) 
> at org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540) 
> at 
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
>  at 
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
>  at java.lang.Thread.run(Thread.java:745) Caused by: 
> org.apache.lucene.index.CorruptIndexException: misplaced codec footer (file 
> extended?): remaining=23, expected=16 
> (resource=BufferedChecksumIndexInput(MMapIndexInput(path="/tmp/search_integration_test/solr1/whoisbanana_shard1_replica1/data/index/segments_1")))
>  at org.apache.lucene.codecs.CodecUtil.validateFooter(CodecUtil.java:411) at 
> org.apache.lucene.codecs.CodecUtil.checkFooter(CodecUtil.java:331) at 
> org.apache.lucene.index.SegmentInfos.readCommit(SegmentInfos.java:442) at 
> org.apache.lucene.index.SegmentInfos$1.doBody(SegmentInfos.java:493) at 
> org.apache.lucene.index.SegmentInfos$1.doBody(SegmentInfos.java:490) at 
> org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:731)
>  at 
> org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:683)
>  at 
> org.apache.lucene.index.SegmentInfos.readLatestCommit(SegmentInfos.java:490) 
> at 
> org.apache.lucene.index.StandardDirectoryReader.isCurrent(StandardDirectoryReader.java:344)
>  at 
> org.apache.lucene.index.FilterDirectoryReader.isCurrent(FilterDirectoryReader.java:124)
>  at 
> org.apache.lucene.index.FilterDirectoryReader.isCurrent(FilterDirectoryReader.java:124)
>  at 
> org.apache.solr.handler.admin.LukeRequestHandler.getIndexInfo(LukeRequestHandler.java:585)
>  at 
> org.apache.solr.handler.admin.CoreAdminHandler.getCoreStatus(CoreAdminHandler.java:1202)
>  at 
> org.apache.solr.handler.admin.CoreAdminHandler.handleStatusAction(CoreAdminHandler.java:743)
>  ... 31 more
>     </str>
>     <int name="code">500</int>
>   </lst>
> </response>
> It seems to me like what we really want is to still return a 200, but to list 
> the init failures under the 'initFailures' key of the  response (as seen in 
> 'healthy response' above). This way, if a node is hosting 10 cores and 1 is 
> corrupted, I can still query the STATUS endpoint to do get information about 
> the non-corrupted cores, AND I can more easily determine what the problem 
> with my corrupted core is because I can see the stack trace. This allows 
> automated tooling, for instance, to go in there and delete and re-add a 
> replica until the day arrives that REQUESTRECOVERY and/or 
> leader-initiated-recovery both work when the index is corrupted (see 
> https://issues.apache.org/jira/browse/SOLR-9836).
> I am not sure which solution the world would like best, so I am proposing two 
> patches:
> -- Patch1 --
> diff --git 
> a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java 
> b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
> index 69d3a88..c84d4c1 100644
> --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
> +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
> @@ -740,7 +740,13 @@ public class CoreAdminHandler extends RequestHandlerBase 
> {
>      try {
>        if (cname == null) {
>          for (String name : coreContainer.getAllCoreNames()) {
> -          status.add(name, getCoreStatus(coreContainer, name, 
> isIndexInfoNeeded));
> +          if (!failures.containsKey(name)) {
> +            status.add(name, getCoreStatus(coreContainer, name, 
> isIndexInfoNeeded));
> +          } else {
> +            NamedList<Object> failure = new NamedList<>();
> +            failure.add("initFailure", failures.get(name));
> +            status.add(name, failure);
> +          }
>          }
>          rsp.add("initFailures", failures);
>        } else {
> -- Patch 2 --
> diff --git 
> a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java 
> b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
> index 69d3a88..6e9b94c 100644
> --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
> +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
> @@ -19,6 +19,8 @@ package org.apache.solr.handler.admin;
>  import java.io.File;
>  import java.io.IOException;
> +import java.io.PrintWriter;
> +import java.io.StringWriter;
>  import java.nio.charset.StandardCharsets;
>  import java.util.ArrayList;
>  import java.util.Arrays;
> @@ -740,7 +742,17 @@ public class CoreAdminHandler extends RequestHandlerBase 
> {
>      try {
>        if (cname == null) {
>          for (String name : coreContainer.getAllCoreNames()) {
> -          status.add(name, getCoreStatus(coreContainer, name, 
> isIndexInfoNeeded));
> +          try {
> +            status.add(name, getCoreStatus(coreContainer, name, 
> isIndexInfoNeeded));
> +          } catch (SolrException ex) {
> +            NamedList<Object> error = new NamedList<>();
> +            error.add("msg", ex.getMessage());
> +            StringWriter sw = new StringWriter();
> +            ex.printStackTrace(new PrintWriter(sw));
> +            error.add("trace", sw.toString());
> +            status.add(name, error);
> +          }
>          }
>          rsp.add("initFailures", failures);
>        } else {



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

Reply via email to