This is an automated email from the ASF dual-hosted git repository.

broustant pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new a931e8a356c SOLR-17805: Exception in TransactionLog constructor 
deletes the file and does not block subsequent updates. (#3412)
a931e8a356c is described below

commit a931e8a356c2378e49386ac3993516b3637a3c75
Author: Bruno Roustant <[email protected]>
AuthorDate: Wed Jul 2 10:23:04 2025 +0200

    SOLR-17805: Exception in TransactionLog constructor deletes the file and 
does not block subsequent updates. (#3412)
---
 solr/CHANGES.txt                                   |  2 +
 .../java/org/apache/solr/core/CoreContainer.java   | 15 +-------
 .../apache/solr/handler/RequestHandlerBase.java    | 33 +++++++++++++---
 .../solr/jersey/CatchAllExceptionMapper.java       |  2 +-
 .../apache/solr/update/DirectUpdateHandler2.java   | 11 +++++-
 .../org/apache/solr/update/TransactionLog.java     |  4 ++
 .../src/java/org/apache/solr/update/UpdateLog.java | 44 ++++++++++++++++++++--
 .../solr/handler/RequestHandlerBaseTest.java       |  6 +--
 .../test/org/apache/solr/update/UpdateLogTest.java | 39 +++++++++++++++++++
 9 files changed, 129 insertions(+), 27 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index cebce8dc85e..2dd391be63a 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -335,6 +335,8 @@ Bug Fixes
 
 * SOLR-17800: Security Manager should handle symlink on /tmp (Kevin Risden)
 
+* SOLR-17805: Exception in TransactionLog constructor deletes the file and 
does not block subsequent updates. (Bruno Roustant)
+
 Dependency Upgrades
 ---------------------
 * SOLR-17471: Upgrade Lucene to 9.12.1. (Pierre Salagnac, Christine Poerschke)
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java 
b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index c7b19368e56..db5c97cc921 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -2561,6 +2561,8 @@ public class CoreContainer {
   }
 
   /**
+   * Checks whether a tragic exception was thrown during update (including 
update log).
+   *
    * @param solrCore the core against which we check if there has been a 
tragic exception
    * @return whether this Solr core has tragic exception
    * @see org.apache.lucene.index.IndexWriter#getTragicException()
@@ -2573,19 +2575,6 @@ public class CoreContainer {
       // failed to open an indexWriter
       tragicException = e;
     }
-
-    if (tragicException != null && isZooKeeperAware()) {
-      getZkController().giveupLeadership(solrCore.getCoreDescriptor());
-
-      try {
-        // If the error was something like a full file system disconnect, this 
probably won't help
-        // But if it is a transient disk failure then it's worth a try
-        solrCore.getSolrCoreState().newIndexWriter(solrCore, false); // should 
we rollback?
-      } catch (IOException e) {
-        log.warn("Could not roll index writer after tragedy");
-      }
-    }
-
     return tragicException != null;
   }
 
diff --git a/solr/core/src/java/org/apache/solr/handler/RequestHandlerBase.java 
b/solr/core/src/java/org/apache/solr/handler/RequestHandlerBase.java
index 43559d170cf..0f62f31aaab 100644
--- a/solr/core/src/java/org/apache/solr/handler/RequestHandlerBase.java
+++ b/solr/core/src/java/org/apache/solr/handler/RequestHandlerBase.java
@@ -22,6 +22,7 @@ import static 
org.apache.solr.response.SolrQueryResponse.haveCompleteResults;
 import com.codahale.metrics.Counter;
 import com.codahale.metrics.Meter;
 import com.codahale.metrics.Timer;
+import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.Collection;
 import java.util.Collections;
@@ -34,9 +35,11 @@ import org.apache.solr.common.params.ShardParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SuppressForbidden;
+import org.apache.solr.core.CoreContainer;
 import org.apache.solr.core.MetricsConfig;
 import org.apache.solr.core.PluginBag;
 import org.apache.solr.core.PluginInfo;
+import org.apache.solr.core.SolrCore;
 import org.apache.solr.core.SolrInfoBean;
 import org.apache.solr.metrics.SolrDelegateRegistryMetricsContext;
 import org.apache.solr.metrics.SolrMetricManager;
@@ -245,7 +248,7 @@ public abstract class RequestHandlerBase
     } catch (QueryLimitsExceededException e) {
       rsp.setPartialResults(req);
     } catch (Exception e) {
-      Exception normalized = normalizeReceivedException(req, e);
+      Exception normalized = processReceivedException(req, e);
       processErrorMetricsOnException(normalized, metrics);
       rsp.setException(normalized);
     } finally {
@@ -303,10 +306,30 @@ public abstract class RequestHandlerBase
     }
   }
 
-  public static Exception normalizeReceivedException(SolrQueryRequest req, 
Exception e) {
-    if (req.getCore() != null) {
-      assert req.getCoreContainer() != null;
-      if (req.getCoreContainer().checkTragicException(req.getCore())) {
+  /**
+   * Processes and normalizes any exceptions that are received from the 
request handler. This method
+   * is called before any error metrics are recorded.
+   *
+   * <p>If a tragic exception occurred in the index writer, this method also 
gives up leadership of
+   * the shard, and replaces the index writer with a new one to attempt to get 
out of a transient
+   * failure (e.g. disk failure).
+   */
+  public static Exception processReceivedException(SolrQueryRequest req, 
Exception e) {
+    SolrCore core = req.getCore();
+    if (core != null) {
+      CoreContainer coreContainer = req.getCoreContainer();
+      assert coreContainer != null;
+      if (coreContainer.checkTragicException(core)) {
+        if (coreContainer.isZooKeeperAware()) {
+          
coreContainer.getZkController().giveupLeadership(core.getCoreDescriptor());
+          try {
+            // If the error was something like a full file system disconnect, 
this probably won't
+            // help, but if it is a transient disk failure then it's worth a 
try.
+            core.getSolrCoreState().newIndexWriter(core, false); // should we 
rollback?
+          } catch (IOException ioe) {
+            log.warn("Could not roll index writer after tragedy");
+          }
+        }
         return SolrException.wrapLuceneTragicExceptionIfNecessary(e);
       }
     }
diff --git 
a/solr/core/src/java/org/apache/solr/jersey/CatchAllExceptionMapper.java 
b/solr/core/src/java/org/apache/solr/jersey/CatchAllExceptionMapper.java
index 3760bfc4590..e80c922513e 100644
--- a/solr/core/src/java/org/apache/solr/jersey/CatchAllExceptionMapper.java
+++ b/solr/core/src/java/org/apache/solr/jersey/CatchAllExceptionMapper.java
@@ -88,7 +88,7 @@ public class CatchAllExceptionMapper implements 
ExceptionMapper<Exception> {
       ContainerRequestContext containerRequestContext) {
     // First, handle any exception-related metrics
     final Exception normalizedException =
-        RequestHandlerBase.normalizeReceivedException(solrQueryRequest, 
exception);
+        RequestHandlerBase.processReceivedException(solrQueryRequest, 
exception);
     final RequestHandlerBase.HandlerMetrics metrics =
         (RequestHandlerBase.HandlerMetrics) 
containerRequestContext.getProperty(HANDLER_METRICS);
     if (metrics != null) {
diff --git 
a/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java 
b/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java
index 4c55eaab9b5..60dbb9c4f28 100644
--- a/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java
+++ b/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java
@@ -336,11 +336,18 @@ public class DirectUpdateHandler2 extends UpdateHandler
               + errorDetails;
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, errorMsg, 
iae);
     } catch (RuntimeException t) {
+      SolrException.ErrorCode errorCode =
+          core.getCoreContainer().checkTragicException(core)
+              ? SolrException.ErrorCode.SERVER_ERROR
+              : SolrException.ErrorCode.BAD_REQUEST;
       String errorMsg =
           "Exception writing document id "
               + cmd.getPrintableId()
-              + " to the index; possible analysis error.";
-      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, errorMsg, 
t);
+              + " to the index"
+              + (errorCode == SolrException.ErrorCode.SERVER_ERROR
+                  ? "."
+                  : "; possible analysis error.");
+      throw new SolrException(errorCode, errorMsg, t);
     }
   }
 
diff --git a/solr/core/src/java/org/apache/solr/update/TransactionLog.java 
b/solr/core/src/java/org/apache/solr/update/TransactionLog.java
index 07b1d0b94fa..2169eaa695c 100644
--- a/solr/core/src/java/org/apache/solr/update/TransactionLog.java
+++ b/solr/core/src/java/org/apache/solr/update/TransactionLog.java
@@ -37,6 +37,7 @@ import java.util.Map;
 import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.SolrInputDocument;
@@ -247,6 +248,9 @@ public class TransactionLog implements Closeable {
         } catch (Exception e) {
           log.error("Error closing tlog file (after error opening)", e);
         }
+        if (!openExisting) {
+          IOUtils.deleteFilesIgnoringExceptions(tlog);
+        }
       }
     }
     assert ObjectReleaseTracker.track(this);
diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java 
b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
index c26eb7f77d7..cd7e1bbae3f 100644
--- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java
+++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
@@ -746,7 +746,10 @@ public class UpdateLog implements PluginInfoInitialized, 
SolrMetricProducer {
   }
 
   public long getLastLogId() {
-    if (id != -1) return id;
+    return id == -1 ? scanLastLogId(tlogFiles) : id;
+  }
+
+  static long scanLastLogId(String[] tlogFiles) {
     if (tlogFiles.length == 0) return -1;
     String last = tlogFiles[tlogFiles.length - 1];
     return Long.parseLong(last.substring(TLOG_NAME.length() + 1));
@@ -1592,10 +1595,45 @@ public class UpdateLog implements 
PluginInfoInitialized, SolrMetricProducer {
     }
   }
 
+  /**
+   * Ensures a transaction log is ready. It is either the current one, or a 
new one. This method
+   * must be called with the synchronization monitor on this {@link UpdateLog}.
+   */
   protected void ensureLog() {
     if (tlog == null) {
-      String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, 
TLOG_NAME, id);
-      tlog = newTransactionLog(tlogDir.resolve(newLogName), globalStrings, 
false);
+      Path newLogPath;
+      int numAttempts = 0;
+      while (true) {
+        String newLogName = String.format(Locale.ROOT, LOG_FILENAME_PATTERN, 
TLOG_NAME, id);
+        newLogPath = tlogDir.resolve(newLogName);
+        // We expect that the log file does not exist since id is designed to 
give the index of the
+        // next transaction log to create. But in very rare cases, the log 
files listed in the
+        // init() method may be stale here (file system delay?), and id may 
point to an existing
+        // file.
+        if (!Files.exists(newLogPath)) {
+          break;
+        }
+        // If the "new" log file already exists, refresh the log list and 
recompute id.
+        // Ideally we would want to include the missing log in the old logs 
tracking (see init()),
+        // but the init() method is not designed to be executed twice. Given 
that in the rare cases
+        // we have seen this missing log, the log was always empty (file size 
0), then we simply
+        // refresh log list and update the id.
+        try {
+          log.error(
+              "New transaction log already exists {} size={}, skipping it",
+              newLogPath,
+              Files.size(newLogPath));
+        } catch (IOException e) {
+          log.error("New transaction log already exists {} size unknown, 
skipping it", newLogPath);
+        }
+        if (++numAttempts >= 2) {
+          throw new SolrException(
+              ErrorCode.SERVER_ERROR, "Cannot recover from already existing 
logs");
+        }
+        tlogFiles = getLogList(tlogDir);
+        id = scanLastLogId(tlogFiles) + 1; // add 1 since we create a new log
+      }
+      tlog = newTransactionLog(newLogPath, globalStrings, false);
     }
   }
 
diff --git 
a/solr/core/src/test/org/apache/solr/handler/RequestHandlerBaseTest.java 
b/solr/core/src/test/org/apache/solr/handler/RequestHandlerBaseTest.java
index c3f2f9bd865..ba8fff85468 100644
--- a/solr/core/src/test/org/apache/solr/handler/RequestHandlerBaseTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/RequestHandlerBaseTest.java
@@ -105,7 +105,7 @@ public class RequestHandlerBaseTest extends SolrTestCaseJ4 {
         };
     final Exception e = new SyntaxError("Some syntax error");
 
-    final Exception normalized = 
RequestHandlerBase.normalizeReceivedException(solrQueryRequest, e);
+    final Exception normalized = 
RequestHandlerBase.processReceivedException(solrQueryRequest, e);
 
     assertEquals(SolrException.class, normalized.getClass());
     final SolrException normalizedSolrException = (SolrException) normalized;
@@ -123,7 +123,7 @@ public class RequestHandlerBaseTest extends SolrTestCaseJ4 {
         };
     final Exception e = new RuntimeException("Some generic, 
non-SolrException");
 
-    final Exception normalized = 
RequestHandlerBase.normalizeReceivedException(solrQueryRequest, e);
+    final Exception normalized = 
RequestHandlerBase.processReceivedException(solrQueryRequest, e);
 
     assertEquals(normalized, e);
   }
@@ -140,7 +140,7 @@ public class RequestHandlerBaseTest extends SolrTestCaseJ4 {
         };
     final Exception e = new RuntimeException("Some generic, 
non-SolrException");
 
-    final Exception normalized = 
RequestHandlerBase.normalizeReceivedException(solrQueryRequest, e);
+    final Exception normalized = 
RequestHandlerBase.processReceivedException(solrQueryRequest, e);
 
     assertEquals(SolrException.class, normalized.getClass());
     final SolrException normalizedSolrException = (SolrException) normalized;
diff --git a/solr/core/src/test/org/apache/solr/update/UpdateLogTest.java 
b/solr/core/src/test/org/apache/solr/update/UpdateLogTest.java
index 6a35c3df32c..c951ad9141c 100644
--- a/solr/core/src/test/org/apache/solr/update/UpdateLogTest.java
+++ b/solr/core/src/test/org/apache/solr/update/UpdateLogTest.java
@@ -19,7 +19,12 @@ package org.apache.solr.update;
 import static org.apache.solr.common.params.CommonParams.VERSION_FIELD;
 import static org.hamcrest.core.StringContains.containsString;
 
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
 import java.util.List;
+import java.util.Locale;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.util.BytesRef;
 import org.apache.solr.SolrTestCaseJ4;
@@ -218,6 +223,23 @@ public class UpdateLogTest extends SolrTestCaseJ4 {
     }
   }
 
+  @Test
+  public void testLogCleanupWhenNewTransactionLogConstructionFails() throws 
Exception {
+    // Simulate the appearance of a new log file after UpdateLog.init().
+    Path tlogDir = Path.of(ulog.getTlogDir());
+    long lastLogId = scanLastLogId(tlogDir);
+    String newLogName =
+        String.format(
+            Locale.ROOT, UpdateLog.LOG_FILENAME_PATTERN, UpdateLog.TLOG_NAME, 
lastLogId + 1);
+    Path newLogPath = tlogDir.resolve(newLogName);
+    createLogFile(newLogPath);
+
+    // Add a doc and expect no "New transaction log already exists" error when 
creating the new
+    // transaction log.
+    ulogAdd(
+        ulog, null, sdoc("id", "1", "title_s", "title1", "val1_i_dvo", "1", 
"_version_", "100"));
+  }
+
   /** Simulate a commit on a given updateLog */
   private static void ulogCommit(UpdateLog ulog) {
     try (SolrQueryRequest req = req()) {
@@ -289,4 +311,21 @@ public class UpdateLogTest extends SolrTestCaseJ4 {
     
cmd.setVersion(Long.parseLong(cmd.solrDoc.getFieldValue(VERSION_FIELD).toString()));
     return cmd;
   }
+
+  private long scanLastLogId(Path tlogDir) throws IOException {
+    try (UpdateLog uLog = new UpdateLog()) {
+      String[] tlogFiles = uLog.getLogList(tlogDir);
+      return UpdateLog.scanLastLogId(tlogFiles);
+    }
+  }
+
+  private void createLogFile(Path logPath) throws IOException {
+    FileChannel fc =
+        FileChannel.open(
+            logPath,
+            StandardOpenOption.READ,
+            StandardOpenOption.WRITE,
+            StandardOpenOption.CREATE_NEW);
+    fc.close();
+  }
 }

Reply via email to