This is an automated email from the ASF dual-hosted git repository. dsmiley pushed a commit to branch feature/SOLR-17458-rebased in repository https://gitbox.apache.org/repos/asf/solr.git
commit e6c70a9100a80c18ba4e2e60fc089cf1997ee7a3 Author: Kevin Liang <[email protected]> AuthorDate: Mon Oct 13 21:32:16 2025 -0400 SOLR-17799: Revamp index merge & flush metrics. Use OTEL. Replace existing segment merge metrics with consistent counters in OTEL format * Enabled segment merge metrics by default (configurable threshold for major merges behavior is unchanged) * Removed the index merge running gauge metrics --- .../org/apache/solr/update/SolrIndexWriter.java | 308 +++++++++------------ .../apache/solr/update/SolrIndexMetricsTest.java | 197 +++++-------- solr/server/contexts/solr-jetty-context.xml | 2 +- .../deployment-guide/pages/metrics-reporting.adoc | 50 +--- 4 files changed, 216 insertions(+), 341 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java index 75bd8ddc311..5b3f2b5ccd5 100644 --- a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java +++ b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java @@ -17,17 +17,16 @@ package org.apache.solr.update; import static org.apache.solr.metrics.SolrMetricProducer.CATEGORY_ATTR; -import static org.apache.solr.metrics.SolrMetricProducer.TYPE_ATTR; import io.opentelemetry.api.common.AttributeKey; -import io.opentelemetry.api.metrics.ObservableLongGauge; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.metrics.LongCounter; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.IndexDeletionPolicy; @@ -45,7 +44,6 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrInfoBean; import org.apache.solr.metrics.SolrMetricsContext; import org.apache.solr.metrics.otel.OtelUnit; -import org.apache.solr.metrics.otel.instruments.AttributedLongCounter; import org.apache.solr.metrics.otel.instruments.AttributedLongTimer; import org.apache.solr.schema.IndexSchema; import org.slf4j.Logger; @@ -69,7 +67,14 @@ public class SolrIndexWriter extends IndexWriter { public static final String COMMIT_COMMAND_VERSION = "commitCommandVer"; + // TODO: we should eventually explore moving to a histogram distribution style of classifying + // merges instead of just setting an (arbitrary) document count threshold for major/minor (see + // discussion on SOLR-17799). This has its own considerations as well, given that the most + // commonly used tiered merge policy results in merges to get exponentially larger. public static final AttributeKey<String> MERGE_TYPE_ATTR = AttributeKey.stringKey("merge_type"); + public static final AttributeKey<String> MERGE_STATE_ATTR = AttributeKey.stringKey("merge_state"); + public static final AttributeKey<String> MERGE_OP_ATTR = AttributeKey.stringKey("merge_op"); + public static final AttributeKey<String> RESULT_ATTR = AttributeKey.stringKey("result"); private final Object CLOSE_LOCK = new Object(); @@ -80,23 +85,17 @@ public class SolrIndexWriter extends IndexWriter { // metrics private long majorMergeDocs = 512 * 1024; - private AttributedLongTimer majorMerge; - private AttributedLongTimer minorMerge; - private AttributedLongCounter majorMergedDocs; - private AttributedLongCounter majorDeletedDocs; - private AttributedLongCounter mergeErrors; - private AttributedLongCounter flushes; // original counter is package-private in IndexWriter - private boolean mergeTotals = false; - private boolean mergeDetails = false; - private final AtomicInteger runningMajorMerges = new AtomicInteger(); - private final AtomicInteger runningMinorMerges = new AtomicInteger(); - private final AtomicInteger runningMajorMergesSegments = new AtomicInteger(); - private final AtomicInteger runningMinorMergesSegments = new AtomicInteger(); - private final AtomicLong runningMajorMergesDocs = new AtomicLong(); - private final AtomicLong runningMinorMergesDocs = new AtomicLong(); - private ObservableLongGauge mergeStats; - - private final SolrMetricsContext solrMetricsContext; + private LongCounter mergesCounter; + private LongCounter mergeDocsCounter; + private LongCounter mergeSegmentsCounter; + private LongCounter flushesCounter; + + private AttributedLongTimer majorMergeTimer; + private AttributedLongTimer minorMergeTimer; + + private SolrMetricsContext solrMetricsContext; + private Attributes baseAttributes; + // merge diagnostics. private final Map<String, Long> runningMerges = new ConcurrentHashMap<>(); @@ -134,8 +133,6 @@ public class SolrIndexWriter extends IndexWriter { numOpens.incrementAndGet(); log.debug("Opened Writer {}", name); // no metrics - mergeTotals = false; - mergeDetails = false; solrMetricsContext = null; } @@ -173,118 +170,8 @@ public class SolrIndexWriter extends IndexWriter { log.warn("Invalid 'majorMergeDocs' argument, using default 512k", e); } } - Boolean Totals = config.metricsInfo.initArgs.getBooleanArg("merge"); - Boolean Details = config.metricsInfo.initArgs.getBooleanArg("mergeDetails"); - if (Details != null) { - mergeDetails = Details; - } else { - mergeDetails = false; - } - if (Totals != null) { - mergeTotals = Totals; - } else { - mergeTotals = false; - } - var baseAttributes = - core.getCoreAttributes().toBuilder() - .put(CATEGORY_ATTR, SolrInfoBean.Category.INDEX.toString()) - .build(); - if (mergeDetails) { - mergeTotals = true; // override - majorMergedDocs = - new AttributedLongCounter( - solrMetricsContext.longCounter( - "solr_indexwriter_major_merged_docs", - "Number of documents merged while merging segments above the majorMergeDocs threshold (" - + majorMergeDocs - + ")"), - baseAttributes); - majorDeletedDocs = - new AttributedLongCounter( - solrMetricsContext.longCounter( - "solr_indexwriter_major_deleted_docs", - "Number of deleted documents that were expunged while merging segments above the majorMergeDocs threshold (" - + majorMergeDocs - + ")"), - baseAttributes); - } - if (mergeTotals) { - minorMerge = - new AttributedLongTimer( - solrMetricsContext.longHistogram( - "solr_indexwriter_merge", - "Time spent merging segments below or equal to the majorMergeDocs threshold (" - + majorMergeDocs - + ")", - OtelUnit.MILLISECONDS), - baseAttributes.toBuilder().put(MERGE_TYPE_ATTR, "minor").build()); - majorMerge = - new AttributedLongTimer( - solrMetricsContext.longHistogram( - "solr_indexwriter_merge", - "Time spent merging segments above the majorMergeDocs threshold (" - + majorMergeDocs - + ")", - OtelUnit.MILLISECONDS), - baseAttributes.toBuilder().put(MERGE_TYPE_ATTR, "major").build()); - mergeErrors = - new AttributedLongCounter( - solrMetricsContext.longCounter( - "solr_indexwriter_merge_errors", "Number of merge errors"), - baseAttributes); - String tag = core.getMetricTag(); - mergeStats = - solrMetricsContext.observableLongGauge( - "solr_indexwriter_merge_stats", - "Metrics around currently running segment merges; major := above the majorMergeDocs threshold (" - + majorMergeDocs - + "), minor := below or equal to the threshold", - (observableLongMeasurement -> { - observableLongMeasurement.record( - runningMajorMerges.get(), - baseAttributes.toBuilder() - .put(TYPE_ATTR, "running") - .put(MERGE_TYPE_ATTR, "major") - .build()); - observableLongMeasurement.record( - runningMajorMergesDocs.get(), - baseAttributes.toBuilder() - .put(TYPE_ATTR, "running_docs") - .put(MERGE_TYPE_ATTR, "major") - .build()); - observableLongMeasurement.record( - runningMajorMergesSegments.get(), - baseAttributes.toBuilder() - .put(TYPE_ATTR, "running_segments") - .put(MERGE_TYPE_ATTR, "major") - .build()); - observableLongMeasurement.record( - runningMinorMerges.get(), - baseAttributes.toBuilder() - .put(TYPE_ATTR, "running") - .put(MERGE_TYPE_ATTR, "minor") - .build()); - observableLongMeasurement.record( - runningMinorMergesDocs.get(), - baseAttributes.toBuilder() - .put(TYPE_ATTR, "running_docs") - .put(MERGE_TYPE_ATTR, "minor") - .build()); - observableLongMeasurement.record( - runningMinorMergesSegments.get(), - baseAttributes.toBuilder() - .put(TYPE_ATTR, "running_segments") - .put(MERGE_TYPE_ATTR, "minor") - .build()); - })); - flushes = - new AttributedLongCounter( - solrMetricsContext.longCounter( - "solr_indexwriter_flush", - "Number of times added/deleted documents have been flushed to the Directory"), - baseAttributes); - } } + initMetrics(core); } @SuppressForbidden( @@ -309,60 +196,36 @@ public class SolrIndexWriter extends IndexWriter { this.directoryFactory = factory; } + // for testing + public void setMajorMergeDocs(long majorMergeDocs) { + this.majorMergeDocs = majorMergeDocs; + } + // we override this method to collect metrics for merges. @Override protected void merge(MergePolicy.OneMerge merge) throws IOException { String segString = merge.segString(); long totalNumDocs = merge.totalNumDocs(); runningMerges.put(segString, totalNumDocs); - if (!mergeTotals) { - try { - super.merge(merge); - } finally { - runningMerges.remove(segString); - } - return; - } long deletedDocs = 0; for (SegmentCommitInfo info : merge.segments) { totalNumDocs -= info.getDelCount(); deletedDocs += info.getDelCount(); } - boolean major = totalNumDocs > majorMergeDocs; int segmentsCount = merge.segments.size(); - AttributedLongTimer.MetricTimer context; - if (major) { - runningMajorMerges.incrementAndGet(); - runningMajorMergesDocs.addAndGet(totalNumDocs); - runningMajorMergesSegments.addAndGet(segmentsCount); - if (mergeDetails) { - majorMergedDocs.add(totalNumDocs); - majorDeletedDocs.add(deletedDocs); - } - context = majorMerge.start(); - } else { - runningMinorMerges.incrementAndGet(); - runningMinorMergesDocs.addAndGet(totalNumDocs); - runningMinorMergesSegments.addAndGet(segmentsCount); - context = minorMerge.start(); - } + AttributedLongTimer.MetricTimer timer = + updateMergeMetrics(totalNumDocs, deletedDocs, segmentsCount, false, false, null); try { super.merge(merge); + updateMergeMetrics(totalNumDocs, deletedDocs, segmentsCount, true, false, timer); } catch (Throwable t) { - mergeErrors.inc(); + if (timer != null) { + timer.stop(); + } + updateMergeMetrics(totalNumDocs, deletedDocs, segmentsCount, true, true, timer); throw t; } finally { runningMerges.remove(segString); - context.stop(); - if (major) { - runningMajorMerges.decrementAndGet(); - runningMajorMergesDocs.addAndGet(-totalNumDocs); - runningMajorMergesSegments.addAndGet(-segmentsCount); - } else { - runningMinorMerges.decrementAndGet(); - runningMinorMergesDocs.addAndGet(-totalNumDocs); - runningMinorMergesSegments.addAndGet(-segmentsCount); - } } } @@ -372,12 +235,111 @@ public class SolrIndexWriter extends IndexWriter { @Override protected void doAfterFlush() throws IOException { - if (flushes != null) { // this is null when writer is used only for snapshot cleanup - flushes.inc(); // or if mergeTotals == false + if (flushesCounter != null) { // this is null when writer is used only for snapshot cleanup + flushesCounter.add(1L, baseAttributes); // or if mergeTotals == false } super.doAfterFlush(); } + private void initMetrics(final SolrCore core) { + if (solrMetricsContext == null) { + solrMetricsContext = core.getSolrMetricsContext().getChildContext(this); + } + + var baseAttributesBuilder = + Attributes.builder().put(CATEGORY_ATTR, SolrInfoBean.Category.INDEX.toString()); + baseAttributes = baseAttributesBuilder.build(); + + mergesCounter = + solrMetricsContext.longCounter( + "solr_indexwriter_merges", + "Number of total merge operations, " + + " where \"major\" merges involve more than " + + majorMergeDocs + + " documents, otherwise merge classified as minor."); + mergeDocsCounter = + solrMetricsContext.longCounter( + "solr_indexwriter_merge_docs", + "Number of documents involved in merge, " + + " where \"major\" merges involve more than " + + majorMergeDocs + + " documents, otherwise merge classified as minor."); + mergeSegmentsCounter = + solrMetricsContext.longCounter( + "solr_indexwriter_merge_segments", + "Number of segments involved in merge, " + + " where \"major\" merges involve more than " + + majorMergeDocs + + " documents, otherwise merge classified as minor."); + flushesCounter = + solrMetricsContext.longCounter( + "solr_indexwriter_flushes", "Number of flush to disk operations triggered"); + + var mergesTimerBase = + solrMetricsContext.longHistogram( + "solr_indexwriter_merge_time", + "Time spent merging segments, " + + " where \"major\" merges involve more than " + + majorMergeDocs + + " documents, otherwise merge classified as minor.", + OtelUnit.MILLISECONDS); + majorMergeTimer = + new AttributedLongTimer( + mergesTimerBase, baseAttributes.toBuilder().put(MERGE_TYPE_ATTR, "major").build()); + minorMergeTimer = + new AttributedLongTimer( + mergesTimerBase, baseAttributes.toBuilder().put(MERGE_TYPE_ATTR, "minor").build()); + } + + /** + * Updates relevant metrics related to segment merging + * + * @param numDocs number of documents in merge op + * @param numDeletedDocs number of deleted docs in merge op + * @param numSegments number of segments in merge op + * @param mergeCompleted true if being called for a successful post-merge, else false to signify a + * merge is about to start + * @param mergeFailed true if merge entered an unrecoverable error state, else false + * @param metricTimer an existing timer context for actively running merge + * @return timer context for current merge operation + */ + private AttributedLongTimer.MetricTimer updateMergeMetrics( + long numDocs, + long numDeletedDocs, + long numSegments, + boolean mergeCompleted, + boolean mergeFailed, + AttributedLongTimer.MetricTimer metricTimer) { + if (solrMetricsContext == null) { + return null; + } + boolean isMajorMerge = numDocs > majorMergeDocs; + var attributes = baseAttributes.toBuilder(); + attributes.put(MERGE_TYPE_ATTR, isMajorMerge ? "major" : "minor"); + Attributes mergeAttr; + if (mergeCompleted) { // merge operation terminating + if (metricTimer != null) { + metricTimer.stop(); + } + attributes.put(MERGE_STATE_ATTR, "completed"); + attributes.put(RESULT_ATTR, mergeFailed ? "error" : "success"); + + } else { // merge operation starting + metricTimer = isMajorMerge ? majorMergeTimer.start() : minorMergeTimer.start(); + attributes.put(MERGE_STATE_ATTR, "started"); + } + mergeAttr = attributes.build(); + mergesCounter.add(1L, mergeAttr); + mergeSegmentsCounter.add(numSegments, mergeAttr); + + mergeDocsCounter.add( + numDocs, mergeAttr.toBuilder().put(MERGE_OP_ATTR, "merge").build()); // docs merged + mergeDocsCounter.add( + numDeletedDocs, mergeAttr.toBuilder().put(MERGE_OP_ATTR, "delete").build()); + + return metricTimer; + } + // use DocumentBuilder now... // private final void addField(Document doc, String name, String val) { // SchemaField ftype = schema.getField(name); @@ -462,7 +424,7 @@ public class SolrIndexWriter extends IndexWriter { if (directoryFactory != null) { directoryFactory.release(directory); } - IOUtils.closeQuietly(mergeStats); + if (solrMetricsContext != null) { solrMetricsContext.unregister(); } diff --git a/solr/core/src/test/org/apache/solr/update/SolrIndexMetricsTest.java b/solr/core/src/test/org/apache/solr/update/SolrIndexMetricsTest.java index 2a40735ef40..f7d0470c88d 100644 --- a/solr/core/src/test/org/apache/solr/update/SolrIndexMetricsTest.java +++ b/solr/core/src/test/org/apache/solr/update/SolrIndexMetricsTest.java @@ -17,7 +17,10 @@ package org.apache.solr.update; import static org.apache.solr.metrics.SolrMetricProducer.CATEGORY_ATTR; +import static org.apache.solr.update.SolrIndexWriter.MERGE_OP_ATTR; +import static org.apache.solr.update.SolrIndexWriter.MERGE_STATE_ATTR; import static org.apache.solr.update.SolrIndexWriter.MERGE_TYPE_ATTR; +import static org.apache.solr.update.SolrIndexWriter.RESULT_ATTR; import io.prometheus.metrics.model.snapshots.MetricSnapshots; import org.apache.solr.SolrTestCaseJ4; @@ -41,7 +44,7 @@ public class SolrIndexMetricsTest extends SolrTestCaseJ4 { SolrQueryRequest req = lrf.makeRequest(); UpdateHandler uh = req.getCore().getUpdateHandler(); AddUpdateCommand add = new AddUpdateCommand(req); - for (int i = 0; i < 1000; i++) { + for (int i = 0; i < 800; i++) { add.clear(); add.solrDoc = new SolrInputDocument(); add.solrDoc.addField("id", "" + i); @@ -53,63 +56,8 @@ public class SolrIndexMetricsTest extends SolrTestCaseJ4 { h.reload(); } - @Test - public void testIndexMetricsNoDetails() throws Exception { - System.setProperty("solr.tests.metrics.merge", "true"); - System.setProperty("solr.tests.metrics.mergeDetails", "false"); - initCore("solrconfig-indexmetrics.xml", "schema.xml"); - - addDocs(); - - try (SolrCore core = h.getCoreContainer().getCore("collection1")) { - // check basic index meters - var minorMergeTimer = - SolrMetricTestUtils.getHistogramDatapoint( - core, - "solr_indexwriter_merge_milliseconds", - SolrMetricTestUtils.newStandaloneLabelsBuilder(core) - .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) - .label(MERGE_TYPE_ATTR.toString(), "minor") - .build()); - assertTrue("minorMerge: " + minorMergeTimer.getCount(), minorMergeTimer.getCount() >= 3); - var majorMergeTimer = - SolrMetricTestUtils.getHistogramDatapoint( - core, - "solr_indexwriter_merge_milliseconds", - SolrMetricTestUtils.newStandaloneLabelsBuilder(core) - .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) - .label(MERGE_TYPE_ATTR.toString(), "major") - .build()); - // major merge timer should have a value of 0, and because 0 values are not reported, no - // datapoint is available - assertNull("majorMergeTimer", majorMergeTimer); - - // check detailed meters - var majorMergeDocs = - SolrMetricTestUtils.getCounterDatapoint( - core, - "solr_indexwriter_major_merged_docs", - SolrMetricTestUtils.newStandaloneLabelsBuilder(core) - .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) - .build()); - // major merge docs should be null because mergeDetails is false - assertNull("majorMergeDocs", majorMergeDocs); - - var flushCounter = - SolrMetricTestUtils.getCounterDatapoint( - core, - "solr_indexwriter_flush", - SolrMetricTestUtils.newStandaloneLabelsBuilder(core) - .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) - .build()); - assertTrue("flush: " + flushCounter.getValue(), flushCounter.getValue() > 10); - } - } - @Test public void testIndexNoMetrics() throws Exception { - System.setProperty("solr.tests.metrics.merge", "false"); - System.setProperty("solr.tests.metrics.mergeDetails", "false"); initCore("solrconfig-indexmetrics.xml", "schema.xml"); addDocs(); try (SolrCore core = h.getCoreContainer().getCore("collection1")) { @@ -133,9 +81,8 @@ public class SolrIndexMetricsTest extends SolrTestCaseJ4 { } @Test - public void testIndexMetricsWithDetails() throws Exception { - System.setProperty("solr.tests.metrics.merge", "false"); // test mergeDetails override too - System.setProperty("solr.tests.metrics.mergeDetails", "true"); + public void testIndexMetricsMajorAndMinorMerges() throws Exception { + System.setProperty("solr.tests.metrics.majorMergeDocs", "450"); initCore("solrconfig-indexmetrics.xml", "schema.xml"); addDocs(); @@ -144,116 +91,118 @@ public class SolrIndexMetricsTest extends SolrTestCaseJ4 { var prometheusMetricReader = SolrMetricTestUtils.getPrometheusMetricReader(core); assertNotNull(prometheusMetricReader); MetricSnapshots otelMetrics = prometheusMetricReader.collect(); - assertTrue("Metrics count: " + otelMetrics.size(), otelMetrics.size() >= 19); + assertTrue("Metrics count: " + otelMetrics.size(), otelMetrics.size() >= 18); + + // addDocs() adds 800 documents and then sends a commit. maxBufferedDocs==100, + // segmentsPerTier==3, + // maxMergeAtOnce==3 and majorMergeDocs==450. Thus, new documents form segments with 100 + // docs, merges are + // called for when there are 3 segments at the lowest tier, and the merges are as follows: + // 1. 100 + 100 + 100 ==> new 300 doc segment, below the 450 threshold ==> minor merge + // 2. 100 + 100 + 100 ==> new 300 doc segment, below the 450 threshold ==> minor merge + // 3. 300 + 100 + 100 ==> new 500 doc segment, above the 450 threshold ==> major merge // check basic index meters var minorMergeTimer = SolrMetricTestUtils.getHistogramDatapoint( core, - "solr_indexwriter_merge_milliseconds", + "solr_indexwriter_merge_time_milliseconds", SolrMetricTestUtils.newStandaloneLabelsBuilder(core) .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) .label(MERGE_TYPE_ATTR.toString(), "minor") .build()); - assertTrue("minorMergeTimer: " + minorMergeTimer.getCount(), minorMergeTimer.getCount() >= 3); + assertEquals( + "minorMergeTimer instances should be at least 2, got: " + minorMergeTimer.getCount(), + 2, + minorMergeTimer.getCount()); var majorMergeTimer = SolrMetricTestUtils.getHistogramDatapoint( core, - "solr_indexwriter_merge_milliseconds", + "solr_indexwriter_merge_time_milliseconds", SolrMetricTestUtils.newStandaloneLabelsBuilder(core) .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) .label(MERGE_TYPE_ATTR.toString(), "major") .build()); - // major merge timer should have a value of 0, and because 0 values are not reported, no - // datapoint is available - assertNull("majorMergeTimer", majorMergeTimer); + assertEquals( + "majorMergeTimer instances should be at least 1, got: " + majorMergeTimer.getCount(), + 1, + majorMergeTimer.getCount()); - // check detailed meters - var majorMergeDocs = + var minorMergeDocs = SolrMetricTestUtils.getCounterDatapoint( core, - "solr_indexwriter_major_merged_docs", + "solr_indexwriter_merge_docs", SolrMetricTestUtils.newStandaloneLabelsBuilder(core) .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .label(MERGE_TYPE_ATTR.toString(), "minor") + .label(MERGE_OP_ATTR.toString(), "merge") + .label(MERGE_STATE_ATTR.toString(), "completed") + .label(RESULT_ATTR.toString(), "success") .build()); - // major merge docs should have a value of 0, and because 0 values are not reported, no - // datapoint is available - assertNull("majorMergeDocs", majorMergeDocs); - - var flushCounter = + assertEquals( + "minorMergeDocs should be 600, got: " + minorMergeDocs.getValue(), + 600, + (long) minorMergeDocs.getValue()); + var majorMergeDocs = SolrMetricTestUtils.getCounterDatapoint( core, - "solr_indexwriter_flush", + "solr_indexwriter_merge_docs", SolrMetricTestUtils.newStandaloneLabelsBuilder(core) .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .label(MERGE_TYPE_ATTR.toString(), "major") + .label(MERGE_OP_ATTR.toString(), "merge") + .label(MERGE_STATE_ATTR.toString(), "completed") + .label(RESULT_ATTR.toString(), "success") .build()); - assertTrue("flush: " + flushCounter.getValue(), flushCounter.getValue() > 10); - } - } - - public void testIndexMetricsMajorAndMinorMergesWithDetails() throws Exception { - System.setProperty("solr.tests.metrics.merge", "false"); // test mergeDetails override too - System.setProperty("solr.tests.metrics.mergeDetails", "true"); - System.setProperty("solr.tests.metrics.majorMergeDocs", "450"); - initCore("solrconfig-indexmetrics.xml", "schema.xml"); - - addDocs(); - - try (SolrCore core = h.getCoreContainer().getCore("collection1")) { - var prometheusMetricReader = SolrMetricTestUtils.getPrometheusMetricReader(core); - assertNotNull(prometheusMetricReader); - MetricSnapshots otelMetrics = prometheusMetricReader.collect(); - assertTrue("Metrics count: " + otelMetrics.size(), otelMetrics.size() >= 18); + assertEquals( + "majorMergeDocs should be 500, got: " + majorMergeDocs.getValue(), + 500, + (long) majorMergeDocs.getValue()); - // addDocs() adds 1000 documents and then sends a commit. maxBufferedDocs==100, - // segmentsPerTier==3, - // maxMergeAtOnce==3 and majorMergeDocs==450. Thus, new documents form segments with 100 - // docs, merges are - // called for when there are 3 segments at the lowest tier, and the merges are as follows: - // 1. 100 + 100 + 100 ==> new 300 doc segment, below the 450 threshold ==> minor merge - // 2. 100 + 100 + 100 ==> new 300 doc segment, below the 450 threshold ==> minor merge - // 3. 300 + 100 + 100 ==> new 500 doc segment, above the 450 threshold ==> major merge - // 4. 300 + 100 + 100 ==> new 500 doc segment, above the 450 threshold ==> major merge - - // check basic index meters - var minorMergeTimer = - SolrMetricTestUtils.getHistogramDatapoint( + // segments metrics + var minorSegmentsMergeMetric = + SolrMetricTestUtils.getCounterDatapoint( core, - "solr_indexwriter_merge_milliseconds", + "solr_indexwriter_merge_segments", SolrMetricTestUtils.newStandaloneLabelsBuilder(core) .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) .label(MERGE_TYPE_ATTR.toString(), "minor") + .label(MERGE_STATE_ATTR.toString(), "completed") + .label(RESULT_ATTR.toString(), "success") .build()); - assertTrue("minorMergeTimer: " + minorMergeTimer.getCount(), minorMergeTimer.getCount() == 2); - var majorMergeTimer = - SolrMetricTestUtils.getHistogramDatapoint( - core, - "solr_indexwriter_merge_milliseconds", - SolrMetricTestUtils.newStandaloneLabelsBuilder(core) - .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) - .label(MERGE_TYPE_ATTR.toString(), "major") - .build()); - assertTrue("majorMergeTimer: " + majorMergeTimer.getCount(), majorMergeTimer.getCount() == 2); - - // check detailed meters - var majorMergeDocs = + assertNotNull("minor segment merges metric should exist", minorSegmentsMergeMetric); + assertEquals( + "number of minor segments merged should be 6, got: " + + minorSegmentsMergeMetric.getValue(), + 6, + (long) minorSegmentsMergeMetric.getValue()); + var majorSegmentsMergeMetric = SolrMetricTestUtils.getCounterDatapoint( core, - "solr_indexwriter_major_merged_docs", + "solr_indexwriter_merge_segments", SolrMetricTestUtils.newStandaloneLabelsBuilder(core) .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .label(MERGE_TYPE_ATTR.toString(), "major") + .label(MERGE_STATE_ATTR.toString(), "completed") + .label(RESULT_ATTR.toString(), "success") .build()); - // majorMergeDocs is the total number of docs merged during major merge operations - assertTrue("majorMergeDocs: " + majorMergeDocs.getValue(), majorMergeDocs.getValue() == 1000); + assertNotNull("major segment merges metric should exist", majorSegmentsMergeMetric); + assertEquals( + "number of major segments merged should be 3, got: " + + majorSegmentsMergeMetric.getValue(), + 3, + (long) majorSegmentsMergeMetric.getValue()); var flushCounter = SolrMetricTestUtils.getCounterDatapoint( core, - "solr_indexwriter_flush", + "solr_indexwriter_flushes", SolrMetricTestUtils.newStandaloneLabelsBuilder(core) .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) .build()); - assertTrue("flush: " + flushCounter.getValue(), flushCounter.getValue() >= 10); + assertTrue( + "should be at greater than 10 flushes: " + flushCounter.getValue(), + flushCounter.getValue() >= 10); } } } diff --git a/solr/server/contexts/solr-jetty-context.xml b/solr/server/contexts/solr-jetty-context.xml index a2656f87284..bd281afaacf 100644 --- a/solr/server/contexts/solr-jetty-context.xml +++ b/solr/server/contexts/solr-jetty-context.xml @@ -3,7 +3,7 @@ <Configure class="org.eclipse.jetty.ee10.webapp.WebAppContext"> <Set name="contextPath"><Property name="hostContext" default="/solr"/></Set> <Set name="war"><Property name="jetty.base"/>/solr-webapp/webapp</Set> - <Set name="defaultsDescriptor"><Property name="jetty.base"/>/etc/webdefault.xml</Set> +<!-- <Set name="defaultsDescriptor"><Property name="jetty.base"/>/etc/webdefault.xml</Set>--> <Set name="extractWAR">false</Set> <!-- diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/metrics-reporting.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/metrics-reporting.adoc index ef02fcd5042..0f8b5c37534 100644 --- a/solr/solr-ref-guide/modules/deployment-guide/pages/metrics-reporting.adoc +++ b/solr/solr-ref-guide/modules/deployment-guide/pages/metrics-reporting.adoc @@ -87,7 +87,7 @@ When making requests with the <<Metrics API>>, you can specify `&group=core` to * all common RequestHandlers report: request timers / counters, timeouts, errors. Handlers that support process distributed shard requests also report `shardRequests` sub-counters for each type of distributed request. -* <<Index Merge Metrics,index-level events>>: meters for minor / major merges, number of merged docs, number of deleted docs, gauges for currently running merges and their size. +* <<Index Merge Metrics,index-level events>>: meters for minor / major merges, number of merged docs, number of deleted docs, number of flushes * shard replication and transaction log replay on replicas, * open / available / pending connections for shard handler and update handler. @@ -312,7 +312,7 @@ complex objects: ---- === Caching Threads Metrics === -The threads metrics in the JVM group can be expensive to compute, as it requires traversing all threads. +The threads metrics in the JVM group can be expensive to compute, as it requires traversing all threads. This can be avoided for every call to the metrics API (group=jvm) by setting a high caching expiration interval (in seconds). For example, to cache the thread metrics for 5 seconds: @@ -655,11 +655,13 @@ Metrics can be aggregated across cores using Shard and Cluster reporters. === Index Merge Metrics -These metrics are collected in respective registries for each core (e.g., `solr.core.collection1....`), under the `INDEX` category. +These metrics are collected under the `INDEX` category and track flush operations (documents being written to disk) and merge operations (segments on disk being merged). -Metrics collection is controlled by boolean parameters in the `<metrics>` section of `solrconfig.xml`: +For merge metrics, metrics are tracked with the distinction of "minor" and "major" merges (as merges with fewer documents will be typically more frequent). +This is indicated by the `merge_type` label for the metric. The threshold for when a merge becomes large enough to be considered major is configurable, but +defaults to 524k documents. -Basic metrics: +Metrics collection for index merges can be configured in the `<metrics>` section of `solrconfig.xml` as shown below: [source,xml] ---- @@ -668,7 +670,6 @@ Basic metrics: <indexConfig> <metrics> <long name="majorMergeDocs">524288</long> - <bool name="merge">true</bool> </metrics> ... </indexConfig> @@ -676,43 +677,6 @@ Basic metrics: </config> ---- -Detailed metrics: - -[source,xml] ----- -<config> - ... - <indexConfig> - <metrics> - <long name="majorMergeDocs">524288</long> - <bool name="mergeDetails">true</bool> - </metrics> - ... - </indexConfig> -... -</config> ----- - -The following metrics are collected: - -* `INDEX.merge.major` - timer for merge operations that include at least "majorMergeDocs" (default value for this parameter is 512k documents). -* `INDEX.merge.minor` - timer for merge operations that include less than "majorMergeDocs". -* `INDEX.merge.errors` - counter for merge errors. -* `INDEX.flush` - meter for index flush operations. - -Additionally, the following gauges are reported, which help to monitor the momentary state of index merge operations: - -* `INDEX.merge.major.running` - number of running major merge operations (depending on the implementation of `MergeScheduler` that is used there can be several concurrently running merge operations). -* `INDEX.merge.minor.running` - as above, for minor merge operations. -* `INDEX.merge.major.running.docs` - total number of documents in the segments being currently merged in major merge operations. -* `INDEX.merge.minor.running.docs` - as above, for minor merge operations. -* `INDEX.merge.major.running.segments` - number of segments being currently merged in major merge operations. -* `INDEX.merge.minor.running.segments` - as above, for minor merge operations. - -If the boolean flag `mergeDetails` is true then the following additional metrics are collected: - -* `INDEX.merge.major.docs` - meter for the number of documents merged in major merge operations -* `INDEX.merge.major.deletedDocs` - meter for the number of deleted documents expunged in major merge operations == Metrics API
