shubhamsrkdev commented on code in PR #15620:
URL: https://github.com/apache/lucene/pull/15620#discussion_r2795852540


##########
lucene/core/src/java/org/apache/lucene/index/TemporalMergePolicy.java:
##########
@@ -0,0 +1,980 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.concurrent.TimeUnit;
+import java.util.logging.Logger;
+import org.apache.lucene.codecs.PointsFormat;
+import org.apache.lucene.codecs.PointsReader;
+import org.apache.lucene.document.LongPoint;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+
+/**
+ * A merge policy that groups segments by time windows and merges segments 
within the same window,
+ * This policy is designed for time-series data where documents contain a 
timestamp field indexed as
+ * a {@link org.apache.lucene.document.LongPoint}.
+ *
+ * <p>This policy organizes segments into time buckets based on the maximum 
timestamp in each
+ * segment. Recent data goes into small time windows (e.g., 1 hour), while 
older data is grouped
+ * into exponentially larger windows (e.g., 4 hours, 16 hours, etc.). Segments 
within the same time
+ * window are merged together when they meet the configured thresholds, but 
segments from different
+ * time windows are never merged together, preserving temporal locality.
+ *
+ * <p><b>When to use this policy:</b>
+ *
+ * <ul>
+ *   <li>Time-series data where queries typically filter by time ranges
+ *   <li>Data with a timestamp field that can be used for bucketing
+ *   <li>Workloads where older data is queried less frequently than recent data
+ *   <li>Use cases where you want to avoid mixing old and new data in the same 
segment
+ * </ul>
+ *
+ * <p><b>Configuration:</b>
+ *
+ * <pre class="prettyprint">
+ * TemporalMergePolicy policy = new TemporalMergePolicy()
+ *     .setTemporalField("timestamp")           // Required: name of the 
timestamp field
+ *     .setBaseTimeSeconds(3600)                // Base window size: 1 hour
+ *     .setMinThreshold(4)                      // Merge when 4+ segments in a 
window
+ *     .setMaxThreshold(8)                      // Merge at most 8 segments at 
once
+ *     .setCompactionRatio(1.2)                 // Size ratio threshold for 
merging
+ *     .setUseExponentialBuckets(true);         // Use exponentially growing 
windows
+ *
+ * IndexWriterConfig config = new IndexWriterConfig(analyzer);
+ * config.setMergePolicy(policy);
+ * </pre>
+ *
+ * <p><b>Time bucketing:</b> When {@link #setUseExponentialBuckets} is true 
(default), window sizes
+ * grow exponentially: {@code baseTime}, {@code baseTime * minThreshold}, 
{@code baseTime *
+ * minThreshold^2}, etc. This ensures that recent data is in small, 
frequently-merged windows while
+ * older data is in larger, less-frequently-merged windows. When false, all 
windows have the same
+ * size ({@code baseTime}).
+ *
+ * <p><b>Compaction ratio:</b> The {@link #setCompactionRatio} parameter 
controls when merges are
+ * triggered. A merge is considered when the total document count across 
candidate segments exceeds
+ * {@code largestSegment * compactionRatio}. Lower values (e.g., 1.2) trigger 
merges more
+ * aggressively, while higher values (e.g., 2.0) allow more segments to 
accumulate before merging.
+ * Set to 1.0 for most aggressive merging.
+ *
+ * <p><b>NOTE:</b> This policy requires a timestamp field indexed as a {@link
+ * org.apache.lucene.document.LongPoint}. The timestamp can be in seconds, 
milliseconds, or
+ * microseconds (auto-detected based on value magnitude).
+ *
+ * <p><b>NOTE:</b> Segments from different time windows are never merged 
together, even during
+ * {@link IndexWriter#forceMerge(int)}. If you call {@code forceMerge(1)} but 
have segments in
+ * multiple time windows, you will end up with one segment per time window.
+ *
+ * <p><b>NOTE:</b> Very old segments (older than {@link #setMaxAgeSeconds}) 
are not merged to avoid
+ * unnecessary I/O on cold data.
+ *
+ * @lucene.experimental
+ */
+public class TemporalMergePolicy extends MergePolicy {
+
+  private static final Logger log = 
Logger.getLogger(TemporalMergePolicy.class.getName());
+
+  // Configuration parameters
+  private String temporalField = "";
+  private long baseTimeSeconds = 3600; // 1 hour default
+  private int minThreshold = 4; // Minimum segments to trigger merge within a 
window
+  private boolean useExponentialBuckets = true; // Use exponential window 
sizing
+  private long maxWindowSizeSeconds = TimeUnit.DAYS.toSeconds(365); // Maximum 
window size (1 year)
+  private long maxAgeSeconds =
+      Long.MAX_VALUE; // Don't compact segments older than this (default: no 
limit)
+  private int maxThreshold = 8; // Maximum segments to merge from a window at 
once
+  private double compactionRatio =
+      1.2d; // Ratio between the total docs and biggest segment to trigger 
merge
+  private double forceMergeDeletesPctAllowed = 10.0d;
+  private Map<SegmentCommitInfo, SegmentDateRange> segmentDateRangeOverrides;
+
+  /** Sole constructor, setting all settings to their defaults. */
+  public TemporalMergePolicy() {}
+
+  /**
+   * Sets the name of the timestamp field used for temporal bucketing. This 
field must be indexed as
+   * a {@link LongPoint} and contain timestamp values in seconds, 
milliseconds, or microseconds
+   * (auto-detected based on value magnitude).
+   *
+   * <p><b>This parameter is required</b> and must be set before the policy 
can schedule any merges.
+   * The merge policy will extract the minimum and maximum timestamps from 
each segment to determine
+   * which time window the segment belongs to.
+   *
+   * <p>Default is empty (no temporal field configured, policy is inactive).
+   */
+  public TemporalMergePolicy setTemporalField(String temporalField) {
+    if (temporalField == null || temporalField.isBlank()) {
+      throw new IllegalArgumentException("temporalField cannot be blank");
+    }
+    this.temporalField = temporalField;
+    return this;
+  }
+
+  /**
+   * Returns the current temporal field name.
+   *
+   * @see #setTemporalField
+   */
+  public String getTemporalField() {
+    return temporalField;
+  }
+
+  /**
+   * Sets the base time window size in seconds. This determines the size of 
the smallest (most
+   * recent) time buckets.
+   *
+   * <p>When {@link #setUseExponentialBuckets} is enabled (default), window 
sizes grow
+   * exponentially: {@code baseTime}, {@code baseTime * minThreshold}, {@code 
baseTime *
+   * minThreshold^2}, etc. When disabled, all windows have the same size equal 
to {@code baseTime}.
+   *
+   * <p>Smaller values create finer-grained time windows, which can improve 
query performance for
+   * time-range queries but may result in more segments. Larger values reduce 
the number of time
+   * windows but may mix data from a wider time range in the same segment.
+   *
+   * <p>Default is 3600 seconds (1 hour).
+   */
+  public TemporalMergePolicy setBaseTimeSeconds(long baseTimeSeconds) {
+    if (baseTimeSeconds <= 0) {
+      throw new IllegalArgumentException("baseTimeSeconds must be positive");
+    }
+    this.baseTimeSeconds = baseTimeSeconds;
+    return this;
+  }
+
+  /**
+   * Returns the current base time window size in seconds.
+   *
+   * @see #setBaseTimeSeconds
+   */
+  public long getBaseTimeSeconds() {
+    return baseTimeSeconds;
+  }
+
+  /**
+   * Sets the minimum number of segments required in a time window to trigger 
a merge. Higher values
+   * reduce merge frequency and I/O but allow more segments to accumulate. 
Lower values keep segment
+   * counts lower but increase write amplification.
+   *
+   * <p>This threshold is also used as the growth factor for exponential 
bucketing when {@link
+   * #setUseExponentialBuckets} is enabled. For example, with {@code 
minThreshold=4}, window sizes
+   * will be: {@code baseTime}, {@code baseTime * 4}, {@code baseTime * 16}, 
etc.
+   *
+   * <p>Must be at least 2 and cannot exceed {@link #setMaxThreshold}. Default 
is 4.
+   */
+  public TemporalMergePolicy setMinThreshold(int minThreshold) {
+    if (minThreshold < 2) {
+      throw new IllegalArgumentException("minThreshold must be at least 2");
+    }
+    if (minThreshold > maxThreshold) {
+      throw new IllegalArgumentException(
+          "minThreshold cannot exceed maxThreshold (" + maxThreshold + ")");
+    }
+    this.minThreshold = minThreshold;
+    return this;
+  }
+
+  /**
+   * Returns the current minimum threshold for merging.
+   *
+   * @see #setMinThreshold
+   */
+  public int getMinThreshold() {
+    return minThreshold;
+  }
+
+  /**
+   * Sets whether to use exponentially growing time windows. When enabled 
(default), older data is
+   * grouped into progressively larger time buckets: {@code baseTime}, {@code 
baseTime *
+   * minThreshold}, {@code baseTime * minThreshold^2}, etc.
+   *
+   * <p>When disabled, all time windows have a fixed size equal to {@code 
baseTime}, which can be
+   * useful for workloads with uniform query patterns across all time ranges.
+   *
+   * <p>Exponential bucketing is recommended for typical time-series use cases 
where recent data is
+   * accessed more frequently than older data.
+   *
+   * <p>Default is true.
+   */
+  public TemporalMergePolicy setUseExponentialBuckets(boolean 
useExponentialBuckets) {
+    this.useExponentialBuckets = useExponentialBuckets;
+    return this;
+  }
+
+  /**
+   * Returns whether exponential bucketing is enabled.
+   *
+   * @see #setUseExponentialBuckets
+   */
+  public boolean getUseExponentialBuckets() {
+    return useExponentialBuckets;
+  }
+
+  /**
+   * Sets the maximum number of segments to merge at once within a time 
window. Larger values allow
+   * more aggressive merging (reducing segment count faster) but increase the 
cost of individual
+   * merge operations.
+   *
+   * <p>Must be at least equal to {@link #setMinThreshold}. When a time window 
accumulates more
+   * segments than this threshold, the policy will schedule multiple smaller 
merges rather than one
+   * large merge.
+   *
+   * <p>Default is 8.
+   */
+  public TemporalMergePolicy setMaxThreshold(int maxThreshold) {
+    if (maxThreshold < minThreshold) {
+      throw new IllegalArgumentException(
+          "maxThreshold must be >= minThreshold (" + minThreshold + ")");
+    }
+    this.maxThreshold = maxThreshold;
+    return this;
+  }
+
+  /**
+   * Returns the current maximum threshold for merging.
+   *
+   * @see #setMaxThreshold
+   */
+  public int getMaxThreshold() {
+    return maxThreshold;
+  }
+
+  /**
+   * Sets the compaction ratio that controls when merges are triggered based 
on segment size
+   * distribution. A merge is considered when the total document count of 
candidate segments exceeds
+   * {@code largestSegment * compactionRatio}.
+   *
+   * <p>Lower values (e.g., 1.2) trigger merges more aggressively, even when 
segment sizes are
+   * relatively balanced. Higher values (e.g., 2.0 or higher) wait for more 
size imbalance before
+   * merging, allowing more segments to accumulate but reducing write 
amplification.
+   *
+   * <p>Setting this to exactly 1.0 enables the most aggressive merging mode, 
where merges occur
+   * whenever the minimum threshold is met, regardless of segment size 
distribution.
+   *
+   * <p>This parameter works together with {@link #setMinThreshold}: a time 
window must have both
+   * (1) at least {@code minThreshold} segments, and (2) satisfy the 
compaction ratio, before a
+   * merge is triggered.
+   *
+   * <p>Default is 1.2.
+   */
+  public TemporalMergePolicy setCompactionRatio(double compactionRatio) {

Review Comment:
   I guess it's fine - since its an advanced knob, I don't mind. If it wasn't, 
a preset (an enum of say 3 levels?) would have been better, not a concern here 
though..



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to