This is an automated email from the ASF dual-hosted git repository.
jiaguo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new d1827c5a17 Support madvise for MmapMemory (#13721)
d1827c5a17 is described below
commit d1827c5a17be77d539affe54cf4310e9fe33844c
Author: Dino Occhialini <[email protected]>
AuthorDate: Thu Aug 15 13:58:50 2024 -0700
Support madvise for MmapMemory (#13721)
* Support madvise for MmapMemory
* Remove default madvise and keep current behavior
* Add instance-level config for default madvise for mmap buffers
* Fix checkstyle
* Log errors from posix_madvise() calls
* Warn when JNR cannot load libC for madvise
* Fix checkstyle for fallthrough of switch
---
.../segment/spi/memory/unsafe/MmapMemory.java | 66 ++++++++++++++++++
.../spi/memory/unsafe/MmapMemoryConfig.java | 81 ++++++++++++++++++++++
.../server/starter/helix/BaseServerStarter.java | 7 ++
.../apache/pinot/spi/utils/CommonConstants.java | 1 +
4 files changed, 155 insertions(+)
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java
index ce91c255fa..a3e0661f6c 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java
@@ -28,6 +28,8 @@ import java.lang.reflect.Method;
import java.nio.channels.FileChannel;
import java.util.List;
import java.util.function.BiConsumer;
+import jnr.ffi.LibraryLoader;
+import jnr.ffi.types.size_t;
import net.openhft.chronicle.core.Jvm;
import net.openhft.chronicle.core.OS;
import net.openhft.posix.MSyncFlag;
@@ -124,6 +126,16 @@ public class MmapMemory implements Memory {
});
private final long _address;
private final UnmapFun _unmapFun;
+ private static final LibC LIB_C;
+ static {
+ LibC libC = null;
+ try {
+ libC = LibraryLoader.create(LibC.class).failImmediately().load("c");
+ } catch (Throwable ignored) {
+ LOGGER.warn("Could not load JNR C Library, madvise will not be used
for mmap memory.");
+ }
+ LIB_C = libC;
+ }
public MapSection(long address, UnmapFun unmapFun) {
_address = address;
@@ -137,6 +149,46 @@ public class MmapMemory implements Memory {
public UnmapFun getUnmapFun() {
return _unmapFun;
}
+
+ /**
+ * Call posix_madvise (if available) at the address aligned start of
MapSection for size bytes
+ *
+ * Internally posix_madvise operates on pages, so unaligned size would
affect all remaining bytes on the page.
+ * _address is expected to be page aligned.
+ * In the future, it may be helpful to expose this to upstream consumers
as a "hint()" abstraction in cases where
+ * advice other than MADV_RANDOM perform better (particularly sequential
reads on slow filesystems).
+ *
+ * Errors during advise are ignored since this is considered a
nice-to-have step.
+ *
+ * @param size Size of the region to set advice for.
+ * @param advice Specific advice to apply (see the LibC interface for
options)
+ */
+ protected void madvise(long size, int advice) {
+ if (LIB_C != null) {
+ int errno = LIB_C.posix_madvise(_address, size, advice);
+ switch (errno) {
+ case 0:
+ // 0 indicates a successful call
+ break;
+ case 22:
+ LOGGER.warn("posix_madvise failed with EINVAL, either addr is not
aligned or advice was invalid");
+ break;
+ case 12:
+ LOGGER.warn("posix_madvise failed with ENOMEM, indicating a bad
address or size");
+ break;
+ default:
+ LOGGER.warn("posix_madvise returned an unknown error code: {}",
errno);
+ break;
+ }
+ }
+ }
+
+ protected void madvise(long size) {
+ int defaultAdvice = MmapMemoryConfig.getDefaultAdvice();
+ if (defaultAdvice >= 0) {
+ madvise(size, defaultAdvice);
+ }
+ }
}
/**
@@ -218,6 +270,7 @@ public class MmapMemory implements Memory {
long mapSize = size + pagePosition;
MapSection map0Section = map0(fc, readOnly, mapPosition, mapSize);
+ map0Section.madvise(mapSize);
return new MapSection(map0Section.getAddress() + pagePosition,
map0Section.getUnmapFun());
} catch (InvocationTargetException | IllegalAccessException e) {
throw new RuntimeException("Cannot map file " + file + " from address
" + offset + " with size " + size, e);
@@ -369,4 +422,17 @@ public class MmapMemory implements Memory {
private interface Finder<C> {
C tryFind() throws NoSuchMethodException, ClassNotFoundException;
}
+
+ // CHECKSTYLE:OFF
+ protected interface LibC {
+ public static final int POSIX_MADV_NORMAL = 0; /* No further special
treatment */
+ public static final int POSIX_MADV_RANDOM = 1; /* Expect random page
references */
+ public static final int POSIX_MADV_SEQUENTIAL = 2; /* Expect sequential
page references */
+ public static final int POSIX_MADV_WILLNEED = 3; /* Will need these pages
*/
+ public static final int POSIX_MADV_DONTNEED = 4; /* Don't need these pages
*/
+
+ @SuppressWarnings({"UnusedReturnValue"})
+ int posix_madvise(@size_t long address, @size_t long size, int advice);
+ }
+ // CHECKSTYLE:ON
}
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemoryConfig.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemoryConfig.java
new file mode 100644
index 0000000000..fc6b5b6b48
--- /dev/null
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemoryConfig.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.spi.memory.unsafe;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.lang3.EnumUtils;
+
+/**
+ * Simple singleton config for managing advanced mmap configurations
+ */
+public class MmapMemoryConfig {
+ private MmapMemoryConfig() { }
+ private static final MmapMemoryConfig INSTANCE = new MmapMemoryConfig();
+
+ enum Advice {
+ NORMAL(MmapMemory.LibC.POSIX_MADV_NORMAL),
+ RANDOM(MmapMemory.LibC.POSIX_MADV_RANDOM),
+ SEQUENTIAL(MmapMemory.LibC.POSIX_MADV_SEQUENTIAL),
+ WILL_NEED(MmapMemory.LibC.POSIX_MADV_WILLNEED),
+ DONT_NEED(MmapMemory.LibC.POSIX_MADV_DONTNEED);
+
+ private final int _advice;
+
+ Advice(int advice) {
+ _advice = advice;
+ }
+
+ /**
+ * Get posix-compatible advice integer
+ */
+ public int getAdvice() {
+ return _advice;
+ }
+ }
+
+ /**
+ * Advice to use by default after calling mmap on a region of a file.
+ * Notably this is expected to be an integer corresponding to advice
+ * supported by posix_madvise()
+ */
+ private int _defaultAdvice = -1;
+
+ public static int getDefaultAdvice() {
+ return INSTANCE._defaultAdvice;
+ }
+
+ public static void setDefaultAdvice(int advice) {
+ Preconditions.checkArgument(
+ advice >= 0 && advice <= 4,
+ "Default advice for mmap buffers must be posix_madvise compatible
(0-4): %d",
+ advice
+ );
+ INSTANCE._defaultAdvice = advice;
+ }
+
+ public static void setDefaultAdvice(String adviceString) {
+ Preconditions.checkArgument(
+ EnumUtils.isValidEnum(Advice.class, adviceString),
+ "Default advice for mmap buffers must match a posix_madvise compatible
option: %s",
+ adviceString
+ );
+
+ setDefaultAdvice(Advice.valueOf(adviceString).getAdvice());
+ }
+}
diff --git
a/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java
b/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java
index 54e118634f..dc4100eebc 100644
---
a/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java
+++
b/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java
@@ -75,6 +75,7 @@ import org.apache.pinot.core.util.ListenerConfigUtil;
import
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneIndexRefreshManager;
import
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndexSearcherPool;
import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
+import org.apache.pinot.segment.spi.memory.unsafe.MmapMemoryConfig;
import org.apache.pinot.server.access.AccessControlFactory;
import org.apache.pinot.server.api.AdminApiApplication;
import org.apache.pinot.server.conf.ServerConf;
@@ -202,6 +203,12 @@ public abstract class BaseServerStarter implements
ServiceStartable {
// Initialize Pinot Environment Provider
_pinotEnvironmentProvider = initializePinotEnvironmentProvider();
+ // Set instance-level mmap advice defaults
+ String defaultMmapAdvice =
_serverConf.getProperty(Server.CONFIG_OF_MMAP_DEFAULT_ADVICE);
+ if (defaultMmapAdvice != null) {
+ MmapMemoryConfig.setDefaultAdvice(defaultMmapAdvice);
+ }
+
// Initialize the data buffer factory
PinotDataBuffer.loadDefaultFactory(serverConf);
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
index 6769ae1894..2bfc61e7f4 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
@@ -662,6 +662,7 @@ public class CommonConstants {
public static final double DEFAULT_SERVER_CONSUMPTION_RATE_LIMIT = 0.0;
public static final String DEFAULT_READ_MODE = "mmap";
+ public static final String CONFIG_OF_MMAP_DEFAULT_ADVICE =
"pinot.server.mmap.advice.default";
// Whether to reload consuming segment on scheme update
public static final boolean DEFAULT_RELOAD_CONSUMING_SEGMENT = true;
public static final String DEFAULT_INSTANCE_BASE_DIR =
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]