This is an automated email from the ASF dual-hosted git repository.
ibessonov pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/ignite-3.git
The following commit(s) were added to refs/heads/main by this push:
new c7c620b3c3 IGNITE-19700 Added several optimizations for meta-storage
reads. (#2218)
c7c620b3c3 is described below
commit c7c620b3c3c80698de2ce0a6855b6495ca351b4b
Author: Ivan Bessonov <[email protected]>
AuthorDate: Tue Jun 20 13:51:05 2023 +0300
IGNITE-19700 Added several optimizations for meta-storage reads. (#2218)
---
.../server/persistence/RocksDbKeyValueStorage.java | 36 ++++++++++++++++------
.../server/persistence/RocksStorageUtils.java | 18 ++++++-----
2 files changed, 36 insertions(+), 18 deletions(-)
diff --git
a/modules/metastorage/src/main/java/org/apache/ignite/internal/metastorage/server/persistence/RocksDbKeyValueStorage.java
b/modules/metastorage/src/main/java/org/apache/ignite/internal/metastorage/server/persistence/RocksDbKeyValueStorage.java
index a03de949bc..15ba8667af 100644
---
a/modules/metastorage/src/main/java/org/apache/ignite/internal/metastorage/server/persistence/RocksDbKeyValueStorage.java
+++
b/modules/metastorage/src/main/java/org/apache/ignite/internal/metastorage/server/persistence/RocksDbKeyValueStorage.java
@@ -39,6 +39,7 @@ import static
org.apache.ignite.lang.ErrorGroups.MetaStorage.COMPACTION_ERR;
import static org.apache.ignite.lang.ErrorGroups.MetaStorage.OP_EXECUTION_ERR;
import static
org.apache.ignite.lang.ErrorGroups.MetaStorage.RESTORING_STORAGE_ERR;
import static
org.apache.ignite.lang.ErrorGroups.MetaStorage.STARTING_STORAGE_ERR;
+import static org.rocksdb.util.SizeUnit.MB;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
@@ -86,10 +87,13 @@ import org.apache.ignite.internal.util.Cursor;
import org.apache.ignite.internal.util.IgniteUtils;
import org.jetbrains.annotations.Nullable;
import org.jetbrains.annotations.TestOnly;
+import org.rocksdb.BlockBasedTableConfig;
+import org.rocksdb.BloomFilter;
import org.rocksdb.ColumnFamilyDescriptor;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.ColumnFamilyOptions;
import org.rocksdb.DBOptions;
+import org.rocksdb.LRUCache;
import org.rocksdb.Options;
import org.rocksdb.ReadOptions;
import org.rocksdb.RocksDB;
@@ -239,20 +243,31 @@ public class RocksDbKeyValueStorage implements
KeyValueStorage {
}
private static List<ColumnFamilyDescriptor> cfDescriptors() {
- Options dataOptions = new Options().setCreateIfMissing(true)
+ Options baseOptions = new Options()
+ .setCreateIfMissing(true)
+ // Lowering the desired number of levels will, on average,
lead to less lookups in files, making reads faster.
+ .setNumLevels(4)
+ // Protect ourselves from slower flushes during the peak write
load.
+ .setMaxWriteBufferNumber(4)
+ .setTableFormatConfig(new BlockBasedTableConfig()
+ // Speed-up key lookup in levels by adding a bloom
filter and always caching it for level 0.
+ // This improves the access time to keys from lower
levels. 12 is chosen to fit into a 4kb memory chunk.
+ // This proved to be big enough to positively affect
the performance.
+ .setPinL0FilterAndIndexBlocksInCache(true)
+ .setFilterPolicy(new BloomFilter(12))
+ // Often helps to avoid reading data from the storage
device, making reads faster.
+ .setBlockCache(new LRUCache(64 * MB))
+ );
+
+ ColumnFamilyOptions dataFamilyOptions = new
ColumnFamilyOptions(baseOptions)
// The prefix is the revision of an entry, so prefix length is
the size of a long
.useFixedLengthPrefixExtractor(Long.BYTES);
- ColumnFamilyOptions dataFamilyOptions = new
ColumnFamilyOptions(dataOptions);
+ ColumnFamilyOptions indexFamilyOptions = new
ColumnFamilyOptions(baseOptions);
- Options indexOptions = new Options().setCreateIfMissing(true);
- ColumnFamilyOptions indexFamilyOptions = new
ColumnFamilyOptions(indexOptions);
+ ColumnFamilyOptions tsToRevFamilyOptions = new
ColumnFamilyOptions(baseOptions);
- Options tsToRevOptions = new Options().setCreateIfMissing(true);
- ColumnFamilyOptions tsToRevFamilyOptions = new
ColumnFamilyOptions(tsToRevOptions);
-
- Options revToTsOptions = new Options().setCreateIfMissing(true);
- ColumnFamilyOptions revToTsFamilyOptions = new
ColumnFamilyOptions(revToTsOptions);
+ ColumnFamilyOptions revToTsFamilyOptions = new
ColumnFamilyOptions(baseOptions);
return List.of(
new ColumnFamilyDescriptor(DATA.nameAsBytes(),
dataFamilyOptions),
@@ -427,7 +442,8 @@ public class RocksDbKeyValueStorage implements
KeyValueStorage {
* @throws RocksDBException If failed.
*/
private void fillAndWriteBatch(WriteBatch batch, long newRev, long
newCntr, @Nullable HybridTimestamp ts) throws RocksDBException {
- try (WriteOptions opts = new WriteOptions()) {
+ // Meta-storage recovery is based on the snapshot & external log. WAL
is never used for recovery, and can be safely disabled.
+ try (WriteOptions opts = new WriteOptions().setDisableWAL(true)) {
byte[] revisionBytes = longToBytes(newRev);
data.put(batch, UPDATE_COUNTER_KEY, longToBytes(newCntr));
diff --git
a/modules/metastorage/src/main/java/org/apache/ignite/internal/metastorage/server/persistence/RocksStorageUtils.java
b/modules/metastorage/src/main/java/org/apache/ignite/internal/metastorage/server/persistence/RocksStorageUtils.java
index 1c4ed24fbe..1ce90f6d38 100644
---
a/modules/metastorage/src/main/java/org/apache/ignite/internal/metastorage/server/persistence/RocksStorageUtils.java
+++
b/modules/metastorage/src/main/java/org/apache/ignite/internal/metastorage/server/persistence/RocksStorageUtils.java
@@ -23,7 +23,6 @@ import java.lang.invoke.MethodHandles;
import java.lang.invoke.VarHandle;
import java.nio.ByteOrder;
import java.util.Arrays;
-import java.util.stream.IntStream;
import org.apache.ignite.internal.metastorage.server.Value;
import org.jetbrains.annotations.Nullable;
@@ -185,9 +184,15 @@ class RocksStorageUtils {
// Value must be divisible by a size of a long, because it's a list of
longs
assert (bytes.length % Long.BYTES) == 0;
- return IntStream.range(0, bytes.length / Long.BYTES)
- .mapToLong(i -> (long) LONG_ARRAY_HANDLE.get(bytes, i *
Long.BYTES))
- .toArray();
+ int size = bytes.length / Long.BYTES;
+
+ long[] res = new long[size];
+
+ for (int i = 0; i < size; i++) {
+ res[i] = (long) LONG_ARRAY_HANDLE.get(bytes, i * Long.BYTES);
+ }
+
+ return res;
}
/**
@@ -202,11 +207,8 @@ class RocksStorageUtils {
return longToBytes(value);
}
- // Allocate a one long bigger array.
- var result = new byte[bytes.length + Long.BYTES];
-
// Copy the current value
- System.arraycopy(bytes, 0, result, 0, bytes.length);
+ var result = Arrays.copyOf(bytes, bytes.length + Long.BYTES);
LONG_ARRAY_HANDLE.set(result, bytes.length, value);