[
https://issues.apache.org/jira/browse/CASSANDRA-19661?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18055654#comment-18055654
]
Dmitry Konstantinov commented on CASSANDRA-19661:
-------------------------------------------------
For UCS it should be such combination (I forgot to mentioned sstable_growth):
{code:java}
'min_sstable_size' : '0MiB', 'base_shard_count': '1', 'sstable_growth' :
'1'{code}
a message like this should be printed in the log before a flush for the table:
{code:java}
Shard count 1 for density {some number} in fixed shards mode {code}
====
Regarding
{quote}I've consistently observed a pattern where nodes get into a state where
pending mutations grow at a steady rate.
{quote}
In the shared logs I see a combination of 3 subsequent errors:
{code:java}
ERROR [MemtablePostFlush:1] 2026-01-29 11:38:04,159
JVMStabilityInspector.java:70 - Exception in thread
Thread[MemtablePostFlush:1,5,MemtablePostFlush]
java.lang.IllegalStateException: null
at
com.google.common.base.Preconditions.checkState(Preconditions.java:496)
at
org.apache.cassandra.index.sai.disk.v1.vector.VectorPostings.computeRowIds(VectorPostings.java:76)
at
org.apache.cassandra.index.sai.disk.v1.vector.OnHeapGraph.writeData(OnHeapGraph.java:315)
at
org.apache.cassandra.index.sai.memory.VectorMemoryIndex.writeDirect(VectorMemoryIndex.java:272)
at
org.apache.cassandra.index.sai.memory.MemtableIndex.writeDirect(MemtableIndex.java:113)
at
org.apache.cassandra.index.sai.disk.v1.MemtableIndexWriter.flushVectorIndex(MemtableIndexWriter.java:212)
at
org.apache.cassandra.index.sai.disk.v1.MemtableIndexWriter.complete(MemtableIndexWriter.java:143)
at
org.apache.cassandra.index.sai.disk.StorageAttachedIndexWriter.complete(StorageAttachedIndexWriter.java:212)
at java.base/java.util.ArrayList.forEach(ArrayList.java:1511)
at
java.base/java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1092)
at
org.apache.cassandra.io.sstable.format.SSTableWriter.commit(SSTableWriter.java:295)
at
org.apache.cassandra.db.compaction.unified.ShardedMultiWriter.commit(ShardedMultiWriter.java:219)
at
org.apache.cassandra.db.ColumnFamilyStore$Flush.flushMemtable(ColumnFamilyStore.java:1354)
at
org.apache.cassandra.db.ColumnFamilyStore$Flush.run(ColumnFamilyStore.java:1253)
at
org.apache.cassandra.concurrent.ExecutionFailure$1.run(ExecutionFailure.java:133)
at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
at java.base/java.lang.Thread.run(Thread.java:833)
Suppressed: java.lang.IllegalStateException: null
... 19 common frames omitted
Suppressed: java.lang.IllegalStateException: null
... 19 common frames omitted
{code}
then
{code:java}
ERROR [MemtablePostFlush:1] 2026-01-29 11:38:04,159
JVMStabilityInspector.java:70 - Exception in thread
Thread[MemtablePostFlush:1,5,MemtablePostFlush]
java.lang.NullPointerException: Cannot invoke
"java.lang.Boolean.booleanValue()" because "res" is null
at
org.apache.cassandra.utils.memory.MemtableCleanerThread$Clean.apply(MemtableCleanerThread.java:97)
at
org.apache.cassandra.utils.concurrent.ListenerList$CallbackBiConsumerListener.run(ListenerList.java:244)
at
org.apache.cassandra.concurrent.ImmediateExecutor.execute(ImmediateExecutor.java:140)
at
org.apache.cassandra.utils.concurrent.ListenerList.safeExecute(ListenerList.java:166)
at
org.apache.cassandra.utils.concurrent.ListenerList.notifyListener(ListenerList.java:157)
at
org.apache.cassandra.utils.concurrent.ListenerList$CallbackBiConsumerListener.notifySelf(ListenerList.java:250)
at
org.apache.cassandra.utils.concurrent.ListenerList.lambda$notifyExclusive$0(ListenerList.java:124)
at
org.apache.cassandra.utils.concurrent.IntrusiveStack.forEach(IntrusiveStack.java:195)
at
org.apache.cassandra.utils.concurrent.ListenerList.notifyExclusive(ListenerList.java:124)
at
org.apache.cassandra.utils.concurrent.ListenerList.notify(ListenerList.java:96)
at
org.apache.cassandra.utils.concurrent.AsyncFuture.trySet(AsyncFuture.java:104)
at
org.apache.cassandra.utils.concurrent.AbstractFuture.tryFailure(AbstractFuture.java:148)
at
org.apache.cassandra.utils.concurrent.AsyncPromise.tryFailure(AsyncPromise.java:139)
at
org.apache.cassandra.db.memtable.AbstractAllocatorMemtable.lambda$flushLargestMemtable$0(AbstractAllocatorMemtable.java:306)
at
org.apache.cassandra.concurrent.ImmediateExecutor.execute(ImmediateExecutor.java:140)
at
org.apache.cassandra.utils.concurrent.ListenerList.safeExecute(ListenerList.java:166)
at
org.apache.cassandra.utils.concurrent.ListenerList.notifyListener(ListenerList.java:157)
at
org.apache.cassandra.utils.concurrent.ListenerList$RunnableWithExecutor.notifySelf(ListenerList.java:345)
at
org.apache.cassandra.utils.concurrent.ListenerList.lambda$notifyExclusive$0(ListenerList.java:124)
at
org.apache.cassandra.utils.concurrent.IntrusiveStack.forEach(IntrusiveStack.java:195)
at
org.apache.cassandra.utils.concurrent.ListenerList.notifyExclusive(ListenerList.java:124)
at
org.apache.cassandra.utils.concurrent.ListenerList.notify(ListenerList.java:96)
at
org.apache.cassandra.utils.concurrent.AsyncFuture.trySet(AsyncFuture.java:104)
at
org.apache.cassandra.utils.concurrent.AbstractFuture.tryFailure(AbstractFuture.java:148)
at
org.apache.cassandra.concurrent.FutureTask.tryFailure(FutureTask.java:87)
at org.apache.cassandra.concurrent.FutureTask.run(FutureTask.java:75)
at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
at java.base/java.lang.Thread.run(Thread.java:833)
{code}
and finally there are also messages about a resource leak:
{code:java}
ERROR [Reference-Reaper] 2026-01-29 11:41:57,667 Ref.java:243 - LEAK DETECTED:
a reference (class
org.apache.cassandra.utils.concurrent.WrappedSharedCloseable$Tidy@344029653:[[OffHeapBitSet]])
to class
org.apache.cassandra.utils.concurrent.WrappedSharedCloseable$Tidy@344029653:[[OffHeapBitSet]]
was not released before the reference was garbage collected
ERROR [Reference-Reaper] 2026-01-29 11:41:57,669 Ref.java:243 - LEAK DETECTED:
a reference (class
org.apache.cassandra.io.util.MmappedRegions$Tidier@434613886:/app/cassandra/data/vector_bench/vectors_cbp-3c1d51e0fbcc11f08c7baf9115bb8d33/oa-3gxh_0w67_0rv342lhihvcwtdh2y-big-Index.db)
to class
org.apache.cassandra.io.util.MmappedRegions$Tidier@434613886:/app/cassandra/data/vector_bench/vectors_cbp-3c1d51e0fbcc11f08c7baf9115bb8d33/oa-3gxh_0w67_0rv342lhihvcwtdh2y-big-Index.db
was not released before the reference was garbage collected
{code}
Do you observe the pending mutations grow issue only in such combination or
there are other patterns too?
Also a thread dump would help here to see what are the mutation threads waiting
for.
> Cannot restart Cassandra 5 after creating a vector table and index
> ------------------------------------------------------------------
>
> Key: CASSANDRA-19661
> URL: https://issues.apache.org/jira/browse/CASSANDRA-19661
> Project: Apache Cassandra
> Issue Type: Bug
> Components: Feature/SAI, Feature/Vector Search, Local/Startup and
> Shutdown
> Reporter: Sergio Rua
> Priority: Normal
> Fix For: 5.0.x, 6.x
>
> Attachments: 5.0.2_fail_memtableflush_vector_full.txt, logs.tar.gz,
> upload_content.py
>
>
> I'm using llama-index and llama3 to train a model. I'm using a very simple
> code that reads some *.txt files from local and uploads them to Cassandra and
> then creates the index:
>
> {code:java}
> # Create the index from documents
> index = VectorStoreIndex.from_documents(
> documents,
> service_context=vector_store.service_context,
> storage_context=storage_context,
> show_progress=True,
> ) {code}
> This works well and I'm able to use a Chat app to get responses from the
> Cassandra data. however, right after, I cannot restart Cassandra. It'll break
> with the following error:
>
> {code:java}
> INFO [PerDiskMemtableFlushWriter_0:7] 2024-05-23 08:23:20,102
> Flushing.java:179 - Completed flushing
> /data/cassandra/data/gpt/docs_20240523-10c8eaa018d811ef8dadf75182f3e2b4/da-6-bti-Data.db
> (124.236MiB) for commitlog position
> CommitLogPosition(segmentId=1716452305636, position=15336)
> [...]
> WARN [MemtableFlushWriter:1] 2024-05-23 08:28:29,575
> MemtableIndexWriter.java:92 - [gpt.docs.idx_vector_docs] Aborting index
> memtable flush for
> /data/cassandra/data/gpt/docs-aea77a80184b11ef8dadf75182f3e2b4/da-3-bti...{code}
> {code:java}
> java.lang.IllegalStateException: null
> at
> com.google.common.base.Preconditions.checkState(Preconditions.java:496)
> at
> org.apache.cassandra.index.sai.disk.v1.vector.VectorPostings.computeRowIds(VectorPostings.java:76)
> at
> org.apache.cassandra.index.sai.disk.v1.vector.OnHeapGraph.writeData(OnHeapGraph.java:313)
> at
> org.apache.cassandra.index.sai.memory.VectorMemoryIndex.writeDirect(VectorMemoryIndex.java:272)
> at
> org.apache.cassandra.index.sai.memory.MemtableIndex.writeDirect(MemtableIndex.java:110)
> at
> org.apache.cassandra.index.sai.disk.v1.MemtableIndexWriter.flushVectorIndex(MemtableIndexWriter.java:192)
> at
> org.apache.cassandra.index.sai.disk.v1.MemtableIndexWriter.complete(MemtableIndexWriter.java:117)
> at
> org.apache.cassandra.index.sai.disk.StorageAttachedIndexWriter.complete(StorageAttachedIndexWriter.java:185)
> at java.base/java.util.ArrayList.forEach(ArrayList.java:1541)
> at
> java.base/java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1085)
> at
> org.apache.cassandra.io.sstable.format.SSTableWriter.commit(SSTableWriter.java:289)
> at
> org.apache.cassandra.db.compaction.unified.ShardedMultiWriter.commit(ShardedMultiWriter.java:219)
> at
> org.apache.cassandra.db.ColumnFamilyStore$Flush.flushMemtable(ColumnFamilyStore.java:1323)
> at
> org.apache.cassandra.db.ColumnFamilyStore$Flush.run(ColumnFamilyStore.java:1222)
> at
> org.apache.cassandra.concurrent.ExecutionFailure$1.run(ExecutionFailure.java:133)
> at
> java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
> at
> java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
> at
> io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
> at java.base/java.lang.Thread.run(Thread.java:829) {code}
> The table created by the script is as follows:
>
> {noformat}
> CREATE TABLE gpt.docs (
> partition_id text,
> row_id text,
> attributes_blob text,
> body_blob text,
> vector vector<float, 1024>,
> metadata_s map<text, text>,
> PRIMARY KEY (partition_id, row_id)
> ) WITH CLUSTERING ORDER BY (row_id ASC)
> AND additional_write_policy = '99p'
> AND allow_auto_snapshot = true
> AND bloom_filter_fp_chance = 0.01
> AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
> AND cdc = false
> AND comment = ''
> AND compaction = {'class':
> 'org.apache.cassandra.db.compaction.UnifiedCompactionStrategy',
> 'scaling_parameters': 'T4', 'target_sstable_size': '1GiB'}
> AND compression = {'chunk_length_in_kb': '16', 'class':
> 'org.apache.cassandra.io.compress.LZ4Compressor'}
> AND memtable = 'default'
> AND crc_check_chance = 1.0
> AND default_time_to_live = 0
> AND extensions = {}
> AND gc_grace_seconds = 864000
> AND incremental_backups = true
> AND max_index_interval = 2048
> AND memtable_flush_period_in_ms = 0
> AND min_index_interval = 128
> AND read_repair = 'BLOCKING'
> AND speculative_retry = '99p';
> CREATE CUSTOM INDEX eidx_metadata_s_docs ON gpt.docs (entries(metadata_s))
> USING 'org.apache.cassandra.index.sai.StorageAttachedIndex';
> CREATE CUSTOM INDEX idx_vector_docs ON gpt.docs (vector) USING
> 'org.apache.cassandra.index.sai.StorageAttachedIndex';{noformat}
> Thank you
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]