[
https://issues.apache.org/jira/browse/CASSANDRA-8696?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14295881#comment-14295881
]
Jeff Liu edited comment on CASSANDRA-8696 at 1/29/15 6:19 PM:
--------------------------------------------------------------
The replica nodes are not busy at all. Actually I have stopped all client
connection while trying to run a nodetool repair.
nodetool tpstats also shows that there isn't much activities going on.
{noformat}
Pool Name Active Pending Completed Blocked All
time blocked
CounterMutationStage 0 0 0 0
0
ReadStage 0 0 35441 0
0
RequestResponseStage 0 0 26940702 0
0
MutationStage 1 0 42298652 0
0
ReadRepairStage 0 0 101 0
0
GossipStage 0 0 77379 0
0
CacheCleanupExecutor 0 0 0 0
0
AntiEntropyStage 0 0 333 0
0
MigrationStage 0 0 1664 0
0
ValidationExecutor 0 0 51 0
0
CommitLogArchiver 0 0 0 0
0
MiscStage 0 0 0 0
0
MemtableFlushWriter 1 1 853 0
0
MemtableReclaimMemory 0 0 853 0
0
PendingRangeCalculator 0 0 12 0
0
MemtablePostFlush 0 0 1365 0
0
CompactionExecutor 2 9 244 0
0
InternalResponseStage 0 0 1679 0
0
HintedHandoff 0 0 13 0
0
Message type Dropped
RANGE_SLICE 0
READ_REPAIR 0
PAGED_RANGE 0
BINARY 0
READ 6067
MUTATION 8
_TRACE 0
REQUEST_RESPONSE 0
COUNTER_MUTATION 0
{noformat}
tail -30 /var/log/cassandra/system.log where you can see the log suddenly
stopped.
{noformat}
at
org.apache.cassandra.repair.RepairSession.failedSnapshot(RepairSession.java:344)
~[apache-cassandra-2.1.2.jar:2.1.2]
at
org.apache.cassandra.repair.RepairJob$2.onFailure(RepairJob.java:128)
~[apache-cassandra-2.1.2.jar:2.1.2]
at com.google.common.util.concurrent.Futures$4.run(Futures.java:1172)
~[guava-16.0.jar:na]
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
[na:1.7.0_45]
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
[na:1.7.0_45]
at java.lang.Thread.run(Thread.java:744) [na:1.7.0_45]
ERROR [AntiEntropySessions:38] 2015-01-28 19:47:39,746 CassandraDaemon.java:153
- Exception in thread Thread[AntiEntropySessions:38,5,RMI Runtime]
java.lang.RuntimeException: java.io.IOException: Failed during snapshot
creation.
at com.google.common.base.Throwables.propagate(Throwables.java:160)
~[guava-16.0.jar:na]
at
org.apache.cassandra.utils.WrappedRunnable.run(WrappedRunnable.java:32)
~[apache-cassandra-2.1.2.jar:2.1.2]
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
~[na:1.7.0_45]
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
~[na:1.7.0_45]
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
~[na:1.7.0_45]
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
[na:1.7.0_45]
at java.lang.Thread.run(Thread.java:744) [na:1.7.0_45]
Caused by: java.io.IOException: Failed during snapshot creation.
at
org.apache.cassandra.repair.RepairSession.failedSnapshot(RepairSession.java:344)
~[apache-cassandra-2.1.2.jar:2.1.2]
at
org.apache.cassandra.repair.RepairJob$2.onFailure(RepairJob.java:128)
~[apache-cassandra-2.1.2.jar:2.1.2]
at com.google.common.util.concurrent.Futures$4.run(Futures.java:1172)
~[guava-16.0.jar:na]
... 3 common frames omitted
WARN [HintedHandoff:2] 2015-01-28 19:47:39,749 SliceQueryFilter.java:236 -
Read 0 live and 4525 tombstoned cells in system.hints (see
tombstone_warn_threshold). 128 columns was requested, slices=[-],
delInfo={deletedAt=-9223372036854775808, localDeletion=2147483647}
INFO [HintedHandoff:2] 2015-01-28 19:47:39,753 HintedHandOffManager.java:389 -
Finished hinted handoff of 0 rows to endpoint /10.98.194.68
INFO [main] 2015-01-29 18:01:48,559 CassandraDaemon.java:89 - Hostname:
cass-chisel05.tgr01.iad02.testd.nestlabs.com
INFO [main] 2015-01-29 18:01:48,666 YamlConfigurationLoader.java:92 - Loading
settings from file:/etc/cassandra/cassandra.yaml
INFO [main] 2015-01-29 18:01:48,870 YamlConfigurationLoader.java:135 - Node
configuration:[authenticator=AllowAllAuthenticator;
authorizer=AllowAllAuthorizer; auto_bootstrap=false; auto_snapshot=true;
broadcast_rpc_address=10.97.9.110; cas_contention_timeout_in_ms=1000;
client_encryption_options=<REDACTED>; cluster_name=chisel_cassandra;
column_index_size_in_kb=64; commitlog_directory=//raid0/cassandra/commitlog/;
commitlog_segment_size_in_mb=32; commitlog_sync=periodic;
commitlog_sync_period_in_ms=10000; compaction_throughput_mb_per_sec=16;
concurrent_reads=32; concurrent_writes=32; cross_node_timeout=false;
data_file_directories=[//raid0/cassandra/data/]; disk_failure_policy=stop;
dynamic_snitch_badness_threshold=0.1;
dynamic_snitch_reset_interval_in_ms=600000;
dynamic_snitch_update_interval_in_ms=100;
endpoint_snitch=GossipingPropertyFileSnitch; hinted_handoff_enabled=true;
hinted_handoff_throttle_in_kb=1024; incremental_backups=false;
initial_token=null; key_cache_save_period=14400; key_cache_size_in_mb=null;
listen_address=10.97.9.110; max_hint_window_in_ms=10800000;
max_hints_delivery_threads=2; native_transport_port=9042; num_tokens=256;
partitioner=org.apache.cassandra.dht.RandomPartitioner;
permissions_validity_in_ms=2000; range_request_timeout_in_ms=10000;
read_request_timeout_in_ms=5000;
request_scheduler=org.apache.cassandra.scheduler.NoScheduler;
request_timeout_in_ms=10000; row_cache_save_period=0; row_cache_size_in_mb=0;
rpc_address=0.0.0.0; rpc_keepalive=true; rpc_port=9160; rpc_server_type=sync;
saved_caches_directory=//raid0/cassandra/saved_caches/;
seed_provider=[{class_name=org.apache.cassandra.locator.SimpleSeedProvider,
parameters=[{seeds=cass-chisel01.tgr01.iad02.testd.nestlabs.com,cass-chisel02.tgr01.iad02.testd.nestlabs.com}]}];
server_encryption_options=<REDACTED>; snapshot_before_compaction=false;
ssl_storage_port=7001; start_native_transport=true; start_rpc=true;
storage_port=7000; thrift_framed_transport_size_in_mb=15;
tombstone_failure_threshold=100000; tombstone_warn_threshold=1000;
trickle_fsync=false; trickle_fsync_interval_in_kb=10240;
truncate_request_timeout_in_ms=60000; write_request_timeout_in_ms=2000]
INFO [main] 2015-01-29 18:01:49,314 DatabaseDescriptor.java:203 -
DiskAccessMode 'auto' determined to be mmap, indexAccessMode is mmap
INFO [main] 2015-01-29 18:01:49,396 DatabaseDescriptor.java:291 - Global
memtable on-heap threshold is enabled at 438MB
INFO [main] 2015-01-29 18:01:49,397 DatabaseDescriptor.java:295 - Global
memtable off-heap threshold is enabled at 438MB
INFO [main] 2015-01-29 18:01:49,831 GossipingPropertyFileSnitch.java:71 -
Loaded cassandra-topology.properties for compatibility
INFO [main] 2015-0
{noformat}
was (Author: jeffl):
The replica nodes are not busy at all. Actually I have stopped all client
connection while trying to run a nodetool repair.
nodetool tpstats also shows that there isn't much activities going on.
{noformat}
Pool Name Active Pending Completed Blocked All
time blocked
CounterMutationStage 0 0 0 0
0
ReadStage 0 0 35441 0
0
RequestResponseStage 0 0 26940702 0
0
MutationStage 1 0 42298652 0
0
ReadRepairStage 0 0 101 0
0
GossipStage 0 0 77379 0
0
CacheCleanupExecutor 0 0 0 0
0
AntiEntropyStage 0 0 333 0
0
MigrationStage 0 0 1664 0
0
ValidationExecutor 0 0 51 0
0
CommitLogArchiver 0 0 0 0
0
MiscStage 0 0 0 0
0
MemtableFlushWriter 1 1 853 0
0
MemtableReclaimMemory 0 0 853 0
0
PendingRangeCalculator 0 0 12 0
0
MemtablePostFlush 0 0 1365 0
0
CompactionExecutor 2 9 244 0
0
InternalResponseStage 0 0 1679 0
0
HintedHandoff 0 0 13 0
0
Message type Dropped
RANGE_SLICE 0
READ_REPAIR 0
PAGED_RANGE 0
BINARY 0
READ 6067
MUTATION 8
_TRACE 0
REQUEST_RESPONSE 0
COUNTER_MUTATION 0
{noformat}
> nodetool repair on cassandra 2.1.2 keyspaces return
> java.lang.RuntimeException: Could not create snapshot
> ---------------------------------------------------------------------------------------------------------
>
> Key: CASSANDRA-8696
> URL: https://issues.apache.org/jira/browse/CASSANDRA-8696
> Project: Cassandra
> Issue Type: Bug
> Reporter: Jeff Liu
>
> When trying to run nodetool repair -pr on cassandra node ( 2.1.2), cassandra
> throw java exceptions: cannot create snapshot.
> the error log from system.log:
> {noformat}
> INFO [STREAM-IN-/10.97.9.110] 2015-01-28 02:07:28,815
> StreamResultFuture.java:166 - [Stream #692c1450-a692-11e4-9973-070e938df227
> ID#0] Prepare completed. Receiving 2 files(221187 bytes), sending 5
> files(632105 bytes)
> INFO [STREAM-IN-/10.97.9.110] 2015-01-28 02:07:29,046
> StreamResultFuture.java:180 - [Stream #692c1450-a692-11e4-9973-070e938df227]
> Session with /10.97.9.110 is complete
> INFO [STREAM-IN-/10.97.9.110] 2015-01-28 02:07:29,046
> StreamResultFuture.java:212 - [Stream #692c1450-a692-11e4-9973-070e938df227]
> All sessions completed
> INFO [STREAM-IN-/10.97.9.110] 2015-01-28 02:07:29,047
> StreamingRepairTask.java:96 - [repair #685e3d00-a692-11e4-9973-070e938df227]
> streaming task succeed, returning response to /10.98.194.68
> INFO [RepairJobTask:1] 2015-01-28 02:07:29,065 StreamResultFuture.java:86 -
> [Stream #692c6270-a692-11e4-9973-070e938df227] Executing streaming plan for
> Repair
> INFO [StreamConnectionEstablisher:4] 2015-01-28 02:07:29,065
> StreamSession.java:213 - [Stream #692c6270-a692-11e4-9973-070e938df227]
> Starting streaming to /10.66.187.201
> INFO [StreamConnectionEstablisher:4] 2015-01-28 02:07:29,070
> StreamCoordinator.java:209 - [Stream #692c6270-a692-11e4-9973-070e938df227,
> ID#0] Beginning stream session with /10.66.187.201
> INFO [STREAM-IN-/10.66.187.201] 2015-01-28 02:07:29,465
> StreamResultFuture.java:166 - [Stream #692c6270-a692-11e4-9973-070e938df227
> ID#0] Prepare completed. Receiving 5 files(627994 bytes), sending 5
> files(632105 bytes)
> INFO [StreamReceiveTask:22] 2015-01-28 02:07:31,971
> StreamResultFuture.java:180 - [Stream #692c6270-a692-11e4-9973-070e938df227]
> Session with /10.66.187.201 is complete
> INFO [StreamReceiveTask:22] 2015-01-28 02:07:31,972
> StreamResultFuture.java:212 - [Stream #692c6270-a692-11e4-9973-070e938df227]
> All sessions completed
> INFO [StreamReceiveTask:22] 2015-01-28 02:07:31,972
> StreamingRepairTask.java:96 - [repair #685e3d00-a692-11e4-9973-070e938df227]
> streaming task succeed, returning response to /10.98.194.68
> ERROR [RepairJobTask:1] 2015-01-28 02:07:39,444 RepairJob.java:127 - Error
> occurred during snapshot phase
> java.lang.RuntimeException: Could not create snapshot at /10.97.9.110
> at
> org.apache.cassandra.repair.SnapshotTask$SnapshotCallback.onFailure(SnapshotTask.java:77)
> ~[apache-cassandra-2.1.2.jar:2.1.2]
> at
> org.apache.cassandra.net.MessagingService$5$1.run(MessagingService.java:347)
> ~[apache-cassandra-2.1.2.jar:2.1.2]
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
> ~[na:1.7.0_45]
> at java.util.concurrent.FutureTask.run(FutureTask.java:262)
> ~[na:1.7.0_45]
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> [na:1.7.0_45]
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> [na:1.7.0_45]
> at java.lang.Thread.run(Thread.java:744) [na:1.7.0_45]
> INFO [AntiEntropySessions:6] 2015-01-28 02:07:39,445 RepairSession.java:260
> - [repair #6f85e740-a692-11e4-9973-070e938df227] new session: will sync
> /10.98.194.68, /10.66.187.201, /10.226.218.135 on range
> (12817179804668051873746972069086
> 2638799,128635403083592540777731520865977436165] for events.[bigint0text,
> bigint0boolean, bigint0int, dataset_catalog, column_categories,
> bigint0double, bigint0bigint]
> ERROR [AntiEntropySessions:5] 2015-01-28 02:07:39,445 RepairSession.java:303
> - [repair #685e3d00-a692-11e4-9973-070e938df227] session completed with the
> following error
> java.io.IOException: Failed during snapshot creation.
> at
> org.apache.cassandra.repair.RepairSession.failedSnapshot(RepairSession.java:344)
> ~[apache-cassandra-2.1.2.jar:2.1.2]
> at
> org.apache.cassandra.repair.RepairJob$2.onFailure(RepairJob.java:128)
> ~[apache-cassandra-2.1.2.jar:2.1.2]
> at com.google.common.util.concurrent.Futures$4.run(Futures.java:1172)
> ~[guava-16.0.jar:na]
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> [na:1.7.0_45]
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> [na:1.7.0_45]
> at java.lang.Thread.run(Thread.java:744) [na:1.7.0_45]
> ERROR [AntiEntropySessions:5] 2015-01-28 02:07:39,446
> CassandraDaemon.java:153 - Exception in thread
> Thread[AntiEntropySessions:5,5,RMI Runtime]
> java.lang.RuntimeException: java.io.IOException: Failed during snapshot
> creation.
> at com.google.common.base.Throwables.propagate(Throwables.java:160)
> ~[guava-16.0.jar:na]
> at
> org.apache.cassandra.utils.WrappedRunnable.run(WrappedRunnable.java:32)
> ~[apache-cassandra-2.1.2.jar:2.1.2]
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
> ~[na:1.7.0_45]
> at java.util.concurrent.FutureTask.run(FutureTask.java:262)
> ~[na:1.7.0_45]
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> ~[na:1.7.0_45]
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> [na:1.7.0_45]
> at java.lang.Thread.run(Thread.java:744) [na:1.7.0_45]
> Caused by: java.io.IOException: Failed during snapshot creation.
> at
> org.apache.cassandra.repair.RepairSession.failedSnapshot(RepairSession.java:344)
> ~[apache-cassandra-2.1.2.jar:2.1.2]
> at
> org.apache.cassandra.repair.RepairJob$2.onFailure(RepairJob.java:128)
> ~[apache-cassandra-2.1.2.jar:2.1.2]
> at com.google.common.util.concurrent.Futures$4.run(Futures.java:1172)
> ~[guava-16.0.jar:na]
> ... 3 common frames omitted
> {noformat}
> The only change we did recently was to change tablespace replication factor
> from 2 to 3 before seeing those errors. Also same time we start seeing
> timeout errors from application.
> the timeout error is something like:
> {noformat}
> core.exceptions.ReadTimeoutException: Cassandra timeout during read query at
> consistency ONE (1 responses were required but only 0 replica responded)
> at
> com.datastax.driver.core.exceptions.ReadTimeoutException.copy(ReadTimeoutException.java:69)
> ~[com.datastax.cassandra.cassandra-driver-core-2.1.3.jar:na]
> at
> com.datastax.driver.core.Responses$Error.asException(Responses.java:100)
> ~[com.datastax.cassandra.cassandra-driver-core-2.1.3.jar:na]
> at
> com.datastax.driver.core.DefaultResultSetFuture.onSet(DefaultResultSetFuture.java:110)
> ~[com.datastax.cassandra.cassandra-driver-core-2.1.3.jar:na]
> at
> com.datastax.driver.core.RequestHandler.setFinalResult(RequestHandler.java:249)
> ~[com.datastax.cassandra.cassandra-driver-core-2.1.3.jar:na]
> at com.datastax.driver.core.RequestHandler.onSet(RequestHandler.java:433)
> ~[com.datastax.cassandra.cassandra-driver-core-2.1.3.jar:na]
> at
> com.datastax.driver.core.Connection$Dispatcher.messageReceived(Connection.java:668)
> ~[com.datastax.cassandra.cassandra-driver-core-2.1.3.jar:na]
> at
> org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.handler.codec.oneone.OneToOneDecoder.handleUpstream(OneToOneDecoder.java:70)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.handler.codec.frame.FrameDecoder.unfoldAndFireMessageReceived(FrameDecoder.java:462)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.handler.codec.frame.FrameDecoder.callDecode(FrameDecoder.java:443)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.handler.codec.frame.FrameDecoder.messageReceived(FrameDecoder.java:303)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:559)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:268)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:255)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:88)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.socket.nio.AbstractNioWorker.process(AbstractNioWorker.java:108)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:318)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:89)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> ~[na:1.7.0_55]
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> ~[na:1.7.0_55]
> at java.lang.Thread.run(Thread.java:745) ~[na:1.7.0_55]
> Caused by: com.datastax.driver.core.exceptions.ReadTimeoutException:
> Cassandra timeout during read query at consistency ONE (1 responses were
> required but only 0 replica responded)
> at com.datastax.driver.core.Responses$Error$1.decode(Responses.java:61)
> ~[com.datastax.cassandra.cassandra-driver-core-2.1.3.jar:na]
> at com.datastax.driver.core.Responses$Error$1.decode(Responses.java:38)
> ~[com.datastax.cassandra.cassandra-driver-core-2.1.3.jar:na]
> at
> com.datastax.driver.core.Message$ProtocolDecoder.decode(Message.java:168)
> ~[com.datastax.cassandra.cassandra-driver-core-2.1.3.jar:na]
> at
> org.jboss.netty.handler.codec.oneone.OneToOneDecoder.handleUpstream(OneToOneDecoder.java:66)
> ~[io.netty.netty-3.9.0.Final.jar:na]
> ... 21 common frames omitted
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)