GWicke has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/189444

Change subject: Update for Cassandra 2.1.2
......................................................................

Update for Cassandra 2.1.2

Updated based on the new Debian default configs. This puppet patch is not
tested yet, but the corresponding configs in production are.

Bug: T88956
Change-Id: If85c01c1d897ce9158dbbe0fb1c1dad7d4282bb6
---
M manifests/defaults.pp
M manifests/init.pp
M templates/cassandra-env.sh.erb
M templates/cassandra.yaml.erb
4 files changed, 156 insertions(+), 57 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet/cassandra 
refs/changes/44/189444/1

diff --git a/manifests/defaults.pp b/manifests/defaults.pp
index a707ce1..56a0b40 100644
--- a/manifests/defaults.pp
+++ b/manifests/defaults.pp
@@ -7,10 +7,12 @@
     $data_file_directories            = 
pick($::cassandra_data_file_directories,            ['/var/lib/cassandra/data'])
     $commitlog_directory              = pick($::cassandra_commitlog_directory, 
             '/var/lib/cassandra/commitlog')
     $disk_failure_policy              = pick($::cassandra_disk_failure_policy, 
             'stop')
+    $row_cache_size_in_mb             = 
pick($::cassandra_row_cache_size_in_mb,             200)
     $memory_allocator                 = pick($::cassandra_memory_allocator,    
             'JEMallocAllocator')
     $saved_caches_directory           = 
pick($::cassandra_saved_caches_directory,           
'/var/lib/cassandra/saved_caches')
     $concurrent_reads                 = pick($::cassandra_concurrent_reads,    
             32)
-    $concurrent_writes                = pick($::cassandra_concurrent_writes,   
             $::processorcount * 8)
+    $concurrent_writes                = pick($::cassandra_concurrent_writes,   
             32)
+    $concurrent_counter_writes        = 
pick($::cassandra_concurrent_counter_writes,        32)
     $storage_port                     = pick($::cassandra_storage_port,        
             7000)
     $listen_address                   = pick($::cassandra_listen_address,      
             $::ipaddress)
 
diff --git a/manifests/init.pp b/manifests/init.pp
index 79c2c0b..6e4d327 100644
--- a/manifests/init.pp
+++ b/manifests/init.pp
@@ -56,8 +56,12 @@
 #
 # [*disk_failure_policy*]
 #   Policy for data disk failure.  Should be one of:
-#   stop_paranoid, stop, best_effort, or ignore.
+#   stop_paranoid, die, stop, best_effort, or ignore.
 #   Default: stop
+#
+# [*row_cache_size_in_mb*]
+#   Enable a small cache by default.
+#   Default: 200
 #
 # [*memory_allocator*]
 #   The off-heap memory allocator.
@@ -68,14 +72,16 @@
 #   Default: /var/lib/cassandra/saved_caches
 #
 # [*concurrent_reads*]
-#   Number of allowed concurrent reads.  Should be set to
-#   (16 * number_of_drives) in your data_file_directories.
+#   Number of allowed concurrent reads.
 #   Default: 32
 #
 # [*concurrent_writes*]
-#   Number of allowed concurrent writes.  Should be set to
-#   about (8 * number_of_cores).  This is also the default.
-#   Default: $::processorcount * 8
+#   Number of allowed concurrent writes. Impacts peak memory usage.
+#   Default: 32
+#
+# [*concurrent_counter_writes*]
+#   Number of allowed concurrent counter writes.
+#   Default: 32
 #
 # [*storage_port*]
 #   TCP port, for commands and data.
@@ -133,11 +139,11 @@
 #   Default: all
 #
 # [*max_heap_size*]
-#   Value for -Xms and -Xmx to pass to the JVM.
+#   Value for -Xms and -Xmx to pass to the JVM. Example: '8g'
 #   Default: undef
 #
 # [*heap_newsize*]
-#   Value for -Xmn to pass to the JVM.
+#   Value for -Xmn to pass to the JVM. Example: '1200m'
 #   Default: undef
 #
 # [*additional_jvm_opts*]
diff --git a/templates/cassandra-env.sh.erb b/templates/cassandra-env.sh.erb
index 5fe213a..4582546 100644
--- a/templates/cassandra-env.sh.erb
+++ b/templates/cassandra-env.sh.erb
@@ -95,12 +95,17 @@
 
 java_ver_output=`"${JAVA:-java}" -version 2>&1`
 
-jvmver=`echo "$java_ver_output" | grep 'java version' | awk -F'"' 'NR==1 
{print $2}'`
+jvmver=`echo "$java_ver_output" | grep '[openjdk|java] version' | awk -F'"' 
'NR==1 {print $2}'`
 JVM_VERSION=${jvmver%_*}
 JVM_PATCH_VERSION=${jvmver#*_}
 
 if [ "$JVM_VERSION" \< "1.7" ] ; then
-    echo "Cassandra 2.0 and later require Java 7 or later."
+    echo "Cassandra 2.0 and later require Java 7u25 or later."
+    exit 1;
+fi
+
+if [ "$JVM_VERSION" \< "1.8" ] && [ "$JVM_PATCH_VERSION" -lt "25" ] ; then
+    echo "Cassandra 2.0 and later require Java 7u25 or later."
     exit 1;
 fi
 
@@ -173,7 +178,7 @@
 JVM_OPTS="$JVM_OPTS -ea"
 
 # add the jamm javaagent
-JVM_OPTS="$JVM_OPTS -javaagent:$CASSANDRA_HOME/lib/jamm-0.2.5.jar"
+JVM_OPTS="$JVM_OPTS -javaagent:$CASSANDRA_HOME/lib/jamm-0.2.8.jar"
 
 # some JVMs will fill up their heap when accessed via JMX, see CASSANDRA-6541
 JVM_OPTS="$JVM_OPTS -XX:+CMSClassUnloadingEnabled"
@@ -217,12 +222,10 @@
 JVM_OPTS="$JVM_OPTS -XX:CMSInitiatingOccupancyFraction=75"
 JVM_OPTS="$JVM_OPTS -XX:+UseCMSInitiatingOccupancyOnly"
 JVM_OPTS="$JVM_OPTS -XX:+UseTLAB"
+JVM_OPTS="$JVM_OPTS -XX:CompileCommandFile=$CASSANDRA_CONF/hotspot_compiler"
+JVM_OPTS="$JVM_OPTS -XX:CMSWaitDuration=10000"
 
 # note: bash evals '1.7.x' as > '1.7' so this is really a >= 1.7 jvm check
-if { [ "$JVM_VERSION" \> "1.7" ] && [ "$JVM_VERSION" \< "1.8.0" ] && [ 
"$JVM_PATCH_VERSION" -ge "60" ]; } || [ "$JVM_VERSION" \> "1.8" ] ; then
-    JVM_OPTS="$JVM_OPTS -XX:+CMSParallelInitialMarkEnabled 
-XX:+CMSEdenChunksRecordAlways"
-fi
-
 if [ "$JVM_ARCH" = "64-Bit" ] ; then
     JVM_OPTS="$JVM_OPTS -XX:+UseCondCardMark"
 fi
@@ -243,13 +246,16 @@
 # JVM_OPTS="$JVM_OPTS -XX:NumberOfGCLogFiles=10"
 # JVM_OPTS="$JVM_OPTS -XX:GCLogFileSize=10M"
 
-# Configure the following for JEMallocAllocator and if jemalloc is not 
available in the system
-# library path (Example: /usr/local/lib/). Usually "make install" will do the 
right thing.
+# Configure the following for JEMallocAllocator and if jemalloc is not 
available in the system 
+# library path (Example: /usr/local/lib/). Usually "make install" will do the 
right thing. 
 # export LD_LIBRARY_PATH=<JEMALLOC_HOME>/lib/
 # JVM_OPTS="$JVM_OPTS -Djava.library.path=<JEMALLOC_HOME>/lib/"
 
 # uncomment to have Cassandra JVM listen for remote debuggers/profilers on 
port 1414
 # JVM_OPTS="$JVM_OPTS -Xdebug -Xnoagent 
-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=1414"
+
+# uncomment to have Cassandra JVM log internal method compilation (developers 
only)
+# JVM_OPTS="$JVM_OPTS -XX:+UnlockDiagnosticVMOptions -XX:+LogCompilation"
 
 # Prefer binding to IPv4 network intefaces (when net.ipv6.bindv6only=1). See
 # http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6342561 (short version:
@@ -265,11 +271,22 @@
 # 
https://blogs.oracle.com/jmxetc/entry/troubleshooting_connection_problems_in_jconsole
 # for more on configuring JMX through firewalls, etc. (Short version:
 # get it working with no firewall first.)
+
+# To use mx4j, an HTML interface for JMX, add mx4j-tools.jar to the lib/
+# directory.
+# See http://wiki.apache.org/cassandra/Operations#Monitoring_with_MX4J
+# By default mx4j listens on 0.0.0.0:8081. Uncomment the following lines
+# to control its listen address and port.
+#MX4J_ADDRESS="-Dmx4jaddress=127.0.0.1"
+#MX4J_PORT="-Dmx4jport=8081"
+
 JVM_OPTS="$JVM_OPTS -Dcom.sun.management.jmxremote.port=$JMX_PORT"
 JVM_OPTS="$JVM_OPTS -Dcom.sun.management.jmxremote.rmi.port=$JMX_PORT"
 JVM_OPTS="$JVM_OPTS -Dcom.sun.management.jmxremote.ssl=false"
 JVM_OPTS="$JVM_OPTS -Dcom.sun.management.jmxremote.authenticate=false"
 #JVM_OPTS="$JVM_OPTS 
-Dcom.sun.management.jmxremote.password.file=/etc/cassandra/jmxremote.password"
+JVM_OPTS="$JVM_OPTS $MX4J_ADDRESS"
+JVM_OPTS="$JVM_OPTS $MX4J_PORT"
 JVM_OPTS="$JVM_OPTS $JVM_EXTRA_OPTS"
 
 <% if not @additional_jvm_opts.empty? -%>
diff --git a/templates/cassandra.yaml.erb b/templates/cassandra.yaml.erb
index 24dec81..9ac2f7c 100644
--- a/templates/cassandra.yaml.erb
+++ b/templates/cassandra.yaml.erb
@@ -26,11 +26,16 @@
 # multiple tokens per node, see http://wiki.apache.org/cassandra/Operations
 num_tokens: <%= @num_tokens %>
 
+# initial_token allows you to specify tokens manually.  While you can use # it 
with
+# vnodes (num_tokens > 1, above) -- in which case you should provide a 
+# comma-separated list -- it's primarily used when adding nodes # to legacy 
clusters 
+# that do not have vnodes enabled.
+# initial_token:
 
+# See http://wiki.apache.org/cassandra/HintedHandoff
 # May either be "true" or "false" to enable globally, or contain a list
 # of data centers to enable per-datacenter.
 # hinted_handoff_enabled: DC1,DC2
-# See http://wiki.apache.org/cassandra/HintedHandoff
 hinted_handoff_enabled: true
 # this defines the maximum amount of time a dead host will have hints
 # generated.  After it has been dead this long, new hints for it will not be
@@ -104,6 +109,8 @@
 commitlog_directory: <%= @commitlog_directory %>
 
 # policy for data disk failures:
+# die: shut down gossip and Thrift and kill the JVM for any fs errors or
+#      single-sstable errors, so the node can be replaced.
 # stop_paranoid: shut down gossip and Thrift even for single-sstable errors.
 # stop: shut down gossip and Thrift, leaving the node effectively dead, but
 #       can still be inspected via JMX.
@@ -114,6 +121,7 @@
 disk_failure_policy: <%= @disk_failure_policy %>
 
 # policy for commit disk failures:
+# die: shut down gossip and Thrift and kill the JVM, so the node can be 
replaced.
 # stop: shut down gossip and Thrift, leaving the node effectively dead, but
 #       can still be inspected via JMX.
 # stop_commit: shutdown the commit log, letting writes collect but
@@ -154,7 +162,7 @@
 # NOTE: if you reduce the size, you may not get you hottest keys loaded on 
startup.
 #
 # Default value is 0, to disable row caching.
-row_cache_size_in_mb: 0
+row_cache_size_in_mb: <%= @row_cache_size_in_mb %>
 
 # Duration in seconds after which Cassandra should
 # safe the row cache. Caches are saved to saved_caches_directory as specified
@@ -170,6 +178,32 @@
 # Number of keys from the row cache to save
 # Disabled by default, meaning all keys are going to be saved
 # row_cache_keys_to_save: 100
+
+# Maximum size of the counter cache in memory.
+#
+# Counter cache helps to reduce counter locks' contention for hot counter 
cells.
+# In case of RF = 1 a counter cache hit will cause Cassandra to skip the read 
before
+# write entirely. With RF > 1 a counter cache hit will still help to reduce 
the duration
+# of the lock hold, helping with hot counter cell updates, but will not allow 
skipping
+# the read entirely. Only the local (clock, count) tuple of a counter cell is 
kept
+# in memory, not the whole counter, so it's relatively cheap.
+#
+# NOTE: if you reduce the size, you may not get you hottest keys loaded on 
startup.
+#
+# Default value is empty to make it "auto" (min(2.5% of Heap (in MB), 50MB)). 
Set to 0 to disable counter cache.
+# NOTE: if you perform counter deletes and rely on low gcgs, you should 
disable the counter cache.
+counter_cache_size_in_mb:
+
+# Duration in seconds after which Cassandra should
+# save the counter cache (keys only). Caches are saved to 
saved_caches_directory as
+# specified in this configuration file.
+#
+# Default is 7200 or 2 hours.
+counter_cache_save_period: 7200
+
+# Number of keys from the counter cache to save
+# Disabled by default, meaning all keys are going to be saved
+# counter_cache_keys_to_save: 100
 
 # The off-heap memory allocator.  Affects storage engine metadata as
 # well as caches.  Experiments show that JEMAlloc saves some memory
@@ -234,22 +268,43 @@
 # bottleneck will be reads that need to fetch data from
 # disk. "concurrent_reads" should be set to (16 * number_of_drives) in
 # order to allow the operations to enqueue low enough in the stack
-# that the OS and drives can reorder them.
+# that the OS and drives can reorder them. Same applies to
+# "concurrent_counter_writes", since counter writes read the current
+# values before incrementing and writing them back.
 #
 # On the other hand, since writes are almost never IO bound, the ideal
 # number of "concurrent_writes" is dependent on the number of cores in
 # your system; (8 * number_of_cores) is a good rule of thumb.
 concurrent_reads: <%= @concurrent_reads %>
 concurrent_writes: <%= @concurrent_writes %>
+concurrent_counter_writes: <%= @concurrent_counter_writes %>
 
 # Total memory to use for sstable-reading buffers.  Defaults to
 # the smaller of 1/4 of heap or 512MB.
 # file_cache_size_in_mb: 512
 
-# Total memory to use for memtables.  Cassandra will flush the largest
-# memtable when this much memory is used.
-# If omitted, Cassandra will set it to 1/4 of the heap.
-# memtable_total_space_in_mb: 2048
+# Total permitted memory to use for memtables. Cassandra will stop 
+# accepting writes when the limit is exceeded until a flush completes,
+# and will trigger a flush based on memtable_cleanup_threshold
+# If omitted, Cassandra will set both to 1/4 the size of the heap.
+# memtable_heap_space_in_mb: 2048
+# memtable_offheap_space_in_mb: 2048
+
+# Ratio of occupied non-flushing memtable size to total permitted size
+# that will trigger a flush of the largest memtable.  Lager mct will
+# mean larger flushes and hence less compaction, but also less concurrent
+# flush activity which can make it difficult to keep your disks fed
+# under heavy write load.
+#
+# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1)
+# memtable_cleanup_threshold: 0.11
+
+# Specify the way Cassandra allocates and manages memtable memory.
+# Options are:
+#   heap_buffers:    on heap nio buffers
+#   offheap_buffers: off heap (direct) nio buffers
+#   offheap_objects: native memory, eliminating nio buffer heap overhead
+memtable_allocation_type: offheap_objects
 
 # Total space to use for commitlogs.  Since commitlog segments are
 # mmapped, and hence use up address space, the default size is 32
@@ -263,15 +318,28 @@
 
 # This sets the amount of memtable flush writer threads.  These will
 # be blocked by disk io, and each one will hold a memtable in memory
-# while blocked. If you have a large heap and many data directories,
-# you can increase this value for better flush performance.
-# By default this will be set to the amount of data directories defined.
-#memtable_flush_writers: 1
+# while blocked. 
+#
+# memtable_flush_writers defaults to the smaller of (number of disks,
+# number of cores), with a minimum of 2 and a maximum of 8.
+# 
+# If your data directories are backed by SSD, you should increase this
+# to the number of cores.
+#memtable_flush_writers: 8
 
-# the number of full memtables to allow pending flush, that is,
-# waiting for a writer thread.  At a minimum, this should be set to
-# the maximum number of secondary indexes created on a single CF.
-memtable_flush_queue_size: 4
+# A fixed memory pool size in MB for for SSTable index summaries. If left
+# empty, this will default to 5% of the heap size. If the memory usage of
+# all index summaries exceeds this limit, SSTables with low read rates will
+# shrink their index summaries in order to meet this limit.  However, this
+# is a best-effort process. In extreme conditions Cassandra may need to use
+# more than this amount of memory.
+index_summary_capacity_in_mb:
+
+# How frequently index summaries should be resampled.  This is done
+# periodically to redistribute memory from the fixed-size pool to sstables
+# proportional their recent read rates.  Setting to -1 will disable this
+# process, leaving existing index summaries at their current sampling level.
+index_summary_resize_interval_in_minutes: 60
 
 # Whether to, when doing sequential writing, fsync() at intervals in
 # order to force the operating system to flush the dirty
@@ -288,9 +356,11 @@
 # encryption_options
 ssl_storage_port: 7001
 
-# Address to bind to and tell other Cassandra nodes to connect to. You
-# _must_ change this if you want multiple nodes to be able to
-# communicate!
+# Address or interface to bind to and tell other Cassandra nodes to connect to.
+# You _must_ change this if you want multiple nodes to be able to communicate!
+#
+# Set listen_address OR listen_interface, not both. Interfaces must correspond
+# to a single address, IP aliasing is not supported.
 #
 # Leaving it blank leaves it up to InetAddress.getLocalHost(). This
 # will always do the Right Thing _if_ the node is properly configured
@@ -327,8 +397,11 @@
 # Whether to start the thrift rpc server.
 start_rpc: <%= @start_rpc %>
 
-# The address to bind the Thrift RPC service and native transport
-# server -- clients connect here.
+# The address or interface to bind the Thrift RPC service and native transport
+# server to.
+#
+# Set rpc_address OR rpc_interface, not both. Interfaces must correspond
+# to a single address, IP aliasing is not supported.
 #
 # Leaving this blank has the same effect it does for ListenAddress,
 # (i.e. it will be based on the configured hostname of the node).
@@ -339,6 +412,12 @@
 rpc_address: <%= @rpc_address %>
 # port for Thrift to listen for clients on
 rpc_port: <%= @rpc_port %>
+
+# RPC address to broadcast to drivers and other Cassandra nodes. This cannot
+# be set to 0.0.0.0. If left blank, this will be set to the value of
+# rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must
+# be set.
+# broadcast_rpc_address: 1.2.3.4
 
 # enable or disable keepalive on rpc/native connections
 rpc_keepalive: true
@@ -353,7 +432,8 @@
 # hsha  -> Stands for "half synchronous, half asynchronous." All thrift 
clients are handled
 #          asynchronously using a small number of threads that does not vary 
with the amount
 #          of thrift clients (and thus scales well to many clients). The rpc 
requests are still
-#          synchronous (one thread per active request).
+#          synchronous (one thread per active request). If hsha is selected 
then it is essential
+#          that rpc_max_threads is changed from the default value of unlimited.
 #
 # The default is sync because on Windows hsha is about 30% slower.  On Linux,
 # sync/hsha performance is about the same, with hsha of course using less 
memory.
@@ -440,11 +520,6 @@
 # Caution should be taken on increasing the size of this threshold as it can 
lead to node instability.
 batch_size_warn_threshold_in_kb: 5
 
-# Size limit for rows being compacted in memory.  Larger rows will spill
-# over to disk and use a slower two-pass compaction process.  A message
-# will be logged specifying the row key.
-in_memory_compaction_limit_in_mb: 64
-
 # Number of simultaneous compactions to allow, NOT including
 # validation "compactions" for anti-entropy repair.  Simultaneous
 # compactions can help preserve read performance in a mixed read/write
@@ -454,16 +529,12 @@
 # slowly or too fast, you should look at
 # compaction_throughput_mb_per_sec first.
 #
-# concurrent_compactors defaults to the number of cores.
-# Uncomment to make compaction mono-threaded, the pre-0.8 default.
+# concurrent_compactors defaults to the smaller of (number of disks,
+# number of cores), with a minimum of 2 and a maximum of 8.
+# 
+# If your data directories are backed by SSD, you should increase this
+# to the number of cores.
 #concurrent_compactors: 1
-
-# Multi-threaded compaction. When enabled, each compaction will use
-# up to one thread per core, plus one thread per sstable being merged.
-# This is usually only useful for SSD-based hardware: otherwise,
-# your concern is usually to get compaction to do LESS i/o (see:
-# compaction_throughput_mb_per_sec), not more.
-multithreaded_compaction: false
 
 # Throttles compaction to the given total throughput across the entire
 # system. The faster you insert data, the faster you need to compact in
@@ -473,10 +544,11 @@
 # of compaction, including validation compaction.
 compaction_throughput_mb_per_sec: <%= @compaction_throughput_mb_per_sec %>
 
-# Track cached row keys during compaction, and re-cache their new
-# positions in the compacted sstable.  Disable if you use really large
-# key caches.
-compaction_preheat_key_cache: true
+# When compacting, the replacement sstable(s) can be opened before they
+# are completely written, and used in place of the prior sstables for
+# any range that has been written. This helps to smoothly transfer reads 
+# between the sstables, reducing page cache churn and keeping hot rows hot
+sstable_preemptive_open_interval_in_mb: 50
 
 # Throttles all outbound streaming file transfers on this node to the
 # given total throughput in Mbps. This is necessary because Cassandra does
@@ -497,6 +569,8 @@
 range_request_timeout_in_ms: 10000
 # How long the coordinator should wait for writes to complete
 write_request_timeout_in_ms: 2000
+# How long the coordinator should wait for counter writes to complete
+counter_write_request_timeout_in_ms: 5000
 # How long a coordinator should continue to retry a CAS operation
 # that contends with other proposals for the same row
 cas_contention_timeout_in_ms: 1000

-- 
To view, visit https://gerrit.wikimedia.org/r/189444
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If85c01c1d897ce9158dbbe0fb1c1dad7d4282bb6
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet/cassandra
Gerrit-Branch: master
Gerrit-Owner: GWicke <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to