Updated Branches: refs/heads/cassandra-1.1 46fc843bb -> 178c934aa refs/heads/trunk 4f5d072a2 -> 907a6ddf1
merge from 1.1 Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/907a6ddf Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/907a6ddf Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/907a6ddf Branch: refs/heads/trunk Commit: 907a6ddf1dc1089f5cd5056e29abaa005f666b91 Parents: 4f5d072 178c934 Author: Jonathan Ellis <[email protected]> Authored: Thu Oct 11 08:56:14 2012 -0500 Committer: Jonathan Ellis <[email protected]> Committed: Thu Oct 11 08:57:49 2012 -0500 ---------------------------------------------------------------------- CHANGES.txt | 1 + NEWS.txt | 5 +++++ .../apache/cassandra/io/sstable/Descriptor.java | 7 +++++-- .../cassandra/io/sstable/SSTableMetadata.java | 6 ++++++ 4 files changed, 17 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/907a6ddf/CHANGES.txt ---------------------------------------------------------------------- diff --cc CHANGES.txt index 49c9d9d,ac3a157..ba146e5 --- a/CHANGES.txt +++ b/CHANGES.txt @@@ -1,103 -1,5 +1,104 @@@ +1.2-beta2 + * Pluggable Thrift transport factories for CLI and cqlsh (CASSANDRA-4609, 4610) + * cassandra-cli: allow Double value type to be inserted to a column (CASSANDRA-4661) + * Add ability to use custom TServerFactory implementations (CASSANDRA-4608) + * optimize batchlog flushing to skip successful batches (CASSANDRA-4667) + * include metadata for system keyspace itself in schema tables (CASSANDRA-4416) + * add check to PropertyFileSnitch to verify presence of location for + local node (CASSANDRA-4728) + * add PBSPredictor consistency modeler (CASSANDRA-4261) + * remove vestiges of Thrift unframed mode (CASSANDRA-4729) + * optimize single-row PK lookups (CASSANDRA-4710) + * adjust blockFor calculation to account for pending ranges due to node + movement (CASSANDRA-833) + * Change CQL version to 3.0.0 and stop accepting 3.0.0-beta1 (CASSANDRA-4649) + * (CQL3) Make prepared statement global instead of per connection + (CASSANDRA-4449) + * Fix scrubbing of CQL3 created tables (CASSANDRA-4685) + * (CQL3) Fix validation when using counter and regular columns in the same + table (CASSANDRA-4706) + * Fix bug starting Cassandra with simple authentication (CASSANDRA-4648) + * Add support for batchlog in CQL3 (CASSANDRA-4545, 4738) + * Add support for multiple column family outputs in CFOF (CASSANDRA-4208) + * Support repairing only the local DC nodes (CASSANDRA-4747) + * Use rpc_address for binary protocol and change default port (CASSANRA-4751) + * Fix use of collections in prepared statements (CASSANDRA-4739) + * Store more information into peers table (CASSANDRA-4351) + * Configurable bucket size for size tiered compaction (CASSANDRA-4704) + * Run leveled compaction in parallel (CASSANDRA-4310) + + +1.2-beta1 + * add atomic_batch_mutate (CASSANDRA-4542, -4635) + * increase default max_hint_window_in_ms to 3h (CASSANDRA-4632) + * include message initiation time to replicas so they can more + accurately drop timed-out requests (CASSANDRA-2858) + * fix clientutil.jar dependencies (CASSANDRA-4566) + * optimize WriteResponse (CASSANDRA-4548) + * new metrics (CASSANDRA-4009) + * redesign KEYS indexes to avoid read-before-write (CASSANDRA-2897) + * debug tracing (CASSANDRA-1123) + * parallelize row cache loading (CASSANDRA-4282) + * Make compaction, flush JBOD-aware (CASSANDRA-4292) + * run local range scans on the read stage (CASSANDRA-3687) + * clean up ioexceptions (CASSANDRA-2116) + * add disk_failure_policy (CASSANDRA-2118) + * Introduce new json format with row level deletion (CASSANDRA-4054) + * remove redundant "name" column from schema_keyspaces (CASSANDRA-4433) + * improve "nodetool ring" handling of multi-dc clusters (CASSANDRA-3047) + * update NTS calculateNaturalEndpoints to be O(N log N) (CASSANDRA-3881) + * add UseCondCardMark XX jvm settings on jdk 1.7 (CASSANDRA-4366) + * split up rpc timeout by operation type (CASSANDRA-2819) + * rewrite key cache save/load to use only sequential i/o (CASSANDRA-3762) + * update MS protocol with a version handshake + broadcast address id + (CASSANDRA-4311) + * multithreaded hint replay (CASSANDRA-4189) + * add inter-node message compression (CASSANDRA-3127) + * remove COPP (CASSANDRA-2479) + * Track tombstone expiration and compact when tombstone content is + higher than a configurable threshold, default 20% (CASSANDRA-3442, 4234) + * update MurmurHash to version 3 (CASSANDRA-2975) + * (CLI) track elapsed time for `delete' operation (CASSANDRA-4060) + * (CLI) jline version is bumped to 1.0 to properly support + 'delete' key function (CASSANDRA-4132) + * Save IndexSummary into new SSTable 'Summary' component (CASSANDRA-2392, 4289) + * Add support for range tombstones (CASSANDRA-3708) + * Improve MessagingService efficiency (CASSANDRA-3617) + * Avoid ID conflicts from concurrent schema changes (CASSANDRA-3794) + * Set thrift HSHA server thread limit to unlimited by default (CASSANDRA-4277) + * Avoids double serialization of CF id in RowMutation messages + (CASSANDRA-4293) + * stream compressed sstables directly with java nio (CASSANDRA-4297) + * Support multiple ranges in SliceQueryFilter (CASSANDRA-3885) + * Add column metadata to system column families (CASSANDRA-4018) + * (cql3) Always use composite types by default (CASSANDRA-4329) + * (cql3) Add support for set, map and list (CASSANDRA-3647) + * Validate date type correctly (CASSANDRA-4441) + * (cql3) Allow definitions with only a PK (CASSANDRA-4361) + * (cql3) Add support for row key composites (CASSANDRA-4179) + * improve DynamicEndpointSnitch by using reservoir sampling (CASSANDRA-4038) + * (cql3) Add support for 2ndary indexes (CASSANDRA-3680) + * (cql3) fix defining more than one PK to be invalid (CASSANDRA-4477) + * remove schema agreement checking from all external APIs (Thrift, CQL and CQL3) (CASSANDRA-4487) + * add Murmur3Partitioner and make it default for new installations (CASSANDRA-3772, 4621) + * (cql3) update pseudo-map syntax to use map syntax (CASSANDRA-4497) + * Finer grained exceptions hierarchy and provides error code with exceptions (CASSANDRA-3979) + * Adds events push to binary protocol (CASSANDRA-4480) + * Rewrite nodetool help (CASSANDRA-2293) + * Make CQL3 the default for CQL (CASSANDRA-4640) + * update stress tool to be able to use CQL3 (CASSANDRA-4406) + * Accept all thrift update on CQL3 cf but don't expose their metadata (CASSANDRA-4377) + * Replace Throttle with Guava's RateLimiter for HintedHandOff (CASSANDRA-4541) + * fix counter add/get using CQL2 and CQL3 in stress tool (CASSANDRA-4633) + * Add sstable count per level to cfstats (CASSANDRA-4537) + * (cql3) Add ALTER KEYSPACE statement (CASSANDRA-4611) + * (cql3) Allow defining default consistency levels (CASSANDRA-4448) + * (cql3) Fix queries using LIMIT missing results (CASSANDRA-4579) + * fix cross-version gossip messaging (CASSANDRA-4576) + + 1.1.6 + * fix commitlog replay for nanotime-infected sstables (CASSANDRA-4782) * preflight check ttl for maximum of 20 years (CASSANDRA-4771) * Fix HH to compact with correct gcBefore, which avoids wiping out undelivered hints (CASSANDRA-4772) http://git-wip-us.apache.org/repos/asf/cassandra/blob/907a6ddf/NEWS.txt ---------------------------------------------------------------------- diff --cc NEWS.txt index c654e19,667a055..f1649ad --- a/NEWS.txt +++ b/NEWS.txt @@@ -14,69 -14,10 +14,74 @@@ by version X, but the inverse is not ne Upgrading --------- - - If you are using counters, you should drain existing Cassandra nodes - prior to the upgrade to prevent overcount during commitlog replay - (see CASSANDRA-4782). For non-counter uses, drain is not required + - 1.2 is NOT network-compatible with versions older than 1.0. That + means if you want to do a rolling, zero-downtime upgrade, you'll need + to upgrade first to 1.0.x or 1.1.x, and then to 1.2. 1.2 retains + the ability to read data files from Cassandra versions at least + back to 0.6, so a non-rolling upgrade remains possible with just + one step. ++ - If you using counters and upgrading from a version prior to ++ 1.1.6, you should drain existing Cassandra nodes prior to the ++ upgrade to prevent overcount during commitlog replay (see ++ CASSANDRA-4782). For non-counter uses, drain is not required + but is a good practice to minimize restart time. + - Server clock synchronization is more important in 1.2; replicas + will use a coordinator-provided timestamp to determine when a + request has timed out and is thus not worth proceeding with. + Using a service like NTP is strongly recommended. + - The hints schema was changed from 1.1 to 1.2. Cassandra automatically + snapshots and then truncates the hints column family as part of + starting up 1.2 for the first time. Additionally, upgraded nodes + will not store new hints destined for older (pre-1.2) nodes. It is + therefore recommended that you perform a cluster upgrade when all + nodes are up. + - The `nodetool removetoken` command (and corresponding JMX operation) + have been renamed to `nodetool removenode`. This function is + incompatible with the earlier `nodetool removetoken`, and attempts to + remove nodes in this way with a mixed 1.1 (or lower) / 1.2 cluster, + is not supported. + - The somewhat ill-conceived CollatingOrderPreservingPartitioner + has been removed. Use Murmur3Partitioner (recommended) or + ByteOrderedPartitioner instead. + - Global option hinted_handoff_throttle_delay_in_ms has been removed. + hinted_handoff_throttle_in_kb has been added instead. + - The default bloom filter fp chance has been increased to 1%. + This will save about 30% of the memory used by the old default. + Existing columnfamilies will retain their old setting. + - The default partitioner (for new clusters; the partitioner cannot be + changed in existing clusters) was changed from RandomPartitioner to + Murmur3Partitioner which provides faster hashing as well as improved + performance with secondary indexes. + - The default version of CQL (and cqlsh) is now CQL3. CQL2 is still + available but you will have to use the thrift set_cql_version method + (that is already supported in 1.1) to use CQL2. For cqlsh, you will need + to use 'cqlsh -2'. + - CQL3 is now considered final in this release. Compared to the beta + version that is part of 1.1, this final version has a few additions + (collections), but also some (incompatible) changes in the syntax for the + options of the create/alter keyspace/table statements. Typically, the + syntax to create a keyspace is now: + CREATE KEYSPACE ks WITH replication = { 'class' : 'SimpleStrategy', + 'replication_factor' : 2 }; + Please refer to the CQL3 documentation for details. + +Features +-------- + - Cassandra can now handle concurrent CREATE TABLE schema changes + as well as other updates + - rpc_timeout has been split up to allow finer-grained control + on timeouts for different operation types + - num_tokens can now be specified in cassandra.yaml. This defines the + number of tokens assigned to the host on the ring (default: 1). + Also specifying initial_token will override any num_tokens setting. + - disk_failure_policy allows blacklisting failed disks in JBOD + configuration instead of erroring out indefinitely + - event tracing can be configured per-connection ("trace_next_query") + or globally/probabilistically ("nodetool settraceprobability") + - Atomic batches are now supported server side, where Cassandra will + guarantee that (at the price of pre-writing the batch to another node + first), all mutations in the batch will be applied, even if the + coordinator fails mid-batch. 1.1.5 http://git-wip-us.apache.org/repos/asf/cassandra/blob/907a6ddf/src/java/org/apache/cassandra/io/sstable/Descriptor.java ---------------------------------------------------------------------- diff --cc src/java/org/apache/cassandra/io/sstable/Descriptor.java index b173bac,f4663b7..a3bce13 --- a/src/java/org/apache/cassandra/io/sstable/Descriptor.java +++ b/src/java/org/apache/cassandra/io/sstable/Descriptor.java @@@ -44,121 -47,20 +44,124 @@@ public class Descripto // we always incremented the major version. In particular, versions g and h are // forwards-compatible with version f, so if the above convention had been followed, // we would have labeled them fb and fc. - public static final String LEGACY_VERSION = "a"; // "pre-history" - // b (0.7.0): added version to sstable filenames - // c (0.7.0): bloom filter component computes hashes over raw key bytes instead of strings - // d (0.7.0): row size in data component becomes a long instead of int - // e (0.7.0): stores undecorated keys in data and index components - // f (0.7.0): switched bloom filter implementations in data component - // g (0.8): tracks flushed-at context in metadata component - // h (1.0): tracks max client timestamp in metadata component - // hb (1.0.3): records compression ration in metadata component - // hc (1.0.4): records partitioner in metadata component - // hd (1.0.10): includes row tombstones in maxtimestamp - // he (1.1.3): includes ancestors generation in metadata component - // hf (1.1.6): marker that replay position corresponds to 1.1.5+ millis-based id (see CASSANDRA-4782) - public static final String CURRENT_VERSION = "hf"; + public static class Version + { + // This needs to be at the begining for initialization sake + private static final String current_version = "ia"; + + public static final Version LEGACY = new Version("a"); // "pre-history" + // b (0.7.0): added version to sstable filenames + // c (0.7.0): bloom filter component computes hashes over raw key bytes instead of strings + // d (0.7.0): row size in data component becomes a long instead of int + // e (0.7.0): stores undecorated keys in data and index components + // f (0.7.0): switched bloom filter implementations in data component + // g (0.8): tracks flushed-at context in metadata component + // h (1.0): tracks max client timestamp in metadata component + // hb (1.0.3): records compression ration in metadata component + // hc (1.0.4): records partitioner in metadata component + // hd (1.0.10): includes row tombstones in maxtimestamp + // he (1.1.3): includes ancestors generation in metadata component ++ // hf (1.1.6): marker that replay position corresponds to 1.1.5+ millis-based id (see CASSANDRA-4782) + // ia (1.2.0): column indexes are promoted to the index file + // records estimated histogram of deletion times in tombstones + // bloom filter (keys and columns) upgraded to Murmur3 + + public static final Version CURRENT = new Version(current_version); + + private final String version; + + public final boolean hasStringsInBloomFilter; + public final boolean hasIntRowSize; + public final boolean hasEncodedKeys; + public final boolean isLatestVersion; + public final boolean metadataIncludesReplayPosition; ++ public final boolean metadataIncludesModernReplayPosition; + public final boolean tracksMaxTimestamp; + public final boolean hasCompressionRatio; + public final boolean hasPartitioner; + public final boolean tracksTombstones; + public final boolean hasPromotedIndexes; + public final FilterFactory.Type filterType; + public final boolean hasAncestors; + + public Version(String version) + { + this.version = version; + hasStringsInBloomFilter = version.compareTo("c") < 0; + hasIntRowSize = version.compareTo("d") < 0; + hasEncodedKeys = version.compareTo("e") < 0; + metadataIncludesReplayPosition = version.compareTo("g") >= 0; - tracksMaxTimestamp = version.compareTo("hd") >= 0; + hasCompressionRatio = version.compareTo("hb") >= 0; + hasPartitioner = version.compareTo("hc") >= 0; ++ tracksMaxTimestamp = version.compareTo("hd") >= 0; ++ hasAncestors = version.compareTo("he") >= 0; ++ metadataIncludesModernReplayPosition = version.compareTo("hf") >= 0; + tracksTombstones = version.compareTo("ia") >= 0; + hasPromotedIndexes = version.compareTo("ia") >= 0; + isLatestVersion = version.compareTo(current_version) == 0; + if (version.compareTo("f") < 0) + filterType = FilterFactory.Type.SHA; + else if (version.compareTo("ia") < 0) + filterType = FilterFactory.Type.MURMUR2; + else + filterType = FilterFactory.Type.MURMUR3; - hasAncestors = version.compareTo("he") >= 0; + } + + /** + * @param ver SSTable version + * @return True if the given version string matches the format. + * @see #version + */ + static boolean validate(String ver) + { + return ver != null && ver.matches("[a-z]+"); + } + + public boolean isCompatible() + { + return version.charAt(0) <= CURRENT.version.charAt(0); + } + + public boolean isStreamCompatible() + { + // we could add compatibility for earlier versions with the new single-pass streaming + // (see SSTableWriter.appendFromStream) but versions earlier than 0.7.1 don't have the + // MessagingService version awareness anyway so there's no point. + return isCompatible() && version.charAt(0) >= 'i'; + } + + /** + * Versions [h..hc] contained a timestamp value that was computed incorrectly, ignoring row tombstones. + * containsTimestamp returns true if there is a timestamp value in the metadata file; to know if it + * actually contains a *correct* timestamp, see tracksMaxTimestamp. + */ + public boolean containsTimestamp() + { + return version.compareTo("h") >= 0; + } + + @Override + public String toString() + { + return version; + } + + @Override + public boolean equals(Object o) + { + if (o == this) + return true; + if (!(o instanceof Version)) + return false; + return version.equals(((Version)o).version); + } + + @Override + public int hashCode() + { + return version.hashCode(); + } + } public final File directory; /** version has the following format: <code>[a-z]+</code> */ http://git-wip-us.apache.org/repos/asf/cassandra/blob/907a6ddf/src/java/org/apache/cassandra/io/sstable/SSTableMetadata.java ---------------------------------------------------------------------- diff --cc src/java/org/apache/cassandra/io/sstable/SSTableMetadata.java index 40a898f,302fb50..fafccab --- a/src/java/org/apache/cassandra/io/sstable/SSTableMetadata.java +++ b/src/java/org/apache/cassandra/io/sstable/SSTableMetadata.java @@@ -259,17 -214,23 +259,23 @@@ public class SSTableMetadat { EstimatedHistogram rowSizes = EstimatedHistogram.serializer.deserialize(dis); EstimatedHistogram columnCounts = EstimatedHistogram.serializer.deserialize(dis); - ReplayPosition replayPosition = desc.metadataIncludesReplayPosition + ReplayPosition replayPosition = desc.version.metadataIncludesReplayPosition ? ReplayPosition.serializer.deserialize(dis) : ReplayPosition.NONE; - if (!desc.metadataIncludesModernReplayPosition) ++ if (!desc.version.metadataIncludesModernReplayPosition) + { + // replay position may be "from the future" thanks to older versions generating them with nanotime. + // make sure we don't omit replaying something that we should. see CASSANDRA-4782 + replayPosition = ReplayPosition.NONE; + } - long maxTimestamp = desc.containsTimestamp() ? dis.readLong() : Long.MIN_VALUE; - if (!desc.tracksMaxTimestamp) // see javadoc to Descriptor.containsTimestamp + long maxTimestamp = desc.version.containsTimestamp() ? dis.readLong() : Long.MIN_VALUE; + if (!desc.version.tracksMaxTimestamp) // see javadoc to Descriptor.containsTimestamp maxTimestamp = Long.MIN_VALUE; - double compressionRatio = desc.hasCompressionRatio + double compressionRatio = desc.version.hasCompressionRatio ? dis.readDouble() : Double.MIN_VALUE; - String partitioner = desc.hasPartitioner ? dis.readUTF() : null; - int nbAncestors = desc.hasAncestors ? dis.readInt() : 0; + String partitioner = desc.version.hasPartitioner ? dis.readUTF() : null; + int nbAncestors = desc.version.hasAncestors ? dis.readInt() : 0; Set<Integer> ancestors = new HashSet<Integer>(nbAncestors); for (int i = 0; i < nbAncestors; i++) ancestors.add(dis.readInt());
