[
https://issues.apache.org/jira/browse/CASSANDRA-8285?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14223321#comment-14223321
]
Pierre Laporte edited comment on CASSANDRA-8285 at 11/24/14 7:09 PM:
---------------------------------------------------------------------
I just reproduced the issue on my machine against Cassandra 2.1.2.
*Howto*
Create 3-nodes C* cluster
{code}ccm create -n 3 -v 2.1.2 -b -s -i 127.0.0. cassandra-2.1{code}
Insert/delete a lot of rows inside a single table. I was actually trying to
reproduce the TombstoneOverwhelmingException but got an OOME instead.
{code}
public class CassandraTest implements AutoCloseable {
public static final String KEYSPACE = "TombstonesOverwhelming";
private Cluster cluster;
protected Session session;
public CassandraTest() {
this(new RoundRobinPolicy());
}
public CassandraTest(LoadBalancingPolicy loadBalancingPolicy) {
System.out.println("Creating builder...");
cluster =
Cluster.builder().addContactPoint("127.0.0.1").withLoadBalancingPolicy(loadBalancingPolicy).build();
for (Host host : cluster.getMetadata().getAllHosts()) {
System.out.println("Found host " + host.getAddress() + " in DC " +
host.getDatacenter());
}
session = cluster.connect();
}
private void executeQuietly(String query) {
try {
execute(query);
} catch (Exception e) {
e.printStackTrace();
}
}
private ResultSet execute(String query) {
return session.execute(query);
}
private ResultSet execute(Statement statement) {
return session.execute(statement);
}
@Override
public void close() throws IOException {
cluster.close();
}
public static void main(String... args) throws Exception {
try (CassandraTest test = new CassandraTest()) {
test.executeQuietly("DROP KEYSPACE IF EXISTS " + KEYSPACE);
test.execute("CREATE KEYSPACE " + KEYSPACE + " " +
"WITH REPLICATION = { 'class' : 'SimpleStrategy',
'replication_factor' : 3 }");
test.execute("USE " + KEYSPACE);
test.execute("CREATE TABLE useful (run int, iteration int, copy
int, PRIMARY KEY (run, iteration, copy))");
System.out.println("Press ENTER to start the test");
System.in.read();
for (int run = 0; run < 1_000_000; run++) {
System.out.printf("Starting run % 7d... ", run);
System.out.print("Inserting...");
for (int iteration = 0; iteration < 1_000_000; iteration++) {
Batch batch = QueryBuilder.batch();
batch.setConsistencyLevel(ConsistencyLevel.QUORUM);
for (int copy = 0; copy < 100; copy++) {
batch.add(QueryBuilder.insertInto("useful")
.value("run", run).value("iteration",
iteration).value("copy", copy));
}
test.execute(batch);
}
System.out.println("Deleting...");
for (int iteration = 0; iteration < 1_000_000; iteration++) {
Batch batch = QueryBuilder.batch();
batch.setConsistencyLevel(ConsistencyLevel.QUORUM);
for (int copy = 0; copy < 100; copy++) {
batch.add(QueryBuilder.delete().from("useful")
.where(eq("run", run)).and(eq("iteration",
iteration)).and(eq("copy", copy)));
}
test.execute(batch);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
{code}
I took ~50 minutes before two instances OOME'd. Please find attached the gc
log (gc-1416849312.log.gz) and the system log (system.log.gz). If needed, I
can upload a heap dump too.
Hope that helps
was (Author: pingtimeout):
I just reproduced the issue on my machine against Cassandra 2.1.2.
*Howto*
Create 3-nodes C* cluster
{code}ccm create -n 3 -v 2.1.2 -b -s -i 127.0.0. cassandra-2.1{code}
Insert/delete a lot of rows inside a single table. I was actually trying to
reproduce the TombstoneOverwhelmingException but got an OOME instead.
{code}
public class CassandraTest implements AutoCloseable {
public static final String KEYSPACE = "TombstonesOverwhelming";
private Cluster cluster;
protected Session session;
public CassandraTest() {
this(new RoundRobinPolicy());
}
public CassandraTest(LoadBalancingPolicy loadBalancingPolicy) {
System.out.println("Creating builder...");
cluster =
Cluster.builder().addContactPoint("127.0.0.1").withLoadBalancingPolicy(loadBalancingPolicy).build();
for (Host host : cluster.getMetadata().getAllHosts()) {
System.out.println("Found host " + host.getAddress() + " in DC " +
host.getDatacenter());
}
session = cluster.connect();
}
private void executeQuietly(String query) {
try {
execute(query);
} catch (Exception e) {
e.printStackTrace();
}
}
private ResultSet execute(String query) {
return session.execute(query);
}
private ResultSet execute(Statement statement) {
return session.execute(statement);
}
@Override
public void close() throws IOException {
cluster.close();
}
public static void main(String... args) throws Exception {
try (CassandraTest test = new CassandraTest()) {
test.executeQuietly("DROP KEYSPACE IF EXISTS " + KEYSPACE);
test.execute("CREATE KEYSPACE " + KEYSPACE + " " +
"WITH REPLICATION = { 'class' : 'SimpleStrategy',
'replication_factor' : 3 }");
test.execute("USE " + KEYSPACE);
test.execute("CREATE TABLE useful (run int, iteration int, copy
int, PRIMARY KEY (run, iteration, copy))");
System.out.println("Press ENTER to start the test");
System.in.read();
for (int run = 0; run < 1_000_000; run++) {
System.out.printf("Starting run % 7d... ", run);
System.out.print("Inserting...");
for (int iteration = 0; iteration < 1_000_000; iteration++) {
Batch batch = QueryBuilder.batch();
batch.setConsistencyLevel(ConsistencyLevel.QUORUM);
for (int copy = 0; copy < 100; copy++) {
batch.add(QueryBuilder.insertInto("useful")
.value("run", run).value("iteration",
iteration).value("copy", copy));
}
test.execute(batch);
}
System.out.println("Deleting...");
for (int iteration = 0; iteration < 1_000_000; iteration++) {
Batch batch = QueryBuilder.batch();
batch.setConsistencyLevel(ConsistencyLevel.QUORUM);
for (int copy = 0; copy < 100; copy++) {
batch.add(QueryBuilder.delete().from("useful")
.where(eq("run", run)).and(eq("iteration",
iteration)).and(eq("copy", copy)));
}
test.execute(batch);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
{code}
I took ~50 minutes before two instances OOME'd. Please find attached the gc
log and the system log. If needed, I can upload a heap dump too.
Hope that helps
> OOME in Cassandra 2.0.11
> ------------------------
>
> Key: CASSANDRA-8285
> URL: https://issues.apache.org/jira/browse/CASSANDRA-8285
> Project: Cassandra
> Issue Type: Bug
> Environment: Cassandra 2.0.11 + java-driver 2.0.8-SNAPSHOT
> Cassandra 2.0.11 + ruby-driver 1.0-beta
> Reporter: Pierre Laporte
> Assignee: Aleksey Yeschenko
> Attachments: OOME_node_system.log, gc-1416849312.log.gz, gc.log.gz,
> heap-usage-after-gc-zoom.png, heap-usage-after-gc.png, system.log.gz
>
>
> We ran drivers 3-days endurance tests against Cassandra 2.0.11 and C* crashed
> with an OOME. This happened both with ruby-driver 1.0-beta and java-driver
> 2.0.8-snapshot.
> Attached are :
> | OOME_node_system.log | The system.log of one Cassandra node that crashed |
> | gc.log.gz | The GC log on the same node |
> | heap-usage-after-gc.png | The heap occupancy evolution after every GC cycle
> |
> | heap-usage-after-gc-zoom.png | A focus on when things start to go wrong |
> Workload :
> Our test executes 5 CQL statements (select, insert, select, delete, select)
> for a given unique id, during 3 days, using multiple threads. There is not
> change in the workload during the test.
> Symptoms :
> In the attached log, it seems something starts in Cassandra between
> 2014-11-06 10:29:22 and 2014-11-06 10:45:32. This causes an allocation that
> fills the heap. We eventually get stuck in a Full GC storm and get an OOME
> in the logs.
> I have run the java-driver tests against Cassandra 1.2.19 and 2.1.1. The
> error does not occur. It seems specific to 2.0.11.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)