beobal commented on code in PR #3574: URL: https://github.com/apache/cassandra/pull/3574#discussion_r1778529392
########## src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java: ########## @@ -0,0 +1,364 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.sequences; + +import java.io.IOException; +import java.time.Duration; +import java.util.Objects; +import java.util.concurrent.ExecutionException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.MultiStepOperation; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.tcm.transformations.FinishDropAccordTable; +import org.apache.cassandra.utils.JVMStabilityInspector; + +import static java.lang.String.format; +import static org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE; +import static org.apache.cassandra.tcm.sequences.SequenceState.continuable; +import static org.apache.cassandra.tcm.sequences.SequenceState.error; +import static org.apache.cassandra.tcm.sequences.SequenceState.halted; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +/** + * A slightly atypical implementation as it consists of only a single step. To perform the drop of an + * Accord table, we first commit a PrepareDropAccordTable transformation. Upon enactement, that + * marks the table as pending drop, which blocks any new transactions from being started. It also + * instantiates an instance of this operation and adds it to the set of in progress operations. + * + * The intention is to introduce a barrier which blocks until the Accord service acknowledges that + * it was learned of the epoch in which the table was marked for deletion and that all prior transactions + * are completed. Once this is complete, we can proceed to actually drop the table. The transformation + * which performs that schema modification also removes this MSO from ClusterMetadata's in-flight set. + * This obviates the need to 'advance' this MSO in the way that other implementations with more steps do. + * + */ +public class DropAccordTable extends MultiStepOperation<Epoch> +{ + private static final Logger logger = LoggerFactory.getLogger(DropAccordTable.class); + + public static final Serializer serializer = new Serializer(); + + public final TableReference table; + + /** + * Used by factory method for external callers and by the serializer. + * We don't need to include the serialized FinishDropAccordTable step in the serialization + * of the MSO itself because they have no parameters other than the table reference and so + * we can just construct a new one when we execute it + */ + private DropAccordTable(TableReference table, Epoch latestModification) + { + super(0, latestModification); + this.table = table; + } + + public static DropAccordTable newSequence(TableReference table, Epoch preparedAt) Review Comment: Nit: in existing MSO impls the static factory method comes before the private constructor. Obviously it doesn't really matter but it feels odd to do it differently in one place. ########## src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java: ########## @@ -0,0 +1,364 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.sequences; + +import java.io.IOException; +import java.time.Duration; +import java.util.Objects; +import java.util.concurrent.ExecutionException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.MultiStepOperation; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.tcm.transformations.FinishDropAccordTable; +import org.apache.cassandra.utils.JVMStabilityInspector; + +import static java.lang.String.format; +import static org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE; +import static org.apache.cassandra.tcm.sequences.SequenceState.continuable; +import static org.apache.cassandra.tcm.sequences.SequenceState.error; +import static org.apache.cassandra.tcm.sequences.SequenceState.halted; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +/** + * A slightly atypical implementation as it consists of only a single step. To perform the drop of an + * Accord table, we first commit a PrepareDropAccordTable transformation. Upon enactement, that + * marks the table as pending drop, which blocks any new transactions from being started. It also + * instantiates an instance of this operation and adds it to the set of in progress operations. + * + * The intention is to introduce a barrier which blocks until the Accord service acknowledges that + * it was learned of the epoch in which the table was marked for deletion and that all prior transactions + * are completed. Once this is complete, we can proceed to actually drop the table. The transformation + * which performs that schema modification also removes this MSO from ClusterMetadata's in-flight set. + * This obviates the need to 'advance' this MSO in the way that other implementations with more steps do. + * + */ +public class DropAccordTable extends MultiStepOperation<Epoch> +{ + private static final Logger logger = LoggerFactory.getLogger(DropAccordTable.class); + + public static final Serializer serializer = new Serializer(); + + public final TableReference table; + + /** + * Used by factory method for external callers and by the serializer. + * We don't need to include the serialized FinishDropAccordTable step in the serialization + * of the MSO itself because they have no parameters other than the table reference and so + * we can just construct a new one when we execute it + */ + private DropAccordTable(TableReference table, Epoch latestModification) + { + super(0, latestModification); + this.table = table; + } + + public static DropAccordTable newSequence(TableReference table, Epoch preparedAt) + { + return new DropAccordTable(table, preparedAt); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + DropAccordTable that = (DropAccordTable) o; + return latestModification.equals(that.latestModification) + && table.equals(that.table); + } + + @Override + public int hashCode() + { + return Objects.hash(latestModification, table); + } + + @Override + public Kind kind() + { + return Kind.DROP_ACCORD_TABLE; + } + + @Override + protected SequenceKey sequenceKey() + { + return table; + } + + @Override + public MetadataSerializer<? extends SequenceKey> keySerializer() + { + return TableReference.serializer; + } + + @Override + public Transformation.Kind nextStep() + { + return FINISH_DROP_ACCORD_TABLE; + } + + @Override + public SequenceState executeNext() + { + try + { + SequenceState failure = awaitSafeFromAccord(); Review Comment: I think we ought to include some backoff & retry around this or in `awaitSafeFromAccord` itself. The comment there says that "this is more than likely to timeout", which will result in us dropping into the catch block, returning `continuable` and the IPS machinery calling `executeNext` again. This will happen in a tight loop and so if it is actually quite likely that the await times out, we should take care of retrying here to control the rate and to cap the maximum number of attempts. If we reach that cap we can return `halted()` which will require operator intervention to proceed (analogous to if streaming fails during a bootstrap MSO, it needs to be resumed by an operator). ########## src/java/org/apache/cassandra/service/accord/AccordService.java: ########## @@ -1073,4 +1087,227 @@ public CompactionInfo getCompactionInfo() })); return new CompactionInfo(redundantBefores, ranges, durableBefore.get()); } + + @Override + public void awaitForTableDrop(TableId id) Review Comment: this method name is a bit odd - should be `awaitTableDrop` or `waitForTableDrop` ########## src/java/org/apache/cassandra/tcm/transformations/FinishDropAccordTable.java: ########## @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.util.Objects; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.sequences.DropAccordTable.TableReference; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +import static org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE; + +/** + * Review Comment: nit: can you pull the general part of the comment out from `execute` up to here (sorry, I left it in this state) ########## src/java/org/apache/cassandra/tcm/transformations/PrepareDropAccordTable.java: ########## @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.util.Objects; + +import org.apache.cassandra.exceptions.ExceptionCode; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.sequences.DropAccordTable; +import org.apache.cassandra.tcm.sequences.DropAccordTable.TableReference; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +public class PrepareDropAccordTable implements Transformation +{ + public static final Serializer serializer = new Serializer(); + + public final TableReference tableRef; + + public PrepareDropAccordTable(TableReference tableRef) + { + this.tableRef = tableRef; + } + + @Override + public Kind kind() + { + return Kind.PREPARE_DROP_ACCORD_TABLE; + } + + @Override + public Result execute(ClusterMetadata prev) + { + KeyspaceMetadata ks = prev.schema.getKeyspaces().getNullable(tableRef.keyspace); + if (ks == null) + return new Rejected(ExceptionCode.INVALID, "Unknown keyspace " + tableRef.keyspace); + TableMetadata metadata = ks.tables.getNullable(tableRef.id); + if (metadata == null) + return new Rejected(ExceptionCode.INVALID, "Table " + tableRef + " is not known"); Review Comment: is there a reason for using `tableRef` in this response, but the `metadata` in the others? ########## src/java/org/apache/cassandra/schema/TableParams.java: ########## @@ -313,6 +319,7 @@ public String toString() .add(Option.FAST_PATH.toString(), fastPath) .add(Option.TRANSACTIONAL_MODE.toString(), transactionalMode) .add(Option.TRANSACTIONAL_MIGRATION_FROM.toString(), transactionalMigrationFrom) + .add(PENDING_DROP.name(), pendingDrop) Review Comment: other elements use `toString` which lower cases. Also, why are the new accord elements prefixed (except this one)? ########## src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java: ########## @@ -0,0 +1,364 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.sequences; + +import java.io.IOException; +import java.time.Duration; +import java.util.Objects; +import java.util.concurrent.ExecutionException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.MultiStepOperation; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.tcm.transformations.FinishDropAccordTable; +import org.apache.cassandra.utils.JVMStabilityInspector; + +import static java.lang.String.format; +import static org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE; +import static org.apache.cassandra.tcm.sequences.SequenceState.continuable; +import static org.apache.cassandra.tcm.sequences.SequenceState.error; +import static org.apache.cassandra.tcm.sequences.SequenceState.halted; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +/** + * A slightly atypical implementation as it consists of only a single step. To perform the drop of an + * Accord table, we first commit a PrepareDropAccordTable transformation. Upon enactement, that + * marks the table as pending drop, which blocks any new transactions from being started. It also + * instantiates an instance of this operation and adds it to the set of in progress operations. + * + * The intention is to introduce a barrier which blocks until the Accord service acknowledges that + * it was learned of the epoch in which the table was marked for deletion and that all prior transactions + * are completed. Once this is complete, we can proceed to actually drop the table. The transformation + * which performs that schema modification also removes this MSO from ClusterMetadata's in-flight set. + * This obviates the need to 'advance' this MSO in the way that other implementations with more steps do. + * + */ +public class DropAccordTable extends MultiStepOperation<Epoch> +{ + private static final Logger logger = LoggerFactory.getLogger(DropAccordTable.class); + + public static final Serializer serializer = new Serializer(); + + public final TableReference table; + + /** + * Used by factory method for external callers and by the serializer. + * We don't need to include the serialized FinishDropAccordTable step in the serialization + * of the MSO itself because they have no parameters other than the table reference and so + * we can just construct a new one when we execute it + */ + private DropAccordTable(TableReference table, Epoch latestModification) + { + super(0, latestModification); + this.table = table; + } + + public static DropAccordTable newSequence(TableReference table, Epoch preparedAt) + { + return new DropAccordTable(table, preparedAt); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + DropAccordTable that = (DropAccordTable) o; + return latestModification.equals(that.latestModification) + && table.equals(that.table); + } + + @Override + public int hashCode() + { + return Objects.hash(latestModification, table); + } + + @Override + public Kind kind() + { + return Kind.DROP_ACCORD_TABLE; + } + + @Override + protected SequenceKey sequenceKey() + { + return table; + } + + @Override + public MetadataSerializer<? extends SequenceKey> keySerializer() + { + return TableReference.serializer; + } + + @Override + public Transformation.Kind nextStep() + { + return FINISH_DROP_ACCORD_TABLE; + } + + @Override + public SequenceState executeNext() + { + try + { + SequenceState failure = awaitSafeFromAccord(); + if (failure != null) return failure; + } + catch (Throwable t) + { + JVMStabilityInspector.inspectThrowable(t); + logger.warn("Exception while waiting for Accord service to notify all table txns are complete", t); + // this is actually continuable as we can simply retry + return continuable(); + } + try + { + // Now we're satisfied that all Accord txns have finished for the table, + // go ahead and actually drop it + ClusterMetadataService.instance().commit(new FinishDropAccordTable(table)); + return continuable(); + } + catch (Throwable t) + { + JVMStabilityInspector.inspectThrowable(t); + logger.warn("Exception committing finish_drop_accord_table. " + + "Accord service has acknowledged the operation but table remains present in schema", t); + return halted(); + } + } + + private SequenceState awaitSafeFromAccord() throws ExecutionException, InterruptedException + { + // make sure that Accord sees the current epoch, which must necessarily follow the + // one which marked the table as pending drop + ClusterMetadata metadata = ClusterMetadata.current(); + // just for the sake of paranoia, assert that the table is actually marked as being dropped + if (!verifyTableMarked(metadata.schema.getKeyspaces())) + return error(new IllegalStateException(String.format("Table %s is in an invalid state to be dropped", table.toCQLString()))); + + long startNanos = nanoTime(); + AccordService.instance().epochReady(metadata.epoch).get(); + long epochEndNanos = nanoTime(); + + //TODO (correctness, operability): this is more than likely to timeout, so best to "await" an existing txn_id rather than creating a new one... + AccordService.instance().awaitForTableDrop(table.id); + long awaitEndNanos = nanoTime(); + logger.info("Wait for Accord to see the drop table was success. " + + "Took {} to wait for Accord to learn about the change, then {} to process everything", + Duration.ofNanos(epochEndNanos - startNanos), Duration.ofNanos(awaitEndNanos - epochEndNanos)); + return null; + } + + private boolean verifyTableMarked(Keyspaces keyspaces) + { + TableMetadata tm = keyspaces.getTableOrViewNullable(table.id); + if (tm == null) + { + logger.warn("Unable to drop accord table {}, table not found", table); + return false; + } + + if (!tm.params.pendingDrop) + { + logger.warn("Unexpected state, table {} was not marked pending drop", table); + return false; + } + + return true; + } + + @Override + public Transformation.Result applyTo(ClusterMetadata metadata) + { + // note: that this will apply the finish drop transformation to the supplied metadata. It's + // not used to actually execute the MSO, but to determine what the metadata state will/would + // be if it were executed. + return new FinishDropAccordTable(table).execute(metadata); + } + + @Override + public DropAccordTable advance(Epoch epoch) + { + // note: this isn't really used by this MSO impl as it consists of a single step so there's nothing + // to advance. An action of the single step is to remove the MSO from the set of in progress sequences + return new DropAccordTable(this.table, epoch); + } + + @Override + public ProgressBarrier barrier() + { + return ProgressBarrier.immediate(); + } + + public static class TableReference implements SequenceKey, Comparable<TableReference> + { + public static final Serializer serializer = new Serializer(); + + public final String keyspace, name; + public final TableId id; + + public TableReference(String keyspace, String name, TableId id) + { + this.keyspace = normalize(keyspace); Review Comment: notwithstanding the comment above, why do we need `normalize`? ########## src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java: ########## @@ -0,0 +1,364 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.sequences; + +import java.io.IOException; +import java.time.Duration; +import java.util.Objects; +import java.util.concurrent.ExecutionException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.MultiStepOperation; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.tcm.transformations.FinishDropAccordTable; +import org.apache.cassandra.utils.JVMStabilityInspector; + +import static java.lang.String.format; +import static org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE; +import static org.apache.cassandra.tcm.sequences.SequenceState.continuable; +import static org.apache.cassandra.tcm.sequences.SequenceState.error; +import static org.apache.cassandra.tcm.sequences.SequenceState.halted; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +/** + * A slightly atypical implementation as it consists of only a single step. To perform the drop of an + * Accord table, we first commit a PrepareDropAccordTable transformation. Upon enactement, that + * marks the table as pending drop, which blocks any new transactions from being started. It also + * instantiates an instance of this operation and adds it to the set of in progress operations. + * + * The intention is to introduce a barrier which blocks until the Accord service acknowledges that + * it was learned of the epoch in which the table was marked for deletion and that all prior transactions + * are completed. Once this is complete, we can proceed to actually drop the table. The transformation + * which performs that schema modification also removes this MSO from ClusterMetadata's in-flight set. + * This obviates the need to 'advance' this MSO in the way that other implementations with more steps do. + * + */ +public class DropAccordTable extends MultiStepOperation<Epoch> +{ + private static final Logger logger = LoggerFactory.getLogger(DropAccordTable.class); + + public static final Serializer serializer = new Serializer(); + + public final TableReference table; + + /** + * Used by factory method for external callers and by the serializer. + * We don't need to include the serialized FinishDropAccordTable step in the serialization + * of the MSO itself because they have no parameters other than the table reference and so + * we can just construct a new one when we execute it + */ + private DropAccordTable(TableReference table, Epoch latestModification) + { + super(0, latestModification); + this.table = table; + } + + public static DropAccordTable newSequence(TableReference table, Epoch preparedAt) + { + return new DropAccordTable(table, preparedAt); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + DropAccordTable that = (DropAccordTable) o; + return latestModification.equals(that.latestModification) + && table.equals(that.table); + } + + @Override + public int hashCode() + { + return Objects.hash(latestModification, table); + } + + @Override + public Kind kind() + { + return Kind.DROP_ACCORD_TABLE; + } + + @Override + protected SequenceKey sequenceKey() + { + return table; + } + + @Override + public MetadataSerializer<? extends SequenceKey> keySerializer() + { + return TableReference.serializer; + } + + @Override + public Transformation.Kind nextStep() + { + return FINISH_DROP_ACCORD_TABLE; + } + + @Override + public SequenceState executeNext() + { + try + { + SequenceState failure = awaitSafeFromAccord(); + if (failure != null) return failure; + } + catch (Throwable t) + { + JVMStabilityInspector.inspectThrowable(t); + logger.warn("Exception while waiting for Accord service to notify all table txns are complete", t); + // this is actually continuable as we can simply retry + return continuable(); + } + try + { + // Now we're satisfied that all Accord txns have finished for the table, + // go ahead and actually drop it + ClusterMetadataService.instance().commit(new FinishDropAccordTable(table)); + return continuable(); + } + catch (Throwable t) + { + JVMStabilityInspector.inspectThrowable(t); + logger.warn("Exception committing finish_drop_accord_table. " + + "Accord service has acknowledged the operation but table remains present in schema", t); + return halted(); + } + } + + private SequenceState awaitSafeFromAccord() throws ExecutionException, InterruptedException + { + // make sure that Accord sees the current epoch, which must necessarily follow the + // one which marked the table as pending drop + ClusterMetadata metadata = ClusterMetadata.current(); + // just for the sake of paranoia, assert that the table is actually marked as being dropped + if (!verifyTableMarked(metadata.schema.getKeyspaces())) + return error(new IllegalStateException(String.format("Table %s is in an invalid state to be dropped", table.toCQLString()))); + + long startNanos = nanoTime(); + AccordService.instance().epochReady(metadata.epoch).get(); + long epochEndNanos = nanoTime(); + + //TODO (correctness, operability): this is more than likely to timeout, so best to "await" an existing txn_id rather than creating a new one... + AccordService.instance().awaitForTableDrop(table.id); + long awaitEndNanos = nanoTime(); + logger.info("Wait for Accord to see the drop table was success. " + + "Took {} to wait for Accord to learn about the change, then {} to process everything", + Duration.ofNanos(epochEndNanos - startNanos), Duration.ofNanos(awaitEndNanos - epochEndNanos)); + return null; + } + + private boolean verifyTableMarked(Keyspaces keyspaces) + { + TableMetadata tm = keyspaces.getTableOrViewNullable(table.id); + if (tm == null) + { + logger.warn("Unable to drop accord table {}, table not found", table); + return false; + } + + if (!tm.params.pendingDrop) + { + logger.warn("Unexpected state, table {} was not marked pending drop", table); + return false; + } + + return true; + } + + @Override + public Transformation.Result applyTo(ClusterMetadata metadata) + { + // note: that this will apply the finish drop transformation to the supplied metadata. It's + // not used to actually execute the MSO, but to determine what the metadata state will/would + // be if it were executed. + return new FinishDropAccordTable(table).execute(metadata); + } + + @Override + public DropAccordTable advance(Epoch epoch) + { + // note: this isn't really used by this MSO impl as it consists of a single step so there's nothing + // to advance. An action of the single step is to remove the MSO from the set of in progress sequences + return new DropAccordTable(this.table, epoch); + } + + @Override + public ProgressBarrier barrier() + { + return ProgressBarrier.immediate(); + } + + public static class TableReference implements SequenceKey, Comparable<TableReference> + { + public static final Serializer serializer = new Serializer(); + + public final String keyspace, name; Review Comment: `keyspace` and `name` aren't needed. The id is the unique identifier for the table and anywhere we need access to the `KeyspaceMetadata` itself we can get it by dereferencing the `TableMetadata` and using the keyspace name to get it (we only do this for validation/logging/testing). ########## src/java/org/apache/cassandra/cql3/statements/schema/DropKeyspaceStatement.java: ########## @@ -45,8 +50,30 @@ public Keyspaces apply(ClusterMetadata metadata) Guardrails.dropKeyspaceEnabled.ensureEnabled(state); Keyspaces schema = metadata.schema.getKeyspaces(); - if (schema.containsKeyspace(keyspaceName)) + KeyspaceMetadata keyspace = schema.getNullable(keyspaceName); + if (keyspace != null) + { + // check that no accord tables in the keyspace are currently in the process of being dropped + List<TableMetadata> pendingDrop = keyspace.tables.stream() + .filter(t -> t.params.pendingDrop) + .collect(Collectors.toList()); + if (!pendingDrop.isEmpty()) + throw ire("Cannot drop keyspace '%s' as it contains accord tables which are currently being dropped. " + + "Please wait for those operations to complete before dropping the keyspace. (%s)", + keyspaceName, pendingDrop.stream() + .map(Object::toString) + .collect(Collectors.joining(","))); + + List<TableMetadata> accordTables = keyspace.tables.stream() + .filter(TableMetadata::isAccordEnabled) + .collect(Collectors.toList()); + if (!accordTables.isEmpty()) Review Comment: If you're adding this condition, is the first one still necessary? ########## src/java/org/apache/cassandra/tools/NodeTool.java: ########## @@ -262,7 +262,8 @@ public int execute(String... args) .withCommand(CMSAdmin.InitializeCMS.class) .withCommand(CMSAdmin.ReconfigureCMS.class) .withCommand(CMSAdmin.Snapshot.class) - .withCommand(CMSAdmin.Unregister.class); + .withCommand(CMSAdmin.Unregister.class) + .withCommand(CMSAdmin.ResumeDropAccordTable.class); Review Comment: I couldn't decide whether this belonged as a subcommand of `nodetool cms` or `nodetool accord`, wdyt? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]

