beobal commented on code in PR #3574:
URL: https://github.com/apache/cassandra/pull/3574#discussion_r1778529392


##########
src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java:
##########
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.tcm.sequences;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.Objects;
+import java.util.concurrent.ExecutionException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.schema.Keyspaces;
+import org.apache.cassandra.schema.TableId;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.accord.AccordService;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.ClusterMetadataService;
+import org.apache.cassandra.tcm.Epoch;
+import org.apache.cassandra.tcm.MultiStepOperation;
+import org.apache.cassandra.tcm.Transformation;
+import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer;
+import org.apache.cassandra.tcm.serialization.MetadataSerializer;
+import org.apache.cassandra.tcm.serialization.Version;
+import org.apache.cassandra.tcm.transformations.FinishDropAccordTable;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+
+import static java.lang.String.format;
+import static 
org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE;
+import static org.apache.cassandra.tcm.sequences.SequenceState.continuable;
+import static org.apache.cassandra.tcm.sequences.SequenceState.error;
+import static org.apache.cassandra.tcm.sequences.SequenceState.halted;
+import static org.apache.cassandra.utils.Clock.Global.nanoTime;
+
+/**
+ * A slightly atypical implementation as it consists of only a single step. To 
perform the drop of an
+ * Accord table, we first commit a PrepareDropAccordTable transformation. Upon 
enactement, that
+ * marks the table as pending drop, which blocks any new transactions from 
being started. It also
+ * instantiates an instance of this operation and adds it to the set of in 
progress operations.
+ *
+ * The intention is to introduce a barrier which blocks until the Accord 
service acknowledges that
+ * it was learned of the epoch in which the table was marked for deletion and 
that all prior transactions
+ * are completed. Once this is complete, we can proceed to actually drop the 
table. The transformation
+ * which performs that schema modification also removes this MSO from 
ClusterMetadata's in-flight set.
+ * This obviates the need to 'advance' this MSO in the way that other 
implementations with more steps do.
+ *
+ */
+public class DropAccordTable extends MultiStepOperation<Epoch>
+{
+    private static final Logger logger = 
LoggerFactory.getLogger(DropAccordTable.class);
+
+    public static final Serializer serializer = new Serializer();
+
+    public final TableReference table;
+
+    /**
+     * Used by factory method for external callers and by the serializer.
+     * We don't need to include the serialized FinishDropAccordTable step in 
the serialization
+     * of the MSO itself because they have no parameters other than the table 
reference and so
+     * we can just construct a new one when we execute it
+     */
+    private DropAccordTable(TableReference table, Epoch latestModification)
+    {
+        super(0, latestModification);
+        this.table = table;
+    }
+
+    public static DropAccordTable newSequence(TableReference table, Epoch 
preparedAt)

Review Comment:
   Nit: in existing MSO impls the static factory method comes before the 
private constructor.  Obviously it doesn't really matter but it feels odd to do 
it differently in one place. 



##########
src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java:
##########
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.tcm.sequences;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.Objects;
+import java.util.concurrent.ExecutionException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.schema.Keyspaces;
+import org.apache.cassandra.schema.TableId;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.accord.AccordService;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.ClusterMetadataService;
+import org.apache.cassandra.tcm.Epoch;
+import org.apache.cassandra.tcm.MultiStepOperation;
+import org.apache.cassandra.tcm.Transformation;
+import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer;
+import org.apache.cassandra.tcm.serialization.MetadataSerializer;
+import org.apache.cassandra.tcm.serialization.Version;
+import org.apache.cassandra.tcm.transformations.FinishDropAccordTable;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+
+import static java.lang.String.format;
+import static 
org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE;
+import static org.apache.cassandra.tcm.sequences.SequenceState.continuable;
+import static org.apache.cassandra.tcm.sequences.SequenceState.error;
+import static org.apache.cassandra.tcm.sequences.SequenceState.halted;
+import static org.apache.cassandra.utils.Clock.Global.nanoTime;
+
+/**
+ * A slightly atypical implementation as it consists of only a single step. To 
perform the drop of an
+ * Accord table, we first commit a PrepareDropAccordTable transformation. Upon 
enactement, that
+ * marks the table as pending drop, which blocks any new transactions from 
being started. It also
+ * instantiates an instance of this operation and adds it to the set of in 
progress operations.
+ *
+ * The intention is to introduce a barrier which blocks until the Accord 
service acknowledges that
+ * it was learned of the epoch in which the table was marked for deletion and 
that all prior transactions
+ * are completed. Once this is complete, we can proceed to actually drop the 
table. The transformation
+ * which performs that schema modification also removes this MSO from 
ClusterMetadata's in-flight set.
+ * This obviates the need to 'advance' this MSO in the way that other 
implementations with more steps do.
+ *
+ */
+public class DropAccordTable extends MultiStepOperation<Epoch>
+{
+    private static final Logger logger = 
LoggerFactory.getLogger(DropAccordTable.class);
+
+    public static final Serializer serializer = new Serializer();
+
+    public final TableReference table;
+
+    /**
+     * Used by factory method for external callers and by the serializer.
+     * We don't need to include the serialized FinishDropAccordTable step in 
the serialization
+     * of the MSO itself because they have no parameters other than the table 
reference and so
+     * we can just construct a new one when we execute it
+     */
+    private DropAccordTable(TableReference table, Epoch latestModification)
+    {
+        super(0, latestModification);
+        this.table = table;
+    }
+
+    public static DropAccordTable newSequence(TableReference table, Epoch 
preparedAt)
+    {
+        return new DropAccordTable(table, preparedAt);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        DropAccordTable that = (DropAccordTable) o;
+        return latestModification.equals(that.latestModification)
+               && table.equals(that.table);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(latestModification, table);
+    }
+
+    @Override
+    public Kind kind()
+    {
+        return Kind.DROP_ACCORD_TABLE;
+    }
+
+    @Override
+    protected SequenceKey sequenceKey()
+    {
+        return table;
+    }
+
+    @Override
+    public MetadataSerializer<? extends SequenceKey> keySerializer()
+    {
+        return TableReference.serializer;
+    }
+
+    @Override
+    public Transformation.Kind nextStep()
+    {
+        return FINISH_DROP_ACCORD_TABLE;
+    }
+
+    @Override
+    public SequenceState executeNext()
+    {
+        try
+        {
+            SequenceState failure = awaitSafeFromAccord();

Review Comment:
   I think we ought to include some backoff & retry around this or in 
`awaitSafeFromAccord` itself. The comment there says that "this is more than 
likely to timeout", which will result in us dropping into the catch block, 
returning `continuable` and the IPS machinery calling `executeNext` again. 
   
   This will happen in a tight loop and so if it is actually quite likely that 
the await times out, we should take care of retrying here to control the rate 
and to cap the  maximum number of attempts. If we reach that cap we can return 
`halted()` which will require operator intervention to proceed (analogous to if 
streaming fails during a bootstrap MSO, it needs to be resumed by an operator). 
 



##########
src/java/org/apache/cassandra/service/accord/AccordService.java:
##########
@@ -1073,4 +1087,227 @@ public CompactionInfo getCompactionInfo()
         }));
         return new CompactionInfo(redundantBefores, ranges, 
durableBefore.get());
     }
+
+    @Override
+    public void awaitForTableDrop(TableId id)

Review Comment:
   this method name is a bit odd - should be `awaitTableDrop` or 
`waitForTableDrop`



##########
src/java/org/apache/cassandra/tcm/transformations/FinishDropAccordTable.java:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.tcm.transformations;
+
+import java.io.IOException;
+import java.util.Objects;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.schema.DistributedSchema;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.Keyspaces;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.Transformation;
+import org.apache.cassandra.tcm.sequences.DropAccordTable.TableReference;
+import org.apache.cassandra.tcm.sequences.LockedRanges;
+import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer;
+import org.apache.cassandra.tcm.serialization.Version;
+
+import static 
org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE;
+
+/**
+ *

Review Comment:
   nit: can you pull the general part of the comment out from `execute` up to 
here (sorry, I left it in this state) 



##########
src/java/org/apache/cassandra/tcm/transformations/PrepareDropAccordTable.java:
##########
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.tcm.transformations;
+
+import java.io.IOException;
+import java.util.Objects;
+
+import org.apache.cassandra.exceptions.ExceptionCode;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.schema.DistributedSchema;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.Transformation;
+import org.apache.cassandra.tcm.sequences.DropAccordTable;
+import org.apache.cassandra.tcm.sequences.DropAccordTable.TableReference;
+import org.apache.cassandra.tcm.sequences.LockedRanges;
+import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer;
+import org.apache.cassandra.tcm.serialization.Version;
+
+public class PrepareDropAccordTable implements Transformation
+{
+    public static final Serializer serializer = new Serializer();
+
+    public final TableReference tableRef;
+
+    public PrepareDropAccordTable(TableReference tableRef)
+    {
+        this.tableRef = tableRef;
+    }
+
+    @Override
+    public Kind kind()
+    {
+        return Kind.PREPARE_DROP_ACCORD_TABLE;
+    }
+
+    @Override
+    public Result execute(ClusterMetadata prev)
+    {
+        KeyspaceMetadata ks = 
prev.schema.getKeyspaces().getNullable(tableRef.keyspace);
+        if (ks == null)
+            return new Rejected(ExceptionCode.INVALID, "Unknown keyspace " + 
tableRef.keyspace);
+        TableMetadata metadata = ks.tables.getNullable(tableRef.id);
+        if (metadata == null)
+            return new Rejected(ExceptionCode.INVALID, "Table " + tableRef + " 
is not known");

Review Comment:
   is there a reason for using `tableRef` in this response, but the `metadata` 
in the others?



##########
src/java/org/apache/cassandra/schema/TableParams.java:
##########
@@ -313,6 +319,7 @@ public String toString()
                           .add(Option.FAST_PATH.toString(), fastPath)
                           .add(Option.TRANSACTIONAL_MODE.toString(), 
transactionalMode)
                           .add(Option.TRANSACTIONAL_MIGRATION_FROM.toString(), 
transactionalMigrationFrom)
+                          .add(PENDING_DROP.name(), pendingDrop)

Review Comment:
   other elements use `toString` which lower cases. Also, why are the new 
accord elements prefixed (except this one)?



##########
src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java:
##########
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.tcm.sequences;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.Objects;
+import java.util.concurrent.ExecutionException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.schema.Keyspaces;
+import org.apache.cassandra.schema.TableId;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.accord.AccordService;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.ClusterMetadataService;
+import org.apache.cassandra.tcm.Epoch;
+import org.apache.cassandra.tcm.MultiStepOperation;
+import org.apache.cassandra.tcm.Transformation;
+import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer;
+import org.apache.cassandra.tcm.serialization.MetadataSerializer;
+import org.apache.cassandra.tcm.serialization.Version;
+import org.apache.cassandra.tcm.transformations.FinishDropAccordTable;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+
+import static java.lang.String.format;
+import static 
org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE;
+import static org.apache.cassandra.tcm.sequences.SequenceState.continuable;
+import static org.apache.cassandra.tcm.sequences.SequenceState.error;
+import static org.apache.cassandra.tcm.sequences.SequenceState.halted;
+import static org.apache.cassandra.utils.Clock.Global.nanoTime;
+
+/**
+ * A slightly atypical implementation as it consists of only a single step. To 
perform the drop of an
+ * Accord table, we first commit a PrepareDropAccordTable transformation. Upon 
enactement, that
+ * marks the table as pending drop, which blocks any new transactions from 
being started. It also
+ * instantiates an instance of this operation and adds it to the set of in 
progress operations.
+ *
+ * The intention is to introduce a barrier which blocks until the Accord 
service acknowledges that
+ * it was learned of the epoch in which the table was marked for deletion and 
that all prior transactions
+ * are completed. Once this is complete, we can proceed to actually drop the 
table. The transformation
+ * which performs that schema modification also removes this MSO from 
ClusterMetadata's in-flight set.
+ * This obviates the need to 'advance' this MSO in the way that other 
implementations with more steps do.
+ *
+ */
+public class DropAccordTable extends MultiStepOperation<Epoch>
+{
+    private static final Logger logger = 
LoggerFactory.getLogger(DropAccordTable.class);
+
+    public static final Serializer serializer = new Serializer();
+
+    public final TableReference table;
+
+    /**
+     * Used by factory method for external callers and by the serializer.
+     * We don't need to include the serialized FinishDropAccordTable step in 
the serialization
+     * of the MSO itself because they have no parameters other than the table 
reference and so
+     * we can just construct a new one when we execute it
+     */
+    private DropAccordTable(TableReference table, Epoch latestModification)
+    {
+        super(0, latestModification);
+        this.table = table;
+    }
+
+    public static DropAccordTable newSequence(TableReference table, Epoch 
preparedAt)
+    {
+        return new DropAccordTable(table, preparedAt);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        DropAccordTable that = (DropAccordTable) o;
+        return latestModification.equals(that.latestModification)
+               && table.equals(that.table);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(latestModification, table);
+    }
+
+    @Override
+    public Kind kind()
+    {
+        return Kind.DROP_ACCORD_TABLE;
+    }
+
+    @Override
+    protected SequenceKey sequenceKey()
+    {
+        return table;
+    }
+
+    @Override
+    public MetadataSerializer<? extends SequenceKey> keySerializer()
+    {
+        return TableReference.serializer;
+    }
+
+    @Override
+    public Transformation.Kind nextStep()
+    {
+        return FINISH_DROP_ACCORD_TABLE;
+    }
+
+    @Override
+    public SequenceState executeNext()
+    {
+        try
+        {
+            SequenceState failure = awaitSafeFromAccord();
+            if (failure != null) return failure;
+        }
+        catch (Throwable t)
+        {
+            JVMStabilityInspector.inspectThrowable(t);
+            logger.warn("Exception while waiting for Accord service to notify 
all table txns are complete", t);
+            // this is actually continuable as we can simply retry
+            return continuable();
+        }
+        try
+        {
+            // Now we're satisfied that all Accord txns have finished for the 
table,
+            // go ahead and actually drop it
+            ClusterMetadataService.instance().commit(new 
FinishDropAccordTable(table));
+            return continuable();
+        }
+        catch (Throwable t)
+        {
+            JVMStabilityInspector.inspectThrowable(t);
+            logger.warn("Exception committing finish_drop_accord_table. " +
+                        "Accord service has acknowledged the operation but 
table remains present in schema", t);
+            return halted();
+        }
+    }
+
+    private SequenceState awaitSafeFromAccord() throws ExecutionException, 
InterruptedException
+    {
+        // make sure that Accord sees the current epoch, which must 
necessarily follow the
+        // one which marked the table as pending drop
+        ClusterMetadata metadata = ClusterMetadata.current();
+        // just for the sake of paranoia, assert that the table is actually 
marked as being dropped
+        if (!verifyTableMarked(metadata.schema.getKeyspaces()))
+            return error(new IllegalStateException(String.format("Table %s is 
in an invalid state to be dropped", table.toCQLString())));
+
+        long startNanos = nanoTime();
+        AccordService.instance().epochReady(metadata.epoch).get();
+        long epochEndNanos = nanoTime();
+
+        //TODO (correctness, operability): this is more than likely to 
timeout, so best to "await" an existing txn_id rather than creating a new one...
+        AccordService.instance().awaitForTableDrop(table.id);
+        long awaitEndNanos = nanoTime();
+        logger.info("Wait for Accord to see the drop table was success.  " +
+                    "Took {} to wait for Accord to learn about the change, 
then {} to process everything",
+                    Duration.ofNanos(epochEndNanos - startNanos), 
Duration.ofNanos(awaitEndNanos - epochEndNanos));
+        return null;
+    }
+
+    private boolean verifyTableMarked(Keyspaces keyspaces)
+    {
+        TableMetadata tm = keyspaces.getTableOrViewNullable(table.id);
+        if (tm == null)
+        {
+            logger.warn("Unable to drop accord table {}, table not found", 
table);
+            return false;
+        }
+
+        if (!tm.params.pendingDrop)
+        {
+            logger.warn("Unexpected state, table {} was not marked pending 
drop", table);
+            return false;
+        }
+
+        return true;
+    }
+
+    @Override
+    public Transformation.Result applyTo(ClusterMetadata metadata)
+    {
+        // note: that this will apply the finish drop transformation to the 
supplied metadata. It's
+        // not used to actually execute the MSO, but to determine what the 
metadata state will/would
+        // be if it were executed.
+        return new FinishDropAccordTable(table).execute(metadata);
+    }
+
+    @Override
+    public DropAccordTable advance(Epoch epoch)
+    {
+        // note: this isn't really used by this MSO impl as it consists of a 
single step so there's nothing
+        // to advance. An action of the single step is to remove the MSO from 
the set of in progress sequences
+        return new DropAccordTable(this.table, epoch);
+    }
+
+    @Override
+    public ProgressBarrier barrier()
+    {
+        return ProgressBarrier.immediate();
+    }
+
+    public static class TableReference implements SequenceKey, 
Comparable<TableReference>
+    {
+        public static final Serializer serializer = new Serializer();
+
+        public final String keyspace, name;
+        public final TableId id;
+
+        public TableReference(String keyspace, String name, TableId id)
+        {
+            this.keyspace = normalize(keyspace);

Review Comment:
   notwithstanding the comment above, why do we need `normalize`?



##########
src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java:
##########
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.tcm.sequences;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.Objects;
+import java.util.concurrent.ExecutionException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.schema.Keyspaces;
+import org.apache.cassandra.schema.TableId;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.accord.AccordService;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.ClusterMetadataService;
+import org.apache.cassandra.tcm.Epoch;
+import org.apache.cassandra.tcm.MultiStepOperation;
+import org.apache.cassandra.tcm.Transformation;
+import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer;
+import org.apache.cassandra.tcm.serialization.MetadataSerializer;
+import org.apache.cassandra.tcm.serialization.Version;
+import org.apache.cassandra.tcm.transformations.FinishDropAccordTable;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+
+import static java.lang.String.format;
+import static 
org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE;
+import static org.apache.cassandra.tcm.sequences.SequenceState.continuable;
+import static org.apache.cassandra.tcm.sequences.SequenceState.error;
+import static org.apache.cassandra.tcm.sequences.SequenceState.halted;
+import static org.apache.cassandra.utils.Clock.Global.nanoTime;
+
+/**
+ * A slightly atypical implementation as it consists of only a single step. To 
perform the drop of an
+ * Accord table, we first commit a PrepareDropAccordTable transformation. Upon 
enactement, that
+ * marks the table as pending drop, which blocks any new transactions from 
being started. It also
+ * instantiates an instance of this operation and adds it to the set of in 
progress operations.
+ *
+ * The intention is to introduce a barrier which blocks until the Accord 
service acknowledges that
+ * it was learned of the epoch in which the table was marked for deletion and 
that all prior transactions
+ * are completed. Once this is complete, we can proceed to actually drop the 
table. The transformation
+ * which performs that schema modification also removes this MSO from 
ClusterMetadata's in-flight set.
+ * This obviates the need to 'advance' this MSO in the way that other 
implementations with more steps do.
+ *
+ */
+public class DropAccordTable extends MultiStepOperation<Epoch>
+{
+    private static final Logger logger = 
LoggerFactory.getLogger(DropAccordTable.class);
+
+    public static final Serializer serializer = new Serializer();
+
+    public final TableReference table;
+
+    /**
+     * Used by factory method for external callers and by the serializer.
+     * We don't need to include the serialized FinishDropAccordTable step in 
the serialization
+     * of the MSO itself because they have no parameters other than the table 
reference and so
+     * we can just construct a new one when we execute it
+     */
+    private DropAccordTable(TableReference table, Epoch latestModification)
+    {
+        super(0, latestModification);
+        this.table = table;
+    }
+
+    public static DropAccordTable newSequence(TableReference table, Epoch 
preparedAt)
+    {
+        return new DropAccordTable(table, preparedAt);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        DropAccordTable that = (DropAccordTable) o;
+        return latestModification.equals(that.latestModification)
+               && table.equals(that.table);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(latestModification, table);
+    }
+
+    @Override
+    public Kind kind()
+    {
+        return Kind.DROP_ACCORD_TABLE;
+    }
+
+    @Override
+    protected SequenceKey sequenceKey()
+    {
+        return table;
+    }
+
+    @Override
+    public MetadataSerializer<? extends SequenceKey> keySerializer()
+    {
+        return TableReference.serializer;
+    }
+
+    @Override
+    public Transformation.Kind nextStep()
+    {
+        return FINISH_DROP_ACCORD_TABLE;
+    }
+
+    @Override
+    public SequenceState executeNext()
+    {
+        try
+        {
+            SequenceState failure = awaitSafeFromAccord();
+            if (failure != null) return failure;
+        }
+        catch (Throwable t)
+        {
+            JVMStabilityInspector.inspectThrowable(t);
+            logger.warn("Exception while waiting for Accord service to notify 
all table txns are complete", t);
+            // this is actually continuable as we can simply retry
+            return continuable();
+        }
+        try
+        {
+            // Now we're satisfied that all Accord txns have finished for the 
table,
+            // go ahead and actually drop it
+            ClusterMetadataService.instance().commit(new 
FinishDropAccordTable(table));
+            return continuable();
+        }
+        catch (Throwable t)
+        {
+            JVMStabilityInspector.inspectThrowable(t);
+            logger.warn("Exception committing finish_drop_accord_table. " +
+                        "Accord service has acknowledged the operation but 
table remains present in schema", t);
+            return halted();
+        }
+    }
+
+    private SequenceState awaitSafeFromAccord() throws ExecutionException, 
InterruptedException
+    {
+        // make sure that Accord sees the current epoch, which must 
necessarily follow the
+        // one which marked the table as pending drop
+        ClusterMetadata metadata = ClusterMetadata.current();
+        // just for the sake of paranoia, assert that the table is actually 
marked as being dropped
+        if (!verifyTableMarked(metadata.schema.getKeyspaces()))
+            return error(new IllegalStateException(String.format("Table %s is 
in an invalid state to be dropped", table.toCQLString())));
+
+        long startNanos = nanoTime();
+        AccordService.instance().epochReady(metadata.epoch).get();
+        long epochEndNanos = nanoTime();
+
+        //TODO (correctness, operability): this is more than likely to 
timeout, so best to "await" an existing txn_id rather than creating a new one...
+        AccordService.instance().awaitForTableDrop(table.id);
+        long awaitEndNanos = nanoTime();
+        logger.info("Wait for Accord to see the drop table was success.  " +
+                    "Took {} to wait for Accord to learn about the change, 
then {} to process everything",
+                    Duration.ofNanos(epochEndNanos - startNanos), 
Duration.ofNanos(awaitEndNanos - epochEndNanos));
+        return null;
+    }
+
+    private boolean verifyTableMarked(Keyspaces keyspaces)
+    {
+        TableMetadata tm = keyspaces.getTableOrViewNullable(table.id);
+        if (tm == null)
+        {
+            logger.warn("Unable to drop accord table {}, table not found", 
table);
+            return false;
+        }
+
+        if (!tm.params.pendingDrop)
+        {
+            logger.warn("Unexpected state, table {} was not marked pending 
drop", table);
+            return false;
+        }
+
+        return true;
+    }
+
+    @Override
+    public Transformation.Result applyTo(ClusterMetadata metadata)
+    {
+        // note: that this will apply the finish drop transformation to the 
supplied metadata. It's
+        // not used to actually execute the MSO, but to determine what the 
metadata state will/would
+        // be if it were executed.
+        return new FinishDropAccordTable(table).execute(metadata);
+    }
+
+    @Override
+    public DropAccordTable advance(Epoch epoch)
+    {
+        // note: this isn't really used by this MSO impl as it consists of a 
single step so there's nothing
+        // to advance. An action of the single step is to remove the MSO from 
the set of in progress sequences
+        return new DropAccordTable(this.table, epoch);
+    }
+
+    @Override
+    public ProgressBarrier barrier()
+    {
+        return ProgressBarrier.immediate();
+    }
+
+    public static class TableReference implements SequenceKey, 
Comparable<TableReference>
+    {
+        public static final Serializer serializer = new Serializer();
+
+        public final String keyspace, name;

Review Comment:
   `keyspace` and `name` aren't needed. The id is the unique identifier for the 
table and anywhere we need access to the `KeyspaceMetadata` itself we can get 
it by dereferencing the `TableMetadata` and using the keyspace name to get it 
(we only do this for validation/logging/testing).



##########
src/java/org/apache/cassandra/cql3/statements/schema/DropKeyspaceStatement.java:
##########
@@ -45,8 +50,30 @@ public Keyspaces apply(ClusterMetadata metadata)
         Guardrails.dropKeyspaceEnabled.ensureEnabled(state);
 
         Keyspaces schema = metadata.schema.getKeyspaces();
-        if (schema.containsKeyspace(keyspaceName))
+        KeyspaceMetadata keyspace = schema.getNullable(keyspaceName);
+        if (keyspace != null)
+        {
+            // check that no accord tables in the keyspace are currently in 
the process of being dropped
+            List<TableMetadata> pendingDrop = keyspace.tables.stream()
+                                                             .filter(t -> 
t.params.pendingDrop)
+                                                             
.collect(Collectors.toList());
+            if (!pendingDrop.isEmpty())
+                throw ire("Cannot drop keyspace '%s' as it contains accord 
tables which are currently being dropped. " +
+                          "Please wait for those operations to complete before 
dropping the keyspace. (%s)",
+                          keyspaceName, pendingDrop.stream()
+                                                   .map(Object::toString)
+                                                   
.collect(Collectors.joining(",")));
+
+            List<TableMetadata> accordTables = keyspace.tables.stream()
+                                               
.filter(TableMetadata::isAccordEnabled)
+                                               .collect(Collectors.toList());
+            if (!accordTables.isEmpty())

Review Comment:
   If you're adding this condition, is the first one still necessary? 



##########
src/java/org/apache/cassandra/tools/NodeTool.java:
##########
@@ -262,7 +262,8 @@ public int execute(String... args)
                .withCommand(CMSAdmin.InitializeCMS.class)
                .withCommand(CMSAdmin.ReconfigureCMS.class)
                .withCommand(CMSAdmin.Snapshot.class)
-               .withCommand(CMSAdmin.Unregister.class);
+               .withCommand(CMSAdmin.Unregister.class)
+               .withCommand(CMSAdmin.ResumeDropAccordTable.class);

Review Comment:
   I couldn't decide whether this belonged as a subcommand  of `nodetool cms` 
or `nodetool accord`, wdyt?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to