[
https://issues.apache.org/jira/browse/CASSANDRA-19144?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17818010#comment-17818010
]
Ekaterina Dimitrova edited comment on CASSANDRA-19144 at 2/16/24 3:51 PM:
--------------------------------------------------------------------------
I rebased my dtest repo and reinstalled the latest ccm changes from
CASSANDRA-18753
The command I use to run the tests locally:
{code:java}
pytest --use-vnodes --log-cli-level="DEBUG"
replace_address_test.py::TestReplaceAddress::test_replace_with_insufficient_replicas
--cassandra-dir=...{code}
Locally with current 5.0 branch with paxos V2 and no other config changes the
test fails for me:
{code:java}
replace_address_test.py:636:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _
self = <ccmlib.node.Node object at 0x10527e550>, exprs = 'Unable to find
sufficient sources for streaming range', from_mark = None, timeout = 600,
process = None, verbose = False, filename = 'system.log',
error_on_pid_terminated = False
def watch_log_for(self, exprs, from_mark=None, timeout=600,
process=None, verbose=False, filename='system.log',
error_on_pid_terminated=False):
"""
Watch the log until one or more (regular) expressions are found or
timeouts (a
TimeoutError is then raised). On successful completion, a list of pair
(line matched,
match object) is returned.
Will raise NodeError if error_on_pit_terminated is True and C* pid is
not running.
"""
start = time.time()
tofind = [exprs] if isinstance(exprs, string_types) else exprs
tofind = [re.compile(e) for e in tofind]
matchings = []
reads = ""
if len(tofind) == 0:
return None
log_file = os.path.join(self.log_directory(), filename)
output_read = False
while not os.path.exists(log_file):
time.sleep(.5)
TimeoutError.raise_if_passed(start=start, timeout=timeout,
node=self.name,
msg="Timed out waiting for {} to be
created.".format(log_file))
if process and not output_read:
process.poll()
if process.returncode is not None:
self.print_process_output(self.name, process, verbose)
output_read = True
if process.returncode != 0:
raise RuntimeError() # Shouldn't reuse RuntimeError
but I'm lazy
with open(log_file) as f:
if from_mark:
f.seek(from_mark)
while True:
# First, if we have a process to check, then check it.
# Skip on Windows - stdout/stderr is cassandra.bat
if not common.is_win() and not output_read:
if process:
process.poll()
if process.returncode is not None:
self.print_process_output(self.name, process,
verbose)
output_read = True
if process.returncode != 0:
raise RuntimeError() # Shouldn't reuse
RuntimeError but I'm lazy
line = f.readline()
if line:
reads = reads + line
for e in tofind:
m = e.search(line)
if m:
matchings.append((line, m))
tofind.remove(e)
if len(tofind) == 0:
return matchings[0] if isinstance(exprs,
string_types) else matchings
else:
# wait for the situation to clarify, either stop or just a
pause in log production
> time.sleep(1)
E Failed: Timeout >900.0s
../dtest/lib/python3.8/site-packages/ccmlib/node.py:603: Failed
{code}
{code:java}
==============================================================================================================
short test summary info
==============================================================================================================
FAILED
replace_address_test.py::TestReplaceAddress::test_replace_with_insufficient_replicas
- Failed: Timeout >900.0s
ERROR
replace_address_test.py::TestReplaceAddress::test_replace_with_insufficient_replicas
- Failed: Unexpected error found in node logs (see stdout for full details).
Errors: [[node1] 'ERROR [Messaging-EventLoop-3-6] 2024-02-16 10:08:25,888
NoSpamLogger.java:110 -
/127.0.0.1:7000->/127.0.0.3:7000-URGENT_MESSAGES-[no-channel] faile...
{code}
This is how it fails with v2:
{code:java}
____________________________________________________________________________________________
TestReplaceAddress.test_replace_with_insufficient_replicas
_____________________________________________________________________________________________
self = <replace_address_test.TestReplaceAddress object at 0x102a927f0>
def test_replace_with_insufficient_replicas(self):
"""
Test that replace fails when there are insufficient replicas
@jira_ticket CASSANDRA-11848
"""
self.fixture_dtest_setup.ignore_log_patterns =
list(self.fixture_dtest_setup.ignore_log_patterns) + [
r'Unable to find sufficient sources for streaming range']
self._setup(n=3)
self._insert_data(rf=2)
self._stop_node_to_replace()
# stop other replica. For CEP-21 purposes, we need to ensure this is not
# the cluster's only CMS member
logger.debug("Stopping other replica")
self.cluster.nodelist()[1].stop(wait_other_notice=True)
options = {}
# CEP-21: availability is intentionally degraded, so we lower the
required number of
# acks required for the replacement node to progress to a point where
it attempts to
# perform streaming
if self.cluster.version() >= LooseVersion('5.1'):
options = {'progress_barrier_min_consistency_level': 'ONE',
'progress_barrier_default_consistency_level': 'ONE'}
self._do_replace(wait_for_binary_proto=False,
wait_other_notice=False,
opts=options)
# replace should fail due to insufficient replicas
self.replacement_node.watch_log_for("Unable to find sufficient sources
for streaming range")
> assert_not_running(self.replacement_node)
replace_address_test.py:637:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _
node = <ccmlib.node.Node object at 0x102bb6640>
def assert_not_running(node):
"""
Assert that a given node is not running
@param node The node to check status
"""
attempts = 0
while node.is_running() and attempts < 10:
sleep(1)
attempts = attempts + 1
> assert not node.is_running()
E AssertionError
tools/assertions.py:282: AssertionError
{code}
{code:java}
==============================================================================================================
short test summary info
==============================================================================================================
FAILED
replace_address_test.py::TestReplaceAddress::test_replace_with_insufficient_replicas
- AssertionError
ERROR
replace_address_test.py::TestReplaceAddress::test_replace_with_insufficient_replicas
- Failed: Unexpected error found in node logs (see stdout for full details).
Errors: [[node1] 'ERROR [Messaging-EventLoop-3-6] 2024-02-16 10:39:00,019
NoSpamLogger.java:110 -
/127.0.0.1:7000->/127.0.0.3:7000-URGENT_MESSAGES-[no-channel] faile...
{code}
We can see also the test failed consistently run with JDK11, 17 in CI, so I
think this ticket can be moved out of triage and worked on.
Whether it is only paxos v2 triggering the reported issue or a combination of
it with other config changes made in cassandra_latest.yaml should be figured
out by whoever assigns the ticket to work on it.
was (Author: e.dimitrova):
I rebased my dtest repo and reinstalled the latest ccm changes from
CASSANDRA-18753
The command I use to run the tests locally:
{code:java}
pytest --use-vnodes --log-cli-level="DEBUG"
replace_address_test.py::TestReplaceAddress::test_replace_with_insufficient_replicas
--cassandra-dir=...{code}
Locally with current 5.0 branch with paxos V2 and no other config changes the
test fails for me:
{code:java}
replace_address_test.py:636:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _
self = <ccmlib.node.Node object at 0x10527e550>, exprs = 'Unable to find
sufficient sources for streaming range', from_mark = None, timeout = 600,
process = None, verbose = False, filename = 'system.log',
error_on_pid_terminated = False
def watch_log_for(self, exprs, from_mark=None, timeout=600,
process=None, verbose=False, filename='system.log',
error_on_pid_terminated=False):
"""
Watch the log until one or more (regular) expressions are found or
timeouts (a
TimeoutError is then raised). On successful completion, a list of pair
(line matched,
match object) is returned.
Will raise NodeError if error_on_pit_terminated is True and C* pid is
not running.
"""
start = time.time()
tofind = [exprs] if isinstance(exprs, string_types) else exprs
tofind = [re.compile(e) for e in tofind]
matchings = []
reads = ""
if len(tofind) == 0:
return None
log_file = os.path.join(self.log_directory(), filename)
output_read = False
while not os.path.exists(log_file):
time.sleep(.5)
TimeoutError.raise_if_passed(start=start, timeout=timeout,
node=self.name,
msg="Timed out waiting for {} to be
created.".format(log_file))
if process and not output_read:
process.poll()
if process.returncode is not None:
self.print_process_output(self.name, process, verbose)
output_read = True
if process.returncode != 0:
raise RuntimeError() # Shouldn't reuse RuntimeError
but I'm lazy
with open(log_file) as f:
if from_mark:
f.seek(from_mark)
while True:
# First, if we have a process to check, then check it.
# Skip on Windows - stdout/stderr is cassandra.bat
if not common.is_win() and not output_read:
if process:
process.poll()
if process.returncode is not None:
self.print_process_output(self.name, process,
verbose)
output_read = True
if process.returncode != 0:
raise RuntimeError() # Shouldn't reuse
RuntimeError but I'm lazy
line = f.readline()
if line:
reads = reads + line
for e in tofind:
m = e.search(line)
if m:
matchings.append((line, m))
tofind.remove(e)
if len(tofind) == 0:
return matchings[0] if isinstance(exprs,
string_types) else matchings
else:
# wait for the situation to clarify, either stop or just a
pause in log production
> time.sleep(1)
E Failed: Timeout >900.0s
../dtest/lib/python3.8/site-packages/ccmlib/node.py:603: Failed
{code}
{code:java}
==============================================================================================================
short test summary info
==============================================================================================================
FAILED
replace_address_test.py::TestReplaceAddress::test_replace_with_insufficient_replicas
- Failed: Timeout >900.0s
ERROR
replace_address_test.py::TestReplaceAddress::test_replace_with_insufficient_replicas
- Failed: Unexpected error found in node logs (see stdout for full details).
Errors: [[node1] 'ERROR [Messaging-EventLoop-3-6] 2024-02-16 10:08:25,888
NoSpamLogger.java:110 -
/127.0.0.1:7000->/127.0.0.3:7000-URGENT_MESSAGES-[no-channel] faile...
{code}
This is how it fails with v2:
{code:java}
____________________________________________________________________________________________
TestReplaceAddress.test_replace_with_insufficient_replicas
_____________________________________________________________________________________________
self = <replace_address_test.TestReplaceAddress object at 0x102a927f0>
def test_replace_with_insufficient_replicas(self):
"""
Test that replace fails when there are insufficient replicas
@jira_ticket CASSANDRA-11848
"""
self.fixture_dtest_setup.ignore_log_patterns =
list(self.fixture_dtest_setup.ignore_log_patterns) + [
r'Unable to find sufficient sources for streaming range']
self._setup(n=3)
self._insert_data(rf=2)
self._stop_node_to_replace()
# stop other replica. For CEP-21 purposes, we need to ensure this is not
# the cluster's only CMS member
logger.debug("Stopping other replica")
self.cluster.nodelist()[1].stop(wait_other_notice=True)
options = {}
# CEP-21: availability is intentionally degraded, so we lower the
required number of
# acks required for the replacement node to progress to a point where
it attempts to
# perform streaming
if self.cluster.version() >= LooseVersion('5.1'):
options = {'progress_barrier_min_consistency_level': 'ONE',
'progress_barrier_default_consistency_level': 'ONE'}
self._do_replace(wait_for_binary_proto=False,
wait_other_notice=False,
opts=options)
# replace should fail due to insufficient replicas
self.replacement_node.watch_log_for("Unable to find sufficient sources
for streaming range")
> assert_not_running(self.replacement_node)
replace_address_test.py:637:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _
node = <ccmlib.node.Node object at 0x102bb6640>
def assert_not_running(node):
"""
Assert that a given node is not running
@param node The node to check status
"""
attempts = 0
while node.is_running() and attempts < 10:
sleep(1)
attempts = attempts + 1
> assert not node.is_running()
E AssertionError
tools/assertions.py:282: AssertionError
{code}
{code:java}
==============================================================================================================
short test summary info
==============================================================================================================
FAILED
replace_address_test.py::TestReplaceAddress::test_replace_with_insufficient_replicas
- AssertionError
ERROR
replace_address_test.py::TestReplaceAddress::test_replace_with_insufficient_replicas
- Failed: Unexpected error found in node logs (see stdout for full details).
Errors: [[node1] 'ERROR [Messaging-EventLoop-3-6] 2024-02-16 10:39:00,019
NoSpamLogger.java:110 -
/127.0.0.1:7000->/127.0.0.3:7000-URGENT_MESSAGES-[no-channel] faile...
{code}
We can see also the test failed consistently run with JDK11, 17 in CI, so I
think this ticket can be moved out of triage and worked on.
Whether it is only paxos v2 triggering the reported issue or a combination of
it with other config changes made in cassandra_latest.yaml should be figured
out by whoever assigns the ticket to work on it.
> Python dtest replace_address_test.TestReplaceAddress is failing with Paxos V2
> -----------------------------------------------------------------------------
>
> Key: CASSANDRA-19144
> URL: https://issues.apache.org/jira/browse/CASSANDRA-19144
> Project: Cassandra
> Issue Type: Bug
> Components: Consistency/Bootstrap and Decommission,
> Feature/Lightweight Transactions
> Reporter: Branimir Lambov
> Priority: Normal
> Fix For: 5.0-rc
>
>
> Paxos repair is causing an unexpected failure:
> {code}
> test_replace_with_insufficient_replicas
> replace_address_test.TestReplaceAddress
> failed on teardown with "Failed: Unexpected error found in node logs (see
> stdout for full details). Errors: [[replacement] 'ERROR [main] 2023-11-29
> 10:23:08,752 CassandraDaemon.java:878 - Exception encountered during
> startup\njava.lang.UnsupportedOperationException: null\n\tat
> org.apache.cassandra.locator.AbstractReplicaCollection$ReplicaMap$AbstractImmutableSet.removeAll(AbstractReplicaCollection.java:298)\n\tat
>
> org.apache.cassandra.service.ActiveRepairService.repairPaxosForTopologyChange(ActiveRepairService.java:1102)\n\tat
>
> org.apache.cassandra.service.StorageService.startRepairPaxosForTopologyChange(StorageService.java:4829)\n\tat
>
> org.apache.cassandra.service.StorageService.tryRepairPaxosForTopologyChange(StorageService.java:4760)\n\tat
>
> org.apache.cassandra.service.StorageService.repairPaxosForTopologyChange(StorageService.java:4793)\n\tat
>
> org.apache.cassandra.service.StorageService.bootstrap(StorageService.java:2120)\n\tat
>
> org.apache.cassandra.service.StorageService.joinTokenRing(StorageService.java:1240)\n\tat
>
> org.apache.cassandra.service.StorageService.joinTokenRing(StorageService.java:1200)\n\tat
>
> org.apache.cassandra.service.StorageService.initServer(StorageService.java:979)\n\tat
>
> org.apache.cassandra.service.StorageService.initServer(StorageService.java:896)\n\tat
>
> org.apache.cassandra.service.CassandraDaemon.setup(CassandraDaemon.java:377)\n\tat
>
> org.apache.cassandra.service.CassandraDaemon.activate(CassandraDaemon.java:721)\n\tat
>
> org.apache.cassandra.service.CassandraDaemon.main(CassandraDaemon.java:856)']"
> Unexpected error found in node logs (see stdout for full details). Errors:
> [[replacement] 'ERROR [main] 2023-11-29 10:23:08,752 CassandraDaemon.java:878
> - Exception encountered during
> startup\njava.lang.UnsupportedOperationException: null\n\tat
> org.apache.cassandra.locator.AbstractReplicaCollection$ReplicaMap$AbstractImmutableSet.removeAll(AbstractReplicaCollection.java:298)\n\tat
>
> org.apache.cassandra.service.ActiveRepairService.repairPaxosForTopologyChange(ActiveRepairService.java:1102)\n\tat
>
> org.apache.cassandra.service.StorageService.startRepairPaxosForTopologyChange(StorageService.java:4829)\n\tat
>
> org.apache.cassandra.service.StorageService.tryRepairPaxosForTopologyChange(StorageService.java:4760)\n\tat
>
> org.apache.cassandra.service.StorageService.repairPaxosForTopologyChange(StorageService.java:4793)\n\tat
>
> org.apache.cassandra.service.StorageService.bootstrap(StorageService.java:2120)\n\tat
>
> org.apache.cassandra.service.StorageService.joinTokenRing(StorageService.java:1240)\n\tat
>
> org.apache.cassandra.service.StorageService.joinTokenRing(StorageService.java:1200)\n\tat
>
> org.apache.cassandra.service.StorageService.initServer(StorageService.java:979)\n\tat
>
> org.apache.cassandra.service.StorageService.initServer(StorageService.java:896)\n\tat
>
> org.apache.cassandra.service.CassandraDaemon.setup(CassandraDaemon.java:377)\n\tat
>
> org.apache.cassandra.service.CassandraDaemon.activate(CassandraDaemon.java:721)\n\tat
> org.apache.cassandra.service.CassandraDaemon.main(CassandraDaemon.java:856)']
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]