Skip to content

Commit e494d21

Browse files
committed
Fix CCM not cleaning up all nodes if one fails to start
Patch by Ariel Weisberg; Reviewed by Mick Semb Wever for CASSANDRA-20673
1 parent 39b8222 commit e494d21

File tree

2 files changed

+30
-2
lines changed

2 files changed

+30
-2
lines changed

ccmlib/cluster.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,7 @@ def start(self, no_wait=False, verbose=False, wait_for_binary_proto=True,
574574
if not node._wait_for_running(p, timeout_s=7):
575575
raise NodeError("Node {} should be running before waiting for <started listening> log message, "
576576
"but C* process is terminated.".format(node.name))
577+
for node, p, mark in started:
577578
try:
578579
timeout=kwargs.get('timeout', DEFAULT_CLUSTER_WAIT_TIMEOUT_IN_SECS)
579580
timeout=int(os.environ.get('CCM_CLUSTER_START_TIMEOUT_OVERRIDE', timeout))

ccmlib/node.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,9 @@ def get_launch_bin(self):
789789
def add_custom_launch_arguments(self, args):
790790
pass
791791

792+
def __log_dir(self):
793+
return '-Dcassandra.logdir=%s' % os.path.join(self.get_path(), 'logs')
794+
792795
def start(self,
793796
join_ring=True,
794797
no_wait=False,
@@ -876,7 +879,7 @@ def start(self,
876879

877880
args = args + ['-p', pidfile, '-Dcassandra.join_ring=%s' % str(join_ring)]
878881

879-
args.append('-Dcassandra.logdir=%s' % os.path.join(self.get_path(), 'logs'))
882+
args.append(self.__log_dir())
880883
if replace_token is not None:
881884
args.append('-Dcassandra.replace_token=%s' % str(replace_token))
882885
if replace_address is not None:
@@ -982,6 +985,26 @@ def _wait_for_running(self, process, timeout_s):
982985
self._update_pid(process)
983986
return self.is_running()
984987

988+
def __unix_kill_process_matching(self, pattern, sig=signal.SIGTERM):
989+
matcher = re.compile(pattern)
990+
for proc in psutil.process_iter(['pid', 'cmdline']):
991+
try:
992+
pid = proc.info['pid']
993+
cmdline = " ".join(proc.info['cmdline']) if proc.info['cmdline'] else ""
994+
logger.info(f"{cmdline}")
995+
if matcher.search(cmdline):
996+
try:
997+
os.kill(int(pid), sig)
998+
except ProcessLookupError:
999+
logger.info(f"Process {pid} not found")
1000+
except PermissionError:
1001+
logger.info(f"Did not have permissions to kill {pid}")
1002+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
1003+
pass
1004+
1005+
def __unix_kill(self, sig):
1006+
self.__unix_kill_process_matching(".*{}.*{}.*".format(self.__log_dir(), "org.apache.cassandra.service.CassandraDaemon"), sig)
1007+
9851008
def stop(self, wait=True, wait_other_notice=False, signal_event=signal.SIGTERM, **kwargs):
9861009
"""
9871010
Stop the node.
@@ -995,6 +1018,7 @@ def stop(self, wait=True, wait_other_notice=False, signal_event=signal.SIGTERM,
9951018
+ gently: Let Cassandra clean up and shut down properly; unless
9961019
false perform a 'kill -9' which shuts down faster.
9971020
"""
1021+
gently = 'gently' in kwargs and kwargs['gently'] is True
9981022
if self.is_running():
9991023
if wait_other_notice:
10001024
marks = [(node, node.mark_log()) for node in list(self.cluster.nodes.values()) if node.is_live() and node is not self]
@@ -1021,7 +1045,7 @@ def stop(self, wait=True, wait_other_notice=False, signal_event=signal.SIGTERM,
10211045
common.warning("Failed to terminate node: {0} with pid: {1}".format(self.name, self.pid))
10221046
else:
10231047
# Determine if the signal event should be updated to keep API compatibility
1024-
if 'gently' in kwargs and kwargs['gently'] is False:
1048+
if gently is False:
10251049
signal_event = signal.SIGKILL
10261050

10271051
os.kill(self.pid, signal_event)
@@ -1046,6 +1070,9 @@ def stop(self, wait=True, wait_other_notice=False, signal_event=signal.SIGTERM,
10461070
else:
10471071
return True
10481072
else:
1073+
# Make sure it is actually stopped even if the PID wasn't found for some reason
1074+
if not common.is_win():
1075+
self.__unix_kill(signal if gently else signal.SIGKILL)
10491076
return False
10501077

10511078
def wait_for_compactions(self, timeout=120):

0 commit comments

Comments
 (0)