summaryrefslogtreecommitdiffstats
path: root/qa/workunits/cephadm/test_cephadm_timeout.py
diff options
context:
space:
mode:
Diffstat (limited to 'qa/workunits/cephadm/test_cephadm_timeout.py')
-rwxr-xr-xqa/workunits/cephadm/test_cephadm_timeout.py179
1 files changed, 179 insertions, 0 deletions
diff --git a/qa/workunits/cephadm/test_cephadm_timeout.py b/qa/workunits/cephadm/test_cephadm_timeout.py
new file mode 100755
index 000000000..67b43a2df
--- /dev/null
+++ b/qa/workunits/cephadm/test_cephadm_timeout.py
@@ -0,0 +1,179 @@
+#!/usr/bin/python3 -s
+
+import time
+import os
+import fcntl
+import subprocess
+import uuid
+import sys
+
+from typing import Optional, Any
+
+LOCK_DIR = '/run/cephadm'
+DATA_DIR = '/var/lib/ceph'
+
+class _Acquire_ReturnProxy(object):
+ def __init__(self, lock: 'FileLock') -> None:
+ self.lock = lock
+ return None
+
+ def __enter__(self) -> 'FileLock':
+ return self.lock
+
+ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+ self.lock.release()
+ return None
+
+class FileLock(object):
+ def __init__(self, name: str, timeout: int = -1) -> None:
+ if not os.path.exists(LOCK_DIR):
+ os.mkdir(LOCK_DIR, 0o700)
+ self._lock_file = os.path.join(LOCK_DIR, name + '.lock')
+
+ self._lock_file_fd: Optional[int] = None
+ self.timeout = timeout
+ self._lock_counter = 0
+ return None
+
+ @property
+ def is_locked(self) -> bool:
+ return self._lock_file_fd is not None
+
+ def acquire(self, timeout: Optional[int] = None, poll_intervall: float = 0.05) -> _Acquire_ReturnProxy:
+ # Use the default timeout, if no timeout is provided.
+ if timeout is None:
+ timeout = self.timeout
+
+ # Increment the number right at the beginning.
+ # We can still undo it, if something fails.
+ self._lock_counter += 1
+
+ start_time = time.time()
+ try:
+ while True:
+ if not self.is_locked:
+ self._acquire()
+
+ if self.is_locked:
+ break
+ elif timeout >= 0 and time.time() - start_time > timeout:
+ raise Exception(self._lock_file)
+ else:
+ time.sleep(poll_intervall)
+ except Exception:
+ # Something did go wrong, so decrement the counter.
+ self._lock_counter = max(0, self._lock_counter - 1)
+
+ raise
+ return _Acquire_ReturnProxy(lock=self)
+
+ def release(self, force: bool = False) -> None:
+ if self.is_locked:
+ self._lock_counter -= 1
+
+ if self._lock_counter == 0 or force:
+ self._release()
+ self._lock_counter = 0
+
+ return None
+
+ def __enter__(self) -> 'FileLock':
+ self.acquire()
+ return self
+
+ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+ self.release()
+ return None
+
+ def __del__(self) -> None:
+ self.release(force=True)
+ return None
+
+ def _acquire(self) -> None:
+ open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
+ fd = os.open(self._lock_file, open_mode)
+
+ try:
+ fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+ except (IOError, OSError):
+ os.close(fd)
+ else:
+ self._lock_file_fd = fd
+ return None
+
+ def _release(self) -> None:
+ fd = self._lock_file_fd
+ self._lock_file_fd = None
+ fcntl.flock(fd, fcntl.LOCK_UN) # type: ignore
+ os.close(fd) # type: ignore
+ return None
+
+def _is_fsid(s):
+ try:
+ uuid.UUID(s)
+ except ValueError:
+ return False
+ return True
+
+def find_fsid():
+ if not os.path.exists(DATA_DIR):
+ raise Exception(f'{DATA_DIR} does not exist. Aborting...')
+
+ for d in os.listdir(DATA_DIR):
+ # assume the first thing we find that is an fsid
+ # is what we want. Not expecting multiple clusters
+ # to have been installed here.
+ if _is_fsid(d):
+ return d
+ raise Exception(f'No fsid dir found in {DATA_DIR} does not exist. Aborting...')
+
+def main():
+ print('Looking for cluster fsid...')
+ fsid = find_fsid()
+ print(f'Found fsid {fsid}')
+
+ print('Setting cephadm command timeout to 120...')
+ subprocess.run(['cephadm', 'shell', '--', 'ceph', 'config', 'set',
+ 'mgr', 'mgr/cephadm/default_cephadm_command_timeout', '120'],
+ check=True)
+
+ print('Taking hold of cephadm lock for 300 seconds...')
+ lock = FileLock(fsid, 300)
+ lock.acquire()
+
+ print('Triggering cephadm device refresh...')
+ subprocess.run(['cephadm', 'shell', '--', 'ceph', 'orch', 'device', 'ls', '--refresh'],
+ check=True)
+
+ print('Sleeping 150 seconds to allow for timeout to occur...')
+ time.sleep(150)
+
+ print('Checking ceph health detail...')
+ # directing stdout to res.stdout via "capture_stdout" option
+ # (and same for stderr) seems to have been added in python 3.7.
+ # Using files so this works with 3.6 as well
+ with open('/tmp/ceph-health-detail-stdout', 'w') as f_stdout:
+ with open('/tmp/ceph-health-detail-stderr', 'w') as f_stderr:
+ subprocess.run(['cephadm', 'shell', '--', 'ceph', 'health', 'detail'],
+ check=True, stdout=f_stdout, stderr=f_stderr)
+
+ res_stdout = open('/tmp/ceph-health-detail-stdout', 'r').read()
+ res_stderr = open('/tmp/ceph-health-detail-stderr', 'r').read()
+ print(f'"cephadm shell -- ceph health detail" stdout:\n{res_stdout}')
+ print(f'"cephadm shell -- ceph health detail" stderr:\n{res_stderr}')
+
+ print('Checking for correct health warning in health detail...')
+ if 'CEPHADM_REFRESH_FAILED' not in res_stdout:
+ raise Exception('No health warning caused by timeout was raised')
+ if 'Command "cephadm ceph-volume -- inventory" timed out' not in res_stdout:
+ raise Exception('Health warnings did not contain message about time out')
+
+ print('Health warnings found succesfully. Exiting.')
+ return 0
+
+
+if __name__ == '__main__':
+ if os.getuid() != 0:
+ print('Trying to run myself with sudo...')
+ os.execvp('sudo', [sys.executable] + list(sys.argv))
+ main()