Adding upstream version 14.2.21.upstream/14.2.21 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
commit: 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree: e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/pybind/mgr/volumes
parent: Initial commit. (diff)
download: ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
31 files changed, 5750 insertions, 0 deletions
diff --git a/src/pybind/mgr/volumes/__init__.py b/src/pybind/mgr/volumes/__init__.py
new file mode 100644
index 00000000..4c5b97ce
--- /dev/null
+++ b/src/pybind/mgr/volumes/__init__.py
@@ -0,0 +1,2 @@
+
+from .module import Module
diff --git a/src/pybind/mgr/volumes/fs/__init__.py b/src/pybind/mgr/volumes/fs/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/__init__.py
diff --git a/src/pybind/mgr/volumes/fs/async_cloner.py b/src/pybind/mgr/volumes/fs/async_cloner.py
new file mode 100644
index 00000000..0c5155e7
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/async_cloner.py
@@ -0,0 +1,345 @@
+import os
+import stat
+import time
+import errno
+import logging
+from contextlib import contextmanager
+
+import cephfs
+
+from .async_job import AsyncJobs
+from .exception import IndexException, MetadataMgrException, OpSmException, VolumeException
+from .fs_util import copy_file
+from .operations.versions.op_sm import SubvolumeOpSm
+from .operations.versions.subvolume_attrs import SubvolumeTypes, SubvolumeStates, SubvolumeActions
+from .operations.resolver import resolve
+from .operations.volume import open_volume, open_volume_lockless
+from .operations.group import open_group
+from .operations.subvolume import open_subvol
+from .operations.clone_index import open_clone_index
+from .operations.template import SubvolumeOpType
+
+log = logging.getLogger(__name__)
+
+# helper for fetching a clone entry for a given volume
+def get_next_clone_entry(volume_client, volname, running_jobs):
+    log.debug("fetching clone entry for volume '{0}'".format(volname))
+
+    try:
+        with open_volume_lockless(volume_client, volname) as fs_handle:
+            try:
+                with open_clone_index(fs_handle, volume_client.volspec) as clone_index:
+                    job = clone_index.get_oldest_clone_entry(running_jobs)
+                    return 0, job
+            except IndexException as ve:
+                if ve.errno == -errno.ENOENT:
+                    return 0, None
+                raise ve
+    except VolumeException as ve:
+        log.error("error fetching clone entry for volume '{0}' ({1})".format(volname, ve))
+        return ve.errno, None
+
+@contextmanager
+def open_at_volume(volume_client, volname, groupname, subvolname, op_type):
+    with open_volume(volume_client, volname) as fs_handle:
+        with open_group(fs_handle, volume_client.volspec, groupname) as group:
+            with open_subvol(volume_client.mgr, fs_handle, volume_client.volspec, group, subvolname, op_type) as subvolume:
+                yield subvolume
+
+@contextmanager
+def open_at_group(volume_client, fs_handle, groupname, subvolname, op_type):
+    with open_group(fs_handle, volume_client.volspec, groupname) as group:
+        with open_subvol(volume_client.mgr, fs_handle, volume_client.volspec, group, subvolname, op_type) as subvolume:
+            yield subvolume
+
+@contextmanager
+def open_at_group_unique(volume_client, fs_handle, s_groupname, s_subvolname, c_subvolume, c_groupname, c_subvolname, op_type):
+    # if a snapshot of a retained subvolume is being cloned to recreate the same subvolume, return
+    # the clone subvolume as the source subvolume
+    if s_groupname == c_groupname and s_subvolname == c_subvolname:
+        yield c_subvolume
+    else:
+        with open_at_group(volume_client, fs_handle, s_groupname, s_subvolname, op_type) as s_subvolume:
+            yield s_subvolume
+
+
+@contextmanager
+def open_clone_subvolume_pair(volume_client, fs_handle, volname, groupname, subvolname):
+    with open_at_group(volume_client, fs_handle, groupname, subvolname, SubvolumeOpType.CLONE_INTERNAL) as clone_subvolume:
+        s_volname, s_groupname, s_subvolname, s_snapname = get_clone_source(clone_subvolume)
+        if groupname == s_groupname and subvolname == s_subvolname:
+            # use the same subvolume to avoid metadata overwrites
+            yield (clone_subvolume, clone_subvolume, s_snapname)
+        else:
+            with open_at_group(volume_client, fs_handle, s_groupname, s_subvolname, SubvolumeOpType.CLONE_SOURCE) as source_subvolume:
+                yield (clone_subvolume, source_subvolume, s_snapname)
+
+def get_clone_state(volume_client, volname, groupname, subvolname):
+    with open_at_volume(volume_client, volname, groupname, subvolname, SubvolumeOpType.CLONE_INTERNAL) as subvolume:
+        return subvolume.state
+
+def set_clone_state(volume_client, volname, groupname, subvolname, state):
+    with open_at_volume(volume_client, volname, groupname, subvolname, SubvolumeOpType.CLONE_INTERNAL) as subvolume:
+        subvolume.state = (state, True)
+
+def get_clone_source(clone_subvolume):
+    source = clone_subvolume._get_clone_source()
+    return (source['volume'], source.get('group', None), source['subvolume'], source['snapshot'])
+
+def get_next_state_on_error(errnum):
+    if errnum == -errno.EINTR:
+        next_state = SubvolumeOpSm.transition(SubvolumeTypes.TYPE_CLONE,
+                                              SubvolumeStates.STATE_INPROGRESS,
+                                              SubvolumeActions.ACTION_CANCELLED)
+    else:
+        # jump to failed state, on all other errors
+        next_state = SubvolumeOpSm.transition(SubvolumeTypes.TYPE_CLONE,
+                                              SubvolumeStates.STATE_INPROGRESS,
+                                              SubvolumeActions.ACTION_FAILED)
+    return next_state
+
+def handle_clone_pending(volume_client, volname, index, groupname, subvolname, should_cancel):
+    try:
+        if should_cancel():
+            next_state = SubvolumeOpSm.transition(SubvolumeTypes.TYPE_CLONE,
+                                                  SubvolumeStates.STATE_PENDING,
+                                                  SubvolumeActions.ACTION_CANCELLED)
+        else:
+            next_state = SubvolumeOpSm.transition(SubvolumeTypes.TYPE_CLONE,
+                                                  SubvolumeStates.STATE_PENDING,
+                                                  SubvolumeActions.ACTION_SUCCESS)
+    except OpSmException as oe:
+        raise VolumeException(oe.errno, oe.error_str)
+    return (next_state, False)
+
+def sync_attrs(fs_handle, target_path, source_statx):
+    try:
+        fs_handle.lchown(target_path, int(source_statx["uid"]), int(source_statx["gid"]))
+        fs_handle.lutimes(target_path, (time.mktime(source_statx["atime"].timetuple()),
+                                        time.mktime(source_statx["mtime"].timetuple())))
+    except cephfs.Error as e:
+        log.warn("error synchronizing attrs for {0} ({1})".format(target_path, e))
+        raise e
+
+def bulk_copy(fs_handle, source_path, dst_path, should_cancel):
+    """
+    bulk copy data from source to destination -- only directories, symlinks
+    and regular files are synced.
+    """
+    log.info("copying data from {0} to {1}".format(source_path, dst_path))
+    def cptree(src_root_path, dst_root_path):
+        log.debug("cptree: {0} -> {1}".format(src_root_path, dst_root_path))
+        try:
+            with fs_handle.opendir(src_root_path) as dir_handle:
+                d = fs_handle.readdir(dir_handle)
+                while d and not should_cancel():
+                    if d.d_name not in (b".", b".."):
+                        log.debug("d={0}".format(d))
+                        d_full_src = os.path.join(src_root_path, d.d_name)
+                        d_full_dst = os.path.join(dst_root_path, d.d_name)
+                        stx = fs_handle.statx(d_full_src, cephfs.CEPH_STATX_MODE  |
+                                                          cephfs.CEPH_STATX_UID   |
+                                                          cephfs.CEPH_STATX_GID   |
+                                                          cephfs.CEPH_STATX_ATIME |
+                                                          cephfs.CEPH_STATX_MTIME |
+                                                          cephfs.CEPH_STATX_SIZE,
+                                                          cephfs.AT_SYMLINK_NOFOLLOW)
+                        handled = True
+                        mo = stx["mode"] & ~stat.S_IFMT(stx["mode"])
+                        if stat.S_ISDIR(stx["mode"]):
+                            log.debug("cptree: (DIR) {0}".format(d_full_src))
+                            try:
+                                fs_handle.mkdir(d_full_dst, mo)
+                            except cephfs.Error as e:
+                                if not e.args[0] == errno.EEXIST:
+                                    raise
+                            cptree(d_full_src, d_full_dst)
+                        elif stat.S_ISLNK(stx["mode"]):
+                            log.debug("cptree: (SYMLINK) {0}".format(d_full_src))
+                            target = fs_handle.readlink(d_full_src, 4096)
+                            try:
+                                fs_handle.symlink(target[:stx["size"]], d_full_dst)
+                            except cephfs.Error as e:
+                                if not e.args[0] == errno.EEXIST:
+                                    raise
+                        elif stat.S_ISREG(stx["mode"]):
+                            log.debug("cptree: (REG) {0}".format(d_full_src))
+                            copy_file(fs_handle, d_full_src, d_full_dst, mo, cancel_check=should_cancel)
+                        else:
+                            handled = False
+                            log.warn("cptree: (IGNORE) {0}".format(d_full_src))
+                        if handled:
+                            sync_attrs(fs_handle, d_full_dst, stx)
+                    d = fs_handle.readdir(dir_handle)
+                stx_root = fs_handle.statx(src_root_path, cephfs.CEPH_STATX_ATIME |
+                                                          cephfs.CEPH_STATX_MTIME,
+                                                          cephfs.AT_SYMLINK_NOFOLLOW)
+                fs_handle.lutimes(dst_root_path, (time.mktime(stx_root["atime"].timetuple()),
+                                                  time.mktime(stx_root["mtime"].timetuple())))
+        except cephfs.Error as e:
+            if not e.args[0] == errno.ENOENT:
+                raise VolumeException(-e.args[0], e.args[1])
+    cptree(source_path, dst_path)
+    if should_cancel():
+        raise VolumeException(-errno.EINTR, "clone operation interrupted")
+
+def do_clone(volume_client, volname, groupname, subvolname, should_cancel):
+    with open_volume_lockless(volume_client, volname) as fs_handle:
+        with open_clone_subvolume_pair(volume_client, fs_handle, volname, groupname, subvolname) as clone_volumes:
+            src_path = clone_volumes[1].snapshot_data_path(clone_volumes[2])
+            dst_path = clone_volumes[0].path
+            bulk_copy(fs_handle, src_path, dst_path, should_cancel)
+
+def handle_clone_in_progress(volume_client, volname, index, groupname, subvolname, should_cancel):
+    try:
+        do_clone(volume_client, volname, groupname, subvolname, should_cancel)
+        next_state = SubvolumeOpSm.transition(SubvolumeTypes.TYPE_CLONE,
+                                              SubvolumeStates.STATE_INPROGRESS,
+                                              SubvolumeActions.ACTION_SUCCESS)
+    except VolumeException as ve:
+        next_state = get_next_state_on_error(ve.errno)
+    except OpSmException as oe:
+        raise VolumeException(oe.errno, oe.error_str)
+    return (next_state, False)
+
+def handle_clone_failed(volume_client, volname, index, groupname, subvolname, should_cancel):
+    try:
+        with open_volume(volume_client, volname) as fs_handle:
+            # detach source but leave the clone section intact for later inspection
+            with open_clone_subvolume_pair(volume_client, fs_handle, volname, groupname, subvolname) as clone_volumes:
+                clone_volumes[1].detach_snapshot(clone_volumes[2], index)
+    except (MetadataMgrException, VolumeException) as e:
+        log.error("failed to detach clone from snapshot: {0}".format(e))
+    return (None, True)
+
+def handle_clone_complete(volume_client, volname, index, groupname, subvolname, should_cancel):
+    try:
+        with open_volume(volume_client, volname) as fs_handle:
+            with open_clone_subvolume_pair(volume_client, fs_handle, volname, groupname, subvolname) as clone_volumes:
+                clone_volumes[1].detach_snapshot(clone_volumes[2], index)
+                clone_volumes[0].remove_clone_source(flush=True)
+    except (MetadataMgrException, VolumeException) as e:
+        log.error("failed to detach clone from snapshot: {0}".format(e))
+    return (None, True)
+
+def start_clone_sm(volume_client, volname, index, groupname, subvolname, state_table, should_cancel):
+    finished = False
+    current_state = None
+    try:
+        current_state = get_clone_state(volume_client, volname, groupname, subvolname)
+        log.debug("cloning ({0}, {1}, {2}) -- starting state \"{3}\"".format(volname, groupname, subvolname, current_state))
+        while not finished:
+            handler = state_table.get(current_state, None)
+            if not handler:
+                raise VolumeException(-errno.EINVAL, "invalid clone state: \"{0}\"".format(current_state))
+            (next_state, finished) = handler(volume_client, volname, index, groupname, subvolname, should_cancel)
+            if next_state:
+                log.debug("({0}, {1}, {2}) transition state [\"{3}\" => \"{4}\"]".format(volname, groupname, subvolname,\
+                                                                                         current_state, next_state))
+                set_clone_state(volume_client, volname, groupname, subvolname, next_state)
+                current_state = next_state
+    except VolumeException as ve:
+        log.error("clone failed for ({0}, {1}, {2}) (current_state: {3}, reason: {4})".format(volname, groupname,\
+                                                                                             subvolname, current_state, ve))
+
+def clone(volume_client, volname, index, clone_path, state_table, should_cancel):
+    log.info("cloning to subvolume path: {0}".format(clone_path))
+    resolved = resolve(volume_client.volspec, clone_path)
+
+    groupname  = resolved[0]
+    subvolname = resolved[1]
+    log.debug("resolved to [group: {0}, subvolume: {1}]".format(groupname, subvolname))
+
+    try:
+        log.info("starting clone: ({0}, {1}, {2})".format(volname, groupname, subvolname))
+        start_clone_sm(volume_client, volname, index, groupname, subvolname, state_table, should_cancel)
+        log.info("finished clone: ({0}, {1}, {2})".format(volname, groupname, subvolname))
+    except VolumeException as ve:
+        log.error("clone failed for ({0}, {1}, {2}), reason: {3}".format(volname, groupname, subvolname, ve))
+
+class Cloner(AsyncJobs):
+    """
+    Asynchronous cloner: pool of threads to copy data from a snapshot to a subvolume.
+    this relies on a simple state machine (which mimics states from SubvolumeOpSm class) as
+    the driver. file types supported are directories, symbolic links and regular files.
+    """
+    def __init__(self, volume_client, tp_size):
+        self.vc = volume_client
+        self.state_table = {
+            SubvolumeStates.STATE_PENDING      : handle_clone_pending,
+            SubvolumeStates.STATE_INPROGRESS   : handle_clone_in_progress,
+            SubvolumeStates.STATE_COMPLETE     : handle_clone_complete,
+            SubvolumeStates.STATE_FAILED       : handle_clone_failed,
+            SubvolumeStates.STATE_CANCELED     : handle_clone_failed,
+        }
+        super(Cloner, self).__init__(volume_client, "cloner", tp_size)
+
+    def reconfigure_max_concurrent_clones(self, tp_size):
+        super(Cloner, self).reconfigure_max_concurrent_clones("cloner", tp_size)
+
+    def is_clone_cancelable(self, clone_state):
+        return not (SubvolumeOpSm.is_complete_state(clone_state) or SubvolumeOpSm.is_failed_state(clone_state))
+
+    def get_clone_tracking_index(self, fs_handle, clone_subvolume):
+        with open_clone_index(fs_handle, self.vc.volspec) as index:
+            return index.find_clone_entry_index(clone_subvolume.base_path)
+
+    def _cancel_pending_clone(self, fs_handle, clone_subvolume, clone_subvolname, clone_groupname, status, track_idx):
+        clone_state = SubvolumeStates.from_value(status['state'])
+        assert self.is_clone_cancelable(clone_state)
+
+        s_groupname = status['source'].get('group', None)
+        s_subvolname = status['source']['subvolume']
+        s_snapname = status['source']['snapshot']
+
+        with open_at_group_unique(self.vc, fs_handle, s_groupname, s_subvolname, clone_subvolume, clone_groupname,
+                                  clone_subvolname, SubvolumeOpType.CLONE_SOURCE) as s_subvolume:
+            next_state = SubvolumeOpSm.transition(SubvolumeTypes.TYPE_CLONE,
+                                                  clone_state,
+                                                  SubvolumeActions.ACTION_CANCELLED)
+            clone_subvolume.state = (next_state, True)
+            s_subvolume.detach_snapshot(s_snapname, track_idx.decode('utf-8'))
+
+    def cancel_job(self, volname, job):
+        """
+        override base class `cancel_job`. interpret @job as (clone, group) tuple.
+        """
+        clonename = job[0]
+        groupname = job[1]
+        track_idx = None
+
+        try:
+            with open_volume(self.vc, volname) as fs_handle:
+                with open_group(fs_handle, self.vc.volspec, groupname) as group:
+                    with open_subvol(self.vc.mgr, fs_handle, self.vc.volspec, group, clonename, SubvolumeOpType.CLONE_CANCEL) as clone_subvolume:
+                        status = clone_subvolume.status
+                        clone_state = SubvolumeStates.from_value(status['state'])
+                        if not self.is_clone_cancelable(clone_state):
+                            raise VolumeException(-errno.EINVAL, "cannot cancel -- clone finished (check clone status)")
+                        track_idx = self.get_clone_tracking_index(fs_handle, clone_subvolume)
+                        if not track_idx:
+                            log.warn("cannot lookup clone tracking index for {0}".format(clone_subvolume.base_path))
+                            raise VolumeException(-errno.EINVAL, "error canceling clone")
+                        if SubvolumeOpSm.is_init_state(SubvolumeTypes.TYPE_CLONE, clone_state):
+                            # clone has not started yet -- cancel right away.
+                            self._cancel_pending_clone(fs_handle, clone_subvolume, clonename, groupname, status, track_idx)
+                            return
+            # cancelling an on-going clone would persist "canceled" state in subvolume metadata.
+            # to persist the new state, async cloner accesses the volume in exclusive mode.
+            # accessing the volume in exclusive mode here would lead to deadlock.
+            assert track_idx is not None
+            with self.lock:
+                with open_volume_lockless(self.vc, volname) as fs_handle:
+                    with open_group(fs_handle, self.vc.volspec, groupname) as group:
+                        with open_subvol(self.vc.mgr, fs_handle, self.vc.volspec, group, clonename, SubvolumeOpType.CLONE_CANCEL) as clone_subvolume:
+                            if not self._cancel_job(volname, (track_idx, clone_subvolume.base_path)):
+                                raise VolumeException(-errno.EINVAL, "cannot cancel -- clone finished (check clone status)")
+        except (IndexException, MetadataMgrException) as e:
+            log.error("error cancelling clone {0}: ({1})".format(job, e))
+            raise VolumeException(-errno.EINVAL, "error canceling clone")
+
+    def get_next_job(self, volname, running_jobs):
+        return get_next_clone_entry(self.vc, volname, running_jobs)
+
+    def execute_job(self, volname, job, should_cancel):
+        clone(self.vc, volname, job[0].decode('utf-8'), job[1].decode('utf-8'), self.state_table, should_cancel)
diff --git a/src/pybind/mgr/volumes/fs/async_job.py b/src/pybind/mgr/volumes/fs/async_job.py
new file mode 100644
index 00000000..954e89c4
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/async_job.py
@@ -0,0 +1,279 @@
+import sys
+import time
+import logging
+import threading
+import traceback
+from collections import deque
+
+from .exception import NotImplementedException
+
+log = logging.getLogger(__name__)
+
+class JobThread(threading.Thread):
+    # this is "not" configurable and there is no need for it to be
+    # configurable. if a thread encounters an exception, we retry
+    # until it hits this many consecutive exceptions.
+    MAX_RETRIES_ON_EXCEPTION = 10
+
+    def __init__(self, async_job, volume_client, name):
+        self.vc = volume_client
+        self.async_job = async_job
+        # event object to cancel jobs
+        self.cancel_event = threading.Event()
+        threading.Thread.__init__(self, name=name)
+
+    def run(self):
+        retries = 0
+        thread_id = threading.currentThread()
+        assert isinstance(thread_id, JobThread)
+        thread_name = thread_id.getName()
+        log.debug("thread [{0}] starting".format(thread_name))
+
+        while retries < JobThread.MAX_RETRIES_ON_EXCEPTION:
+            vol_job = None
+            try:
+                # fetch next job to execute
+                with self.async_job.lock:
+                    while True:
+                        if self.should_reconfigure_num_threads():
+                            log.info("thread [{0}] terminating due to reconfigure".format(thread_name))
+                            self.async_job.threads.remove(self)
+                            return
+                        vol_job = self.async_job.get_job()
+                        if vol_job:
+                            break
+                        self.async_job.cv.wait()
+                    self.async_job.register_async_job(vol_job[0], vol_job[1], thread_id)
+
+                # execute the job (outside lock)
+                self.async_job.execute_job(vol_job[0], vol_job[1], should_cancel=lambda: thread_id.should_cancel())
+                retries = 0
+            except NotImplementedException:
+                raise
+            except Exception:
+                # unless the jobs fetching and execution routines are not implemented
+                # retry till we hit cap limit.
+                retries += 1
+                log.warning("thread [{0}] encountered fatal error: (attempt#" \
+                            " {1}/{2})".format(thread_name, retries, JobThread.MAX_RETRIES_ON_EXCEPTION))
+                exc_type, exc_value, exc_traceback = sys.exc_info()
+                log.warning("traceback: {0}".format("".join(
+                    traceback.format_exception(exc_type, exc_value, exc_traceback))))
+            finally:
+                # when done, unregister the job
+                if vol_job:
+                    with self.async_job.lock:
+                        self.async_job.unregister_async_job(vol_job[0], vol_job[1], thread_id)
+                time.sleep(1)
+        log.error("thread [{0}] reached exception limit, bailing out...".format(thread_name))
+        self.vc.cluster_log("thread {0} bailing out due to exception".format(thread_name))
+        with self.async_job.lock:
+            self.async_job.threads.remove(self)
+
+    def should_reconfigure_num_threads(self):
+        # reconfigure of max_concurrent_clones
+        return len(self.async_job.threads) > self.async_job.nr_concurrent_jobs
+
+    def cancel_job(self):
+        self.cancel_event.set()
+
+    def should_cancel(self):
+        return self.cancel_event.is_set()
+
+    def reset_cancel(self):
+        self.cancel_event.clear()
+
+class AsyncJobs(object):
+    """
+    Class providing asynchronous execution of jobs via worker threads.
+    `jobs` are grouped by `volume`, so a `volume` can have N number of
+    `jobs` executing concurrently (capped by number of concurrent jobs).
+
+    Usability is simple: subclass this and implement the following:
+      - get_next_job(volname, running_jobs)
+      - execute_job(volname, job, should_cancel)
+
+    ... and do not forget to invoke base class constructor.
+
+    Job cancelation is for a volume as a whole, i.e., all executing jobs
+    for a volume are canceled. Cancelation is poll based -- jobs need to
+    periodically check if cancelation is requested, after which the job
+    should return as soon as possible. Cancelation check is provided
+    via `should_cancel()` lambda passed to `execute_job()`.
+    """
+
+    def __init__(self, volume_client, name_pfx, nr_concurrent_jobs):
+        self.vc = volume_client
+        # queue of volumes for starting async jobs
+        self.q = deque()
+        # volume => job tracking
+        self.jobs = {}
+        # lock, cv for kickstarting jobs
+        self.lock = threading.Lock()
+        self.cv = threading.Condition(self.lock)
+        # cv for job cancelation
+        self.waiting = False
+        self.cancel_cv = threading.Condition(self.lock)
+        self.nr_concurrent_jobs = nr_concurrent_jobs
+
+        self.threads = []
+        for i in range(nr_concurrent_jobs):
+            self.threads.append(JobThread(self, volume_client, name="{0}.{1}".format(name_pfx, i)))
+            self.threads[-1].start()
+
+    def reconfigure_max_concurrent_clones(self, name_pfx, nr_concurrent_jobs):
+        """
+        reconfigure number of cloner threads
+        """
+        with self.lock:
+            self.nr_concurrent_jobs = nr_concurrent_jobs
+            # Decrease in concurrency. Notify threads which are waiting for a job to terminate.
+            if len(self.threads) > nr_concurrent_jobs:
+                self.cv.notifyAll()
+            # Increase in concurrency
+            if len(self.threads) < nr_concurrent_jobs:
+                for i in range(len(self.threads), nr_concurrent_jobs):
+                    self.threads.append(JobThread(self, self.vc, name="{0}.{1}.{2}".format(name_pfx, time.time(), i)))
+                    self.threads[-1].start()
+
+    def get_job(self):
+        log.debug("processing {0} volume entries".format(len(self.q)))
+        nr_vols = len(self.q)
+        to_remove = []
+        next_job = None
+        while nr_vols > 0:
+            volname = self.q[0]
+            # do this now so that the other thread pick up jobs for other volumes
+            self.q.rotate(1)
+            running_jobs = [j[0] for j in self.jobs[volname]]
+            (ret, job) = self.get_next_job(volname, running_jobs)
+            if job:
+                next_job = (volname, job)
+                break
+            # this is an optimization when for a given volume there are no more
+            # jobs and no jobs are in progress. in such cases we remove the volume
+            # from the tracking list so as to:
+            #
+            # a. not query the filesystem for jobs over and over again
+            # b. keep the filesystem connection idle so that it can be freed
+            #    from the connection pool
+            #
+            # if at all there are jobs for a volume, the volume gets added again
+            # to the tracking list and the jobs get kickstarted.
+            # note that, we do not iterate the volume list fully if there is a
+            # jobs to process (that will take place eventually).
+            if ret == 0 and not job and not running_jobs:
+                to_remove.append(volname)
+            nr_vols -= 1
+        for vol in to_remove:
+            log.debug("auto removing volume '{0}' from tracked volumes".format(vol))
+            self.q.remove(vol)
+            self.jobs.pop(vol)
+        return next_job
+
+    def register_async_job(self, volname, job, thread_id):
+        log.debug("registering async job {0}.{1} with thread {2}".format(volname, job, thread_id))
+        self.jobs[volname].append((job, thread_id))
+
+    def unregister_async_job(self, volname, job, thread_id):
+        log.debug("unregistering async job {0}.{1} from thread {2}".format(volname, job, thread_id))
+        self.jobs[volname].remove((job, thread_id))
+
+        cancelled = thread_id.should_cancel()
+        thread_id.reset_cancel()
+
+        # wake up cancellation waiters if needed
+        if cancelled:
+            logging.info("waking up cancellation waiters")
+            self.cancel_cv.notifyAll()
+
+    def queue_job(self, volname):
+        """
+        queue a volume for asynchronous job execution.
+        """
+        log.info("queuing job for volume '{0}'".format(volname))
+        with self.lock:
+            if not volname in self.q:
+                self.q.append(volname)
+                self.jobs[volname] = []
+            self.cv.notifyAll()
+
+    def _cancel_jobs(self, volname):
+        """
+        cancel all jobs for the volume. do nothing is the no jobs are
+        executing for the given volume. this would wait until all jobs
+        get interrupted and finish execution.
+        """
+        log.info("cancelling jobs for volume '{0}'".format(volname))
+        try:
+            if not volname in self.q and not volname in self.jobs:
+                return
+            self.q.remove(volname)
+            # cancel in-progress operation and wait until complete
+            for j in self.jobs[volname]:
+                j[1].cancel_job()
+            # wait for cancellation to complete
+            while self.jobs[volname]:
+                log.debug("waiting for {0} in-progress jobs for volume '{1}' to " \
+                          "cancel".format(len(self.jobs[volname]), volname))
+                self.cancel_cv.wait()
+            self.jobs.pop(volname)
+        except (KeyError, ValueError):
+            pass
+
+    def _cancel_job(self, volname, job):
+        """
+        cancel a executing job for a given volume. return True if canceled, False
+        otherwise (volume/job not found).
+        """
+        canceled = False
+        log.info("canceling job {0} for volume {1}".format(job, volname))
+        try:
+            if not volname in self.q and not volname in self.jobs and not job in self.jobs[volname]:
+                return canceled
+            for j in self.jobs[volname]:
+                if j[0] == job:
+                    j[1].cancel_job()
+                    # be safe against _cancel_jobs() running concurrently
+                    while j in self.jobs.get(volname, []):
+                        self.cancel_cv.wait()
+                    canceled = True
+                    break
+        except (KeyError, ValueError):
+            pass
+        return canceled
+
+    def cancel_job(self, volname, job):
+        with self.lock:
+            return self._cancel_job(volname, job)
+
+    def cancel_jobs(self, volname):
+        """
+        cancel all executing jobs for a given volume.
+        """
+        with self.lock:
+            self._cancel_jobs(volname)
+
+    def cancel_all_jobs(self):
+        """
+        call all executing jobs for all volumes.
+        """
+        with self.lock:
+            for volname in list(self.q):
+                self._cancel_jobs(volname)
+
+    def get_next_job(self, volname, running_jobs):
+        """
+        get the next job for asynchronous execution as (retcode, job) tuple. if no
+        jobs are available return (0, None) else return (0, job). on error return
+        (-ret, None). called under `self.lock`.
+        """
+        raise NotImplementedException()
+
+    def execute_job(self, volname, job, should_cancel):
+        """
+        execute a job for a volume. the job can block on I/O operations, sleep for long
+        hours and do all kinds of synchronous work. called outside `self.lock`.
+        """
+        raise NotImplementedException()
+
diff --git a/src/pybind/mgr/volumes/fs/exception.py b/src/pybind/mgr/volumes/fs/exception.py
new file mode 100644
index 00000000..4f903b99
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/exception.py
@@ -0,0 +1,63 @@
+class VolumeException(Exception):
+    def __init__(self, error_code, error_message):
+        self.errno = error_code
+        self.error_str = error_message
+
+    def to_tuple(self):
+        return self.errno, "", self.error_str
+
+    def __str__(self):
+        return "{0} ({1})".format(self.errno, self.error_str)
+
+class MetadataMgrException(Exception):
+    def __init__(self, error_code, error_message):
+        self.errno = error_code
+        self.error_str = error_message
+
+    def __str__(self):
+        return "{0} ({1})".format(self.errno, self.error_str)
+
+class IndexException(Exception):
+    def __init__(self, error_code, error_message):
+        self.errno = error_code
+        self.error_str = error_message
+
+    def __str__(self):
+        return "{0} ({1})".format(self.errno, self.error_str)
+
+class OpSmException(Exception):
+    def __init__(self, error_code, error_message):
+        self.errno = error_code
+        self.error_str = error_message
+
+    def __str__(self):
+        return "{0} ({1})".format(self.errno, self.error_str)
+
+class NotImplementedException(Exception):
+    pass
+
+class ClusterTimeout(Exception):
+    """
+    Exception indicating that we timed out trying to talk to the Ceph cluster,
+    either to the mons, or to any individual daemon that the mons indicate ought
+    to be up but isn't responding to us.
+    """
+    pass
+
+class ClusterError(Exception):
+    """
+    Exception indicating that the cluster returned an error to a command that
+    we thought should be successful based on our last knowledge of the cluster
+    state.
+    """
+    def __init__(self, action, result_code, result_str):
+        self._action = action
+        self._result_code = result_code
+        self._result_str = result_str
+
+    def __str__(self):
+        return "Error {0} (\"{1}\") while {2}".format(
+            self._result_code, self._result_str, self._action)
+
+class EvictionError(Exception):
+    pass
diff --git a/src/pybind/mgr/volumes/fs/fs_util.py b/src/pybind/mgr/volumes/fs/fs_util.py
new file mode 100644
index 00000000..6fe02f58
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/fs_util.py
@@ -0,0 +1,161 @@
+import os
+import errno
+import logging
+
+import cephfs
+import orchestrator
+
+from .exception import VolumeException
+
+log = logging.getLogger(__name__)
+
+def create_pool(mgr, pool_name, pg_num):
+    # create the given pool
+    command = {'prefix': 'osd pool create', 'pool': pool_name, 'pg_num': pg_num}
+    return mgr.mon_command(command)
+
+def remove_pool(mgr, pool_name):
+    command = {'prefix': 'osd pool rm', 'pool': pool_name, 'pool2': pool_name,
+               'yes_i_really_really_mean_it': True}
+    return mgr.mon_command(command)
+
+def create_filesystem(mgr, fs_name, metadata_pool, data_pool):
+    command = {'prefix': 'fs new', 'fs_name': fs_name, 'metadata': metadata_pool,
+               'data': data_pool}
+    return mgr.mon_command(command)
+
+def remove_filesystem(mgr, fs_name):
+    command = {'prefix': 'fs fail', 'fs_name': fs_name}
+    r, outb, outs = mgr.mon_command(command)
+    if r != 0:
+        return r, outb, outs
+
+    command = {'prefix': 'fs rm', 'fs_name': fs_name, 'yes_i_really_mean_it': True}
+    return mgr.mon_command(command)
+
+def create_mds(mgr, fs_name):
+    spec = orchestrator.StatelessServiceSpec()
+    spec.name = fs_name
+    try:
+        completion = mgr.add_stateless_service("mds", spec)
+        mgr._orchestrator_wait([completion])
+        orchestrator.raise_if_exception(completion)
+    except (ImportError, orchestrator.OrchestratorError):
+        return 0, "", "Volume created successfully (no MDS daemons created)"
+    except Exception as e:
+        # Don't let detailed orchestrator exceptions (python backtraces)
+        # bubble out to the user
+        log.exception("Failed to create MDS daemons")
+        return -errno.EINVAL, "", str(e)
+    return 0, "", ""
+
+def volume_exists(mgr, fs_name):
+    fs_map = mgr.get('fs_map')
+    for fs in fs_map['filesystems']:
+        if fs['mdsmap']['fs_name'] == fs_name:
+            return True
+    return False
+
+def listdir(fs, dirpath):
+    """
+    Get the directory names (only dirs) for a given path
+    """
+    dirs = []
+    try:
+        with fs.opendir(dirpath) as dir_handle:
+            d = fs.readdir(dir_handle)
+            while d:
+                if (d.d_name not in (b".", b"..")) and d.is_dir():
+                    dirs.append(d.d_name)
+                d = fs.readdir(dir_handle)
+    except cephfs.Error as e:
+        raise VolumeException(-e.args[0], e.args[1])
+    return dirs
+
+def is_inherited_snap(snapname):
+    """
+    Returns True if the snapname is inherited else False
+    """
+    return snapname.startswith("_")
+
+def listsnaps(fs, volspec, snapdirpath, filter_inherited_snaps=False):
+    """
+    Get the snap names from a given snap directory path
+    """
+    if os.path.basename(snapdirpath) != volspec.snapshot_prefix.encode('utf-8'):
+        raise VolumeException(-errno.EINVAL, "Not a snap directory: {0}".format(snapdirpath))
+    snaps = []
+    try:
+        with fs.opendir(snapdirpath) as dir_handle:
+            d = fs.readdir(dir_handle)
+            while d:
+                if (d.d_name not in (b".", b"..")) and d.is_dir():
+                    d_name = d.d_name.decode('utf-8')
+                    if not is_inherited_snap(d_name):
+                        snaps.append(d.d_name)
+                    elif is_inherited_snap(d_name) and not filter_inherited_snaps:
+                        snaps.append(d.d_name)
+                d = fs.readdir(dir_handle)
+    except cephfs.Error as e:
+        raise VolumeException(-e.args[0], e.args[1])
+    return snaps
+
+def list_one_entry_at_a_time(fs, dirpath):
+    """
+    Get a directory entry (one entry a time)
+    """
+    try:
+        with fs.opendir(dirpath) as dir_handle:
+            d = fs.readdir(dir_handle)
+            while d:
+                if d.d_name not in (b".", b".."):
+                    yield d
+                d = fs.readdir(dir_handle)
+    except cephfs.Error as e:
+        raise VolumeException(-e.args[0], e.args[1])
+
+def copy_file(fs, src, dst, mode, cancel_check=None):
+    """
+    Copy a regular file from @src to @dst. @dst is overwritten if it exists.
+    """
+    src_fd = dst_fd = None
+    try:
+        src_fd = fs.open(src, os.O_RDONLY);
+        dst_fd = fs.open(dst, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode)
+    except cephfs.Error as e:
+        if src_fd is not None:
+            fs.close(src_fd)
+        if dst_fd is not None:
+            fs.close(dst_fd)
+        raise VolumeException(-e.args[0], e.args[1])
+
+    IO_SIZE = 8 * 1024 * 1024
+    try:
+        while True:
+            if cancel_check and cancel_check():
+                raise VolumeException(-errno.EINTR, "copy operation interrupted")
+            data = fs.read(src_fd, -1, IO_SIZE)
+            if not len(data):
+                break
+            written = 0
+            while written < len(data):
+                written += fs.write(dst_fd, data[written:], -1)
+        fs.fsync(dst_fd, 0)
+    except cephfs.Error as e:
+        raise VolumeException(-e.args[0], e.args[1])
+    finally:
+        fs.close(src_fd)
+        fs.close(dst_fd)
+
+def get_ancestor_xattr(fs, path, attr):
+    """
+    Helper for reading layout information: if this xattr is missing
+    on the requested path, keep checking parents until we find it.
+    """
+    try:
+        return fs.getxattr(path, attr).decode('utf-8')
+    except cephfs.NoData as e:
+        if path == "/":
+            raise VolumeException(-e.args[0], e.args[1])
+        else:
+            return get_ancestor_xattr(fs, os.path.split(path)[0], attr)
diff --git a/src/pybind/mgr/volumes/fs/operations/__init__.py b/src/pybind/mgr/volumes/fs/operations/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/__init__.py
diff --git a/src/pybind/mgr/volumes/fs/operations/access.py b/src/pybind/mgr/volumes/fs/operations/access.py
new file mode 100644
index 00000000..44430f59
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/access.py
@@ -0,0 +1,142 @@
+import errno
+import json
+try:
+    from typing import List
+except ImportError:
+    pass # For typing only
+
+def prepare_updated_caps_list(existing_caps, mds_cap_str, osd_cap_str, authorize=True):
+    caps_list = [] # type: List[str]
+    for k, v in existing_caps['caps'].items():
+        if k == 'mds' or k == 'osd':
+            continue
+        elif k == 'mon':
+            if not authorize and v == 'allow r':
+                continue
+        caps_list.extend((k,v))
+
+    if mds_cap_str:
+        caps_list.extend(('mds', mds_cap_str))
+    if osd_cap_str:
+        caps_list.extend(('osd', osd_cap_str))
+
+    if authorize and 'mon' not in caps_list:
+        caps_list.extend(('mon', 'allow r'))
+
+    return caps_list
+
+
+def allow_access(mgr, client_entity, want_mds_cap, want_osd_cap,
+                 unwanted_mds_cap, unwanted_osd_cap, existing_caps):
+    if existing_caps is None:
+        ret, out, err = mgr.mon_command({
+            "prefix": "auth get-or-create",
+            "entity": client_entity,
+            "caps": ['mds',  want_mds_cap, 'osd', want_osd_cap, 'mon', 'allow r'],
+            "format": "json"})
+    else:
+        cap = existing_caps[0]
+
+        def cap_update(
+                orig_mds_caps, orig_osd_caps, want_mds_cap,
+                want_osd_cap, unwanted_mds_cap, unwanted_osd_cap):
+
+            if not orig_mds_caps:
+                return want_mds_cap, want_osd_cap
+
+            mds_cap_tokens = [x.strip() for x in orig_mds_caps.split(",")]
+            osd_cap_tokens = [x.strip() for x in orig_osd_caps.split(",")]
+
+            if want_mds_cap in mds_cap_tokens:
+                return orig_mds_caps, orig_osd_caps
+
+            if unwanted_mds_cap in mds_cap_tokens:
+                mds_cap_tokens.remove(unwanted_mds_cap)
+                osd_cap_tokens.remove(unwanted_osd_cap)
+
+            mds_cap_tokens.append(want_mds_cap)
+            osd_cap_tokens.append(want_osd_cap)
+
+            return ",".join(mds_cap_tokens), ",".join(osd_cap_tokens)
+
+        orig_mds_caps = cap['caps'].get('mds', "")
+        orig_osd_caps = cap['caps'].get('osd', "")
+
+        mds_cap_str, osd_cap_str = cap_update(
+            orig_mds_caps, orig_osd_caps, want_mds_cap, want_osd_cap,
+            unwanted_mds_cap, unwanted_osd_cap)
+
+        caps_list = prepare_updated_caps_list(cap, mds_cap_str, osd_cap_str)
+        mgr.mon_command(
+            {
+                "prefix": "auth caps",
+                'entity': client_entity,
+                'caps': caps_list
+            })
+        ret, out, err = mgr.mon_command(
+            {
+                'prefix': 'auth get',
+                'entity': client_entity,
+                'format': 'json'
+            })
+
+    # Result expected like this:
+    # [
+    #     {
+    #         "entity": "client.foobar",
+    #         "key": "AQBY0\/pViX\/wBBAAUpPs9swy7rey1qPhzmDVGQ==",
+    #         "caps": {
+    #             "mds": "allow *",
+    #             "mon": "allow *"
+    #         }
+    #     }
+    # ]
+
+    caps = json.loads(out)
+    assert len(caps) == 1
+    assert caps[0]['entity'] == client_entity
+    return caps[0]['key']
+
+def deny_access(mgr, client_entity, want_mds_caps, want_osd_caps):
+    ret, out, err = mgr.mon_command({
+        "prefix": "auth get",
+        "entity": client_entity,
+        "format": "json",
+    })
+
+    if ret == -errno.ENOENT:
+        # Already gone, great.
+        return
+
+    def cap_remove(orig_mds_caps, orig_osd_caps, want_mds_caps, want_osd_caps):
+        mds_cap_tokens = [x.strip() for x in orig_mds_caps.split(",")]
+        osd_cap_tokens = [x.strip() for x in orig_osd_caps.split(",")]
+
+        for want_mds_cap, want_osd_cap in zip(want_mds_caps, want_osd_caps):
+            if want_mds_cap in mds_cap_tokens:
+                mds_cap_tokens.remove(want_mds_cap)
+                osd_cap_tokens.remove(want_osd_cap)
+                break
+
+        return ",".join(mds_cap_tokens), ",".join(osd_cap_tokens)
+
+    cap = json.loads(out)[0]
+    orig_mds_caps = cap['caps'].get('mds', "")
+    orig_osd_caps = cap['caps'].get('osd', "")
+    mds_cap_str, osd_cap_str = cap_remove(orig_mds_caps, orig_osd_caps,
+                                          want_mds_caps, want_osd_caps)
+
+    caps_list = prepare_updated_caps_list(cap, mds_cap_str, osd_cap_str, authorize=False)
+    if not caps_list:
+        mgr.mon_command(
+            {
+                'prefix': 'auth rm',
+                'entity': client_entity
+            })
+    else:
+        mgr.mon_command(
+            {
+                "prefix": "auth caps",
+                'entity': client_entity,
+                'caps': caps_list
+            })
diff --git a/src/pybind/mgr/volumes/fs/operations/clone_index.py b/src/pybind/mgr/volumes/fs/operations/clone_index.py
new file mode 100644
index 00000000..a2b31f85
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/clone_index.py
@@ -0,0 +1,98 @@
+import os
+import uuid
+import stat
+import errno
+import logging
+from contextlib import contextmanager
+
+import cephfs
+
+from .index import Index
+from ..exception import IndexException, VolumeException
+from ..fs_util import list_one_entry_at_a_time
+
+log = logging.getLogger(__name__)
+
+class CloneIndex(Index):
+    SUB_GROUP_NAME = "clone"
+    PATH_MAX = 4096
+
+    @property
+    def path(self):
+        return os.path.join(super(CloneIndex, self).path, CloneIndex.SUB_GROUP_NAME.encode('utf-8'))
+
+    def _track(self, sink_path):
+        tracking_id = str(uuid.uuid4())
+        source_path = os.path.join(self.path, tracking_id.encode('utf-8'))
+        log.info("tracking-id {0} for path {1}".format(tracking_id, sink_path))
+
+        self.fs.symlink(sink_path, source_path)
+        return tracking_id
+
+    def track(self, sink_path):
+        try:
+            return self._track(sink_path)
+        except (VolumeException, cephfs.Error) as e:
+            if isinstance(e, cephfs.Error):
+                e = IndexException(-e.args[0], e.args[1])
+            elif isinstance(e, VolumeException):
+                e = IndexException(e.errno, e.error_str)
+            raise e
+
+    def untrack(self, tracking_id):
+        log.info("untracking {0}".format(tracking_id))
+        source_path = os.path.join(self.path, tracking_id.encode('utf-8'))
+        try:
+            self.fs.unlink(source_path)
+        except cephfs.Error as e:
+            raise IndexException(-e.args[0], e.args[1])
+
+    def get_oldest_clone_entry(self, exclude=[]):
+        min_ctime_entry = None
+        exclude_tracking_ids = [v[0] for v in exclude]
+        log.debug("excluded tracking ids: {0}".format(exclude_tracking_ids))
+        for entry in list_one_entry_at_a_time(self.fs, self.path):
+            dname = entry.d_name
+            dpath = os.path.join(self.path, dname)
+            st = self.fs.lstat(dpath)
+            if dname not in exclude_tracking_ids and stat.S_ISLNK(st.st_mode):
+                if min_ctime_entry is None or st.st_ctime < min_ctime_entry[1].st_ctime:
+                    min_ctime_entry = (dname, st)
+        if min_ctime_entry:
+            try:
+                linklen = min_ctime_entry[1].st_size
+                sink_path = self.fs.readlink(os.path.join(self.path, min_ctime_entry[0]), CloneIndex.PATH_MAX)
+                return (min_ctime_entry[0], sink_path[:linklen])
+            except cephfs.Error as e:
+                raise IndexException(-e.args[0], e.args[1])
+        return None
+
+    def find_clone_entry_index(self, sink_path):
+        try:
+            for entry in list_one_entry_at_a_time(self.fs, self.path):
+                dname = entry.d_name
+                dpath = os.path.join(self.path, dname)
+                st = self.fs.lstat(dpath)
+                if stat.S_ISLNK(st.st_mode):
+                    target_path = self.fs.readlink(dpath, CloneIndex.PATH_MAX)
+                    if sink_path == target_path[:st.st_size]:
+                        return dname
+            return None
+        except cephfs.Error as e:
+            raise IndexException(-e.args[0], e.args[1])
+
+def create_clone_index(fs, vol_spec):
+    clone_index = CloneIndex(fs, vol_spec)
+    try:
+        fs.mkdirs(clone_index.path, 0o700)
+    except cephfs.Error as e:
+        raise IndexException(-e.args[0], e.args[1])
+
+@contextmanager
+def open_clone_index(fs, vol_spec):
+    clone_index = CloneIndex(fs, vol_spec)
+    try:
+        fs.stat(clone_index.path)
+    except cephfs.Error as e:
+        raise IndexException(-e.args[0], e.args[1])
+    yield clone_index
diff --git a/src/pybind/mgr/volumes/fs/operations/group.py b/src/pybind/mgr/volumes/fs/operations/group.py
new file mode 100644
index 00000000..ae334563
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/group.py
@@ -0,0 +1,186 @@
+import os
+import errno
+import logging
+from contextlib import contextmanager
+
+import cephfs
+
+from .snapshot_util import mksnap, rmsnap
+from .template import GroupTemplate
+from ..fs_util import listdir, listsnaps, get_ancestor_xattr
+from ..exception import VolumeException
+
+log = logging.getLogger(__name__)
+
+class Group(GroupTemplate):
+    # Reserved subvolume group name which we use in paths for subvolumes
+    # that are not assigned to a group (i.e. created with group=None)
+    NO_GROUP_NAME = "_nogroup"
+
+    def __init__(self, fs, vol_spec, groupname):
+        assert groupname != Group.NO_GROUP_NAME
+        self.fs = fs
+        self.user_id = None
+        self.group_id = None
+        self.vol_spec = vol_spec
+        self.groupname = groupname if groupname else Group.NO_GROUP_NAME
+
+    @property
+    def path(self):
+        return os.path.join(self.vol_spec.base_dir.encode('utf-8'), self.groupname.encode('utf-8'))
+
+    @property
+    def group_name(self):
+        return self.groupname
+
+    @property
+    def uid(self):
+        return self.user_id
+
+    @uid.setter
+    def uid(self, val):
+        self.user_id = val
+
+    @property
+    def gid(self):
+        return self.group_id
+
+    @gid.setter
+    def gid(self, val):
+        self.group_id = val
+
+    def is_default_group(self):
+        return self.groupname == Group.NO_GROUP_NAME
+
+    def list_subvolumes(self):
+        try:
+            return listdir(self.fs, self.path)
+        except VolumeException as ve:
+            # listing a default group when it's not yet created
+            if ve.errno == -errno.ENOENT and self.is_default_group():
+                return []
+            raise
+
+    def create_snapshot(self, snapname):
+        snappath = os.path.join(self.path,
+                                self.vol_spec.snapshot_dir_prefix.encode('utf-8'),
+                                snapname.encode('utf-8'))
+        mksnap(self.fs, snappath)
+
+    def remove_snapshot(self, snapname):
+        snappath = os.path.join(self.path,
+                                self.vol_spec.snapshot_dir_prefix.encode('utf-8'),
+                                snapname.encode('utf-8'))
+        rmsnap(self.fs, snappath)
+
+    def list_snapshots(self):
+        try:
+            dirpath = os.path.join(self.path,
+                                   self.vol_spec.snapshot_dir_prefix.encode('utf-8'))
+            return listsnaps(self.fs, self.vol_spec, dirpath, filter_inherited_snaps=True)
+        except VolumeException as ve:
+            if ve.errno == -errno.ENOENT:
+                return []
+            raise
+
+def create_group(fs, vol_spec, groupname, pool, mode, uid, gid):
+    """
+    create a subvolume group.
+
+    :param fs: ceph filesystem handle
+    :param vol_spec: volume specification
+    :param groupname: subvolume group name
+    :param pool: the RADOS pool where the data objects of the subvolumes will be stored
+    :param mode: the user permissions
+    :param uid: the user identifier
+    :param gid: the group identifier
+    :return: None
+    """
+    group = Group(fs, vol_spec, groupname)
+    path = group.path
+    fs.mkdirs(path, mode)
+    try:
+        if not pool:
+            pool = get_ancestor_xattr(fs, path, "ceph.dir.layout.pool")
+        try:
+            fs.setxattr(path, 'ceph.dir.layout.pool', pool.encode('utf-8'), 0)
+        except cephfs.InvalidValue:
+            raise VolumeException(-errno.EINVAL,
+                                  "Invalid pool layout '{0}'. It must be a valid data pool".format(pool))
+        if uid is None:
+            uid = 0
+        else:
+            try:
+                uid = int(uid)
+                if uid < 0:
+                    raise ValueError
+            except ValueError:
+                raise VolumeException(-errno.EINVAL, "invalid UID")
+        if gid is None:
+            gid = 0
+        else:
+            try:
+                gid = int(gid)
+                if gid < 0:
+                    raise ValueError
+            except ValueError:
+                raise VolumeException(-errno.EINVAL, "invalid GID")
+        fs.chown(path, uid, gid)
+    except (cephfs.Error, VolumeException) as e:
+        try:
+            # cleanup group path on best effort basis
+            log.debug("cleaning up subvolume group path: {0}".format(path))
+            fs.rmdir(path)
+        except cephfs.Error as ce:
+            log.debug("failed to clean up subvolume group {0} with path: {1} ({2})".format(groupname, path, ce))
+        if isinstance(e, cephfs.Error):
+            e = VolumeException(-e.args[0], e.args[1])
+        raise e
+
+def remove_group(fs, vol_spec, groupname):
+    """
+    remove a subvolume group.
+
+    :param fs: ceph filesystem handle
+    :param vol_spec: volume specification
+    :param groupname: subvolume group name
+    :return: None
+    """
+    group = Group(fs, vol_spec, groupname)
+    try:
+        fs.rmdir(group.path)
+    except cephfs.Error as e:
+        if e.args[0] == errno.ENOENT:
+            raise VolumeException(-errno.ENOENT, "subvolume group '{0}' does not exist".format(groupname))
+        raise VolumeException(-e.args[0], e.args[1])
+
+@contextmanager
+def open_group(fs, vol_spec, groupname):
+    """
+    open a subvolume group. This API is to be used as a context manager.
+
+    :param fs: ceph filesystem handle
+    :param vol_spec: volume specification
+    :param groupname: subvolume group name
+    :return: yields a group object (subclass of GroupTemplate)
+    """
+    group = Group(fs, vol_spec, groupname)
+    try:
+        st = fs.stat(group.path)
+        group.uid = int(st.st_uid)
+        group.gid = int(st.st_gid)
+    except cephfs.Error as e:
+        if e.args[0] == errno.ENOENT:
+            if not group.is_default_group():
+                raise VolumeException(-errno.ENOENT, "subvolume group '{0}' does not exist".format(groupname))
+        else:
+            raise VolumeException(-e.args[0], e.args[1])
+    yield group
+
+@contextmanager
+def open_group_unique(fs, vol_spec, groupname, c_group, c_groupname):
+    if groupname == c_groupname:
+        yield c_group
+    else:
+        with open_group(fs, vol_spec, groupname) as group:
+            yield group
diff --git a/src/pybind/mgr/volumes/fs/operations/index.py b/src/pybind/mgr/volumes/fs/operations/index.py
new file mode 100644
index 00000000..0e4296d7
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/index.py
@@ -0,0 +1,23 @@
+import errno
+import os
+
+from ..exception import VolumeException
+from .template import GroupTemplate
+
+class Index(GroupTemplate):
+    GROUP_NAME = "_index"
+
+    def __init__(self, fs, vol_spec):
+        self.fs = fs
+        self.vol_spec = vol_spec
+        self.groupname = Index.GROUP_NAME
+
+    @property
+    def path(self):
+        return os.path.join(self.vol_spec.base_dir.encode('utf-8'), self.groupname.encode('utf-8'))
+
+    def track(self, *args):
+        raise VolumeException(-errno.EINVAL, "operation not supported.")
+
+    def untrack(self, tracking_id):
+        raise VolumeException(-errno.EINVAL, "operation not supported.")
diff --git a/src/pybind/mgr/volumes/fs/operations/lock.py b/src/pybind/mgr/volumes/fs/operations/lock.py
new file mode 100644
index 00000000..ab5f1d04
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/lock.py
@@ -0,0 +1,42 @@
+from contextlib import contextmanager
+import logging
+from threading import Lock
+
+log = logging.getLogger(__name__)
+
+# singleton design pattern taken from http://www.aleax.it/5ep.html
+
+class GlobalLock(object):
+    """
+    Global lock to serialize operations in mgr/volumes. This lock
+    is currently held when accessing (opening) a volume to perform
+    group/subvolume operations. Since this is a big lock, it's rather
+    inefficient -- but right now it's ok since mgr/volumes does not
+    expect concurrent operations via its APIs.
+
+    As and when features get added (such as clone, where mgr/volumes
+    would maintain subvolume states in the filesystem), there might
+    be a need to allow concurrent operations. In that case it would
+    be nice to implement an efficient path based locking mechanism.
+
+    See: https://people.eecs.berkeley.edu/~kubitron/courses/cs262a-F14/projects/reports/project6_report.pdf
+    """
+    _shared_state = {
+        'lock' : Lock(),
+        'init' : False
+    }
+
+    def __init__(self):
+        with self._shared_state['lock']:
+            if not self._shared_state['init']:
+                self._shared_state['init'] = True
+        # share this state among all instances
+        self.__dict__ = self._shared_state
+
+    @contextmanager
+    def lock_op(self):
+        log.debug("entering global lock")
+        with self._shared_state['lock']:
+            log.debug("acquired global lock")
+            yield
+        log.debug("exited global lock")
diff --git a/src/pybind/mgr/volumes/fs/operations/rankevicter.py b/src/pybind/mgr/volumes/fs/operations/rankevicter.py
new file mode 100644
index 00000000..5b945c38
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/rankevicter.py
@@ -0,0 +1,114 @@
+import errno
+import json
+import logging
+import threading
+import time
+
+from .volume import get_mds_map
+from ..exception import ClusterTimeout, ClusterError
+
+log = logging.getLogger(__name__)
+
+class RankEvicter(threading.Thread):
+    """
+    Thread for evicting client(s) from a particular MDS daemon instance.
+
+    This is more complex than simply sending a command, because we have to
+    handle cases where MDS daemons might not be fully up yet, and/or might
+    be transiently unresponsive to commands.
+    """
+    class GidGone(Exception):
+        pass
+
+    POLL_PERIOD = 5
+
+    def __init__(self, mgr, fs, client_spec, volname, rank, gid, mds_map, ready_timeout):
+        """
+        :param client_spec: list of strings, used as filter arguments to "session evict"
+                            pass ["id=123"] to evict a single client with session id 123.
+        """
+        self.volname = volname
+        self.rank = rank
+        self.gid = gid
+        self._mds_map = mds_map
+        self._client_spec = client_spec
+        self._fs = fs
+        self._ready_timeout = ready_timeout
+        self._ready_waited = 0
+        self.mgr = mgr
+
+        self.success = False
+        self.exception = None
+
+        super(RankEvicter, self).__init__()
+
+    def _ready_to_evict(self):
+        if self._mds_map['up'].get("mds_{0}".format(self.rank), None) != self.gid:
+            log.info("Evicting {0} from {1}/{2}: rank no longer associated with gid, done.".format(
+                self._client_spec, self.rank, self.gid
+            ))
+            raise RankEvicter.GidGone()
+
+        info = self._mds_map['info']["gid_{0}".format(self.gid)]
+        log.debug("_ready_to_evict: state={0}".format(info['state']))
+        return info['state'] in ["up:active", "up:clientreplay"]
+
+    def _wait_for_ready(self):
+        """
+        Wait for that MDS rank to reach an active or clientreplay state, and
+        not be laggy.
+        """
+        while not self._ready_to_evict():
+            if self._ready_waited > self._ready_timeout:
+                raise ClusterTimeout()
+
+            time.sleep(self.POLL_PERIOD)
+            self._ready_waited += self.POLL_PERIOD
+            self._mds_map = get_mds_map(self.mgr, self.volname)
+
+    def _evict(self):
+        """
+        Run the eviction procedure.  Return true on success, false on errors.
+        """
+
+        # Wait til the MDS is believed by the mon to be available for commands
+        try:
+            self._wait_for_ready()
+        except self.GidGone:
+            return True
+
+        # Then send it an evict
+        ret = -errno.ETIMEDOUT
+        while ret == -errno.ETIMEDOUT:
+            log.debug("mds_command: {0}, {1}".format(
+                "%s" % self.gid, ["session", "evict"] + self._client_spec
+            ))
+            ret, outb, outs = self._fs.mds_command(
+                "%s" % self.gid,
+                json.dumps({
+                                "prefix": "session evict",
+                                "filters": self._client_spec
+                }), "")
+            log.debug("mds_command: complete {0} {1}".format(ret, outs))
+
+            # If we get a clean response, great, it's gone from that rank.
+            if ret == 0:
+                return True
+            elif ret == -errno.ETIMEDOUT:
+                # Oh no, the MDS went laggy (that's how libcephfs knows to emit this error)
+                self._mds_map = get_mds_map(self.mgr, self.volname)
+                try:
+                    self._wait_for_ready()
+                except self.GidGone:
+                    return True
+            else:
+                raise ClusterError("Sending evict to mds.{0}".format(self.gid), ret, outs)
+
+    def run(self):
+        try:
+            self._evict()
+        except Exception as e:
+            self.success = False
+            self.exception = e
+        else:
+            self.success = True
diff --git a/src/pybind/mgr/volumes/fs/operations/resolver.py b/src/pybind/mgr/volumes/fs/operations/resolver.py
new file mode 100644
index 00000000..a9543654
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/resolver.py
@@ -0,0 +1,26 @@
+import os
+
+from .group import Group
+
+def splitall(path):
+    if path == "/":
+        return ["/"]
+    s = os.path.split(path)
+    return splitall(s[0]) + [s[1]]
+
+def resolve(vol_spec, path):
+    parts = splitall(path)
+    if len(parts) != 4 or os.path.join(parts[0], parts[1]) != vol_spec.subvolume_prefix:
+        return None
+    groupname = None if parts[2] == Group.NO_GROUP_NAME else parts[2]
+    subvolname = parts[3]
+    return (groupname, subvolname)
+
+def resolve_trash(vol_spec, path):
+    parts = splitall(path)
+    if len(parts) != 6 or os.path.join(parts[0], parts[1]) != vol_spec.subvolume_prefix or \
+       parts[4] != '.trash':
+        return None
+    groupname = None if parts[2] == Group.NO_GROUP_NAME else parts[2]
+    subvolname = parts[3]
+    return (groupname, subvolname)
diff --git a/src/pybind/mgr/volumes/fs/operations/snapshot_util.py b/src/pybind/mgr/volumes/fs/operations/snapshot_util.py
new file mode 100644
index 00000000..2223c58e
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/snapshot_util.py
@@ -0,0 +1,30 @@
+import os
+import errno
+
+import cephfs
+
+from ..exception import VolumeException
+
+def mksnap(fs, snappath):
+    """
+    Create a snapshot, or do nothing if it already exists.
+    """
+    try:
+        # snap create does not accept mode -- use default
+        fs.mkdir(snappath, 0o755)
+    except cephfs.ObjectExists:
+        return
+    except cephfs.Error as e:
+        raise VolumeException(-e.args[0], e.args[1])
+
+def rmsnap(fs, snappath):
+    """
+    Remove a snapshot
+    """
+    try:
+        fs.stat(snappath)
+        fs.rmdir(snappath)
+    except cephfs.ObjectNotFound:
+        raise VolumeException(-errno.ENOENT, "snapshot '{0}' does not exist".format(os.path.basename(snappath)))
+    except cephfs.Error as e:
+        raise VolumeException(-e.args[0], e.args[1])
diff --git a/src/pybind/mgr/volumes/fs/operations/subvolume.py b/src/pybind/mgr/volumes/fs/operations/subvolume.py
new file mode 100644
index 00000000..dc36477b
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/subvolume.py
@@ -0,0 +1,74 @@
+import os
+import errno
+from contextlib import contextmanager
+
+from ..exception import VolumeException
+from .template import SubvolumeOpType
+
+from .versions import loaded_subvolumes
+
+def create_subvol(mgr, fs, vol_spec, group, subvolname, size, isolate_nspace, pool, mode, uid, gid):
+    """
+    create a subvolume (create a subvolume with the max known version).
+
+    :param fs: ceph filesystem handle
+    :param vol_spec: volume specification
+    :param group: group object for the subvolume
+    :param size: In bytes, or None for no size limit
+    :param isolate_nspace: If true, use separate RADOS namespace for this subvolume
+    :param pool: the RADOS pool where the data objects of the subvolumes will be stored
+    :param mode: the user permissions
+    :param uid: the user identifier
+    :param gid: the group identifier
+    :return: None
+    """
+    subvolume = loaded_subvolumes.get_subvolume_object_max(mgr, fs, vol_spec, group, subvolname)
+    subvolume.create(size, isolate_nspace, pool, mode, uid, gid)
+
+def create_clone(mgr, fs, vol_spec, group, subvolname, pool, source_volume, source_subvolume, snapname):
+    """
+    create a cloned subvolume.
+
+    :param fs: ceph filesystem handle
+    :param vol_spec: volume specification
+    :param group: group object for the clone
+    :param subvolname: clone subvolume nam
+    :param pool: the RADOS pool where the data objects of the cloned subvolume will be stored
+    :param source_volume: source subvolumes volume name
+    :param source_subvolume: source (parent) subvolume object
+    :param snapname: source subvolume snapshot
+    :return None
+    """
+    subvolume = loaded_subvolumes.get_subvolume_object_max(mgr, fs, vol_spec, group, subvolname)
+    subvolume.create_clone(pool, source_volume, source_subvolume, snapname)
+
+def remove_subvol(mgr, fs, vol_spec, group, subvolname, force=False, retainsnaps=False):
+    """
+    remove a subvolume.
+
+    :param fs: ceph filesystem handle
+    :param vol_spec: volume specification
+    :param group: group object for the subvolume
+    :param subvolname: subvolume name
+    :param force: force remove subvolumes
+    :return: None
+    """
+    op_type = SubvolumeOpType.REMOVE if not force else SubvolumeOpType.REMOVE_FORCE
+    with open_subvol(mgr, fs, vol_spec, group, subvolname, op_type) as subvolume:
+        subvolume.remove(retainsnaps)
+
+@contextmanager
+def open_subvol(mgr, fs, vol_spec, group, subvolname, op_type):
+    """
+    open a subvolume. This API is to be used as a context manager.
+
+    :param fs: ceph filesystem handle
+    :param vol_spec: volume specification
+    :param group: group object for the subvolume
+    :param subvolname: subvolume name
+    :param op_type: operation type for which subvolume is being opened
+    :return: yields a subvolume object (subclass of SubvolumeTemplate)
+    """
+    subvolume = loaded_subvolumes.get_subvolume_object(mgr, fs, vol_spec, group, subvolname)
+    subvolume.open(op_type)
+    yield subvolume
diff --git a/src/pybind/mgr/volumes/fs/operations/template.py b/src/pybind/mgr/volumes/fs/operations/template.py
new file mode 100644
index 00000000..d35ad0de
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/template.py
@@ -0,0 +1,173 @@
+import errno
+
+from enum import Enum, unique
+
+from ..exception import VolumeException
+
+class GroupTemplate(object):
+    def list_subvolumes(self):
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def create_snapshot(self, snapname):
+        """
+        create a subvolume group snapshot.
+
+        :param: group snapshot name
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def remove_snapshot(self, snapname):
+        """
+        remove a subvolume group snapshot.
+
+        :param: group snapshot name
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def list_snapshots(self):
+        """
+        list all subvolume group snapshots.
+
+        :param: None
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+@unique
+class SubvolumeOpType(Enum):
+    CREATE          = 'create'
+    REMOVE          = 'rm'
+    REMOVE_FORCE    = 'rm-force'
+    PIN             = 'pin'
+    LIST            = 'ls'
+    GETPATH         = 'getpath'
+    INFO            = 'info'
+    RESIZE          = 'resize'
+    SNAP_CREATE     = 'snap-create'
+    SNAP_REMOVE     = 'snap-rm'
+    SNAP_LIST       = 'snap-ls'
+    SNAP_INFO       = 'snap-info'
+    SNAP_PROTECT    = 'snap-protect'
+    SNAP_UNPROTECT  = 'snap-unprotect'
+    CLONE_SOURCE    = 'clone-source'
+    CLONE_CREATE    = 'clone-create'
+    CLONE_STATUS    = 'clone-status'
+    CLONE_CANCEL    = 'clone-cancel'
+    CLONE_INTERNAL  = 'clone_internal'
+    ALLOW_ACCESS    = 'allow-access'
+    DENY_ACCESS     = 'deny-access'
+    AUTH_LIST       = 'auth-list'
+    EVICT           = 'evict'
+
+class SubvolumeTemplate(object):
+    VERSION = None # type: int
+
+    @staticmethod
+    def version():
+        return SubvolumeTemplate.VERSION
+
+    def open(self, op_type):
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def status(self):
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def create(self, size, isolate_nspace, pool, mode, uid, gid):
+        """
+        set up metadata, pools and auth for a subvolume.
+
+        This function is idempotent.  It is safe to call this again
+        for an already-created subvolume, even if it is in use.
+
+        :param size: In bytes, or None for no size limit
+        :param isolate_nspace: If true, use separate RADOS namespace for this subvolume
+        :param pool: the RADOS pool where the data objects of the subvolumes will be stored
+        :param mode: the user permissions
+        :param uid: the user identifier
+        :param gid: the group identifier
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def create_clone(self, pool, source_volname, source_subvolume, snapname):
+        """
+        prepare a subvolume to be cloned.
+
+        :param pool: the RADOS pool where the data objects of the cloned subvolume will be stored
+        :param source_volname: source volume of snapshot
+        :param source_subvolume: source subvolume of snapshot
+        :param snapname: snapshot name to be cloned from
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def remove(self):
+        """
+        make a subvolume inaccessible to guests.
+
+        This function is idempotent.  It is safe to call this again
+
+        :param: None
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def resize(self, newsize, nshrink):
+        """
+        resize a subvolume
+
+        :param newsize: new size In bytes (or inf/infinite)
+        :return: new quota size and used bytes as a tuple
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def create_snapshot(self, snapname):
+        """
+        snapshot a subvolume.
+
+        :param: subvolume snapshot name
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def remove_snapshot(self, snapname):
+        """
+        remove a subvolume snapshot.
+
+        :param: subvolume snapshot name
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def list_snapshots(self):
+        """
+        list all subvolume snapshots.
+
+        :param: None
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def attach_snapshot(self, snapname, tgt_subvolume):
+        """
+        attach a snapshot to a target cloned subvolume. the target subvolume
+        should be an empty subvolume (type "clone") in "pending" state.
+
+        :param: snapname: snapshot to attach to a clone
+        :param: tgt_subvolume: target clone subvolume
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
+
+    def detach_snapshot(self, snapname, tgt_subvolume):
+        """
+        detach a snapshot from a target cloned subvolume. the target subvolume
+        should either be in "failed" or "completed" state.
+
+        :param: snapname: snapshot to detach from a clone
+        :param: tgt_subvolume: target clone subvolume
+        :return: None
+        """
+        raise VolumeException(-errno.ENOTSUP, "operation not supported.")
diff --git a/src/pybind/mgr/volumes/fs/operations/trash.py b/src/pybind/mgr/volumes/fs/operations/trash.py
new file mode 100644
index 00000000..66f1d71c
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/trash.py
@@ -0,0 +1,145 @@
+import os
+import uuid
+import logging
+from contextlib import contextmanager
+
+import cephfs
+
+from .template import GroupTemplate
+from ..fs_util import listdir
+from ..exception import VolumeException
+
+log = logging.getLogger(__name__)
+
+class Trash(GroupTemplate):
+    GROUP_NAME = "_deleting"
+
+    def __init__(self, fs, vol_spec):
+        self.fs = fs
+        self.vol_spec = vol_spec
+        self.groupname = Trash.GROUP_NAME
+
+    @property
+    def path(self):
+        return os.path.join(self.vol_spec.base_dir.encode('utf-8'), self.groupname.encode('utf-8'))
+
+    @property
+    def unique_trash_path(self):
+        """
+        return a unique trash directory entry path
+        """
+        return os.path.join(self.path, str(uuid.uuid4()).encode('utf-8'))
+
+    def _get_single_dir_entry(self, exclude_list=[]):
+        exclude_list.extend((b".", b".."))
+        try:
+            with self.fs.opendir(self.path) as d:
+                entry = self.fs.readdir(d)
+                while entry:
+                    if entry.d_name not in exclude_list:
+                        return entry.d_name
+                    entry = self.fs.readdir(d)
+            return None
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+    def get_trash_entry(self, exclude_list):
+        """
+        get a trash entry excluding entries provided.
+
+        :praram exclude_list: entries to exclude
+        :return: trash entry
+        """
+        return self._get_single_dir_entry(exclude_list)
+
+    def purge(self, trashpath, should_cancel):
+        """
+        purge a trash entry.
+
+        :praram trash_entry: the trash entry to purge
+        :praram should_cancel: callback to check if the purge should be aborted
+        :return: None
+        """
+        def rmtree(root_path):
+            log.debug("rmtree {0}".format(root_path))
+            try:
+                with self.fs.opendir(root_path) as dir_handle:
+                    d = self.fs.readdir(dir_handle)
+                    while d and not should_cancel():
+                        if d.d_name not in (b".", b".."):
+                            d_full = os.path.join(root_path, d.d_name)
+                            if d.is_dir():
+                                rmtree(d_full)
+                            else:
+                                self.fs.unlink(d_full)
+                        d = self.fs.readdir(dir_handle)
+            except cephfs.ObjectNotFound:
+                return
+            except cephfs.Error as e:
+                raise VolumeException(-e.args[0], e.args[1])
+            # remove the directory only if we were not asked to cancel
+            # (else we would fail to remove this anyway)
+            if not should_cancel():
+                self.fs.rmdir(root_path)
+
+        # catch any unlink errors
+        try:
+            rmtree(trashpath)
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+    def dump(self, path):
+        """
+        move an filesystem entity to trash can.
+
+        :praram path: the filesystem path to be moved
+        :return: None
+        """
+        try:
+            self.fs.rename(path, self.unique_trash_path)
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+    def link(self, path, bname):
+        pth = os.path.join(self.path, bname)
+        try:
+            self.fs.symlink(path, pth)
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+    def delink(self, bname):
+        pth = os.path.join(self.path, bname)
+        try:
+            self.fs.unlink(pth)
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+def create_trashcan(fs, vol_spec):
+    """
+    create a trash can.
+
+    :param fs: ceph filesystem handle
+    :param vol_spec: volume specification
+    :return: None
+    """
+    trashcan = Trash(fs, vol_spec)
+    try:
+        fs.mkdirs(trashcan.path, 0o700)
+    except cephfs.Error as e:
+        raise VolumeException(-e.args[0], e.args[1])
+
+@contextmanager
+def open_trashcan(fs, vol_spec):
+    """
+    open a trash can. This API is to be used as a context manager.
+
+    :param fs: ceph filesystem handle
+    :param vol_spec: volume specification
+    :return: yields a trash can object (subclass of GroupTemplate)
+    """
+    trashcan = Trash(fs, vol_spec)
+    try:
+        fs.stat(trashcan.path)
+    except cephfs.Error as e:
+        raise VolumeException(-e.args[0], e.args[1])
+    yield trashcan
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/__init__.py b/src/pybind/mgr/volumes/fs/operations/versions/__init__.py
new file mode 100644
index 00000000..3dcdd7c1
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/versions/__init__.py
@@ -0,0 +1,109 @@
+import errno
+import logging
+import importlib
+
+import cephfs
+
+from .subvolume_base import SubvolumeBase
+from .subvolume_attrs import SubvolumeTypes
+from .subvolume_v1 import SubvolumeV1
+from .subvolume_v2 import SubvolumeV2
+from .metadata_manager import MetadataManager
+from .op_sm import SubvolumeOpSm
+from ..template import SubvolumeOpType
+from ...exception import MetadataMgrException, OpSmException, VolumeException
+
+log = logging.getLogger(__name__)
+
+class SubvolumeLoader(object):
+    INVALID_VERSION = -1
+
+    SUPPORTED_MODULES = ['subvolume_v1.SubvolumeV1', 'subvolume_v2.SubvolumeV2']
+
+    def __init__(self):
+        self.max_version = SubvolumeLoader.INVALID_VERSION
+        self.versions = {}
+
+    def _load_module(self, mod_cls):
+        mod_name, cls_name = mod_cls.split('.')
+        mod = importlib.import_module('.versions.{0}'.format(mod_name), package='volumes.fs.operations')
+        return getattr(mod, cls_name)
+
+    def _load_supported_versions(self):
+        for mod_cls in SubvolumeLoader.SUPPORTED_MODULES:
+            cls = self._load_module(mod_cls)
+            log.info("loaded v{0} subvolume".format(cls.version()))
+            if self.max_version is not None or cls.version() > self.max_version:
+                self.max_version = cls.version()
+                self.versions[cls.version()] = cls
+        if self.max_version == SubvolumeLoader.INVALID_VERSION:
+            raise VolumeException(-errno.EINVAL, "no subvolume version available")
+        log.info("max subvolume version is v{0}".format(self.max_version))
+
+    def _get_subvolume_version(self, version):
+        try:
+            return self.versions[version]
+        except KeyError:
+            raise VolumeException(-errno.EINVAL, "subvolume class v{0} does not exist".format(version))
+
+    def get_subvolume_object_max(self, mgr, fs, vol_spec, group, subvolname):
+        return self._get_subvolume_version(self.max_version)(mgr, fs, vol_spec, group, subvolname)
+
+    def upgrade_to_v2_subvolume(self, subvolume):
+        # legacy mode subvolumes cannot be upgraded to v2
+        if subvolume.legacy_mode:
+            return
+
+        version = int(subvolume.metadata_mgr.get_global_option('version'))
+        if version >= SubvolumeV2.version():
+            return
+
+        v1_subvolume = self._get_subvolume_version(version)(subvolume.mgr, subvolume.fs, subvolume.vol_spec, subvolume.group, subvolume.subvolname)
+        try:
+            v1_subvolume.open(SubvolumeOpType.SNAP_LIST)
+        except VolumeException as ve:
+            # if volume is not ready for snapshot listing, do not upgrade at present
+            if ve.errno == -errno.EAGAIN:
+                return
+            raise
+
+        # v1 subvolumes with snapshots cannot be upgraded to v2
+        if v1_subvolume.list_snapshots():
+            return
+
+        subvolume.metadata_mgr.update_global_section(MetadataManager.GLOBAL_META_KEY_VERSION, SubvolumeV2.version())
+        subvolume.metadata_mgr.flush()
+
+    def upgrade_legacy_subvolume(self, fs, subvolume):
+        assert subvolume.legacy_mode
+        try:
+            fs.mkdirs(subvolume.legacy_dir, 0o700)
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], "error accessing subvolume")
+        subvolume_type = SubvolumeTypes.TYPE_NORMAL
+        try:
+            initial_state = SubvolumeOpSm.get_init_state(subvolume_type)
+        except OpSmException as oe:
+            raise VolumeException(-errno.EINVAL, "subvolume creation failed: internal error")
+        qpath = subvolume.base_path.decode('utf-8')
+        # legacy is only upgradable to v1
+        subvolume.init_config(SubvolumeV1.version(), subvolume_type, qpath, initial_state)
+
+    def get_subvolume_object(self, mgr, fs, vol_spec, group, subvolname, upgrade=True):
+        subvolume = SubvolumeBase(mgr, fs, vol_spec, group, subvolname)
+        try:
+            subvolume.discover()
+            self.upgrade_to_v2_subvolume(subvolume)
+            version = int(subvolume.metadata_mgr.get_global_option('version'))
+            return self._get_subvolume_version(version)(mgr, fs, vol_spec, group, subvolname, legacy=subvolume.legacy_mode)
+        except MetadataMgrException as me:
+            if me.errno == -errno.ENOENT and upgrade:
+                self.upgrade_legacy_subvolume(fs, subvolume)
+                return self.get_subvolume_object(mgr, fs, vol_spec, group, subvolname, upgrade=False)
+            else:
+                # log the actual error and generalize error string returned to user
+                log.error("error accessing subvolume metadata for '{0}' ({1})".format(subvolname, me))
+                raise VolumeException(-errno.EINVAL, "error accessing subvolume metadata")
+
+loaded_subvolumes = SubvolumeLoader()
+loaded_subvolumes._load_supported_versions()
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/auth_metadata.py b/src/pybind/mgr/volumes/fs/operations/versions/auth_metadata.py
new file mode 100644
index 00000000..259dcd0e
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/versions/auth_metadata.py
@@ -0,0 +1,208 @@
+from contextlib import contextmanager
+import os
+import fcntl
+import json
+import logging
+import struct
+import uuid
+
+import cephfs
+
+from ..group import Group
+
+log = logging.getLogger(__name__)
+
+class AuthMetadataError(Exception):
+    pass
+
+class AuthMetadataManager(object):
+
+    # Current version
+    version = 6
+
+    # Filename extensions for meta files.
+    META_FILE_EXT = ".meta"
+    DEFAULT_VOL_PREFIX = "/volumes"
+
+    def __init__(self, fs):
+        self.fs = fs
+        self._id = struct.unpack(">Q", uuid.uuid1().bytes[0:8])[0]
+        self.volume_prefix = self.DEFAULT_VOL_PREFIX
+
+    def _to_bytes(self, param):
+        '''
+        Helper method that returns byte representation of the given parameter.
+        '''
+        if isinstance(param, str):
+            return param.encode('utf-8')
+        elif param is None:
+            return param
+        else:
+            return str(param).encode('utf-8')
+
+    def _subvolume_metadata_path(self, group_name, subvol_name):
+        return os.path.join(self.volume_prefix, "_{0}:{1}{2}".format(
+            group_name if group_name != Group.NO_GROUP_NAME else "",
+            subvol_name,
+            self.META_FILE_EXT))
+
+    def _check_compat_version(self, compat_version):
+        if self.version < compat_version:
+            msg = ("The current version of AuthMetadataManager, version {0} "
+                   "does not support the required feature. Need version {1} "
+                   "or greater".format(self.version, compat_version)
+                  )
+            log.error(msg)
+            raise AuthMetadataError(msg)
+
+    def _metadata_get(self, path):
+        """
+        Return a deserialized JSON object, or None
+        """
+        fd = self.fs.open(path, "r")
+        # TODO iterate instead of assuming file < 4MB
+        read_bytes = self.fs.read(fd, 0, 4096 * 1024)
+        self.fs.close(fd)
+        if read_bytes:
+            return json.loads(read_bytes.decode())
+        else:
+            return None
+
+    def _metadata_set(self, path, data):
+        serialized = json.dumps(data)
+        fd = self.fs.open(path, "w")
+        try:
+            self.fs.write(fd, self._to_bytes(serialized), 0)
+            self.fs.fsync(fd, 0)
+        finally:
+            self.fs.close(fd)
+
+    def _lock(self, path):
+        @contextmanager
+        def fn():
+            while(1):
+                fd = self.fs.open(path, os.O_CREAT, 0o755)
+                self.fs.flock(fd, fcntl.LOCK_EX, self._id)
+
+                # The locked file will be cleaned up sometime. It could be
+                # unlinked by consumer e.g., an another manila-share service
+                # instance, before lock was applied on it. Perform checks to
+                # ensure that this does not happen.
+                try:
+                    statbuf = self.fs.stat(path)
+                except cephfs.ObjectNotFound:
+                    self.fs.close(fd)
+                    continue
+
+                fstatbuf = self.fs.fstat(fd)
+                if statbuf.st_ino == fstatbuf.st_ino:
+                    break
+
+            try:
+                yield
+            finally:
+                self.fs.flock(fd, fcntl.LOCK_UN, self._id)
+                self.fs.close(fd)
+
+        return fn()
+
+    def _auth_metadata_path(self, auth_id):
+        return os.path.join(self.volume_prefix, "${0}{1}".format(
+            auth_id, self.META_FILE_EXT))
+
+    def auth_lock(self, auth_id):
+        return self._lock(self._auth_metadata_path(auth_id))
+
+    def auth_metadata_get(self, auth_id):
+        """
+        Call me with the metadata locked!
+
+        Check whether a auth metadata structure can be decoded by the current
+        version of AuthMetadataManager.
+
+        Return auth metadata that the current version of AuthMetadataManager
+        can decode.
+        """
+        auth_metadata = self._metadata_get(self._auth_metadata_path(auth_id))
+
+        if auth_metadata:
+            self._check_compat_version(auth_metadata['compat_version'])
+
+        return auth_metadata
+
+    def auth_metadata_set(self, auth_id, data):
+        """
+        Call me with the metadata locked!
+
+        Fsync the auth metadata.
+
+        Add two version attributes to the auth metadata,
+        'compat_version', the minimum AuthMetadataManager version that can
+        decode the metadata, and 'version', the AuthMetadataManager version
+        that encoded the metadata.
+        """
+        data['compat_version'] = 6
+        data['version'] = self.version
+        return self._metadata_set(self._auth_metadata_path(auth_id), data)
+
+    def create_subvolume_metadata_file(self, group_name, subvol_name):
+        """
+        Create a subvolume metadata file, if it does not already exist, to store
+        data about auth ids having access to the subvolume
+        """
+        fd = self.fs.open(self._subvolume_metadata_path(group_name, subvol_name),
+                          os.O_CREAT, 0o755)
+        self.fs.close(fd)
+
+    def delete_subvolume_metadata_file(self, group_name, subvol_name):
+        vol_meta_path = self._subvolume_metadata_path(group_name, subvol_name)
+        try:
+            self.fs.unlink(vol_meta_path)
+        except cephfs.ObjectNotFound:
+            pass
+
+    def subvol_metadata_lock(self, group_name, subvol_name):
+        """
+        Return a ContextManager which locks the authorization metadata for
+        a particular subvolume, and persists a flag to the metadata indicating
+        that it is currently locked, so that we can detect dirty situations
+        during recovery.
+
+        This lock isn't just to make access to the metadata safe: it's also
+        designed to be used over the two-step process of checking the
+        metadata and then responding to an authorization request, to
+        ensure that at the point we respond the metadata hasn't changed
+        in the background.  It's key to how we avoid security holes
+        resulting from races during that problem ,
+        """
+        return self._lock(self._subvolume_metadata_path(group_name, subvol_name))
+
+    def subvol_metadata_get(self, group_name, subvol_name):
+        """
+        Call me with the metadata locked!
+
+        Check whether a subvolume metadata structure can be decoded by the current
+        version of AuthMetadataManager.
+
+        Return a subvolume_metadata structure that the current version of
+        AuthMetadataManager can decode.
+        """
+        subvolume_metadata = self._metadata_get(self._subvolume_metadata_path(group_name, subvol_name))
+
+        if subvolume_metadata:
+            self._check_compat_version(subvolume_metadata['compat_version'])
+
+        return subvolume_metadata
+
+    def subvol_metadata_set(self, group_name, subvol_name, data):
+        """
+        Call me with the metadata locked!
+
+        Add two version attributes to the subvolume metadata,
+        'compat_version', the minimum AuthMetadataManager version that can
+        decode the metadata and 'version', the AuthMetadataManager version
+        that encoded the metadata.
+        """
+        data['compat_version'] = 1
+        data['version'] = self.version
+        return self._metadata_set(self._subvolume_metadata_path(group_name, subvol_name), data)
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
new file mode 100644
index 00000000..1b6c4327
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
@@ -0,0 +1,144 @@
+import os
+import errno
+import logging
+import sys
+
+if sys.version_info >= (3, 2):
+    import configparser
+else:
+    import ConfigParser as configparser
+
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
+
+import cephfs
+
+from ...exception import MetadataMgrException
+
+log = logging.getLogger(__name__)
+
+class MetadataManager(object):
+    GLOBAL_SECTION = "GLOBAL"
+    GLOBAL_META_KEY_VERSION = "version"
+    GLOBAL_META_KEY_TYPE    = "type"
+    GLOBAL_META_KEY_PATH    = "path"
+    GLOBAL_META_KEY_STATE   = "state"
+
+    MAX_IO_BYTES = 8 * 1024
+
+    def __init__(self, fs, config_path, mode):
+        self.fs = fs
+        self.mode = mode
+        self.config_path = config_path
+        if sys.version_info >= (3, 2):
+            self.config = configparser.ConfigParser()
+        else:
+            self.config = configparser.SafeConfigParser()
+
+    def refresh(self):
+        fd = None
+        conf_data = StringIO()
+        try:
+            log.debug("opening config {0}".format(self.config_path))
+            fd = self.fs.open(self.config_path, os.O_RDONLY)
+            while True:
+                data = self.fs.read(fd, -1, MetadataManager.MAX_IO_BYTES)
+                if not len(data):
+                    break
+                conf_data.write(data.decode('utf-8'))
+            conf_data.seek(0)
+            self.config.readfp(conf_data)
+        except cephfs.ObjectNotFound:
+            raise MetadataMgrException(-errno.ENOENT, "metadata config '{0}' not found".format(self.config_path))
+        except cephfs.Error as e:
+            raise MetadataMgrException(-e.args[0], e.args[1])
+        finally:
+            if fd is not None:
+                self.fs.close(fd)
+
+    def flush(self):
+        # cull empty sections
+        for section in list(self.config.sections()):
+            if len(self.config.items(section)) == 0:
+                self.config.remove_section(section)
+
+        conf_data = StringIO()
+        self.config.write(conf_data)
+        conf_data.seek(0)
+
+        fd = None
+        try:
+            fd = self.fs.open(self.config_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, self.mode)
+            wrote = 0
+            while True:
+                data = conf_data.read()
+                if not len(data):
+                    break
+                wrote += self.fs.write(fd, data.encode('utf-8'), -1)
+            self.fs.fsync(fd, 0)
+            log.info("wrote {0} bytes to config {1}".format(wrote, self.config_path))
+        except cephfs.Error as e:
+            raise MetadataMgrException(-e.args[0], e.args[1])
+        finally:
+            if fd is not None:
+                self.fs.close(fd)
+
+    def init(self, version, typ, path, state):
+        # you may init just once before refresh (helps to overwrite conf)
+        if self.config.has_section(MetadataManager.GLOBAL_SECTION):
+            raise MetadataMgrException(-errno.EINVAL, "init called on an existing config")
+
+        self.add_section(MetadataManager.GLOBAL_SECTION)
+        self.update_section_multi(
+            MetadataManager.GLOBAL_SECTION, {MetadataManager.GLOBAL_META_KEY_VERSION : str(version),
+                                             MetadataManager.GLOBAL_META_KEY_TYPE    : str(typ),
+                                             MetadataManager.GLOBAL_META_KEY_PATH    : str(path),
+                                             MetadataManager.GLOBAL_META_KEY_STATE   : str(state)
+            })
+
+    def add_section(self, section):
+        try:
+            self.config.add_section(section)
+        except configparser.DuplicateSectionError:
+            return
+        except:
+            raise MetadataMgrException(-errno.EINVAL, "error adding section to config")
+
+    def remove_option(self, section, key):
+        if not self.config.has_section(section):
+            raise MetadataMgrException(-errno.ENOENT, "section '{0}' does not exist".format(section))
+        self.config.remove_option(section, key)
+
+    def remove_section(self, section):
+        self.config.remove_section(section)
+
+    def update_section(self, section, key, value):
+        if not self.config.has_section(section):
+            raise MetadataMgrException(-errno.ENOENT, "section '{0}' does not exist".format(section))
+        self.config.set(section, key, str(value))
+
+    def update_section_multi(self, section, dct):
+        if not self.config.has_section(section):
+            raise MetadataMgrException(-errno.ENOENT, "section '{0}' does not exist".format(section))
+        for key,value in dct.items():
+            self.config.set(section, key, str(value))
+
+    def update_global_section(self, key, value):
+        self.update_section(MetadataManager.GLOBAL_SECTION, key, str(value))
+
+    def get_option(self, section, key):
+        if not self.config.has_section(section):
+            raise MetadataMgrException(-errno.ENOENT, "section '{0}' does not exist".format(section))
+        if not self.config.has_option(section, key):
+            raise MetadataMgrException(-errno.ENOENT, "no config '{0}' in section '{1}'".format(key, section))
+        return self.config.get(section, key)
+
+    def get_global_option(self, key):
+        return self.get_option(MetadataManager.GLOBAL_SECTION, key)
+
+    def section_has_item(self, section, item):
+        if not self.config.has_section(section):
+            raise MetadataMgrException(-errno.ENOENT, "section '{0}' does not exist".format(section))
+        return item in [v[1] for v in self.config.items(section)]
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/op_sm.py b/src/pybind/mgr/volumes/fs/operations/versions/op_sm.py
new file mode 100644
index 00000000..c2b5f582
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/versions/op_sm.py
@@ -0,0 +1,114 @@
+import errno
+
+from enum import Enum, unique
+
+from ...exception import OpSmException
+from .subvolume_attrs import SubvolumeTypes, SubvolumeStates, SubvolumeActions
+
+class TransitionKey(object):
+    def __init__(self, subvol_type, state, action_type):
+        self.transition_key = [subvol_type, state, action_type]
+
+    def __hash__(self):
+        return hash(tuple(self.transition_key))
+
+    def __eq__(self, other):
+        return self.transition_key == other.transition_key
+
+    def __neq__(self, other):
+        return not(self == other)
+
+class SubvolumeOpSm(object):
+    transition_table = {}
+
+    @staticmethod
+    def is_complete_state(state):
+        if not isinstance(state, SubvolumeStates):
+            raise OpSmException(-errno.EINVAL, "unknown state '{0}'".format(state))
+        return state == SubvolumeStates.STATE_COMPLETE
+
+    @staticmethod
+    def is_failed_state(state):
+        if not isinstance(state, SubvolumeStates):
+            raise OpSmException(-errno.EINVAL, "unknown state '{0}'".format(state))
+        return state == SubvolumeStates.STATE_FAILED or state == SubvolumeStates.STATE_CANCELED
+
+    @staticmethod
+    def is_init_state(stm_type, state):
+        if not isinstance(state, SubvolumeStates):
+            raise OpSmException(-errno.EINVAL, "unknown state '{0}'".format(state))
+        return state == SubvolumeOpSm.get_init_state(stm_type)
+
+    @staticmethod
+    def get_init_state(stm_type):
+        if not isinstance(stm_type, SubvolumeTypes):
+            raise OpSmException(-errno.EINVAL, "unknown state machine '{0}'".format(stm_type))
+        init_state =  SubvolumeOpSm.transition_table[TransitionKey(stm_type,
+                                                     SubvolumeStates.STATE_INIT,
+                                                     SubvolumeActions.ACTION_NONE)]
+        if not init_state:
+            raise OpSmException(-errno.ENOENT, "initial state for state machine '{0}' not found".format(stm_type))
+        return init_state
+
+    @staticmethod
+    def transition(stm_type, current_state, action):
+        if not isinstance(stm_type, SubvolumeTypes):
+            raise OpSmException(-errno.EINVAL, "unknown state machine '{0}'".format(stm_type))
+        if not isinstance(current_state, SubvolumeStates):
+            raise OpSmException(-errno.EINVAL, "unknown state '{0}'".format(current_state))
+        if not isinstance(action, SubvolumeActions):
+            raise OpSmException(-errno.EINVAL, "unknown action '{0}'".format(action))
+
+        transition = SubvolumeOpSm.transition_table[TransitionKey(stm_type, current_state, action)]
+        if not transition:
+            raise OpSmException(-errno.EINVAL, "invalid action '{0}' on current state {1} for state machine '{2}'".format(action, current_state, stm_type))
+
+        return transition
+
+SubvolumeOpSm.transition_table = {
+    # state transitions for state machine type TYPE_NORMAL
+    TransitionKey(SubvolumeTypes.TYPE_NORMAL,
+                  SubvolumeStates.STATE_INIT,
+                  SubvolumeActions.ACTION_NONE) : SubvolumeStates.STATE_COMPLETE,
+
+    TransitionKey(SubvolumeTypes.TYPE_NORMAL,
+                  SubvolumeStates.STATE_COMPLETE,
+                  SubvolumeActions.ACTION_RETAINED) : SubvolumeStates.STATE_RETAINED,
+
+    # state transitions for state machine type TYPE_CLONE
+    TransitionKey(SubvolumeTypes.TYPE_CLONE,
+                  SubvolumeStates.STATE_INIT,
+                  SubvolumeActions.ACTION_NONE) : SubvolumeStates.STATE_PENDING,
+
+    TransitionKey(SubvolumeTypes.TYPE_CLONE,
+                  SubvolumeStates.STATE_PENDING,
+                  SubvolumeActions.ACTION_SUCCESS) : SubvolumeStates.STATE_INPROGRESS,
+
+    TransitionKey(SubvolumeTypes.TYPE_CLONE,
+                  SubvolumeStates.STATE_PENDING,
+                  SubvolumeActions.ACTION_CANCELLED) : SubvolumeStates.STATE_CANCELED,
+
+    TransitionKey(SubvolumeTypes.TYPE_CLONE,
+                  SubvolumeStates.STATE_INPROGRESS,
+                  SubvolumeActions.ACTION_SUCCESS) : SubvolumeStates.STATE_COMPLETE,
+
+    TransitionKey(SubvolumeTypes.TYPE_CLONE,
+                  SubvolumeStates.STATE_INPROGRESS,
+                  SubvolumeActions.ACTION_CANCELLED) : SubvolumeStates.STATE_CANCELED,
+
+    TransitionKey(SubvolumeTypes.TYPE_CLONE,
+                  SubvolumeStates.STATE_INPROGRESS,
+                  SubvolumeActions.ACTION_FAILED) : SubvolumeStates.STATE_FAILED,
+
+    TransitionKey(SubvolumeTypes.TYPE_CLONE,
+                  SubvolumeStates.STATE_COMPLETE,
+                  SubvolumeActions.ACTION_RETAINED) : SubvolumeStates.STATE_RETAINED,
+
+    TransitionKey(SubvolumeTypes.TYPE_CLONE,
+                  SubvolumeStates.STATE_CANCELED,
+                  SubvolumeActions.ACTION_RETAINED) : SubvolumeStates.STATE_RETAINED,
+
+    TransitionKey(SubvolumeTypes.TYPE_CLONE,
+                  SubvolumeStates.STATE_FAILED,
+                  SubvolumeActions.ACTION_RETAINED) : SubvolumeStates.STATE_RETAINED,
+}
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_attrs.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_attrs.py
new file mode 100644
index 00000000..ec7138cb
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_attrs.py
@@ -0,0 +1,61 @@
+import errno
+from enum import Enum, unique
+
+from ...exception import VolumeException
+
+@unique
+class SubvolumeTypes(Enum):
+    TYPE_NORMAL  = "subvolume"
+    TYPE_CLONE   = "clone"
+
+    @staticmethod
+    def from_value(value):
+        if value == "subvolume":
+            return SubvolumeTypes.TYPE_NORMAL
+        if value == "clone":
+            return SubvolumeTypes.TYPE_CLONE
+
+        raise VolumeException(-errno.EINVAL, "invalid subvolume type '{0}'".format(value))
+
+@unique
+class SubvolumeStates(Enum):
+    STATE_INIT          = 'init'
+    STATE_PENDING       = 'pending'
+    STATE_INPROGRESS    = 'in-progress'
+    STATE_FAILED        = 'failed'
+    STATE_COMPLETE      = 'complete'
+    STATE_CANCELED      = 'canceled'
+    STATE_RETAINED      = 'snapshot-retained'
+
+    @staticmethod
+    def from_value(value):
+        if value == "init":
+            return SubvolumeStates.STATE_INIT
+        if value == "pending":
+            return SubvolumeStates.STATE_PENDING
+        if value == "in-progress":
+            return SubvolumeStates.STATE_INPROGRESS
+        if value == "failed":
+            return SubvolumeStates.STATE_FAILED
+        if value == "complete":
+            return SubvolumeStates.STATE_COMPLETE
+        if value == "canceled":
+            return SubvolumeStates.STATE_CANCELED
+        if value == "snapshot-retained":
+            return SubvolumeStates.STATE_RETAINED
+
+        raise VolumeException(-errno.EINVAL, "invalid state '{0}'".format(value))
+
+@unique
+class SubvolumeActions(Enum):
+    ACTION_NONE         = 0
+    ACTION_SUCCESS      = 1
+    ACTION_FAILED       = 2
+    ACTION_CANCELLED    = 3
+    ACTION_RETAINED     = 4
+
+@unique
+class SubvolumeFeatures(Enum):
+    FEATURE_SNAPSHOT_CLONE          = "snapshot-clone"
+    FEATURE_SNAPSHOT_RETENTION      = "snapshot-retention"
+    FEATURE_SNAPSHOT_AUTOPROTECT    = "snapshot-autoprotect"
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_base.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_base.py
new file mode 100644
index 00000000..f193dabd
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_base.py
@@ -0,0 +1,331 @@
+import os
+import stat
+import uuid
+import errno
+import logging
+from hashlib import md5
+
+import cephfs
+
+from .subvolume_attrs import SubvolumeTypes, SubvolumeStates
+from .metadata_manager import MetadataManager
+from ..trash import create_trashcan, open_trashcan
+from ...fs_util import get_ancestor_xattr
+from ...exception import MetadataMgrException, VolumeException
+from .op_sm import SubvolumeOpSm
+from .auth_metadata import AuthMetadataManager
+
+log = logging.getLogger(__name__)
+
+class SubvolumeBase(object):
+    LEGACY_CONF_DIR = "_legacy"
+
+    def __init__(self, mgr, fs, vol_spec, group, subvolname, legacy=False):
+        self.mgr = mgr
+        self.fs = fs
+        self.auth_mdata_mgr = AuthMetadataManager(fs)
+        self.cmode = None
+        self.user_id = None
+        self.group_id = None
+        self.vol_spec = vol_spec
+        self.group = group
+        self.subvolname = subvolname
+        self.legacy_mode = legacy
+        self.load_config()
+
+    @property
+    def uid(self):
+        return self.user_id
+
+    @uid.setter
+    def uid(self, val):
+        self.user_id = val
+
+    @property
+    def gid(self):
+        return self.group_id
+
+    @gid.setter
+    def gid(self, val):
+        self.group_id = val
+
+    @property
+    def mode(self):
+        return self.cmode
+
+    @mode.setter
+    def mode(self, val):
+        self.cmode = val
+
+    @property
+    def base_path(self):
+        return os.path.join(self.group.path, self.subvolname.encode('utf-8'))
+
+    @property
+    def config_path(self):
+        return os.path.join(self.base_path, b".meta")
+
+    @property
+    def legacy_dir(self):
+        return os.path.join(self.vol_spec.base_dir.encode('utf-8'), SubvolumeBase.LEGACY_CONF_DIR.encode('utf-8'))
+
+    @property
+    def legacy_config_path(self):
+        m = md5()
+        m.update(self.base_path)
+        meta_config = "{0}.meta".format(m.hexdigest())
+        return os.path.join(self.legacy_dir, meta_config.encode('utf-8'))
+
+    @property
+    def namespace(self):
+        return "{0}{1}".format(self.vol_spec.fs_namespace, self.subvolname)
+
+    @property
+    def group_name(self):
+        return self.group.group_name
+
+    @property
+    def subvol_name(self):
+        return self.subvolname
+
+    @property
+    def legacy_mode(self):
+        return self.legacy
+
+    @legacy_mode.setter
+    def legacy_mode(self, mode):
+        self.legacy = mode
+
+    @property
+    def path(self):
+        """ Path to subvolume data directory """
+        raise NotImplementedError
+
+    @property
+    def features(self):
+        """ List of features supported by the subvolume, containing items from SubvolumeFeatures """
+        raise NotImplementedError
+
+    @property
+    def state(self):
+        """ Subvolume state, one of SubvolumeStates """
+        raise NotImplementedError
+
+    @property
+    def subvol_type(self):
+        return SubvolumeTypes.from_value(self.metadata_mgr.get_global_option(MetadataManager.GLOBAL_META_KEY_TYPE))
+
+    @property
+    def purgeable(self):
+        """ Boolean declaring if subvolume can be purged """
+        raise NotImplementedError
+
+    def load_config(self):
+        if self.legacy_mode:
+            self.metadata_mgr = MetadataManager(self.fs, self.legacy_config_path, 0o640)
+        else:
+            self.metadata_mgr = MetadataManager(self.fs, self.config_path, 0o640)
+
+    def get_attrs(self, pathname):
+        # get subvolume attributes
+        attrs = {}
+        stx = self.fs.statx(pathname,
+                            cephfs.CEPH_STATX_UID | cephfs.CEPH_STATX_GID | cephfs.CEPH_STATX_MODE,
+                            cephfs.AT_SYMLINK_NOFOLLOW)
+
+        attrs["uid"] = int(stx["uid"])
+        attrs["gid"] = int(stx["gid"])
+        attrs["mode"] = int(int(stx["mode"]) & ~stat.S_IFMT(stx["mode"]))
+
+        try:
+            attrs["data_pool"] = self.fs.getxattr(pathname, 'ceph.dir.layout.pool').decode('utf-8')
+        except cephfs.NoData:
+            attrs["data_pool"] = None
+
+        try:
+            attrs["pool_namespace"] = self.fs.getxattr(pathname, 'ceph.dir.layout.pool_namespace').decode('utf-8')
+        except cephfs.NoData:
+            attrs["pool_namespace"] = None
+
+        try:
+            attrs["quota"] = int(self.fs.getxattr(pathname, 'ceph.quota.max_bytes').decode('utf-8'))
+        except cephfs.NoData:
+            attrs["quota"] = None
+
+        return attrs
+
+    def set_attrs(self, path, attrs):
+        # set subvolume attributes
+        # set size
+        quota = attrs.get("quota")
+        if quota is not None:
+            try:
+                self.fs.setxattr(path, 'ceph.quota.max_bytes', str(quota).encode('utf-8'), 0)
+            except cephfs.InvalidValue as e:
+                raise VolumeException(-errno.EINVAL, "invalid size specified: '{0}'".format(quota))
+            except cephfs.Error as e:
+                raise VolumeException(-e.args[0], e.args[1])
+
+        # set pool layout
+        data_pool = attrs.get("data_pool")
+        if data_pool is not None:
+            try:
+                self.fs.setxattr(path, 'ceph.dir.layout.pool', data_pool.encode('utf-8'), 0)
+            except cephfs.InvalidValue:
+                raise VolumeException(-errno.EINVAL,
+                                      "invalid pool layout '{0}' -- need a valid data pool".format(data_pool))
+            except cephfs.Error as e:
+                raise VolumeException(-e.args[0], e.args[1])
+
+        # isolate namespace
+        xattr_key = xattr_val = None
+        pool_namespace = attrs.get("pool_namespace")
+        if pool_namespace is not None:
+            # enforce security isolation, use separate namespace for this subvolume
+            xattr_key = 'ceph.dir.layout.pool_namespace'
+            xattr_val = pool_namespace
+        elif not data_pool:
+            # If subvolume's namespace layout is not set, then the subvolume's pool
+            # layout remains unset and will undesirably change with ancestor's
+            # pool layout changes.
+            xattr_key = 'ceph.dir.layout.pool'
+            xattr_val = None
+            try:
+                self.fs.getxattr(path, 'ceph.dir.layout.pool').decode('utf-8')
+            except cephfs.NoData as e:
+                xattr_val = get_ancestor_xattr(self.fs, os.path.split(path)[0], "ceph.dir.layout.pool")
+        if xattr_key and xattr_val:
+            try:
+                self.fs.setxattr(path, xattr_key, xattr_val.encode('utf-8'), 0)
+            except cephfs.Error as e:
+                raise VolumeException(-e.args[0], e.args[1])
+
+        # set uid/gid
+        uid = attrs.get("uid")
+        if uid is None:
+            uid = self.group.uid
+        else:
+            try:
+                uid = int(uid)
+                if uid < 0:
+                    raise ValueError
+            except ValueError:
+                raise VolumeException(-errno.EINVAL, "invalid UID")
+
+        gid = attrs.get("gid")
+        if gid is None:
+            gid = self.group.gid
+        else:
+            try:
+                gid = int(gid)
+                if gid < 0:
+                    raise ValueError
+            except ValueError:
+                raise VolumeException(-errno.EINVAL, "invalid GID")
+
+        if uid is not None and gid is not None:
+            self.fs.chown(path, uid, gid)
+
+    def _resize(self, path, newsize, noshrink):
+        try:
+            newsize = int(newsize)
+            if newsize <= 0:
+                raise VolumeException(-errno.EINVAL, "Invalid subvolume size")
+        except ValueError:
+            newsize = newsize.lower()
+            if not (newsize == "inf" or newsize == "infinite"):
+                raise VolumeException(-errno.EINVAL, "invalid size option '{0}'".format(newsize))
+            newsize = 0
+            noshrink = False
+
+        try:
+            maxbytes = int(self.fs.getxattr(path, 'ceph.quota.max_bytes').decode('utf-8'))
+        except cephfs.NoData:
+            maxbytes = 0
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+        subvolstat = self.fs.stat(path)
+        if newsize > 0 and newsize < subvolstat.st_size:
+            if noshrink:
+                raise VolumeException(-errno.EINVAL, "Can't resize the subvolume. The new size '{0}' would be lesser than the current "
+                                      "used size '{1}'".format(newsize, subvolstat.st_size))
+
+        if not newsize == maxbytes:
+            try:
+                self.fs.setxattr(path, 'ceph.quota.max_bytes', str(newsize).encode('utf-8'), 0)
+            except cephfs.Error as e:
+                raise VolumeException(-e.args[0], "Cannot set new size for the subvolume. '{0}'".format(e.args[1]))
+        return newsize, subvolstat.st_size
+
+    def init_config(self, version, subvolume_type, subvolume_path, subvolume_state):
+        self.metadata_mgr.init(version, subvolume_type.value, subvolume_path, subvolume_state.value)
+        self.metadata_mgr.flush()
+
+    def discover(self):
+        log.debug("discovering subvolume '{0}' [mode: {1}]".format(self.subvolname, "legacy" if self.legacy_mode else "new"))
+        try:
+            self.fs.stat(self.base_path)
+            self.metadata_mgr.refresh()
+            log.debug("loaded subvolume '{0}'".format(self.subvolname))
+        except MetadataMgrException as me:
+            if me.errno == -errno.ENOENT and not self.legacy_mode:
+                self.legacy_mode = True
+                self.load_config()
+                self.discover()
+            else:
+                raise
+        except cephfs.Error as e:
+            if e.args[0] == errno.ENOENT:
+                raise VolumeException(-errno.ENOENT, "subvolume '{0}' does not exist".format(self.subvolname))
+            raise VolumeException(-e.args[0], "error accessing subvolume '{0}'".format(self.subvolname))
+
+    def _trash_dir(self, path):
+        create_trashcan(self.fs, self.vol_spec)
+        with open_trashcan(self.fs, self.vol_spec) as trashcan:
+            trashcan.dump(path)
+            log.info("subvolume path '{0}' moved to trashcan".format(path))
+
+    def _link_dir(self, path, bname):
+        create_trashcan(self.fs, self.vol_spec)
+        with open_trashcan(self.fs, self.vol_spec) as trashcan:
+            trashcan.link(path, bname)
+            log.info("subvolume path '{0}' linked in trashcan bname {1}".format(path, bname))
+
+    def trash_base_dir(self):
+        if self.legacy_mode:
+            self.fs.unlink(self.legacy_config_path)
+        self._trash_dir(self.base_path)
+
+    def create_base_dir(self, mode):
+        try:
+            self.fs.mkdirs(self.base_path, mode)
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+    def info (self):
+        subvolpath = self.metadata_mgr.get_global_option(MetadataManager.GLOBAL_META_KEY_PATH)
+        etype = self.subvol_type
+        st = self.fs.statx(subvolpath, cephfs.CEPH_STATX_BTIME | cephfs.CEPH_STATX_SIZE |
+                                       cephfs.CEPH_STATX_UID | cephfs.CEPH_STATX_GID |
+                                       cephfs.CEPH_STATX_MODE | cephfs.CEPH_STATX_ATIME |
+                                       cephfs.CEPH_STATX_MTIME | cephfs.CEPH_STATX_CTIME,
+                                       cephfs.AT_SYMLINK_NOFOLLOW)
+        usedbytes = st["size"]
+        try:
+            nsize = int(self.fs.getxattr(subvolpath, 'ceph.quota.max_bytes').decode('utf-8'))
+        except cephfs.NoData:
+            nsize = 0
+
+        try:
+            data_pool = self.fs.getxattr(subvolpath, 'ceph.dir.layout.pool').decode('utf-8')
+            pool_namespace = self.fs.getxattr(subvolpath, 'ceph.dir.layout.pool_namespace').decode('utf-8')
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+        return {'path': subvolpath, 'type': etype.value, 'uid': int(st["uid"]), 'gid': int(st["gid"]),
+            'atime': str(st["atime"]), 'mtime': str(st["mtime"]), 'ctime': str(st["ctime"]),
+            'mode': int(st["mode"]), 'data_pool': data_pool, 'created_at': str(st["btime"]),
+            'bytes_quota': "infinite" if nsize == 0 else nsize, 'bytes_used': int(usedbytes),
+            'bytes_pcent': "undefined" if nsize == 0 else '{0:.2f}'.format((float(usedbytes) / nsize) * 100.0),
+            'pool_namespace': pool_namespace, 'features': self.features, 'state': self.state.value}
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
new file mode 100644
index 00000000..b4cca736
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
@@ -0,0 +1,780 @@
+import os
+import sys
+import stat
+import uuid
+import errno
+import logging
+import json
+from datetime import datetime
+try:
+    from typing import List, Dict
+except ImportError:
+    pass # For typing only
+
+import cephfs
+
+from .metadata_manager import MetadataManager
+from .subvolume_attrs import SubvolumeTypes, SubvolumeStates, SubvolumeFeatures
+from .op_sm import SubvolumeOpSm
+from .subvolume_base import SubvolumeBase
+from ..template import SubvolumeTemplate
+from ..snapshot_util import mksnap, rmsnap
+from ..access import allow_access, deny_access
+from ...exception import IndexException, OpSmException, VolumeException, MetadataMgrException, EvictionError
+from ...fs_util import listsnaps, is_inherited_snap
+from ..template import SubvolumeOpType
+from ..group import Group
+from ..rankevicter import RankEvicter
+from ..volume import get_mds_map
+
+from ..clone_index import open_clone_index, create_clone_index
+
+log = logging.getLogger(__name__)
+
+class SubvolumeV1(SubvolumeBase, SubvolumeTemplate):
+    """
+    Version 1 subvolumes creates a subvolume with path as follows,
+        volumes/<group-name>/<subvolume-name>/<uuid>/
+
+    - The directory under which user data resides is <uuid>
+    - Snapshots of the subvolume are taken within the <uuid> directory
+    - A meta file is maintained under the <subvolume-name> directory as a metadata store, typically storing,
+        - global information about the subvolume (version, path, type, state)
+        - snapshots attached to an ongoing clone operation
+        - clone snapshot source if subvolume is a clone of a snapshot
+    - It retains backward compatability with legacy subvolumes by creating the meta file for legacy subvolumes under
+    /volumes/_legacy/ (see legacy_config_path), thus allowing cloning of older legacy volumes that lack the <uuid>
+    component in the path.
+    """
+    VERSION = 1
+
+    @staticmethod
+    def version():
+        return SubvolumeV1.VERSION
+
+    @property
+    def path(self):
+        try:
+            # no need to stat the path -- open() does that
+            return self.metadata_mgr.get_global_option(MetadataManager.GLOBAL_META_KEY_PATH).encode('utf-8')
+        except MetadataMgrException as me:
+            raise VolumeException(-errno.EINVAL, "error fetching subvolume metadata")
+
+    @property
+    def features(self):
+        return [SubvolumeFeatures.FEATURE_SNAPSHOT_CLONE.value, SubvolumeFeatures.FEATURE_SNAPSHOT_AUTOPROTECT.value]
+
+    def mark_subvolume(self):
+        # set subvolume attr, on subvolume root, marking it as a CephFS subvolume
+        # subvolume root is where snapshots would be taken, and hence is the <uuid> dir for v1 subvolumes
+        try:
+            # MDS treats this as a noop for already marked subvolume
+            self.fs.setxattr(self.path, 'ceph.dir.subvolume', b'1', 0)
+        except cephfs.InvalidValue as e:
+            raise VolumeException(-errno.EINVAL, "invalid value specified for ceph.dir.subvolume")
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+    def snapshot_base_path(self):
+        """ Base path for all snapshots """
+        return os.path.join(self.path, self.vol_spec.snapshot_dir_prefix.encode('utf-8'))
+
+    def snapshot_path(self, snapname):
+        """ Path to a specific snapshot named 'snapname' """
+        return os.path.join(self.snapshot_base_path(), snapname.encode('utf-8'))
+
+    def snapshot_data_path(self, snapname):
+        """ Path to user data directory within a subvolume snapshot named 'snapname' """
+        return self.snapshot_path(snapname)
+
+    def create(self, size, isolate_nspace, pool, mode, uid, gid):
+        subvolume_type = SubvolumeTypes.TYPE_NORMAL
+        try:
+            initial_state = SubvolumeOpSm.get_init_state(subvolume_type)
+        except OpSmException as oe:
+            raise VolumeException(-errno.EINVAL, "subvolume creation failed: internal error")
+
+        subvol_path = os.path.join(self.base_path, str(uuid.uuid4()).encode('utf-8'))
+        try:
+            # create directory and set attributes
+            self.fs.mkdirs(subvol_path, mode)
+            self.mark_subvolume()
+            attrs = {
+                'uid': uid,
+                'gid': gid,
+                'data_pool': pool,
+                'pool_namespace': self.namespace if isolate_nspace else None,
+                'quota': size
+            }
+            self.set_attrs(subvol_path, attrs)
+
+            # persist subvolume metadata
+            qpath = subvol_path.decode('utf-8')
+            self.init_config(SubvolumeV1.VERSION, subvolume_type, qpath, initial_state)
+        except (VolumeException, MetadataMgrException, cephfs.Error) as e:
+            try:
+                log.info("cleaning up subvolume with path: {0}".format(self.subvolname))
+                self.remove()
+            except VolumeException as ve:
+                log.info("failed to cleanup subvolume '{0}' ({1})".format(self.subvolname, ve))
+
+            if isinstance(e, MetadataMgrException):
+                log.error("metadata manager exception: {0}".format(e))
+                e = VolumeException(-errno.EINVAL, "exception in subvolume metadata")
+            elif isinstance(e, cephfs.Error):
+                e = VolumeException(-e.args[0], e.args[1])
+            raise e
+
+    def add_clone_source(self, volname, subvolume, snapname, flush=False):
+        self.metadata_mgr.add_section("source")
+        self.metadata_mgr.update_section("source", "volume", volname)
+        if not subvolume.group.is_default_group():
+            self.metadata_mgr.update_section("source", "group", subvolume.group_name)
+        self.metadata_mgr.update_section("source", "subvolume", subvolume.subvol_name)
+        self.metadata_mgr.update_section("source", "snapshot", snapname)
+        if flush:
+            self.metadata_mgr.flush()
+
+    def remove_clone_source(self, flush=False):
+        self.metadata_mgr.remove_section("source")
+        if flush:
+            self.metadata_mgr.flush()
+
+    def create_clone(self, pool, source_volname, source_subvolume, snapname):
+        subvolume_type = SubvolumeTypes.TYPE_CLONE
+        try:
+            initial_state = SubvolumeOpSm.get_init_state(subvolume_type)
+        except OpSmException as oe:
+            raise VolumeException(-errno.EINVAL, "clone failed: internal error")
+
+        subvol_path = os.path.join(self.base_path, str(uuid.uuid4()).encode('utf-8'))
+        try:
+            # source snapshot attrs are used to create clone subvolume.
+            # attributes of subvolume's content though, are synced during the cloning process.
+            attrs = source_subvolume.get_attrs(source_subvolume.snapshot_data_path(snapname))
+
+            # override snapshot pool setting, if one is provided for the clone
+            if pool is not None:
+                attrs["data_pool"] = pool
+                attrs["pool_namespace"] = None
+
+            # create directory and set attributes
+            self.fs.mkdirs(subvol_path, attrs.get("mode"))
+            self.mark_subvolume()
+            self.set_attrs(subvol_path, attrs)
+
+            # persist subvolume metadata and clone source
+            qpath = subvol_path.decode('utf-8')
+            self.metadata_mgr.init(SubvolumeV1.VERSION, subvolume_type.value, qpath, initial_state.value)
+            self.add_clone_source(source_volname, source_subvolume, snapname)
+            self.metadata_mgr.flush()
+        except (VolumeException, MetadataMgrException, cephfs.Error) as e:
+            try:
+                log.info("cleaning up subvolume with path: {0}".format(self.subvolname))
+                self.remove()
+            except VolumeException as ve:
+                log.info("failed to cleanup subvolume '{0}' ({1})".format(self.subvolname, ve))
+
+            if isinstance(e, MetadataMgrException):
+                log.error("metadata manager exception: {0}".format(e))
+                e = VolumeException(-errno.EINVAL, "exception in subvolume metadata")
+            elif isinstance(e, cephfs.Error):
+                e = VolumeException(-e.args[0], e.args[1])
+            raise e
+
+    def allowed_ops_by_type(self, vol_type):
+        if vol_type == SubvolumeTypes.TYPE_CLONE:
+            return {op_type for op_type in SubvolumeOpType}
+
+        if vol_type == SubvolumeTypes.TYPE_NORMAL:
+            return {op_type for op_type in SubvolumeOpType} - {SubvolumeOpType.CLONE_STATUS,
+                                                               SubvolumeOpType.CLONE_CANCEL,
+                                                               SubvolumeOpType.CLONE_INTERNAL}
+
+        return {}
+
+    def allowed_ops_by_state(self, vol_state):
+        if vol_state == SubvolumeStates.STATE_COMPLETE:
+            return {op_type for op_type in SubvolumeOpType}
+
+        return {SubvolumeOpType.REMOVE_FORCE,
+                SubvolumeOpType.CLONE_CREATE,
+                SubvolumeOpType.CLONE_STATUS,
+                SubvolumeOpType.CLONE_CANCEL,
+                SubvolumeOpType.CLONE_INTERNAL}
+
+    def open(self, op_type):
+        if not isinstance(op_type, SubvolumeOpType):
+            raise VolumeException(-errno.ENOTSUP, "operation {0} not supported on subvolume '{1}'".format(
+                                  op_type.value, self.subvolname))
+        try:
+            self.metadata_mgr.refresh()
+
+            etype = self.subvol_type
+            if op_type not in self.allowed_ops_by_type(etype):
+                raise VolumeException(-errno.ENOTSUP, "operation '{0}' is not allowed on subvolume '{1}' of type {2}".format(
+                                      op_type.value, self.subvolname, etype.value))
+
+            estate = self.state
+            if op_type not in self.allowed_ops_by_state(estate):
+                raise VolumeException(-errno.EAGAIN, "subvolume '{0}' is not ready for operation {1}".format(
+                                      self.subvolname, op_type.value))
+
+            subvol_path = self.path
+            log.debug("refreshed metadata, checking subvolume path '{0}'".format(subvol_path))
+            st = self.fs.stat(subvol_path)
+            # unconditionally mark as subvolume, to handle pre-existing subvolumes without the mark
+            self.mark_subvolume()
+
+            self.uid = int(st.st_uid)
+            self.gid = int(st.st_gid)
+            self.mode = int(st.st_mode & ~stat.S_IFMT(st.st_mode))
+        except MetadataMgrException as me:
+            if me.errno == -errno.ENOENT:
+                raise VolumeException(-errno.ENOENT, "subvolume '{0}' does not exist".format(self.subvolname))
+            raise VolumeException(me.args[0], me.args[1])
+        except cephfs.ObjectNotFound:
+            log.debug("missing subvolume path '{0}' for subvolume '{1}'".format(subvol_path, self.subvolname))
+            raise VolumeException(-errno.ENOENT, "mount path missing for subvolume '{0}'".format(self.subvolname))
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+    def _recover_auth_meta(self, auth_id, auth_meta):
+        """
+        Call me after locking the auth meta file.
+        """
+        remove_subvolumes = []
+
+        for subvol, subvol_data in auth_meta['subvolumes'].items():
+            if not subvol_data['dirty']:
+                continue
+
+            (group_name, subvol_name) = subvol.split('/')
+            group_name = group_name if group_name != 'None' else Group.NO_GROUP_NAME
+            access_level = subvol_data['access_level']
+
+            with self.auth_mdata_mgr.subvol_metadata_lock(group_name, subvol_name):
+                subvol_meta = self.auth_mdata_mgr.subvol_metadata_get(group_name, subvol_name)
+
+                # No SVMeta update indicates that there was no auth update
+                # in Ceph either. So it's safe to remove corresponding
+                # partial update in AMeta.
+                if not subvol_meta or auth_id not in subvol_meta['auths']:
+                    remove_subvolumes.append(subvol)
+                    continue
+
+                want_auth = {
+                    'access_level': access_level,
+                    'dirty': False,
+                }
+                # SVMeta update looks clean. Ceph auth update must have been
+                # clean. Update the dirty flag and continue
+                if subvol_meta['auths'][auth_id] == want_auth:
+                    auth_meta['subvolumes'][subvol]['dirty'] = False
+                    self.auth_mdata_mgr.auth_metadata_set(auth_id, auth_meta)
+                    continue
+
+                client_entity = "client.{0}".format(auth_id)
+                ret, out, err = self.mgr.mon_command(
+                    {
+                        'prefix': 'auth get',
+                        'entity': client_entity,
+                        'format': 'json'
+                    })
+                if ret == 0:
+                    existing_caps = json.loads(out)
+                elif ret == -errno.ENOENT:
+                    existing_caps = None
+                else:
+                    log.error(err)
+                    raise VolumeException(ret, err)
+
+                self._authorize_subvolume(auth_id, access_level, existing_caps)
+
+            # Recovered from partial auth updates for the auth ID's access
+            # to a subvolume.
+            auth_meta['subvolumes'][subvol]['dirty'] = False
+            self.auth_mdata_mgr.auth_metadata_set(auth_id, auth_meta)
+
+        for subvol in remove_subvolumes:
+            del auth_meta['subvolumes'][subvol]
+
+        if not auth_meta['subvolumes']:
+            # Clean up auth meta file
+            self.fs.unlink(self.auth_mdata_mgr._auth_metadata_path(auth_id))
+            return
+
+        # Recovered from all partial auth updates for the auth ID.
+        auth_meta['dirty'] = False
+        self.auth_mdata_mgr.auth_metadata_set(auth_id, auth_meta)
+
+    def authorize(self, auth_id, access_level, tenant_id=None, allow_existing_id=False):
+        """
+        Get-or-create a Ceph auth identity for `auth_id` and grant them access
+        to
+        :param auth_id:
+        :param access_level:
+        :param tenant_id: Optionally provide a stringizable object to
+                          restrict any created cephx IDs to other callers
+                          passing the same tenant ID.
+        :allow_existing_id: Optionally authorize existing auth-ids not
+                          created by ceph_volume_client.
+        :return:
+        """
+
+        with self.auth_mdata_mgr.auth_lock(auth_id):
+            client_entity = "client.{0}".format(auth_id)
+            ret, out, err = self.mgr.mon_command(
+                {
+                    'prefix': 'auth get',
+                    'entity': client_entity,
+                    'format': 'json'
+                })
+
+            if ret == 0:
+                existing_caps = json.loads(out)
+            elif ret == -errno.ENOENT:
+                existing_caps = None
+            else:
+                log.error(err)
+                raise VolumeException(ret, err)
+
+            # Existing meta, or None, to be updated
+            auth_meta = self.auth_mdata_mgr.auth_metadata_get(auth_id)
+
+            # subvolume data to be inserted
+            group_name = self.group.groupname if self.group.groupname != Group.NO_GROUP_NAME else None
+            group_subvol_id = "{0}/{1}".format(group_name, self.subvolname)
+            subvolume = {
+                group_subvol_id : {
+                    # The access level at which the auth_id is authorized to
+                    # access the volume.
+                    'access_level': access_level,
+                    'dirty': True,
+                }
+            }
+
+            if auth_meta is None:
+                if not allow_existing_id and existing_caps is not None:
+                    msg = "auth ID: {0} exists and not created by mgr plugin. Not allowed to modify".format(auth_id)
+                    log.error(msg)
+                    raise VolumeException(-errno.EPERM, msg)
+
+                # non-existent auth IDs
+                sys.stderr.write("Creating meta for ID {0} with tenant {1}\n".format(
+                    auth_id, tenant_id
+                ))
+                log.debug("Authorize: no existing meta")
+                auth_meta = {
+                    'dirty': True,
+                    'tenant_id': str(tenant_id) if tenant_id else None,
+                    'subvolumes': subvolume
+                }
+            else:
+                # Update 'volumes' key (old style auth metadata file) to 'subvolumes' key
+                if 'volumes' in auth_meta:
+                    auth_meta['subvolumes'] = auth_meta.pop('volumes')
+
+                # Disallow tenants to share auth IDs
+                if str(auth_meta['tenant_id']) != str(tenant_id):
+                    msg = "auth ID: {0} is already in use".format(auth_id)
+                    log.error(msg)
+                    raise VolumeException(-errno.EPERM, msg)
+
+                if auth_meta['dirty']:
+                    self._recover_auth_meta(auth_id, auth_meta)
+
+                log.debug("Authorize: existing tenant {tenant}".format(
+                    tenant=auth_meta['tenant_id']
+                ))
+                auth_meta['dirty'] = True
+                auth_meta['subvolumes'].update(subvolume)
+
+            self.auth_mdata_mgr.auth_metadata_set(auth_id, auth_meta)
+
+            with self.auth_mdata_mgr.subvol_metadata_lock(self.group.groupname, self.subvolname):
+                key = self._authorize_subvolume(auth_id, access_level, existing_caps)
+
+            auth_meta['dirty'] = False
+            auth_meta['subvolumes'][group_subvol_id]['dirty'] = False
+            self.auth_mdata_mgr.auth_metadata_set(auth_id, auth_meta)
+
+            if tenant_id:
+                return key
+            else:
+                # Caller wasn't multi-tenant aware: be safe and don't give
+                # them a key
+                return ""
+
+    def _authorize_subvolume(self, auth_id, access_level, existing_caps):
+        subvol_meta = self.auth_mdata_mgr.subvol_metadata_get(self.group.groupname, self.subvolname)
+
+        auth = {
+            auth_id: {
+                'access_level': access_level,
+                'dirty': True,
+            }
+        }
+
+        if subvol_meta is None:
+            subvol_meta = {
+                'auths': auth
+            }
+        else:
+            subvol_meta['auths'].update(auth)
+            self.auth_mdata_mgr.subvol_metadata_set(self.group.groupname, self.subvolname, subvol_meta)
+
+        key = self._authorize(auth_id, access_level, existing_caps)
+
+        subvol_meta['auths'][auth_id]['dirty'] = False
+        self.auth_mdata_mgr.subvol_metadata_set(self.group.groupname, self.subvolname, subvol_meta)
+
+        return key
+
+    def _authorize(self, auth_id, access_level, existing_caps):
+        subvol_path = self.path
+        log.debug("Authorizing Ceph id '{0}' for path '{1}'".format(auth_id, subvol_path))
+
+        # First I need to work out what the data pool is for this share:
+        # read the layout
+        try:
+            pool = self.fs.getxattr(subvol_path, 'ceph.dir.layout.pool').decode('utf-8')
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+        try:
+            namespace = self.fs.getxattr(subvol_path, 'ceph.dir.layout.pool_namespace').decode('utf-8')
+        except cephfs.NoData:
+            namespace = None
+
+        # Now construct auth capabilities that give the guest just enough
+        # permissions to access the share
+        client_entity = "client.{0}".format(auth_id)
+        want_mds_cap = "allow {0} path={1}".format(access_level, subvol_path.decode('utf-8'))
+        want_osd_cap = "allow {0} pool={1}{2}".format(
+                access_level, pool, " namespace={0}".format(namespace) if namespace else "")
+
+        # Construct auth caps that if present might conflict with the desired
+        # auth caps.
+        unwanted_access_level = 'r' if access_level is 'rw' else 'rw'
+        unwanted_mds_cap = 'allow {0} path={1}'.format(unwanted_access_level, subvol_path.decode('utf-8'))
+        unwanted_osd_cap = "allow {0} pool={1}{2}".format(
+                unwanted_access_level, pool, " namespace={0}".format(namespace) if namespace else "")
+
+        return allow_access(self.mgr, client_entity, want_mds_cap, want_osd_cap,
+                            unwanted_mds_cap, unwanted_osd_cap, existing_caps)
+
+    def deauthorize(self, auth_id):
+        with self.auth_mdata_mgr.auth_lock(auth_id):
+            # Existing meta, or None, to be updated
+            auth_meta = self.auth_mdata_mgr.auth_metadata_get(auth_id)
+
+            if auth_meta is None:
+                msg = "auth ID: {0} doesn't exist".format(auth_id)
+                log.error(msg)
+                raise VolumeException(-errno.ENOENT, msg)
+
+            # Update 'volumes' key (old style auth metadata file) to 'subvolumes' key
+            if 'volumes' in auth_meta:
+                auth_meta['subvolumes'] = auth_meta.pop('volumes')
+
+            group_name = self.group.groupname if self.group.groupname != Group.NO_GROUP_NAME else None
+            group_subvol_id = "{0}/{1}".format(group_name, self.subvolname)
+            if (auth_meta is None) or (not auth_meta['subvolumes']):
+                log.warning("deauthorized called for already-removed auth"
+                         "ID '{auth_id}' for subvolume '{subvolume}'".format(
+                    auth_id=auth_id, subvolume=self.subvolname
+                ))
+                # Clean up the auth meta file of an auth ID
+                self.fs.unlink(self.auth_mdata_mgr._auth_metadata_path(auth_id))
+                return
+
+            if group_subvol_id not in auth_meta['subvolumes']:
+                log.warning("deauthorized called for already-removed auth"
+                         "ID '{auth_id}' for subvolume '{subvolume}'".format(
+                    auth_id=auth_id, subvolume=self.subvolname
+                ))
+                return
+
+            if auth_meta['dirty']:
+                self._recover_auth_meta(auth_id, auth_meta)
+
+            auth_meta['dirty'] = True
+            auth_meta['subvolumes'][group_subvol_id]['dirty'] = True
+            self.auth_mdata_mgr.auth_metadata_set(auth_id, auth_meta)
+
+            self._deauthorize_subvolume(auth_id)
+
+            # Filter out the volume we're deauthorizing
+            del auth_meta['subvolumes'][group_subvol_id]
+
+            # Clean up auth meta file
+            if not auth_meta['subvolumes']:
+                self.fs.unlink(self.auth_mdata_mgr._auth_metadata_path(auth_id))
+                return
+
+            auth_meta['dirty'] = False
+            self.auth_mdata_mgr.auth_metadata_set(auth_id, auth_meta)
+
+    def _deauthorize_subvolume(self, auth_id):
+        with self.auth_mdata_mgr.subvol_metadata_lock(self.group.groupname, self.subvolname):
+            subvol_meta = self.auth_mdata_mgr.subvol_metadata_get(self.group.groupname, self.subvolname)
+
+            if (subvol_meta is None) or (auth_id not in subvol_meta['auths']):
+                log.warning("deauthorized called for already-removed auth"
+                         "ID '{auth_id}' for subvolume '{subvolume}'".format(
+                    auth_id=auth_id, subvolume=self.subvolname
+                ))
+                return
+
+            subvol_meta['auths'][auth_id]['dirty'] = True
+            self.auth_mdata_mgr.subvol_metadata_set(self.group.groupname, self.subvolname, subvol_meta)
+
+            self._deauthorize(auth_id)
+
+            # Remove the auth_id from the metadata *after* removing it
+            # from ceph, so that if we crashed here, we would actually
+            # recreate the auth ID during recovery (i.e. end up with
+            # a consistent state).
+
+            # Filter out the auth we're removing
+            del subvol_meta['auths'][auth_id]
+            self.auth_mdata_mgr.subvol_metadata_set(self.group.groupname, self.subvolname, subvol_meta)
+
+    def _deauthorize(self, auth_id):
+        """
+        The volume must still exist.
+        """
+        client_entity = "client.{0}".format(auth_id)
+        subvol_path = self.path
+        try:
+            pool_name = self.fs.getxattr(subvol_path, 'ceph.dir.layout.pool').decode('utf-8')
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+        try:
+            namespace = self.fs.getxattr(subvol_path, 'ceph.dir.layout.pool_namespace').decode('utf-8')
+        except cephfs.NoData:
+            namespace = None
+
+        # The auth_id might have read-only or read-write mount access for the
+        # subvolume path.
+        access_levels = ('r', 'rw')
+        want_mds_caps = ['allow {0} path={1}'.format(access_level, subvol_path.decode('utf-8'))
+                         for access_level in access_levels]
+        want_osd_caps = ['allow {0} pool={1}{2}'.format(
+                          access_level, pool_name, " namespace={0}".format(namespace) if namespace else "")
+                         for access_level in access_levels]
+        deny_access(self.mgr, client_entity, want_mds_caps, want_osd_caps)
+
+    def authorized_list(self):
+        """
+        Expose a list of auth IDs that have access to a subvolume.
+
+        return: a list of (auth_id, access_level) tuples, where
+                the access_level can be 'r' , or 'rw'.
+                None if no auth ID is given access to the subvolume.
+        """
+        with self.auth_mdata_mgr.subvol_metadata_lock(self.group.groupname, self.subvolname):
+            meta = self.auth_mdata_mgr.subvol_metadata_get(self.group.groupname, self.subvolname)
+            auths = [] # type: List[Dict[str,str]]
+            if not meta or not meta['auths']:
+                return auths
+
+            for auth, auth_data in meta['auths'].items():
+                # Skip partial auth updates.
+                if not auth_data['dirty']:
+                    auths.append({auth: auth_data['access_level']})
+
+            return auths
+
+    def evict(self, volname, auth_id, timeout=30):
+        """
+        Evict all clients based on the authorization ID and the subvolume path mounted.
+        Assumes that the authorization key has been revoked prior to calling this function.
+
+        This operation can throw an exception if the mon cluster is unresponsive, or
+        any individual MDS daemon is unresponsive for longer than the timeout passed in.
+        """
+
+        client_spec = ["auth_name={0}".format(auth_id), ]
+        client_spec.append("client_metadata.root={0}".
+                           format(self.path.decode('utf-8')))
+
+        log.info("evict clients with {0}".format(', '.join(client_spec)))
+
+        mds_map = get_mds_map(self.mgr, volname)
+        if not mds_map:
+            raise VolumeException(-errno.ENOENT, "mdsmap for volume {0} not found".format(volname))
+
+        up = {}
+        for name, gid in mds_map['up'].items():
+            # Quirk of the MDSMap JSON dump: keys in the up dict are like "mds_0"
+            assert name.startswith("mds_")
+            up[int(name[4:])] = gid
+
+        # For all MDS ranks held by a daemon
+        # Do the parallelism in python instead of using "tell mds.*", because
+        # the latter doesn't give us per-mds output
+        threads = []
+        for rank, gid in up.items():
+            thread = RankEvicter(self.mgr, self.fs, client_spec, volname, rank, gid, mds_map, timeout)
+            thread.start()
+            threads.append(thread)
+
+        for t in threads:
+            t.join()
+
+        log.info("evict: joined all")
+
+        for t in threads:
+            if not t.success:
+                msg = ("Failed to evict client with {0} from mds {1}/{2}: {3}".
+                       format(', '.join(client_spec), t.rank, t.gid, t.exception)
+                      )
+                log.error(msg)
+                raise EvictionError(msg)
+
+    def _get_clone_source(self):
+        try:
+            clone_source = {
+                'volume'   : self.metadata_mgr.get_option("source", "volume"),
+                'subvolume': self.metadata_mgr.get_option("source", "subvolume"),
+                'snapshot' : self.metadata_mgr.get_option("source", "snapshot"),
+            }
+
+            try:
+                clone_source["group"] = self.metadata_mgr.get_option("source", "group")
+            except MetadataMgrException as me:
+                if me.errno == -errno.ENOENT:
+                    pass
+                else:
+                    raise
+        except MetadataMgrException as me:
+            raise VolumeException(-errno.EINVAL, "error fetching subvolume metadata")
+        return clone_source
+
+    @property
+    def status(self):
+        state = SubvolumeStates.from_value(self.metadata_mgr.get_global_option(MetadataManager.GLOBAL_META_KEY_STATE))
+        subvolume_type = self.subvol_type
+        subvolume_status = {
+            'state' : state.value
+        }
+        if not SubvolumeOpSm.is_complete_state(state) and subvolume_type == SubvolumeTypes.TYPE_CLONE:
+            subvolume_status["source"] = self._get_clone_source()
+        return subvolume_status
+
+    @property
+    def state(self):
+        return SubvolumeStates.from_value(self.metadata_mgr.get_global_option(MetadataManager.GLOBAL_META_KEY_STATE))
+
+    @state.setter
+    def state(self, val):
+        state = val[0].value
+        flush = val[1]
+        self.metadata_mgr.update_global_section(MetadataManager.GLOBAL_META_KEY_STATE, state)
+        if flush:
+            self.metadata_mgr.flush()
+
+    def remove(self, retainsnaps=False):
+        if retainsnaps:
+            raise VolumeException(-errno.EINVAL, "subvolume '{0}' does not support snapshot retention on delete".format(self.subvolname))
+        if self.list_snapshots():
+            raise VolumeException(-errno.ENOTEMPTY, "subvolume '{0}' has snapshots".format(self.subvolname))
+        self.trash_base_dir()
+
+    def resize(self, newsize, noshrink):
+        subvol_path = self.path
+        return self._resize(subvol_path, newsize, noshrink)
+
+    def create_snapshot(self, snapname):
+        try:
+            group_snapshot_path = os.path.join(self.group.path,
+                                               self.vol_spec.snapshot_dir_prefix.encode('utf-8'),
+                                               snapname.encode('utf-8'))
+            self.fs.stat(group_snapshot_path)
+        except cephfs.Error as e:
+            if e.args[0] == errno.ENOENT:
+                snappath = self.snapshot_path(snapname)
+                mksnap(self.fs, snappath)
+            else:
+                raise VolumeException(-e.args[0], e.args[1])
+        else:
+            raise VolumeException(-errno.EINVAL, "subvolumegroup and subvolume snapshot name can't be same")
+
+    def has_pending_clones(self, snapname):
+        try:
+            return self.metadata_mgr.section_has_item('clone snaps', snapname)
+        except MetadataMgrException as me:
+            if me.errno == -errno.ENOENT:
+                return False
+            raise
+
+    def remove_snapshot(self, snapname):
+        if self.has_pending_clones(snapname):
+            raise VolumeException(-errno.EAGAIN, "snapshot '{0}' has pending clones".format(snapname))
+        snappath = self.snapshot_path(snapname)
+        rmsnap(self.fs, snappath)
+
+    def snapshot_info(self, snapname):
+        if is_inherited_snap(snapname):
+            raise VolumeException(-errno.EINVAL,
+                                  "snapshot name '{0}' is invalid".format(snapname))
+        snappath = self.snapshot_data_path(snapname)
+        snap_info = {}
+        try:
+            snap_attrs = {'created_at':'ceph.snap.btime', 'size':'ceph.dir.rbytes',
+                          'data_pool':'ceph.dir.layout.pool'}
+            for key, val in snap_attrs.items():
+                snap_info[key] = self.fs.getxattr(snappath, val)
+            return {'size': int(snap_info['size']),
+                    'created_at': str(datetime.fromtimestamp(float(snap_info['created_at']))),
+                    'data_pool': snap_info['data_pool'].decode('utf-8'),
+                    'has_pending_clones': "yes" if self.has_pending_clones(snapname) else "no"}
+        except cephfs.Error as e:
+            if e.errno == errno.ENOENT:
+                raise VolumeException(-errno.ENOENT,
+                                      "snapshot '{0}' does not exist".format(snapname))
+            raise VolumeException(-e.args[0], e.args[1])
+
+    def list_snapshots(self):
+        try:
+            dirpath = self.snapshot_base_path()
+            return listsnaps(self.fs, self.vol_spec, dirpath, filter_inherited_snaps=True)
+        except VolumeException as ve:
+            if ve.errno == -errno.ENOENT:
+                return []
+            raise
+
+    def _add_snap_clone(self, track_id, snapname):
+        self.metadata_mgr.add_section("clone snaps")
+        self.metadata_mgr.update_section("clone snaps", track_id, snapname)
+        self.metadata_mgr.flush()
+
+    def _remove_snap_clone(self, track_id):
+        self.metadata_mgr.remove_option("clone snaps", track_id)
+        self.metadata_mgr.flush()
+
+    def attach_snapshot(self, snapname, tgt_subvolume):
+        if not snapname.encode('utf-8') in self.list_snapshots():
+            raise VolumeException(-errno.ENOENT, "snapshot '{0}' does not exist".format(snapname))
+        try:
+            create_clone_index(self.fs, self.vol_spec)
+            with open_clone_index(self.fs, self.vol_spec) as index:
+                track_idx = index.track(tgt_subvolume.base_path)
+                self._add_snap_clone(track_idx, snapname)
+        except (IndexException, MetadataMgrException) as e:
+            log.warn("error creating clone index: {0}".format(e))
+            raise VolumeException(-errno.EINVAL, "error cloning subvolume")
+
+    def detach_snapshot(self, snapname, track_id):
+        if not snapname.encode('utf-8') in self.list_snapshots():
+            raise VolumeException(-errno.ENOENT, "snapshot '{0}' does not exist".format(snapname))
+        try:
+            with open_clone_index(self.fs, self.vol_spec) as index:
+                index.untrack(track_id)
+                self._remove_snap_clone(track_id)
+        except (IndexException, MetadataMgrException) as e:
+            log.warn("error delining snapshot from clone: {0}".format(e))
+            raise VolumeException(-errno.EINVAL, "error delinking snapshot from clone")
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py
new file mode 100644
index 00000000..1dd6f3fe
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py
@@ -0,0 +1,370 @@
+import os
+import stat
+import uuid
+import errno
+import logging
+
+import cephfs
+
+from .metadata_manager import MetadataManager
+from .subvolume_attrs import SubvolumeTypes, SubvolumeStates, SubvolumeFeatures
+from .op_sm import SubvolumeOpSm
+from .subvolume_v1 import SubvolumeV1
+from ..template import SubvolumeTemplate
+from ...exception import OpSmException, VolumeException, MetadataMgrException
+from ...fs_util import listdir
+from ..template import SubvolumeOpType
+
+log = logging.getLogger(__name__)
+
+class SubvolumeV2(SubvolumeV1):
+    """
+    Version 2 subvolumes creates a subvolume with path as follows,
+        volumes/<group-name>/<subvolume-name>/<uuid>/
+
+    The distinguishing feature of V2 subvolume as compared to V1 subvolumes is its ability to retain snapshots
+    of a subvolume on removal. This is done by creating snapshots under the <subvolume-name> directory,
+    rather than under the <uuid> directory, as is the case of V1 subvolumes.
+
+    - The directory under which user data resides is <uuid>
+    - Snapshots of the subvolume are taken within the <subvolume-name> directory
+    - A meta file is maintained under the <subvolume-name> directory as a metadata store, storing information similar
+    to V1 subvolumes
+    - On a request to remove subvolume but retain its snapshots, only the <uuid> directory is moved to trash, retaining
+    the rest of the subvolume and its meta file.
+        - The <uuid> directory, when present, is the current incarnation of the subvolume, which may have snapshots of
+        older incarnations of the same subvolume.
+    - V1 subvolumes that currently do not have any snapshots are upgraded to V2 subvolumes automatically, to support the
+    snapshot retention feature
+    """
+    VERSION = 2
+
+    @staticmethod
+    def version():
+        return SubvolumeV2.VERSION
+
+    @property
+    def features(self):
+        return [SubvolumeFeatures.FEATURE_SNAPSHOT_CLONE.value,
+                SubvolumeFeatures.FEATURE_SNAPSHOT_AUTOPROTECT.value,
+                SubvolumeFeatures.FEATURE_SNAPSHOT_RETENTION.value]
+
+    @property
+    def retained(self):
+        try:
+            self.metadata_mgr.refresh()
+            if self.state == SubvolumeStates.STATE_RETAINED:
+                return True
+            return False
+        except MetadataMgrException as me:
+            if me.errno != -errno.ENOENT:
+                raise VolumeException(me.errno, "internal error while processing subvolume '{0}'".format(self.subvolname))
+        return False
+
+    @property
+    def purgeable(self):
+        if not self.retained or self.list_snapshots() or self.has_pending_purges:
+            return False
+        return True
+
+    @property
+    def has_pending_purges(self):
+        try:
+            return not listdir(self.fs, self.trash_dir) == []
+        except VolumeException as ve:
+            if ve.errno == -errno.ENOENT:
+                return False
+            raise
+
+    @property
+    def trash_dir(self):
+        return os.path.join(self.base_path, b".trash")
+
+    def create_trashcan(self):
+        """per subvolume trash directory"""
+        try:
+            self.fs.stat(self.trash_dir)
+        except cephfs.Error as e:
+            if e.args[0] == errno.ENOENT:
+                try:
+                    self.fs.mkdir(self.trash_dir, 0o700)
+                except cephfs.Error as ce:
+                    raise VolumeException(-ce.args[0], ce.args[1])
+            else:
+                raise VolumeException(-e.args[0], e.args[1])
+
+    def mark_subvolume(self):
+        # set subvolume attr, on subvolume root, marking it as a CephFS subvolume
+        # subvolume root is where snapshots would be taken, and hence is the base_path for v2 subvolumes
+        try:
+            # MDS treats this as a noop for already marked subvolume
+            self.fs.setxattr(self.base_path, 'ceph.dir.subvolume', b'1', 0)
+        except cephfs.InvalidValue as e:
+            raise VolumeException(-errno.EINVAL, "invalid value specified for ceph.dir.subvolume")
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+    @staticmethod
+    def is_valid_uuid(uuid_str):
+        try:
+            uuid.UUID(uuid_str)
+            return True
+        except ValueError:
+            return False
+
+    def snapshot_base_path(self):
+        return os.path.join(self.base_path, self.vol_spec.snapshot_dir_prefix.encode('utf-8'))
+
+    def snapshot_data_path(self, snapname):
+        snap_base_path = self.snapshot_path(snapname)
+        uuid_str = None
+        try:
+            with self.fs.opendir(snap_base_path) as dir_handle:
+                d = self.fs.readdir(dir_handle)
+                while d:
+                    if d.d_name not in (b".", b".."):
+                        d_full_path = os.path.join(snap_base_path, d.d_name)
+                        stx = self.fs.statx(d_full_path, cephfs.CEPH_STATX_MODE, cephfs.AT_SYMLINK_NOFOLLOW)
+                        if stat.S_ISDIR(stx.get('mode')):
+                            if self.is_valid_uuid(d.d_name.decode('utf-8')):
+                                uuid_str = d.d_name
+                    d = self.fs.readdir(dir_handle)
+        except cephfs.Error as e:
+            if e.errno == errno.ENOENT:
+                raise VolumeException(-errno.ENOENT, "snapshot '{0}' does not exist".format(snapname))
+            raise VolumeException(-e.args[0], e.args[1])
+
+        if not uuid_str:
+            raise VolumeException(-errno.ENOENT, "snapshot '{0}' does not exist".format(snapname))
+
+        return os.path.join(snap_base_path, uuid_str)
+
+    def _remove_on_failure(self, subvol_path, retained):
+        if retained:
+            log.info("cleaning up subvolume incarnation with path: {0}".format(subvol_path))
+            try:
+                self.fs.rmdir(subvol_path)
+            except cephfs.Error as e:
+                raise VolumeException(-e.args[0], e.args[1])
+        else:
+            log.info("cleaning up subvolume with path: {0}".format(self.subvolname))
+            self.remove()
+
+    def _set_incarnation_metadata(self, subvolume_type, qpath, initial_state):
+        self.metadata_mgr.update_global_section(MetadataManager.GLOBAL_META_KEY_TYPE, subvolume_type.value)
+        self.metadata_mgr.update_global_section(MetadataManager.GLOBAL_META_KEY_PATH, qpath)
+        self.metadata_mgr.update_global_section(MetadataManager.GLOBAL_META_KEY_STATE, initial_state.value)
+
+    def create(self, size, isolate_nspace, pool, mode, uid, gid):
+        subvolume_type = SubvolumeTypes.TYPE_NORMAL
+        try:
+            initial_state = SubvolumeOpSm.get_init_state(subvolume_type)
+        except OpSmException as oe:
+            raise VolumeException(-errno.EINVAL, "subvolume creation failed: internal error")
+
+        retained = self.retained
+        if retained and self.has_pending_purges:
+            raise VolumeException(-errno.EAGAIN, "asynchronous purge of subvolume in progress")
+        subvol_path = os.path.join(self.base_path, str(uuid.uuid4()).encode('utf-8'))
+        try:
+            self.fs.mkdirs(subvol_path, mode)
+            self.mark_subvolume()
+            attrs = {
+                'uid': uid,
+                'gid': gid,
+                'data_pool': pool,
+                'pool_namespace': self.namespace if isolate_nspace else None,
+                'quota': size
+            }
+            self.set_attrs(subvol_path, attrs)
+
+            # persist subvolume metadata
+            qpath = subvol_path.decode('utf-8')
+            if retained:
+                self._set_incarnation_metadata(subvolume_type, qpath, initial_state)
+                self.metadata_mgr.flush()
+            else:
+                self.init_config(SubvolumeV2.VERSION, subvolume_type, qpath, initial_state)
+
+            # Create the subvolume metadata file which manages auth-ids if it doesn't exist
+            self.auth_mdata_mgr.create_subvolume_metadata_file(self.group.groupname, self.subvolname)
+        except (VolumeException, MetadataMgrException, cephfs.Error) as e:
+            try:
+                self._remove_on_failure(subvol_path, retained)
+            except VolumeException as ve:
+                log.info("failed to cleanup subvolume '{0}' ({1})".format(self.subvolname, ve))
+
+            if isinstance(e, MetadataMgrException):
+                log.error("metadata manager exception: {0}".format(e))
+                e = VolumeException(-errno.EINVAL, "exception in subvolume metadata")
+            elif isinstance(e, cephfs.Error):
+                e = VolumeException(-e.args[0], e.args[1])
+            raise e
+
+    def create_clone(self, pool, source_volname, source_subvolume, snapname):
+        subvolume_type = SubvolumeTypes.TYPE_CLONE
+        try:
+            initial_state = SubvolumeOpSm.get_init_state(subvolume_type)
+        except OpSmException as oe:
+            raise VolumeException(-errno.EINVAL, "clone failed: internal error")
+
+        retained = self.retained
+        if retained and self.has_pending_purges:
+            raise VolumeException(-errno.EAGAIN, "asynchronous purge of subvolume in progress")
+        subvol_path = os.path.join(self.base_path, str(uuid.uuid4()).encode('utf-8'))
+        try:
+            # source snapshot attrs are used to create clone subvolume
+            # attributes of subvolume's content though, are synced during the cloning process.
+            attrs = source_subvolume.get_attrs(source_subvolume.snapshot_data_path(snapname))
+
+            # override snapshot pool setting, if one is provided for the clone
+            if pool is not None:
+                attrs["data_pool"] = pool
+                attrs["pool_namespace"] = None
+
+            # create directory and set attributes
+            self.fs.mkdirs(subvol_path, attrs.get("mode"))
+            self.mark_subvolume()
+            self.set_attrs(subvol_path, attrs)
+
+            # persist subvolume metadata and clone source
+            qpath = subvol_path.decode('utf-8')
+            if retained:
+                self._set_incarnation_metadata(subvolume_type, qpath, initial_state)
+            else:
+                self.metadata_mgr.init(SubvolumeV2.VERSION, subvolume_type.value, qpath, initial_state.value)
+            self.add_clone_source(source_volname, source_subvolume, snapname)
+            self.metadata_mgr.flush()
+        except (VolumeException, MetadataMgrException, cephfs.Error) as e:
+            try:
+                self._remove_on_failure(subvol_path, retained)
+            except VolumeException as ve:
+                log.info("failed to cleanup subvolume '{0}' ({1})".format(self.subvolname, ve))
+
+            if isinstance(e, MetadataMgrException):
+                log.error("metadata manager exception: {0}".format(e))
+                e = VolumeException(-errno.EINVAL, "exception in subvolume metadata")
+            elif isinstance(e, cephfs.Error):
+                e = VolumeException(-e.args[0], e.args[1])
+            raise e
+
+    def allowed_ops_by_type(self, vol_type):
+        if vol_type == SubvolumeTypes.TYPE_CLONE:
+            return {op_type for op_type in SubvolumeOpType}
+
+        if vol_type == SubvolumeTypes.TYPE_NORMAL:
+            return {op_type for op_type in SubvolumeOpType} - {SubvolumeOpType.CLONE_STATUS,
+                                                               SubvolumeOpType.CLONE_CANCEL,
+                                                               SubvolumeOpType.CLONE_INTERNAL}
+
+        return {}
+
+    def allowed_ops_by_state(self, vol_state):
+        if vol_state == SubvolumeStates.STATE_COMPLETE:
+            return {op_type for op_type in SubvolumeOpType}
+
+        if vol_state == SubvolumeStates.STATE_RETAINED:
+            return {
+                SubvolumeOpType.REMOVE,
+                SubvolumeOpType.REMOVE_FORCE,
+                SubvolumeOpType.LIST,
+                SubvolumeOpType.INFO,
+                SubvolumeOpType.SNAP_REMOVE,
+                SubvolumeOpType.SNAP_LIST,
+                SubvolumeOpType.SNAP_INFO,
+                SubvolumeOpType.SNAP_PROTECT,
+                SubvolumeOpType.SNAP_UNPROTECT,
+                SubvolumeOpType.CLONE_SOURCE
+            }
+
+        return {SubvolumeOpType.REMOVE_FORCE,
+                SubvolumeOpType.CLONE_CREATE,
+                SubvolumeOpType.CLONE_STATUS,
+                SubvolumeOpType.CLONE_CANCEL,
+                SubvolumeOpType.CLONE_INTERNAL,
+                SubvolumeOpType.CLONE_SOURCE}
+
+    def open(self, op_type):
+        if not isinstance(op_type, SubvolumeOpType):
+            raise VolumeException(-errno.ENOTSUP, "operation {0} not supported on subvolume '{1}'".format(
+                                  op_type.value, self.subvolname))
+        try:
+            self.metadata_mgr.refresh()
+            # unconditionally mark as subvolume, to handle pre-existing subvolumes without the mark
+            self.mark_subvolume()
+
+            etype = self.subvol_type
+            if op_type not in self.allowed_ops_by_type(etype):
+                raise VolumeException(-errno.ENOTSUP, "operation '{0}' is not allowed on subvolume '{1}' of type {2}".format(
+                                      op_type.value, self.subvolname, etype.value))
+
+            estate = self.state
+            if op_type not in self.allowed_ops_by_state(estate) and estate == SubvolumeStates.STATE_RETAINED:
+                raise VolumeException(-errno.ENOENT, "subvolume '{0}' is removed and has only snapshots retained".format(
+                                      self.subvolname))
+
+            if op_type not in self.allowed_ops_by_state(estate) and estate != SubvolumeStates.STATE_RETAINED:
+                raise VolumeException(-errno.EAGAIN, "subvolume '{0}' is not ready for operation {1}".format(
+                                      self.subvolname, op_type.value))
+
+            if estate != SubvolumeStates.STATE_RETAINED:
+                subvol_path = self.path
+                log.debug("refreshed metadata, checking subvolume path '{0}'".format(subvol_path))
+                st = self.fs.stat(subvol_path)
+
+                self.uid = int(st.st_uid)
+                self.gid = int(st.st_gid)
+                self.mode = int(st.st_mode & ~stat.S_IFMT(st.st_mode))
+        except MetadataMgrException as me:
+            if me.errno == -errno.ENOENT:
+                raise VolumeException(-errno.ENOENT, "subvolume '{0}' does not exist".format(self.subvolname))
+            raise VolumeException(me.args[0], me.args[1])
+        except cephfs.ObjectNotFound:
+            log.debug("missing subvolume path '{0}' for subvolume '{1}'".format(subvol_path, self.subvolname))
+            raise VolumeException(-errno.ENOENT, "mount path missing for subvolume '{0}'".format(self.subvolname))
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+    def trash_incarnation_dir(self):
+        """rename subvolume (uuid component) to trash"""
+        self.create_trashcan()
+        try:
+            bname = os.path.basename(self.path)
+            tpath = os.path.join(self.trash_dir, bname)
+            log.debug("trash: {0} -> {1}".format(self.path, tpath))
+            self.fs.rename(self.path, tpath)
+            self._link_dir(tpath, bname)
+        except cephfs.Error as e:
+            raise VolumeException(-e.args[0], e.args[1])
+
+    def remove(self, retainsnaps=False):
+        if self.list_snapshots():
+            if not retainsnaps:
+                raise VolumeException(-errno.ENOTEMPTY, "subvolume '{0}' has snapshots".format(self.subvolname))
+        else:
+            if not self.has_pending_purges:
+                self.trash_base_dir()
+                # Delete the volume meta file, if it's not already deleted
+                self.auth_mdata_mgr.delete_subvolume_metadata_file(self.group.groupname, self.subvolname)
+                return
+        if self.state != SubvolumeStates.STATE_RETAINED:
+            self.trash_incarnation_dir()
+            self.metadata_mgr.update_global_section(MetadataManager.GLOBAL_META_KEY_PATH, "")
+            self.metadata_mgr.update_global_section(MetadataManager.GLOBAL_META_KEY_STATE, SubvolumeStates.STATE_RETAINED.value)
+            self.metadata_mgr.flush()
+            # Delete the volume meta file, if it's not already deleted
+            self.auth_mdata_mgr.delete_subvolume_metadata_file(self.group.groupname, self.subvolname)
+
+    def info(self):
+        if self.state != SubvolumeStates.STATE_RETAINED:
+            return super(SubvolumeV2, self).info()
+
+        return {'type': self.subvol_type.value, 'features': self.features, 'state': SubvolumeStates.STATE_RETAINED.value}
+
+    def remove_snapshot(self, snapname):
+        super(SubvolumeV2, self).remove_snapshot(snapname)
+        if self.purgeable:
+            self.trash_base_dir()
+            # tickle the volume purge job to purge this entry, using ESTALE
+            raise VolumeException(-errno.ESTALE, "subvolume '{0}' has been removed as the last retained snapshot is removed".format(self.subvolname))
+        # if not purgeable, subvol is not retained, or has snapshots, or already has purge jobs that will garbage collect this subvol
diff --git a/src/pybind/mgr/volumes/fs/operations/volume.py b/src/pybind/mgr/volumes/fs/operations/volume.py
new file mode 100644
index 00000000..410e5c44
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/operations/volume.py
@@ -0,0 +1,348 @@
+import time
+import errno
+import logging
+import sys
+
+try:
+    from typing import List
+except ImportError:
+    pass  # For typing only
+
+from contextlib import contextmanager
+from threading import Lock, Condition
+
+if sys.version_info >= (3, 3):
+    from threading import Timer
+else:
+    from threading import _Timer as Timer
+
+import cephfs
+import orchestrator
+
+from .lock import GlobalLock
+from ..exception import VolumeException
+from ..fs_util import create_pool, remove_pool, create_filesystem, \
+    remove_filesystem, create_mds, volume_exists
+
+log = logging.getLogger(__name__)
+
+class ConnectionPool(object):
+    class Connection(object):
+        def __init__(self, mgr, fs_name):
+            self.fs = None
+            self.mgr = mgr
+            self.fs_name = fs_name
+            self.ops_in_progress = 0
+            self.last_used = time.time()
+            self.fs_id = self.get_fs_id()
+
+        def get_fs_id(self):
+            fs_map = self.mgr.get('fs_map')
+            for fs in fs_map['filesystems']:
+                if fs['mdsmap']['fs_name'] == self.fs_name:
+                    return fs['id']
+            raise VolumeException(
+                -errno.ENOENT, "Volume '{0}' not found".format(self.fs_name))
+
+        def get_fs_handle(self):
+            self.last_used = time.time()
+            self.ops_in_progress += 1
+            return self.fs
+
+        def put_fs_handle(self, notify):
+            assert self.ops_in_progress > 0
+            self.ops_in_progress -= 1
+            if self.ops_in_progress == 0:
+                notify()
+
+        def del_fs_handle(self, waiter):
+            if waiter:
+                while self.ops_in_progress != 0:
+                    waiter()
+            if self.is_connection_valid():
+                self.disconnect()
+            else:
+                self.abort()
+
+        def is_connection_valid(self):
+            fs_id = None
+            try:
+                fs_id = self.get_fs_id()
+            except:
+                # the filesystem does not exist now -- connection is not valid.
+                pass
+            log.debug("self.fs_id={0}, fs_id={1}".format(self.fs_id, fs_id))
+            return self.fs_id == fs_id
+
+        def is_connection_idle(self, timeout):
+            return (self.ops_in_progress == 0 and ((time.time() - self.last_used) >= timeout))
+
+        def connect(self):
+            assert self.ops_in_progress == 0
+            log.debug("Connecting to cephfs '{0}'".format(self.fs_name))
+            self.fs = cephfs.LibCephFS(rados_inst=self.mgr.rados)
+            log.debug("Setting user ID and group ID of CephFS mount as root...")
+            self.fs.conf_set("client_mount_uid", "0")
+            self.fs.conf_set("client_mount_gid", "0")
+            log.debug("CephFS initializing...")
+            self.fs.init()
+            log.debug("CephFS mounting...")
+            self.fs.mount(filesystem_name=self.fs_name.encode('utf-8'))
+            log.debug("Connection to cephfs '{0}' complete".format(self.fs_name))
+
+        def disconnect(self):
+            try:
+                assert self.fs
+                assert self.ops_in_progress == 0
+                log.info("disconnecting from cephfs '{0}'".format(self.fs_name))
+                self.fs.shutdown()
+                self.fs = None
+            except Exception as e:
+                log.debug("disconnect: ({0})".format(e))
+                raise
+
+        def abort(self):
+            assert self.fs
+            assert self.ops_in_progress == 0
+            log.info("aborting connection from cephfs '{0}'".format(self.fs_name))
+            self.fs.abort_conn()
+            log.info("abort done from cephfs '{0}'".format(self.fs_name))
+            self.fs = None
+
+    class RTimer(Timer):
+        """
+        recurring timer variant of Timer
+        """
+        def run(self):
+            try:
+                while not self.finished.is_set():
+                    self.finished.wait(self.interval)
+                    self.function(*self.args, **self.kwargs)
+                self.finished.set()
+            except Exception as e:
+                log.error("ConnectionPool.RTimer: %s", e)
+                raise
+
+    # TODO: make this configurable
+    TIMER_TASK_RUN_INTERVAL = 30.0  # seconds
+    CONNECTION_IDLE_INTERVAL = 60.0 # seconds
+
+    def __init__(self, mgr):
+        self.mgr = mgr
+        self.connections = {}
+        self.lock = Lock()
+        self.cond = Condition(self.lock)
+        self.timer_task = ConnectionPool.RTimer(ConnectionPool.TIMER_TASK_RUN_INTERVAL,
+                                                self.cleanup_connections)
+        self.timer_task.start()
+
+    def cleanup_connections(self):
+        with self.lock:
+            log.info("scanning for idle connections..")
+            idle_fs = [fs_name for fs_name,conn in self.connections.items()
+                       if conn.is_connection_idle(ConnectionPool.CONNECTION_IDLE_INTERVAL)]
+            for fs_name in idle_fs:
+                log.info("cleaning up connection for '{}'".format(fs_name))
+                self._del_fs_handle(fs_name)
+
+    def get_fs_handle(self, fs_name):
+        with self.lock:
+            conn = None
+            try:
+                conn = self.connections.get(fs_name, None)
+                if conn:
+                    if conn.is_connection_valid():
+                        return conn.get_fs_handle()
+                    else:
+                        # filesystem id changed beneath us (or the filesystem does not exist).
+                        # this is possible if the filesystem got removed (and recreated with
+                        # same name) via "ceph fs rm/new" mon command.
+                        log.warning("filesystem id changed for volume '{0}', reconnecting...".format(fs_name))
+                        self._del_fs_handle(fs_name)
+                conn = ConnectionPool.Connection(self.mgr, fs_name)
+                conn.connect()
+            except cephfs.Error as e:
+                # try to provide a better error string if possible
+                if e.args[0] == errno.ENOENT:
+                    raise VolumeException(
+                        -errno.ENOENT, "Volume '{0}' not found".format(fs_name))
+                raise VolumeException(-e.args[0], e.args[1])
+            self.connections[fs_name] = conn
+            return conn.get_fs_handle()
+
+    def put_fs_handle(self, fs_name):
+        with self.lock:
+            conn = self.connections.get(fs_name, None)
+            if conn:
+                conn.put_fs_handle(notify=lambda: self.cond.notifyAll())
+
+    def _del_fs_handle(self, fs_name, wait=False):
+        conn = self.connections.pop(fs_name, None)
+        if conn:
+            conn.del_fs_handle(waiter=None if not wait else lambda: self.cond.wait())
+
+    def del_fs_handle(self, fs_name, wait=False):
+        with self.lock:
+            self._del_fs_handle(fs_name, wait)
+
+    def del_all_handles(self):
+        with self.lock:
+            for fs_name in list(self.connections.keys()):
+                log.info("waiting for pending ops for '{}'".format(fs_name))
+                self._del_fs_handle(fs_name, wait=True)
+                log.info("pending ops completed for '{}'".format(fs_name))
+            # no new connections should have been initialized since its
+            # guarded on shutdown.
+            assert len(self.connections) == 0
+
+def gen_pool_names(volname):
+    """
+    return metadata and data pool name (from a filesystem/volume name) as a tuple
+    """
+    return "cephfs.{}.meta".format(volname), "cephfs.{}.data".format(volname)
+
+def get_mds_map(mgr, volname):
+    """
+    return mdsmap for a volname
+    """
+    mds_map = None
+    fs_map = mgr.get("fs_map")
+    for f in fs_map['filesystems']:
+        if volname == f['mdsmap']['fs_name']:
+            return f['mdsmap']
+    return mds_map
+
+def get_pool_names(mgr, volname):
+    """
+    return metadata and data pools (list) names of volume as a tuple
+    """
+    fs_map = mgr.get("fs_map")
+    metadata_pool_id = None
+    data_pool_ids = [] # type: List[int]
+    for f in fs_map['filesystems']:
+        if volname == f['mdsmap']['fs_name']:
+            metadata_pool_id = f['mdsmap']['metadata_pool']
+            data_pool_ids = f['mdsmap']['data_pools']
+            break
+    if metadata_pool_id is None:
+        return None, None
+
+    osdmap = mgr.get("osd_map")
+    pools = dict([(p['pool'], p['pool_name']) for p in osdmap['pools']])
+    metadata_pool = pools[metadata_pool_id]
+    data_pools = [pools[id] for id in data_pool_ids]
+    return metadata_pool, data_pools
+
+def create_volume(mgr, volname):
+    """
+    create volume  (pool, filesystem and mds)
+    """
+    metadata_pool, data_pool = gen_pool_names(volname)
+    # create pools
+    r, outs, outb = create_pool(mgr, metadata_pool, 16)
+    if r != 0:
+        return r, outb, outs
+    r, outb, outs = create_pool(mgr, data_pool, 8)
+    if r != 0:
+        #cleanup
+        remove_pool(mgr, metadata_pool)
+        return r, outb, outs
+    # create filesystem
+    r, outb, outs = create_filesystem(mgr, volname, metadata_pool, data_pool)
+    if r != 0:
+        log.error("Filesystem creation error: {0} {1} {2}".format(r, outb, outs))
+        #cleanup
+        remove_pool(mgr, data_pool)
+        remove_pool(mgr, metadata_pool)
+        return r, outb, outs
+    # create mds
+    return create_mds(mgr, volname)
+
+def delete_volume(mgr, volname, metadata_pool, data_pools):
+    """
+    delete the given module (tear down mds, remove filesystem, remove pools)
+    """
+    # Tear down MDS daemons
+    try:
+        completion = mgr.remove_stateless_service("mds", volname)
+        mgr._orchestrator_wait([completion])
+        orchestrator.raise_if_exception(completion)
+    except (ImportError, orchestrator.OrchestratorError):
+        log.warning("OrchestratorError, not tearing down MDS daemons")
+    except Exception as e:
+        # Don't let detailed orchestrator exceptions (python backtraces)
+        # bubble out to the user
+        log.exception("Failed to tear down MDS daemons")
+        return -errno.EINVAL, "", str(e)
+
+    # In case orchestrator didn't tear down MDS daemons cleanly, or
+    # there was no orchestrator, we force the daemons down.
+    if volume_exists(mgr, volname):
+        r, outb, outs = remove_filesystem(mgr, volname)
+        if r != 0:
+            return r, outb, outs
+    else:
+        err = "Filesystem not found for volume '{0}'".format(volname)
+        log.warning(err)
+        return -errno.ENOENT, "", err
+    r, outb, outs = remove_pool(mgr, metadata_pool)
+    if r != 0:
+        return r, outb, outs
+
+    for data_pool in data_pools:
+        r, outb, outs = remove_pool(mgr, data_pool)
+        if r != 0:
+            return r, outb, outs
+    result_str = "metadata pool: {0} data pool: {1} removed".format(metadata_pool, str(data_pools))
+    return r, result_str, ""
+
+def list_volumes(mgr):
+    """
+    list all filesystem volumes.
+
+    :param: None
+    :return: None
+    """
+    result = []
+    fs_map = mgr.get("fs_map")
+    for f in fs_map['filesystems']:
+        result.append({'name': f['mdsmap']['fs_name']})
+    return result
+
+@contextmanager
+def open_volume(vc, volname):
+    """
+    open a volume for exclusive access. This API is to be used as a context manager.
+
+    :param vc: volume client instance
+    :param volname: volume name
+    :return: yields a volume handle (ceph filesystem handle)
+    """
+    if vc.is_stopping():
+        raise VolumeException(-errno.ESHUTDOWN, "shutdown in progress")
+
+    g_lock = GlobalLock()
+    fs_handle = vc.connection_pool.get_fs_handle(volname)
+    try:
+        with g_lock.lock_op():
+            yield fs_handle
+    finally:
+        vc.connection_pool.put_fs_handle(volname)
+
+@contextmanager
+def open_volume_lockless(vc, volname):
+    """
+    open a volume with shared access. This API is to be used as a context manager.
+
+    :param vc: volume client instance
+    :param volname: volume name
+    :return: yields a volume handle (ceph filesystem handle)
+    """
+    if vc.is_stopping():
+        raise VolumeException(-errno.ESHUTDOWN, "shutdown in progress")
+
+    fs_handle = vc.connection_pool.get_fs_handle(volname)
+    try:
+        yield fs_handle
+    finally:
+        vc.connection_pool.put_fs_handle(volname)
diff --git a/src/pybind/mgr/volumes/fs/purge_queue.py b/src/pybind/mgr/volumes/fs/purge_queue.py
new file mode 100644
index 00000000..7c902572
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/purge_queue.py
@@ -0,0 +1,109 @@
+import errno
+import logging
+import os
+import stat
+
+import cephfs
+
+from .async_job import AsyncJobs
+from .exception import VolumeException
+from .operations.resolver import resolve_trash
+from .operations.template import SubvolumeOpType
+from .operations.group import open_group
+from .operations.subvolume import open_subvol
+from .operations.volume import open_volume, open_volume_lockless
+from .operations.trash import open_trashcan
+
+log = logging.getLogger(__name__)
+
+# helper for fetching a trash entry for a given volume
+def get_trash_entry_for_volume(volume_client, volname, running_jobs):
+    log.debug("fetching trash entry for volume '{0}'".format(volname))
+
+    try:
+        with open_volume_lockless(volume_client, volname) as fs_handle:
+            try:
+                with open_trashcan(fs_handle, volume_client.volspec) as trashcan:
+                    path = trashcan.get_trash_entry(running_jobs)
+                    return 0, path
+            except VolumeException as ve:
+                if ve.errno == -errno.ENOENT:
+                    return 0, None
+                raise ve
+    except VolumeException as ve:
+        log.error("error fetching trash entry for volume '{0}' ({1})".format(volname, ve))
+        return ve.errno, None
+
+def subvolume_purge(volume_client, volname, trashcan, subvolume_trash_entry, should_cancel):
+    groupname, subvolname = resolve_trash(volume_client.volspec, subvolume_trash_entry.decode('utf-8'))
+    log.debug("subvolume resolved to {0}/{1}".format(groupname, subvolname))
+
+    try:
+        with open_volume(volume_client, volname) as fs_handle:
+            with open_group(fs_handle, volume_client.volspec, groupname) as group:
+                with open_subvol(volume_client.mgr, fs_handle, volume_client.volspec, group, subvolname, SubvolumeOpType.REMOVE) as subvolume:
+                    log.debug("subvolume.path={0}, purgeable={1}".format(subvolume.path, subvolume.purgeable))
+                    if not subvolume.purgeable:
+                        return
+                    # this is fine under the global lock -- there are just a handful
+                    # of entries in the subvolume to purge. moreover, the purge needs
+                    # to be guarded since a create request might sneak in.
+                    trashcan.purge(subvolume.base_path, should_cancel)
+    except VolumeException as ve:
+        if not ve.errno == -errno.ENOENT:
+            raise
+
+# helper for starting a purge operation on a trash entry
+def purge_trash_entry_for_volume(volume_client, volname, purge_entry, should_cancel):
+    log.debug("purging trash entry '{0}' for volume '{1}'".format(purge_entry, volname))
+
+    ret = 0
+    try:
+        with open_volume_lockless(volume_client, volname) as fs_handle:
+            with open_trashcan(fs_handle, volume_client.volspec) as trashcan:
+                try:
+                    pth = os.path.join(trashcan.path, purge_entry)
+                    stx = fs_handle.statx(pth, cephfs.CEPH_STATX_MODE | cephfs.CEPH_STATX_SIZE,
+                                          cephfs.AT_SYMLINK_NOFOLLOW)
+                    if stat.S_ISLNK(stx['mode']):
+                        tgt = fs_handle.readlink(pth, 4096)
+                        tgt = tgt[:stx['size']]
+                        log.debug("purging entry pointing to subvolume trash: {0}".format(tgt))
+                        delink = True
+                        try:
+                            trashcan.purge(tgt, should_cancel)
+                        except VolumeException as ve:
+                            if not ve.errno == -errno.ENOENT:
+                                delink = False
+                                return ve.errno
+                        finally:
+                            if delink:
+                                subvolume_purge(volume_client, volname, trashcan, tgt, should_cancel)
+                                log.debug("purging trash link: {0}".format(purge_entry))
+                                trashcan.delink(purge_entry)
+                    else:
+                        log.debug("purging entry pointing to trash: {0}".format(pth))
+                        trashcan.purge(pth, should_cancel)
+                except cephfs.Error as e:
+                    log.warn("failed to remove trash entry: {0}".format(e))
+    except VolumeException as ve:
+        ret = ve.errno
+    return ret
+
+class ThreadPoolPurgeQueueMixin(AsyncJobs):
+    """
+    Purge queue mixin class maintaining a pool of threads for purging trash entries.
+    Subvolumes are chosen from volumes in a round robin fashion. If some of the purge
+    entries (belonging to a set of volumes) have huge directory tree's (such as, lots
+    of small files in a directory w/ deep directory trees), this model may lead to
+    _all_ threads purging entries for one volume (starving other volumes).
+    """
+    def __init__(self, volume_client, tp_size):
+        self.vc = volume_client
+        super(ThreadPoolPurgeQueueMixin, self).__init__(volume_client, "puregejob", tp_size)
+
+    def get_next_job(self, volname, running_jobs):
+        return get_trash_entry_for_volume(self.vc, volname, running_jobs)
+
+    def execute_job(self, volname, job, should_cancel):
+        purge_trash_entry_for_volume(self.vc, volname, job, should_cancel)
diff --git a/src/pybind/mgr/volumes/fs/vol_spec.py b/src/pybind/mgr/volumes/fs/vol_spec.py
new file mode 100644
index 00000000..e18ab069
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/vol_spec.py
@@ -0,0 +1,37 @@
+import os
+
+class VolSpec(object):
+    """
+    specification of a "volume" -- base directory and various prefixes.
+    """
+
+    # where shall we (by default) create subvolumes
+    DEFAULT_SUBVOL_PREFIX = "/volumes"
+    # and the default namespace
+    DEFAULT_NS_PREFIX = "fsvolumens_"
+
+    def __init__(self, snapshot_prefix, subvolume_prefix=None, pool_ns_prefix=None):
+        self.snapshot_prefix = snapshot_prefix
+        self.subvolume_prefix = subvolume_prefix if subvolume_prefix else VolSpec.DEFAULT_SUBVOL_PREFIX
+        self.pool_ns_prefix = pool_ns_prefix if pool_ns_prefix else VolSpec.DEFAULT_NS_PREFIX
+
+    @property
+    def snapshot_dir_prefix(self):
+        """
+        Return the snapshot directory prefix
+        """
+        return self.snapshot_prefix
+
+    @property
+    def base_dir(self):
+        """
+        Return the top level directory under which subvolumes/groups are created
+        """
+        return self.subvolume_prefix
+
+    @property
+    def fs_namespace(self):
+        """
+        return a filesystem namespace by stashing pool namespace prefix and subvolume-id
+        """
+        return self.pool_ns_prefix
diff --git a/src/pybind/mgr/volumes/fs/volume.py b/src/pybind/mgr/volumes/fs/volume.py
new file mode 100644
index 00000000..ab287539
--- /dev/null
+++ b/src/pybind/mgr/volumes/fs/volume.py
@@ -0,0 +1,660 @@
+import json
+import errno
+import logging
+from threading import Event
+
+import cephfs
+
+from .fs_util import listdir
+
+from .operations.volume import ConnectionPool, open_volume, create_volume, \
+    delete_volume, list_volumes, get_pool_names
+from .operations.group import open_group, create_group, remove_group, open_group_unique
+from .operations.subvolume import open_subvol, create_subvol, remove_subvol, \
+    create_clone
+
+from .vol_spec import VolSpec
+from .exception import VolumeException, ClusterError, ClusterTimeout, EvictionError
+from .async_cloner import Cloner
+from .purge_queue import ThreadPoolPurgeQueueMixin
+from .operations.template import SubvolumeOpType
+
+log = logging.getLogger(__name__)
+
+ALLOWED_ACCESS_LEVELS = ('r', 'rw')
+
+
+def octal_str_to_decimal_int(mode):
+    try:
+        return int(mode, 8)
+    except ValueError:
+        raise VolumeException(-errno.EINVAL, "Invalid mode '{0}'".format(mode))
+
+def name_to_json(names):
+    """
+    convert the list of names to json
+    """
+    namedict = []
+    for i in range(len(names)):
+        namedict.append({'name': names[i].decode('utf-8')})
+    return json.dumps(namedict, indent=4, sort_keys=True)
+
+class VolumeClient(object):
+    def __init__(self, mgr):
+        self.mgr = mgr
+        self.stopping = Event()
+        # volume specification
+        self.volspec = VolSpec(mgr.rados.conf_get('client_snapdir'))
+        self.connection_pool = ConnectionPool(self.mgr)
+        self.cloner = Cloner(self, self.mgr.max_concurrent_clones)
+        self.purge_queue = ThreadPoolPurgeQueueMixin(self, 4)
+        # on startup, queue purge job for available volumes to kickstart
+        # purge for leftover subvolume entries in trash. note that, if the
+        # trash directory does not exist or if there are no purge entries
+        # available for a volume, the volume is removed from the purge
+        # job list.
+        fs_map = self.mgr.get('fs_map')
+        for fs in fs_map['filesystems']:
+            self.cloner.queue_job(fs['mdsmap']['fs_name'])
+            self.purge_queue.queue_job(fs['mdsmap']['fs_name'])
+
+    def is_stopping(self):
+        return self.stopping.is_set()
+
+    def shutdown(self):
+        log.info("shutting down")
+        # first, note that we're shutting down
+        self.stopping.set()
+        # second, ask purge threads to quit
+        self.purge_queue.cancel_all_jobs()
+        # third, delete all libcephfs handles from connection pool
+        self.connection_pool.del_all_handles()
+
+    def cluster_log(self, msg, lvl=None):
+        """
+        log to cluster log with default log level as WARN.
+        """
+        if not lvl:
+            lvl = self.mgr.CLUSTER_LOG_PRIO_WARN
+        self.mgr.cluster_log("cluster", lvl, msg)
+
+    def volume_exception_to_retval(self, ve):
+        """
+        return a tuple representation from a volume exception
+        """
+        return ve.to_tuple()
+
+    ### volume operations -- create, rm, ls
+
+    def create_fs_volume(self, volname):
+        if self.is_stopping():
+            return -errno.ESHUTDOWN, "", "shutdown in progress"
+        return create_volume(self.mgr, volname)
+
+    def delete_fs_volume(self, volname, confirm):
+        if self.is_stopping():
+            return -errno.ESHUTDOWN, "", "shutdown in progress"
+
+        if confirm != "--yes-i-really-mean-it":
+            return -errno.EPERM, "", "WARNING: this will *PERMANENTLY DESTROY* all data " \
+                "stored in the filesystem '{0}'. If you are *ABSOLUTELY CERTAIN* " \
+                "that is what you want, re-issue the command followed by " \
+                "--yes-i-really-mean-it.".format(volname)
+
+        ret, out, err = self.mgr.mon_command({
+            'prefix': 'config get',
+            'key': 'mon_allow_pool_delete',
+            'who': 'mon.*',
+            'format': 'json',
+        })
+        if ret != 0:
+            return ret, out, err
+        mon_allow_pool_delete = json.loads(out)
+        if not mon_allow_pool_delete:
+            return -errno.EPERM, "", "pool deletion is disabled; you must first " \
+                "set the mon_allow_pool_delete config option to true before volumes " \
+                "can be deleted"
+
+        metadata_pool, data_pools = get_pool_names(self.mgr, volname)
+        if not metadata_pool:
+            return -errno.ENOENT, "", "volume {0} doesn't exist".format(volname)
+        self.purge_queue.cancel_jobs(volname)
+        self.connection_pool.del_fs_handle(volname, wait=True)
+        return delete_volume(self.mgr, volname, metadata_pool, data_pools)
+
+    def list_fs_volumes(self):
+        if self.stopping.is_set():
+            return -errno.ESHUTDOWN, "", "shutdown in progress"
+        volumes = list_volumes(self.mgr)
+        return 0, json.dumps(volumes, indent=4, sort_keys=True), ""
+
+    ### subvolume operations
+
+    def _create_subvolume(self, fs_handle, volname, group, subvolname, **kwargs):
+        size       = kwargs['size']
+        pool       = kwargs['pool_layout']
+        uid        = kwargs['uid']
+        gid        = kwargs['gid']
+        mode       = kwargs['mode']
+        isolate_nspace = kwargs['namespace_isolated']
+
+        oct_mode = octal_str_to_decimal_int(mode)
+        try:
+            create_subvol(
+                self.mgr, fs_handle, self.volspec, group, subvolname, size, isolate_nspace, pool, oct_mode, uid, gid)
+        except VolumeException as ve:
+            # kick the purge threads for async removal -- note that this
+            # assumes that the subvolume is moved to trashcan for cleanup on error.
+            self.purge_queue.queue_job(volname)
+            raise ve
+
+    def create_subvolume(self, **kwargs):
+        ret        = 0, "", ""
+        volname    = kwargs['vol_name']
+        subvolname = kwargs['sub_name']
+        groupname  = kwargs['group_name']
+        size       = kwargs['size']
+        pool       = kwargs['pool_layout']
+        uid        = kwargs['uid']
+        gid        = kwargs['gid']
+        isolate_nspace = kwargs['namespace_isolated']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    try:
+                        with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.CREATE) as subvolume:
+                            # idempotent creation -- valid. Attributes set is supported.
+                            attrs = {
+                                'uid': uid if uid else subvolume.uid,
+                                'gid': gid if gid else subvolume.gid,
+                                'data_pool': pool,
+                                'pool_namespace': subvolume.namespace if isolate_nspace else None,
+                                'quota': size
+                            }
+                            subvolume.set_attrs(subvolume.path, attrs)
+                    except VolumeException as ve:
+                        if ve.errno == -errno.ENOENT:
+                            self._create_subvolume(fs_handle, volname, group, subvolname, **kwargs)
+                        else:
+                            raise
+        except VolumeException as ve:
+            # volume/group does not exist or subvolume creation failed
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def remove_subvolume(self, **kwargs):
+        ret         = 0, "", ""
+        volname     = kwargs['vol_name']
+        subvolname  = kwargs['sub_name']
+        groupname   = kwargs['group_name']
+        force       = kwargs['force']
+        retainsnaps = kwargs['retain_snapshots']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    remove_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, force, retainsnaps)
+                    # kick the purge threads for async removal -- note that this
+                    # assumes that the subvolume is moved to trash can.
+                    # TODO: make purge queue as singleton so that trash can kicks
+                    # the purge threads on dump.
+                    self.purge_queue.queue_job(volname)
+        except VolumeException as ve:
+            if ve.errno == -errno.EAGAIN:
+                ve = VolumeException(ve.errno, ve.error_str + " (use --force to override)")
+                ret = self.volume_exception_to_retval(ve)
+            elif not (ve.errno == -errno.ENOENT and force):
+                ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def authorize_subvolume(self, **kwargs):
+        ret = 0, "", ""
+        volname     = kwargs['vol_name']
+        subvolname  = kwargs['sub_name']
+        authid      = kwargs['auth_id']
+        groupname   = kwargs['group_name']
+        accesslevel = kwargs['access_level']
+        tenant_id   = kwargs['tenant_id']
+        allow_existing_id = kwargs['allow_existing_id']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.ALLOW_ACCESS) as subvolume:
+                        key = subvolume.authorize(authid, accesslevel, tenant_id, allow_existing_id)
+                        ret = 0, key, ""
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def deauthorize_subvolume(self, **kwargs):
+        ret = 0, "", ""
+        volname     = kwargs['vol_name']
+        subvolname  = kwargs['sub_name']
+        authid      = kwargs['auth_id']
+        groupname   = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.DENY_ACCESS) as subvolume:
+                        subvolume.deauthorize(authid)
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def authorized_list(self, **kwargs):
+        ret = 0, "", ""
+        volname     = kwargs['vol_name']
+        subvolname  = kwargs['sub_name']
+        groupname   = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.AUTH_LIST) as subvolume:
+                        auths = subvolume.authorized_list()
+                        ret = 0, json.dumps(auths, indent=4, sort_keys=True), ""
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def evict(self, **kwargs):
+        ret = 0, "", ""
+        volname     = kwargs['vol_name']
+        subvolname  = kwargs['sub_name']
+        authid      = kwargs['auth_id']
+        groupname   = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.EVICT) as subvolume:
+                        key = subvolume.evict(volname, authid)
+                        ret = 0, "", ""
+        except (VolumeException, ClusterTimeout, ClusterError, EvictionError) as e:
+            if isinstance(e, VolumeException):
+                ret = self.volume_exception_to_retval(e)
+            elif isinstance(e, ClusterTimeout):
+                ret = -errno.ETIMEDOUT , "", "Timedout trying to talk to ceph cluster"
+            elif isinstance(e, ClusterError):
+                ret = e._result_code , "", e._result_str
+            elif isinstance(e, EvictionError):
+                ret = -errno.EINVAL, "", str(e)
+        return ret
+
+    def resize_subvolume(self, **kwargs):
+        ret        = 0, "", ""
+        volname    = kwargs['vol_name']
+        subvolname = kwargs['sub_name']
+        newsize    = kwargs['new_size']
+        noshrink   = kwargs['no_shrink']
+        groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.RESIZE) as subvolume:
+                        nsize, usedbytes = subvolume.resize(newsize, noshrink)
+                        ret = 0, json.dumps(
+                            [{'bytes_used': usedbytes},{'bytes_quota': nsize},
+                             {'bytes_pcent': "undefined" if nsize == 0 else '{0:.2f}'.format((float(usedbytes) / nsize) * 100.0)}],
+                            indent=4, sort_keys=True), ""
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def subvolume_getpath(self, **kwargs):
+        ret        = None
+        volname    = kwargs['vol_name']
+        subvolname = kwargs['sub_name']
+        groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.GETPATH) as subvolume:
+                        subvolpath = subvolume.path
+                        ret = 0, subvolpath.decode("utf-8"), ""
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def subvolume_info(self, **kwargs):
+        ret        = None
+        volname    = kwargs['vol_name']
+        subvolname = kwargs['sub_name']
+        groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.INFO) as subvolume:
+                        mon_addr_lst = []
+                        mon_map_mons = self.mgr.get('mon_map')['mons']
+                        for mon in mon_map_mons:
+                            ip_port = mon['addr'].split("/")[0]
+                            mon_addr_lst.append(ip_port)
+
+                        subvol_info_dict = subvolume.info()
+                        subvol_info_dict["mon_addrs"] = mon_addr_lst
+                        ret = 0, json.dumps(subvol_info_dict, indent=4, sort_keys=True), ""
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def list_subvolumes(self, **kwargs):
+        ret        = 0, "", ""
+        volname    = kwargs['vol_name']
+        groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    subvolumes = group.list_subvolumes()
+                    ret = 0, name_to_json(subvolumes), ""
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    ### subvolume snapshot
+
+    def create_subvolume_snapshot(self, **kwargs):
+        ret        = 0, "", ""
+        volname    = kwargs['vol_name']
+        subvolname = kwargs['sub_name']
+        snapname   = kwargs['snap_name']
+        groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.SNAP_CREATE) as subvolume:
+                        subvolume.create_snapshot(snapname)
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def remove_subvolume_snapshot(self, **kwargs):
+        ret        = 0, "", ""
+        volname    = kwargs['vol_name']
+        subvolname = kwargs['sub_name']
+        snapname   = kwargs['snap_name']
+        groupname  = kwargs['group_name']
+        force      = kwargs['force']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.SNAP_REMOVE) as subvolume:
+                        subvolume.remove_snapshot(snapname)
+        except VolumeException as ve:
+            # ESTALE serves as an error to state that subvolume is currently stale due to internal removal and,
+            # we should tickle the purge jobs to purge the same
+            if ve.errno == -errno.ESTALE:
+                self.purge_queue.queue_job(volname)
+            elif not (ve.errno == -errno.ENOENT and force):
+                ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def subvolume_snapshot_info(self, **kwargs):
+        ret        = 0, "", ""
+        volname    = kwargs['vol_name']
+        subvolname = kwargs['sub_name']
+        snapname   = kwargs['snap_name']
+        groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.SNAP_INFO) as subvolume:
+                        snap_info_dict = subvolume.snapshot_info(snapname)
+                        ret = 0, json.dumps(snap_info_dict, indent=4, sort_keys=True), ""
+        except VolumeException as ve:
+                ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def list_subvolume_snapshots(self, **kwargs):
+        ret        = 0, "", ""
+        volname    = kwargs['vol_name']
+        subvolname = kwargs['sub_name']
+        groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.SNAP_LIST) as subvolume:
+                        snapshots = subvolume.list_snapshots()
+                        ret = 0, name_to_json(snapshots), ""
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def protect_subvolume_snapshot(self, **kwargs):
+        ret        = 0, "", "Deprecation warning: 'snapshot protect' call is deprecated and will be removed in a future release"
+        volname    = kwargs['vol_name']
+        subvolname = kwargs['sub_name']
+        groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.SNAP_PROTECT) as subvolume:
+                        log.warning("snapshot protect call is deprecated and will be removed in a future release")
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def unprotect_subvolume_snapshot(self, **kwargs):
+        ret        = 0, "", "Deprecation warning: 'snapshot unprotect' call is deprecated and will be removed in a future release"
+        volname    = kwargs['vol_name']
+        subvolname = kwargs['sub_name']
+        groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.SNAP_UNPROTECT) as subvolume:
+                        log.warning("snapshot unprotect call is deprecated and will be removed in a future release")
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def _prepare_clone_subvolume(self, fs_handle, volname, s_subvolume, s_snapname, t_group, t_subvolname, **kwargs):
+        t_pool              = kwargs['pool_layout']
+        s_subvolname        = kwargs['sub_name']
+        s_groupname         = kwargs['group_name']
+        t_groupname         = kwargs['target_group_name']
+
+        create_clone(self.mgr, fs_handle, self.volspec, t_group, t_subvolname, t_pool, volname, s_subvolume, s_snapname)
+        with open_subvol(self.mgr, fs_handle, self.volspec, t_group, t_subvolname, SubvolumeOpType.CLONE_INTERNAL) as t_subvolume:
+            try:
+                if t_groupname == s_groupname and t_subvolname == s_subvolname:
+                    t_subvolume.attach_snapshot(s_snapname, t_subvolume)
+                else:
+                    s_subvolume.attach_snapshot(s_snapname, t_subvolume)
+                self.cloner.queue_job(volname)
+            except VolumeException as ve:
+                try:
+                    t_subvolume.remove()
+                    self.purge_queue.queue_job(volname)
+                except Exception as e:
+                    log.warning("failed to cleanup clone subvolume '{0}' ({1})".format(t_subvolname, e))
+                raise ve
+
+    def _clone_subvolume_snapshot(self, fs_handle, volname, s_group, s_subvolume, **kwargs):
+        s_snapname          = kwargs['snap_name']
+        target_subvolname   = kwargs['target_sub_name']
+        target_groupname    = kwargs['target_group_name']
+        s_groupname         = kwargs['group_name']
+
+        if not s_snapname.encode('utf-8') in s_subvolume.list_snapshots():
+            raise VolumeException(-errno.ENOENT, "snapshot '{0}' does not exist".format(s_snapname))
+
+        with open_group_unique(fs_handle, self.volspec, target_groupname, s_group, s_groupname) as target_group:
+            try:
+                with open_subvol(self.mgr, fs_handle, self.volspec, target_group, target_subvolname, SubvolumeOpType.CLONE_CREATE):
+                    raise VolumeException(-errno.EEXIST, "subvolume '{0}' exists".format(target_subvolname))
+            except VolumeException as ve:
+                if ve.errno == -errno.ENOENT:
+                    self._prepare_clone_subvolume(fs_handle, volname, s_subvolume, s_snapname,
+                                                  target_group, target_subvolname, **kwargs)
+                else:
+                    raise
+
+    def clone_subvolume_snapshot(self, **kwargs):
+        ret        = 0, "", ""
+        volname    = kwargs['vol_name']
+        s_subvolname = kwargs['sub_name']
+        s_groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, s_groupname) as s_group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, s_group, s_subvolname, SubvolumeOpType.CLONE_SOURCE) as s_subvolume:
+                        self._clone_subvolume_snapshot(fs_handle, volname, s_group, s_subvolume, **kwargs)
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def clone_status(self, **kwargs):
+        ret       = 0, "", ""
+        volname   = kwargs['vol_name']
+        clonename = kwargs['clone_name']
+        groupname = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    with open_subvol(self.mgr, fs_handle, self.volspec, group, clonename, SubvolumeOpType.CLONE_STATUS) as subvolume:
+                        ret = 0, json.dumps({'status' : subvolume.status}, indent=2), ""
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def clone_cancel(self, **kwargs):
+        ret       = 0, "", ""
+        volname   = kwargs['vol_name']
+        clonename = kwargs['clone_name']
+        groupname = kwargs['group_name']
+
+        try:
+            self.cloner.cancel_job(volname, (clonename, groupname))
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    ### group operations
+
+    def create_subvolume_group(self, **kwargs):
+        ret       = 0, "", ""
+        volname    = kwargs['vol_name']
+        groupname = kwargs['group_name']
+        pool      = kwargs['pool_layout']
+        uid       = kwargs['uid']
+        gid       = kwargs['gid']
+        mode      = kwargs['mode']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                try:
+                    with open_group(fs_handle, self.volspec, groupname):
+                        # idempotent creation -- valid.
+                        pass
+                except VolumeException as ve:
+                    if ve.errno == -errno.ENOENT:
+                        oct_mode = octal_str_to_decimal_int(mode)
+                        create_group(fs_handle, self.volspec, groupname, pool, oct_mode, uid, gid)
+                    else:
+                        raise
+        except VolumeException as ve:
+            # volume does not exist or subvolume group creation failed
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def remove_subvolume_group(self, **kwargs):
+        ret       = 0, "", ""
+        volname    = kwargs['vol_name']
+        groupname = kwargs['group_name']
+        force     = kwargs['force']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                remove_group(fs_handle, self.volspec, groupname)
+        except VolumeException as ve:
+            if not (ve.errno == -errno.ENOENT and force):
+                ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def getpath_subvolume_group(self, **kwargs):
+        volname    = kwargs['vol_name']
+        groupname  = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    return 0, group.path.decode('utf-8'), ""
+        except VolumeException as ve:
+            return self.volume_exception_to_retval(ve)
+
+    def list_subvolume_groups(self, **kwargs):
+        volname = kwargs['vol_name']
+        ret     = 0, '[]', ""
+        try:
+            with open_volume(self, volname) as fs_handle:
+                groups = listdir(fs_handle, self.volspec.base_dir)
+                ret = 0, name_to_json(groups), ""
+        except VolumeException as ve:
+            if not ve.errno == -errno.ENOENT:
+                ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    ### group snapshot
+
+    def create_subvolume_group_snapshot(self, **kwargs):
+        ret       = -errno.ENOSYS, "", "subvolume group snapshots are not supported"
+        volname   = kwargs['vol_name']
+        groupname = kwargs['group_name']
+        # snapname  = kwargs['snap_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    # as subvolumes are marked with the vxattr ceph.dir.subvolume deny snapshots
+                    # at the subvolume group (see: https://tracker.ceph.com/issues/46074)
+                    # group.create_snapshot(snapname)
+                    pass
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def remove_subvolume_group_snapshot(self, **kwargs):
+        ret       = 0, "", ""
+        volname   = kwargs['vol_name']
+        groupname = kwargs['group_name']
+        snapname  = kwargs['snap_name']
+        force     = kwargs['force']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    group.remove_snapshot(snapname)
+        except VolumeException as ve:
+            if not (ve.errno == -errno.ENOENT and force):
+                ret = self.volume_exception_to_retval(ve)
+        return ret
+
+    def list_subvolume_group_snapshots(self, **kwargs):
+        ret       = 0, "", ""
+        volname   = kwargs['vol_name']
+        groupname = kwargs['group_name']
+
+        try:
+            with open_volume(self, volname) as fs_handle:
+                with open_group(fs_handle, self.volspec, groupname) as group:
+                    snapshots = group.list_snapshots()
+                    ret = 0, name_to_json(snapshots), ""
+        except VolumeException as ve:
+            ret = self.volume_exception_to_retval(ve)
+        return ret
diff --git a/src/pybind/mgr/volumes/module.py b/src/pybind/mgr/volumes/module.py
new file mode 100644
index 00000000..3b9ed4a9
--- /dev/null
+++ b/src/pybind/mgr/volumes/module.py
@@ -0,0 +1,576 @@
+import errno
+import json
+import logging
+import traceback
+import threading
+
+from mgr_module import MgrModule
+import orchestrator
+
+from .fs.volume import VolumeClient
+
+log = logging.getLogger(__name__)
+
+class VolumesInfoWrapper():
+    def __init__(self, f, context):
+        self.f = f
+        self.context = context
+    def __enter__(self):
+        log.info("Starting {}".format(self.context))
+    def __exit__(self, exc_type, exc_value, tb):
+        if exc_type is not None:
+            log.error("Failed {}:\n{}".format(self.context, "".join(traceback.format_exception(exc_type, exc_value, tb))))
+        else:
+            log.info("Finishing {}".format(self.context))
+
+def mgr_cmd_wrap(f):
+    def wrap(self, inbuf, cmd):
+        astr = []
+        for k in cmd:
+            astr.append("{}:{}".format(k, cmd[k]))
+        context = "{}({}) < \"{}\"".format(f.__name__, ", ".join(astr), inbuf)
+        with VolumesInfoWrapper(f, context):
+            return f(self, inbuf, cmd)
+    return wrap
+
+class Module(orchestrator.OrchestratorClientMixin, MgrModule):
+    COMMANDS = [
+        {
+            'cmd': 'fs volume ls',
+            'desc': "List volumes",
+            'perm': 'r'
+        },
+        {
+            'cmd': 'fs volume create '
+                   'name=name,type=CephString ',
+            'desc': "Create a CephFS volume",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs volume rm '
+                   'name=vol_name,type=CephString '
+                   'name=yes-i-really-mean-it,type=CephString,req=false ',
+            'desc': "Delete a FS volume by passing --yes-i-really-mean-it flag",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolumegroup ls '
+            'name=vol_name,type=CephString ',
+            'desc': "List subvolumegroups",
+            'perm': 'r'
+        },
+        {
+            'cmd': 'fs subvolumegroup create '
+                   'name=vol_name,type=CephString '
+                   'name=group_name,type=CephString '
+                   'name=pool_layout,type=CephString,req=false '
+                   'name=uid,type=CephInt,req=false '
+                   'name=gid,type=CephInt,req=false '
+                   'name=mode,type=CephString,req=false ',
+            'desc': "Create a CephFS subvolume group in a volume, and optionally, "
+                    "with a specific data pool layout, and a specific numeric mode",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolumegroup rm '
+                   'name=vol_name,type=CephString '
+                   'name=group_name,type=CephString '
+                   'name=force,type=CephBool,req=false ',
+            'desc': "Delete a CephFS subvolume group in a volume",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume ls '
+                   'name=vol_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "List subvolumes",
+            'perm': 'r'
+        },
+        {
+            'cmd': 'fs subvolume create '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=size,type=CephInt,req=false '
+                   'name=group_name,type=CephString,req=false '
+                   'name=pool_layout,type=CephString,req=false '
+                   'name=uid,type=CephInt,req=false '
+                   'name=gid,type=CephInt,req=false '
+                   'name=mode,type=CephString,req=false '
+                   'name=namespace_isolated,type=CephBool,req=false ',
+            'desc': "Create a CephFS subvolume in a volume, and optionally, "
+                    "with a specific size (in bytes), a specific data pool layout, "
+                    "a specific mode, in a specific subvolume group and in separate "
+                    "RADOS namespace",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume rm '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=group_name,type=CephString,req=false '
+                   'name=force,type=CephBool,req=false '
+                   'name=retain_snapshots,type=CephBool,req=false ',
+            'desc': "Delete a CephFS subvolume in a volume, and optionally, "
+                    "in a specific subvolume group, force deleting a cancelled or failed "
+                    "clone, and retaining existing subvolume snapshots",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume authorize '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=auth_id,type=CephString '
+                   'name=group_name,type=CephString,req=false '
+                   'name=access_level,type=CephString,req=false '
+                   'name=tenant_id,type=CephString,req=false '
+                   'name=allow_existing_id,type=CephBool,req=false ',
+            'desc': "Allow a cephx auth ID access to a subvolume",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume deauthorize '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=auth_id,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "Deny a cephx auth ID access to a subvolume",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume authorized_list '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "List auth IDs that have access to a subvolume",
+            'perm': 'r'
+        },
+        {
+            'cmd': 'fs subvolume evict '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=auth_id,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "Evict clients based on auth IDs and subvolume mounted",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolumegroup getpath '
+                   'name=vol_name,type=CephString '
+                   'name=group_name,type=CephString ',
+            'desc': "Get the mountpath of a CephFS subvolume group in a volume",
+            'perm': 'r'
+        },
+        {
+            'cmd': 'fs subvolume getpath '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "Get the mountpath of a CephFS subvolume in a volume, "
+                    "and optionally, in a specific subvolume group",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume info '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "Get the metadata of a CephFS subvolume in a volume, "
+                    "and optionally, in a specific subvolume group",
+            'perm': 'r'
+        },
+        {
+            'cmd': 'fs subvolumegroup snapshot ls '
+                   'name=vol_name,type=CephString '
+                   'name=group_name,type=CephString ',
+            'desc': "List subvolumegroup snapshots",
+            'perm': 'r'
+        },
+        {
+            'cmd': 'fs subvolumegroup snapshot create '
+                   'name=vol_name,type=CephString '
+                   'name=group_name,type=CephString '
+                   'name=snap_name,type=CephString ',
+            'desc': "Create a snapshot of a CephFS subvolume group in a volume",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolumegroup snapshot rm '
+                   'name=vol_name,type=CephString '
+                   'name=group_name,type=CephString '
+                   'name=snap_name,type=CephString '
+                   'name=force,type=CephBool,req=false ',
+                   'desc': "Delete a snapshot of a CephFS subvolume group in a volume",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume snapshot ls '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "List subvolume snapshots",
+            'perm': 'r'
+        },
+        {
+            'cmd': 'fs subvolume snapshot create '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=snap_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "Create a snapshot of a CephFS subvolume in a volume, "
+                    "and optionally, in a specific subvolume group",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume snapshot info '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=snap_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "Get the metadata of a CephFS subvolume snapshot "
+                    "and optionally, in a specific subvolume group",
+            'perm': 'r'
+        },
+        {
+            'cmd': 'fs subvolume snapshot rm '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=snap_name,type=CephString '
+                   'name=group_name,type=CephString,req=false '
+                   'name=force,type=CephBool,req=false ',
+            'desc': "Delete a snapshot of a CephFS subvolume in a volume, "
+                    "and optionally, in a specific subvolume group",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume resize '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=new_size,type=CephString,req=true '
+                   'name=group_name,type=CephString,req=false '
+                   'name=no_shrink,type=CephBool,req=false ',
+            'desc': "Resize a CephFS subvolume",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume snapshot protect '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=snap_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "(deprecated) Protect snapshot of a CephFS subvolume in a volume, "
+                    "and optionally, in a specific subvolume group",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume snapshot unprotect '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=snap_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "(deprecated) Unprotect a snapshot of a CephFS subvolume in a volume, "
+                    "and optionally, in a specific subvolume group",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs subvolume snapshot clone '
+                   'name=vol_name,type=CephString '
+                   'name=sub_name,type=CephString '
+                   'name=snap_name,type=CephString '
+                   'name=target_sub_name,type=CephString '
+                   'name=pool_layout,type=CephString,req=false '
+                   'name=group_name,type=CephString,req=false '
+                   'name=target_group_name,type=CephString,req=false ',
+            'desc': "Clone a snapshot to target subvolume",
+            'perm': 'rw'
+        },
+        {
+            'cmd': 'fs clone status '
+                   'name=vol_name,type=CephString '
+                   'name=clone_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "Get status on a cloned subvolume.",
+            'perm': 'r'
+        },
+        {
+            'cmd': 'fs clone cancel '
+                   'name=vol_name,type=CephString '
+                   'name=clone_name,type=CephString '
+                   'name=group_name,type=CephString,req=false ',
+            'desc': "Cancel an pending or ongoing clone operation.",
+            'perm': 'r'
+        },
+
+        # volume ls [recursive]
+        # subvolume ls <volume>
+        # volume authorize/deauthorize
+        # subvolume authorize/deauthorize
+
+        # volume describe (free space, etc)
+        # volume auth list (vc.get_authorized_ids)
+
+        # snapshots?
+
+        # FIXME: we're doing CephFSVolumeClient.recover on every
+        # path where we instantiate and connect a client.  Perhaps
+        # keep clients alive longer, or just pass a "don't recover"
+        # flag in if it's the >1st time we connected a particular
+        # volume in the lifetime of this module instance.
+    ]
+
+    MODULE_OPTIONS = [
+        {
+            'name': 'max_concurrent_clones',
+            'type': 'int',
+            'default': 4,
+            'desc': 'Number of asynchronous cloner threads',
+        }
+    ]
+
+    def __init__(self, *args, **kwargs):
+        self.inited = False
+        # for mypy
+        self.max_concurrent_clones = None
+        self.lock = threading.Lock()
+        super(Module, self).__init__(*args, **kwargs)
+        # Initialize config option members
+        self.config_notify()
+        with self.lock:
+            self.vc = VolumeClient(self)
+            self.inited = True
+
+    def __del__(self):
+        self.vc.shutdown()
+
+    def shutdown(self):
+        self.vc.shutdown()
+
+    def config_notify(self):
+        """
+        This method is called whenever one of our config options is changed.
+        """
+        with self.lock:
+            for opt in self.MODULE_OPTIONS:
+                setattr(self,
+                        opt['name'],  # type: ignore
+                        self.get_module_option(opt['name']))  # type: ignore
+                self.log.debug(' mgr option %s = %s',
+                               opt['name'], getattr(self, opt['name']))  # type: ignore
+                if self.inited:
+                    if opt['name'] == "max_concurrent_clones":
+                        self.vc.cloner.reconfigure_max_concurrent_clones(self.max_concurrent_clones)
+
+    def handle_command(self, inbuf, cmd):
+        handler_name = "_cmd_" + cmd['prefix'].replace(" ", "_")
+        try:
+            handler = getattr(self, handler_name)
+        except AttributeError:
+            return -errno.EINVAL, "", "Unknown command"
+
+        return handler(inbuf, cmd)
+
+    @mgr_cmd_wrap
+    def _cmd_fs_volume_create(self, inbuf, cmd):
+        vol_id = cmd['name']
+        return self.vc.create_fs_volume(vol_id)
+
+    @mgr_cmd_wrap
+    def _cmd_fs_volume_rm(self, inbuf, cmd):
+        vol_name = cmd['vol_name']
+        confirm = cmd.get('yes-i-really-mean-it', None)
+        return self.vc.delete_fs_volume(vol_name, confirm)
+
+    @mgr_cmd_wrap
+    def _cmd_fs_volume_ls(self, inbuf, cmd):
+        return self.vc.list_fs_volumes()
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolumegroup_create(self, inbuf, cmd):
+        """
+        :return: a 3-tuple of return code(int), empty string(str), error message (str)
+        """
+        return self.vc.create_subvolume_group(
+            vol_name=cmd['vol_name'], group_name=cmd['group_name'],
+            pool_layout=cmd.get('pool_layout', None), mode=cmd.get('mode', '755'),
+            uid=cmd.get('uid', None), gid=cmd.get('gid', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolumegroup_rm(self, inbuf, cmd):
+        """
+        :return: a 3-tuple of return code(int), empty string(str), error message (str)
+        """
+        return self.vc.remove_subvolume_group(vol_name=cmd['vol_name'],
+                                              group_name=cmd['group_name'],
+                                              force=cmd.get('force', False))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolumegroup_ls(self, inbuf, cmd):
+        return self.vc.list_subvolume_groups(vol_name=cmd['vol_name'])
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_create(self, inbuf, cmd):
+        """
+        :return: a 3-tuple of return code(int), empty string(str), error message (str)
+        """
+        return self.vc.create_subvolume(vol_name=cmd['vol_name'],
+                                        sub_name=cmd['sub_name'],
+                                        group_name=cmd.get('group_name', None),
+                                        size=cmd.get('size', None),
+                                        pool_layout=cmd.get('pool_layout', None),
+                                        uid=cmd.get('uid', None),
+                                        gid=cmd.get('gid', None),
+                                        mode=cmd.get('mode', '755'),
+                                        namespace_isolated=cmd.get('namespace_isolated', False))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_rm(self, inbuf, cmd):
+        """
+        :return: a 3-tuple of return code(int), empty string(str), error message (str)
+        """
+        return self.vc.remove_subvolume(vol_name=cmd['vol_name'],
+                                        sub_name=cmd['sub_name'],
+                                        group_name=cmd.get('group_name', None),
+                                        force=cmd.get('force', False),
+                                        retain_snapshots=cmd.get('retain_snapshots', False))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_authorize(self, inbuf, cmd):
+        """
+        :return: a 3-tuple of return code(int), secret key(str), error message (str)
+        """
+        return self.vc.authorize_subvolume(vol_name=cmd['vol_name'],
+                                           sub_name=cmd['sub_name'],
+                                           auth_id=cmd['auth_id'],
+                                           group_name=cmd.get('group_name', None),
+                                           access_level=cmd.get('access_level', 'rw'),
+                                           tenant_id=cmd.get('tenant_id', None),
+                                           allow_existing_id=cmd.get('allow_existing_id', False))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_deauthorize(self, inbuf, cmd):
+        """
+        :return: a 3-tuple of return code(int), empty string(str), error message (str)
+        """
+        return self.vc.deauthorize_subvolume(vol_name=cmd['vol_name'],
+                                             sub_name=cmd['sub_name'],
+                                             auth_id=cmd['auth_id'],
+                                             group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_authorized_list(self, inbuf, cmd):
+        """
+        :return: a 3-tuple of return code(int), list of authids(json), error message (str)
+        """
+        return self.vc.authorized_list(vol_name=cmd['vol_name'],
+                                       sub_name=cmd['sub_name'],
+                                       group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_evict(self, inbuf, cmd):
+        """
+        :return: a 3-tuple of return code(int), empyt string(str), error message (str)
+        """
+        return self.vc.evict(vol_name=cmd['vol_name'],
+                             sub_name=cmd['sub_name'],
+                             auth_id=cmd['auth_id'],
+                             group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_ls(self, inbuf, cmd):
+        return self.vc.list_subvolumes(vol_name=cmd['vol_name'],
+                                       group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolumegroup_getpath(self, inbuf, cmd):
+        return self.vc.getpath_subvolume_group(
+            vol_name=cmd['vol_name'], group_name=cmd['group_name'])
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_getpath(self, inbuf, cmd):
+        return self.vc.subvolume_getpath(vol_name=cmd['vol_name'],
+                                         sub_name=cmd['sub_name'],
+                                         group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_info(self, inbuf, cmd):
+        return self.vc.subvolume_info(vol_name=cmd['vol_name'],
+                                      sub_name=cmd['sub_name'],
+                                      group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolumegroup_snapshot_create(self, inbuf, cmd):
+        return self.vc.create_subvolume_group_snapshot(vol_name=cmd['vol_name'],
+                                                       group_name=cmd['group_name'],
+                                                       snap_name=cmd['snap_name'])
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolumegroup_snapshot_rm(self, inbuf, cmd):
+        return self.vc.remove_subvolume_group_snapshot(vol_name=cmd['vol_name'],
+                                                       group_name=cmd['group_name'],
+                                                       snap_name=cmd['snap_name'],
+                                                       force=cmd.get('force', False))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolumegroup_snapshot_ls(self, inbuf, cmd):
+        return self.vc.list_subvolume_group_snapshots(vol_name=cmd['vol_name'],
+                                                      group_name=cmd['group_name'])
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_snapshot_create(self, inbuf, cmd):
+        return self.vc.create_subvolume_snapshot(vol_name=cmd['vol_name'],
+                                                 sub_name=cmd['sub_name'],
+                                                 snap_name=cmd['snap_name'],
+                                                 group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_snapshot_rm(self, inbuf, cmd):
+        return self.vc.remove_subvolume_snapshot(vol_name=cmd['vol_name'],
+                                                 sub_name=cmd['sub_name'],
+                                                 snap_name=cmd['snap_name'],
+                                                 group_name=cmd.get('group_name', None),
+                                                 force=cmd.get('force', False))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_snapshot_info(self, inbuf, cmd):
+        return self.vc.subvolume_snapshot_info(vol_name=cmd['vol_name'],
+                                               sub_name=cmd['sub_name'],
+                                               snap_name=cmd['snap_name'],
+                                               group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_snapshot_ls(self, inbuf, cmd):
+        return self.vc.list_subvolume_snapshots(vol_name=cmd['vol_name'],
+                                                sub_name=cmd['sub_name'],
+                                                group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_resize(self, inbuf, cmd):
+        return self.vc.resize_subvolume(vol_name=cmd['vol_name'], sub_name=cmd['sub_name'],
+                                        new_size=cmd['new_size'], group_name=cmd.get('group_name', None),
+                                        no_shrink=cmd.get('no_shrink', False))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_snapshot_protect(self, inbuf, cmd):
+        return self.vc.protect_subvolume_snapshot(vol_name=cmd['vol_name'], sub_name=cmd['sub_name'],
+                                                  snap_name=cmd['snap_name'], group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_snapshot_unprotect(self, inbuf, cmd):
+        return self.vc.unprotect_subvolume_snapshot(vol_name=cmd['vol_name'], sub_name=cmd['sub_name'],
+                                                    snap_name=cmd['snap_name'], group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_subvolume_snapshot_clone(self, inbuf, cmd):
+        return self.vc.clone_subvolume_snapshot(
+            vol_name=cmd['vol_name'], sub_name=cmd['sub_name'], snap_name=cmd['snap_name'],
+            group_name=cmd.get('group_name', None), pool_layout=cmd.get('pool_layout', None),
+            target_sub_name=cmd['target_sub_name'], target_group_name=cmd.get('target_group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_clone_status(self, inbuf, cmd):
+        return self.vc.clone_status(
+            vol_name=cmd['vol_name'], clone_name=cmd['clone_name'],  group_name=cmd.get('group_name', None))
+
+    @mgr_cmd_wrap
+    def _cmd_fs_clone_cancel(self, inbuf, cmd):
+        return self.vc.clone_cancel(
+            vol_name=cmd['vol_name'], clone_name=cmd['clone_name'],  group_name=cmd.get('group_name', None))
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
commit	483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree	e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/pybind/mgr/volumes
parent	Initial commit. (diff)
download	ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip