diff options
Diffstat (limited to 'src/ceph-volume/ceph_volume/devices')
24 files changed, 4419 insertions, 0 deletions
diff --git a/src/ceph-volume/ceph_volume/devices/__init__.py b/src/ceph-volume/ceph_volume/devices/__init__.py new file mode 100644 index 000000000..2b017d671 --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/__init__.py @@ -0,0 +1 @@ +from . import lvm, simple, raw # noqa diff --git a/src/ceph-volume/ceph_volume/devices/lvm/__init__.py b/src/ceph-volume/ceph_volume/devices/lvm/__init__.py new file mode 100644 index 000000000..3c147123e --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/__init__.py @@ -0,0 +1 @@ +from .main import LVM # noqa diff --git a/src/ceph-volume/ceph_volume/devices/lvm/activate.py b/src/ceph-volume/ceph_volume/devices/lvm/activate.py new file mode 100644 index 000000000..feb91053b --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/activate.py @@ -0,0 +1,281 @@ +from __future__ import print_function +import argparse +import logging +import os +from textwrap import dedent +from ceph_volume import process, conf, decorators, terminal, configuration +from ceph_volume.util import system, disk +from ceph_volume.util import prepare as prepare_utils +from ceph_volume.util import encryption as encryption_utils +from ceph_volume.systemd import systemctl +from ceph_volume.api import lvm as api +from .listing import direct_report + + +logger = logging.getLogger(__name__) + + + +def get_osd_device_path(osd_lvs, device_type, dmcrypt_secret=None): + """ + ``device_type`` can be one of ``db``, ``wal`` or ``block`` so that we can + query LVs on system and fallback to querying the uuid if that is not + present. + + Return a path if possible, failing to do that a ``None``, since some of + these devices are optional. + """ + osd_block_lv = None + for lv in osd_lvs: + if lv.tags.get('ceph.type') == 'block': + osd_block_lv = lv + break + if osd_block_lv: + is_encrypted = osd_block_lv.tags.get('ceph.encrypted', '0') == '1' + logger.debug('Found block device (%s) with encryption: %s', osd_block_lv.name, is_encrypted) + uuid_tag = 'ceph.%s_uuid' % device_type + device_uuid = osd_block_lv.tags.get(uuid_tag) + if not device_uuid: + return None + + device_lv = None + for lv in osd_lvs: + if lv.tags.get('ceph.type') == device_type: + device_lv = lv + break + if device_lv: + if is_encrypted: + encryption_utils.luks_open(dmcrypt_secret, device_lv.lv_path, device_uuid) + return '/dev/mapper/%s' % device_uuid + return device_lv.lv_path + + # this could be a regular device, so query it with blkid + physical_device = disk.get_device_from_partuuid(device_uuid) + if physical_device: + if is_encrypted: + encryption_utils.luks_open(dmcrypt_secret, physical_device, device_uuid) + return '/dev/mapper/%s' % device_uuid + return physical_device + + raise RuntimeError('could not find %s with uuid %s' % (device_type, device_uuid)) + + +def activate_bluestore(osd_lvs, no_systemd=False, no_tmpfs=False): + for lv in osd_lvs: + if lv.tags.get('ceph.type') == 'block': + osd_block_lv = lv + break + else: + raise RuntimeError('could not find a bluestore OSD to activate') + + is_encrypted = osd_block_lv.tags.get('ceph.encrypted', '0') == '1' + dmcrypt_secret = None + osd_id = osd_block_lv.tags['ceph.osd_id'] + conf.cluster = osd_block_lv.tags['ceph.cluster_name'] + osd_fsid = osd_block_lv.tags['ceph.osd_fsid'] + configuration.load_ceph_conf_path(osd_block_lv.tags['ceph.cluster_name']) + configuration.load() + + # mount on tmpfs the osd directory + osd_path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id) + if not system.path_is_mounted(osd_path): + # mkdir -p and mount as tmpfs + prepare_utils.create_osd_path(osd_id, tmpfs=not no_tmpfs) + # XXX This needs to be removed once ceph-bluestore-tool can deal with + # symlinks that exist in the osd dir + for link_name in ['block', 'block.db', 'block.wal']: + link_path = os.path.join(osd_path, link_name) + if os.path.exists(link_path): + os.unlink(os.path.join(osd_path, link_name)) + # encryption is handled here, before priming the OSD dir + if is_encrypted: + osd_lv_path = '/dev/mapper/%s' % osd_block_lv.lv_uuid + lockbox_secret = osd_block_lv.tags['ceph.cephx_lockbox_secret'] + encryption_utils.write_lockbox_keyring(osd_id, osd_fsid, lockbox_secret) + dmcrypt_secret = encryption_utils.get_dmcrypt_key(osd_id, osd_fsid) + encryption_utils.luks_open(dmcrypt_secret, osd_block_lv.lv_path, osd_block_lv.lv_uuid) + else: + osd_lv_path = osd_block_lv.lv_path + + db_device_path = get_osd_device_path(osd_lvs, 'db', dmcrypt_secret=dmcrypt_secret) + wal_device_path = get_osd_device_path(osd_lvs, 'wal', dmcrypt_secret=dmcrypt_secret) + + # Once symlinks are removed, the osd dir can be 'primed again. chown first, + # regardless of what currently exists so that ``prime-osd-dir`` can succeed + # even if permissions are somehow messed up + system.chown(osd_path) + prime_command = [ + 'ceph-bluestore-tool', '--cluster=%s' % conf.cluster, + 'prime-osd-dir', '--dev', osd_lv_path, + '--path', osd_path, '--no-mon-config'] + + process.run(prime_command) + # always re-do the symlink regardless if it exists, so that the block, + # block.wal, and block.db devices that may have changed can be mapped + # correctly every time + process.run(['ln', '-snf', osd_lv_path, os.path.join(osd_path, 'block')]) + system.chown(os.path.join(osd_path, 'block')) + system.chown(osd_path) + if db_device_path: + destination = os.path.join(osd_path, 'block.db') + process.run(['ln', '-snf', db_device_path, destination]) + system.chown(db_device_path) + system.chown(destination) + if wal_device_path: + destination = os.path.join(osd_path, 'block.wal') + process.run(['ln', '-snf', wal_device_path, destination]) + system.chown(wal_device_path) + system.chown(destination) + + if no_systemd is False: + # enable the ceph-volume unit for this OSD + systemctl.enable_volume(osd_id, osd_fsid, 'lvm') + + # enable the OSD + systemctl.enable_osd(osd_id) + + # start the OSD + systemctl.start_osd(osd_id) + terminal.success("ceph-volume lvm activate successful for osd ID: %s" % osd_id) + + +class Activate(object): + + help = 'Discover and mount the LVM device associated with an OSD ID and start the Ceph OSD' + + def __init__(self, argv): + self.argv = argv + + @decorators.needs_root + def activate_all(self, args): + listed_osds = direct_report() + osds = {} + for osd_id, devices in listed_osds.items(): + # the metadata for all devices in each OSD will contain + # the FSID which is required for activation + for device in devices: + fsid = device.get('tags', {}).get('ceph.osd_fsid') + if fsid: + osds[fsid] = osd_id + break + if not osds: + terminal.warning('Was unable to find any OSDs to activate') + terminal.warning('Verify OSDs are present with "ceph-volume lvm list"') + return + for osd_fsid, osd_id in osds.items(): + if not args.no_systemd and systemctl.osd_is_active(osd_id): + terminal.warning( + 'OSD ID %s FSID %s process is active. Skipping activation' % (osd_id, osd_fsid) + ) + else: + terminal.info('Activating OSD ID %s FSID %s' % (osd_id, osd_fsid)) + self.activate(args, osd_id=osd_id, osd_fsid=osd_fsid) + + @decorators.needs_root + def activate(self, args, osd_id=None, osd_fsid=None): + """ + :param args: The parsed arguments coming from the CLI + :param osd_id: When activating all, this gets populated with an + existing OSD ID + :param osd_fsid: When activating all, this gets populated with an + existing OSD FSID + """ + osd_id = osd_id if osd_id else args.osd_id + osd_fsid = osd_fsid if osd_fsid else args.osd_fsid + + if osd_id and osd_fsid: + tags = {'ceph.osd_id': osd_id, 'ceph.osd_fsid': osd_fsid} + elif not osd_id and osd_fsid: + tags = {'ceph.osd_fsid': osd_fsid} + elif osd_id and not osd_fsid: + raise RuntimeError('could not activate osd.{}, please provide the ' + 'osd_fsid too'.format(osd_id)) + else: + raise RuntimeError('Please provide both osd_id and osd_fsid') + lvs = api.get_lvs(tags=tags) + if not lvs: + raise RuntimeError('could not find osd.%s with osd_fsid %s' % + (osd_id, osd_fsid)) + + # This argument is only available when passed in directly or via + # systemd, not when ``create`` is being used + # placeholder when a new objectstore support will be added + if getattr(args, 'auto_detect_objectstore', False): + logger.info('auto detecting objectstore') + return activate_bluestore(lvs, args.no_systemd) + + # explicit 'objectstore' flags take precedence + if getattr(args, 'bluestore', False): + activate_bluestore(lvs, args.no_systemd, getattr(args, 'no_tmpfs', False)) + elif any('ceph.block_device' in lv.tags for lv in lvs): + activate_bluestore(lvs, args.no_systemd, getattr(args, 'no_tmpfs', False)) + + def main(self): + sub_command_help = dedent(""" + Activate OSDs by discovering them with LVM and mounting them in their + appropriate destination: + + ceph-volume lvm activate {ID} {FSID} + + The lvs associated with the OSD need to have been prepared previously, + so that all needed tags and metadata exist. + + When migrating OSDs, or a multiple-osd activation is needed, the + ``--all`` flag can be used instead of the individual ID and FSID: + + ceph-volume lvm activate --all + + """) + parser = argparse.ArgumentParser( + prog='ceph-volume lvm activate', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + + parser.add_argument( + 'osd_id', + metavar='ID', + nargs='?', + help='The ID of the OSD, usually an integer, like 0' + ) + parser.add_argument( + 'osd_fsid', + metavar='FSID', + nargs='?', + help='The FSID of the OSD, similar to a SHA1' + ) + parser.add_argument( + '--auto-detect-objectstore', + action='store_true', + help='Autodetect the objectstore by inspecting the OSD', + ) + parser.add_argument( + '--bluestore', + action='store_true', + help='force bluestore objectstore activation', + ) + parser.add_argument( + '--all', + dest='activate_all', + action='store_true', + help='Activate all OSDs found in the system', + ) + parser.add_argument( + '--no-systemd', + dest='no_systemd', + action='store_true', + help='Skip creating and enabling systemd units and starting OSD services', + ) + parser.add_argument( + '--no-tmpfs', + action='store_true', + help='Do not use a tmpfs mount for OSD data dir' + ) + if len(self.argv) == 0: + print(sub_command_help) + return + args = parser.parse_args(self.argv) + if args.activate_all: + self.activate_all(args) + else: + self.activate(args) diff --git a/src/ceph-volume/ceph_volume/devices/lvm/batch.py b/src/ceph-volume/ceph_volume/devices/lvm/batch.py new file mode 100644 index 000000000..69a3f672b --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/batch.py @@ -0,0 +1,631 @@ +import argparse +from collections import namedtuple +import json +import logging +from textwrap import dedent +from ceph_volume import terminal, decorators +from ceph_volume.util import disk, prompt_bool, arg_validators, templates +from ceph_volume.util import prepare +from . import common +from .create import Create +from .prepare import Prepare + +mlogger = terminal.MultiLogger(__name__) +logger = logging.getLogger(__name__) + + +device_list_template = """ + * {path: <25} {size: <10} {state}""" + + +def device_formatter(devices): + lines = [] + for path, details in devices: + lines.append(device_list_template.format( + path=path, size=details['human_readable_size'], + state='solid' if details['rotational'] == '0' else 'rotational') + ) + + return ''.join(lines) + + +def ensure_disjoint_device_lists(data, db=[], wal=[]): + # check that all device lists are disjoint with each other + if not all([set(data).isdisjoint(set(db)), + set(data).isdisjoint(set(wal)), + set(db).isdisjoint(set(wal))]): + raise Exception('Device lists are not disjoint') + + +def separate_devices_from_lvs(devices): + phys = [] + lvm = [] + for d in devices: + phys.append(d) if d.is_device else lvm.append(d) + return phys, lvm + + +def get_physical_osds(devices, args): + ''' + Goes through passed physical devices and assigns OSDs + ''' + data_slots = args.osds_per_device + if args.data_slots: + data_slots = max(args.data_slots, args.osds_per_device) + rel_data_size = args.data_allocate_fraction / data_slots + mlogger.debug('relative data size: {}'.format(rel_data_size)) + ret = [] + for dev in devices: + if dev.available_lvm: + dev_size = dev.vg_size[0] + abs_size = disk.Size(b=int(dev_size * rel_data_size)) + free_size = dev.vg_free[0] + for _ in range(args.osds_per_device): + if abs_size > free_size: + break + free_size -= abs_size.b + osd_id = None + if args.osd_ids: + osd_id = args.osd_ids.pop() + ret.append(Batch.OSD(dev.path, + rel_data_size, + abs_size, + args.osds_per_device, + osd_id, + 'dmcrypt' if args.dmcrypt else None, + dev.symlink)) + return ret + + +def get_lvm_osds(lvs, args): + ''' + Goes through passed LVs and assigns planned osds + ''' + ret = [] + for lv in lvs: + if lv.used_by_ceph: + continue + osd_id = None + if args.osd_ids: + osd_id = args.osd_ids.pop() + osd = Batch.OSD("{}/{}".format(lv.vg_name, lv.lv_name), + 100.0, + disk.Size(b=int(lv.lvs[0].lv_size)), + 1, + osd_id, + 'dmcrypt' if args.dmcrypt else None) + ret.append(osd) + return ret + + +def get_physical_fast_allocs(devices, type_, fast_slots_per_device, new_osds, args): + requested_slots = getattr(args, '{}_slots'.format(type_)) + if not requested_slots or requested_slots < fast_slots_per_device: + if requested_slots: + mlogger.info('{}_slots argument is too small, ignoring'.format(type_)) + requested_slots = fast_slots_per_device + + requested_size = getattr(args, '{}_size'.format(type_), 0) + if not requested_size or requested_size == 0: + # no size argument was specified, check ceph.conf + get_size_fct = getattr(prepare, 'get_{}_size'.format(type_)) + requested_size = get_size_fct(lv_format=False) + + ret = [] + vg_device_map = group_devices_by_vg(devices) + for vg_name, vg_devices in vg_device_map.items(): + for dev in vg_devices: + if not dev.available_lvm: + continue + # any LV present is considered a taken slot + occupied_slots = len(dev.lvs) + # prior to v15.2.8, db/wal deployments were grouping multiple fast devices into single VGs - we need to + # multiply requested_slots (per device) by the number of devices in the VG in order to ensure that + # abs_size is calculated correctly from vg_size + if vg_name == 'unused_devices': + slots_for_vg = requested_slots + else: + if len(vg_devices) > 1: + slots_for_vg = len(args.devices) + else: + slots_for_vg = len(vg_devices) * requested_slots + dev_size = dev.vg_size[0] + # this only looks at the first vg on device, unsure if there is a better + # way + abs_size = disk.Size(b=int(dev_size / slots_for_vg)) + free_size = dev.vg_free[0] + relative_size = int(abs_size) / dev_size + if requested_size: + if requested_size <= abs_size: + abs_size = requested_size + relative_size = int(abs_size) / dev_size + else: + mlogger.error( + '{} was requested for {}, but only {} can be fulfilled'.format( + requested_size, + '{}_size'.format(type_), + abs_size, + )) + exit(1) + while abs_size <= free_size and len(ret) < new_osds and occupied_slots < fast_slots_per_device: + free_size -= abs_size.b + occupied_slots += 1 + ret.append((dev.path, relative_size, abs_size, requested_slots)) + return ret + +def group_devices_by_vg(devices): + result = dict() + result['unused_devices'] = [] + for dev in devices: + if len(dev.vgs) > 0: + vg_name = dev.vgs[0].name + if vg_name in result: + result[vg_name].append(dev) + else: + result[vg_name] = [dev] + else: + result['unused_devices'].append(dev) + return result + +def get_lvm_fast_allocs(lvs): + return [("{}/{}".format(d.vg_name, d.lv_name), 100.0, + disk.Size(b=int(d.lvs[0].lv_size)), 1) for d in lvs if not + d.journal_used_by_ceph] + + +class Batch(object): + + help = 'Automatically size devices for multi-OSD provisioning with minimal interaction' + + _help = dedent(""" + Automatically size devices ready for OSD provisioning based on default strategies. + + Usage: + + ceph-volume lvm batch [DEVICE...] + + Devices can be physical block devices or LVs. + Optional reporting on possible outcomes is enabled with --report + + ceph-volume lvm batch --report [DEVICE...] + """) + + def __init__(self, argv): + parser = argparse.ArgumentParser( + prog='ceph-volume lvm batch', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=self._help, + ) + + parser.add_argument( + 'devices', + metavar='DEVICES', + nargs='*', + type=arg_validators.ValidBatchDataDevice(), + default=[], + help='Devices to provision OSDs', + ) + parser.add_argument( + '--db-devices', + nargs='*', + type=arg_validators.ValidBatchDevice(), + default=[], + help='Devices to provision OSDs db volumes', + ) + parser.add_argument( + '--wal-devices', + nargs='*', + type=arg_validators.ValidBatchDevice(), + default=[], + help='Devices to provision OSDs wal volumes', + ) + parser.add_argument( + '--auto', + action='store_true', + help=('deploy multi-device OSDs if rotational and non-rotational drives ' + 'are passed in DEVICES'), + default=True + ) + parser.add_argument( + '--no-auto', + action='store_false', + dest='auto', + help=('deploy standalone OSDs if rotational and non-rotational drives ' + 'are passed in DEVICES'), + ) + parser.add_argument( + '--bluestore', + action='store_true', + help='bluestore objectstore (default)', + ) + parser.add_argument( + '--report', + action='store_true', + help='Only report on OSD that would be created and exit', + ) + parser.add_argument( + '--yes', + action='store_true', + help='Avoid prompting for confirmation when provisioning', + ) + parser.add_argument( + '--format', + help='output format, defaults to "pretty"', + default='pretty', + choices=['json', 'json-pretty', 'pretty'], + ) + parser.add_argument( + '--dmcrypt', + action='store_true', + help='Enable device encryption via dm-crypt', + ) + parser.add_argument( + '--crush-device-class', + dest='crush_device_class', + help='Crush device class to assign this OSD to', + default="" + ) + parser.add_argument( + '--no-systemd', + dest='no_systemd', + action='store_true', + help='Skip creating and enabling systemd units and starting OSD services', + ) + parser.add_argument( + '--osds-per-device', + type=int, + default=1, + help='Provision more than 1 (the default) OSD per device', + ) + parser.add_argument( + '--data-slots', + type=int, + help=('Provision more than 1 (the default) OSD slot per device' + ' if more slots then osds-per-device are specified, slots' + 'will stay unoccupied'), + ) + parser.add_argument( + '--data-allocate-fraction', + type=arg_validators.ValidFraction(), + help='Fraction to allocate from data device (0,1.0]', + default=1.0 + ) + parser.add_argument( + '--block-db-size', + type=disk.Size.parse, + help='Set (or override) the "bluestore_block_db_size" value, in bytes' + ) + parser.add_argument( + '--block-db-slots', + type=int, + help='Provision slots on DB device, can remain unoccupied' + ) + parser.add_argument( + '--block-wal-size', + type=disk.Size.parse, + help='Set (or override) the "bluestore_block_wal_size" value, in bytes' + ) + parser.add_argument( + '--block-wal-slots', + type=int, + help='Provision slots on WAL device, can remain unoccupied' + ) + parser.add_argument( + '--prepare', + action='store_true', + help='Only prepare all OSDs, do not activate', + ) + parser.add_argument( + '--osd-ids', + nargs='*', + default=[], + help='Reuse existing OSD ids', + type=arg_validators.valid_osd_id + ) + self.args = parser.parse_args(argv) + self.parser = parser + for dev_list in ['', 'db_', 'wal_']: + setattr(self, '{}usable'.format(dev_list), []) + + def report(self, plan): + report = self._create_report(plan) + print(report) + + def _create_report(self, plan): + if self.args.format == 'pretty': + report = '' + report += templates.total_osds.format(total_osds=len(plan)) + + report += templates.osd_component_titles + for osd in plan: + report += templates.osd_header + report += osd.report() + return report + else: + json_report = [] + for osd in plan: + json_report.append(osd.report_json()) + if self.args.format == 'json': + return json.dumps(json_report) + elif self.args.format == 'json-pretty': + return json.dumps(json_report, indent=4, + sort_keys=True) + + def _check_slot_args(self): + ''' + checking if -slots args are consistent with other arguments + ''' + if self.args.data_slots and self.args.osds_per_device: + if self.args.data_slots < self.args.osds_per_device: + raise ValueError('data_slots is smaller then osds_per_device') + + def _sort_rotational_disks(self): + ''' + Helper for legacy auto behaviour. + Sorts drives into rotating and non-rotating, the latter being used for + db. + ''' + mlogger.warning('DEPRECATION NOTICE') + mlogger.warning('You are using the legacy automatic disk sorting behavior') + mlogger.warning('The Pacific release will change the default to --no-auto') + rotating = [] + ssd = [] + for d in self.args.devices: + rotating.append(d) if d.rotational else ssd.append(d) + if ssd and not rotating: + # no need for additional sorting, we'll only deploy standalone on ssds + return + self.args.devices = rotating + self.args.db_devices = ssd + + @decorators.needs_root + def main(self): + if not self.args.devices: + return self.parser.print_help() + + # Default to bluestore here since defaulting it in add_argument may + # cause both to be True + if not self.args.bluestore: + self.args.bluestore = True + + if (self.args.auto and not self.args.db_devices and not + self.args.wal_devices): + self._sort_rotational_disks() + + self._check_slot_args() + + ensure_disjoint_device_lists(self.args.devices, + self.args.db_devices, + self.args.wal_devices) + + plan = self.get_plan(self.args) + + if self.args.report: + self.report(plan) + return 0 + + if not self.args.yes: + self.report(plan) + terminal.info('The above OSDs would be created if the operation continues') + if not prompt_bool('do you want to proceed? (yes/no)'): + terminal.error('aborting OSD provisioning') + raise SystemExit(0) + + self._execute(plan) + + def _execute(self, plan): + defaults = common.get_default_args() + global_args = [ + 'bluestore', + 'dmcrypt', + 'crush_device_class', + 'no_systemd', + ] + defaults.update({arg: getattr(self.args, arg) for arg in global_args}) + for osd in plan: + args = osd.get_args(defaults) + if self.args.prepare: + p = Prepare([]) + p.safe_prepare(argparse.Namespace(**args)) + else: + c = Create([]) + c.create(argparse.Namespace(**args)) + + + def get_plan(self, args): + if args.bluestore: + plan = self.get_deployment_layout(args, args.devices, args.db_devices, + args.wal_devices) + return plan + + def get_deployment_layout(self, args, devices, fast_devices=[], + very_fast_devices=[]): + ''' + The methods here are mostly just organization, error reporting and + setting up of (default) args. The heavy lifting code for the deployment + layout can be found in the static get_*_osds and get_*_fast_allocs + functions. + ''' + plan = [] + phys_devs, lvm_devs = separate_devices_from_lvs(devices) + mlogger.debug(('passed data devices: {} physical,' + ' {} LVM').format(len(phys_devs), len(lvm_devs))) + + plan.extend(get_physical_osds(phys_devs, args)) + + plan.extend(get_lvm_osds(lvm_devs, args)) + + num_osds = len(plan) + if num_osds == 0: + mlogger.info('All data devices are unavailable') + return plan + requested_osds = args.osds_per_device * len(phys_devs) + len(lvm_devs) + + if args.bluestore: + fast_type = 'block_db' + fast_allocations = self.fast_allocations(fast_devices, + requested_osds, + num_osds, + fast_type) + if fast_devices and not fast_allocations: + mlogger.info('{} fast devices were passed, but none are available'.format(len(fast_devices))) + return [] + if fast_devices and not len(fast_allocations) == num_osds: + mlogger.error('{} fast allocations != {} num_osds'.format( + len(fast_allocations), num_osds)) + exit(1) + + very_fast_allocations = self.fast_allocations(very_fast_devices, + requested_osds, + num_osds, + 'block_wal') + if very_fast_devices and not very_fast_allocations: + mlogger.info('{} very fast devices were passed, but none are available'.format(len(very_fast_devices))) + return [] + if very_fast_devices and not len(very_fast_allocations) == num_osds: + mlogger.error('{} very fast allocations != {} num_osds'.format( + len(very_fast_allocations), num_osds)) + exit(1) + + for osd in plan: + if fast_devices: + osd.add_fast_device(*fast_allocations.pop(), + type_=fast_type) + if very_fast_devices and args.bluestore: + osd.add_very_fast_device(*very_fast_allocations.pop()) + return plan + + def fast_allocations(self, devices, requested_osds, new_osds, type_): + ret = [] + if not devices: + return ret + phys_devs, lvm_devs = separate_devices_from_lvs(devices) + mlogger.debug(('passed {} devices: {} physical,' + ' {} LVM').format(type_, len(phys_devs), len(lvm_devs))) + + ret.extend(get_lvm_fast_allocs(lvm_devs)) + + # fill up uneven distributions across fast devices: 5 osds and 2 fast + # devices? create 3 slots on each device rather then deploying + # heterogeneous osds + slot_divider = max(1, len(phys_devs)) + if (requested_osds - len(lvm_devs)) % slot_divider: + fast_slots_per_device = int((requested_osds - len(lvm_devs)) / slot_divider) + 1 + else: + fast_slots_per_device = int((requested_osds - len(lvm_devs)) / slot_divider) + + + ret.extend(get_physical_fast_allocs(phys_devs, + type_, + fast_slots_per_device, + new_osds, + self.args)) + return ret + + class OSD(object): + ''' + This class simply stores info about to-be-deployed OSDs and provides an + easy way to retrieve the necessary create arguments. + ''' + VolSpec = namedtuple('VolSpec', + ['path', + 'rel_size', + 'abs_size', + 'slots', + 'type_']) + + def __init__(self, + data_path, + rel_size, + abs_size, + slots, + id_, + encryption, + symlink=None): + self.id_ = id_ + self.data = self.VolSpec(path=data_path, + rel_size=rel_size, + abs_size=abs_size, + slots=slots, + type_='data') + self.fast = None + self.very_fast = None + self.encryption = encryption + self.symlink = symlink + + def add_fast_device(self, path, rel_size, abs_size, slots, type_): + self.fast = self.VolSpec(path=path, + rel_size=rel_size, + abs_size=abs_size, + slots=slots, + type_=type_) + + def add_very_fast_device(self, path, rel_size, abs_size, slots): + self.very_fast = self.VolSpec(path=path, + rel_size=rel_size, + abs_size=abs_size, + slots=slots, + type_='block_wal') + + def _get_osd_plan(self): + plan = { + 'data': self.data.path, + 'data_size': self.data.abs_size, + 'encryption': self.encryption, + } + if self.fast: + type_ = self.fast.type_.replace('.', '_') + plan.update( + { + type_: self.fast.path, + '{}_size'.format(type_): self.fast.abs_size, + }) + if self.very_fast: + plan.update( + { + 'block_wal': self.very_fast.path, + 'block_wal_size': self.very_fast.abs_size, + }) + if self.id_: + plan.update({'osd_id': self.id_}) + return plan + + def get_args(self, defaults): + my_defaults = defaults.copy() + my_defaults.update(self._get_osd_plan()) + return my_defaults + + def report(self): + report = '' + if self.id_: + report += templates.osd_reused_id.format( + id_=self.id_) + if self.encryption: + report += templates.osd_encryption.format( + enc=self.encryption) + path = self.data.path + if self.symlink: + path = f'{self.symlink} -> {self.data.path}' + report += templates.osd_component.format( + _type=self.data.type_, + path=path, + size=self.data.abs_size, + percent=self.data.rel_size) + if self.fast: + report += templates.osd_component.format( + _type=self.fast.type_, + path=self.fast.path, + size=self.fast.abs_size, + percent=self.fast.rel_size) + if self.very_fast: + report += templates.osd_component.format( + _type=self.very_fast.type_, + path=self.very_fast.path, + size=self.very_fast.abs_size, + percent=self.very_fast.rel_size) + return report + + def report_json(self): + # cast all values to string so that the report can be dumped in to + # json.dumps + return {k: str(v) for k, v in self._get_osd_plan().items()} diff --git a/src/ceph-volume/ceph_volume/devices/lvm/common.py b/src/ceph-volume/ceph_volume/devices/lvm/common.py new file mode 100644 index 000000000..35e53181a --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/common.py @@ -0,0 +1,164 @@ +from ceph_volume.util import arg_validators, disk +from ceph_volume import process, conf +from ceph_volume import terminal +from ceph_volume.devices.lvm.zap import Zap +import argparse + +def rollback_osd(args, osd_id=None): + """ + When the process of creating or preparing fails, the OSD needs to be + destroyed so that the ID can be reused. This prevents from leaving the ID + around as "used" on the monitor, which can cause confusion if expecting + sequential OSD IDs. + + The usage of `destroy-new` allows this to be done without requiring the + admin keyring (otherwise needed for destroy and purge commands) + """ + if not osd_id: + # it means that it wasn't generated, so there is nothing to rollback here + return + + # once here, this is an error condition that needs to be rolled back + terminal.error('Was unable to complete a new OSD, will rollback changes') + osd_name = 'osd.%s' + bootstrap_keyring = '/var/lib/ceph/bootstrap-osd/%s.keyring' % conf.cluster + cmd = [ + 'ceph', + '--cluster', conf.cluster, + '--name', 'client.bootstrap-osd', + '--keyring', bootstrap_keyring, + 'osd', 'purge-new', osd_name % osd_id, + '--yes-i-really-mean-it', + ] + + process.run(cmd) + Zap(['--destroy', '--osd-id', osd_id]).main() + + +common_args = { + '--data': { + 'help': 'OSD data path. A physical device or logical volume', + 'required': True, + 'type': arg_validators.ValidDataDevice(as_string=True), + #'default':, + #'type':, + }, + '--data-size': { + 'help': 'Size of data LV in case a device was passed in --data', + 'default': '0', + 'type': disk.Size.parse + }, + '--data-slots': { + 'help': ('Intended number of slots on data device. The new OSD gets one' + 'of those slots or 1/nth of the available capacity'), + 'type': int, + 'default': 1, + }, + '--osd-id': { + 'help': 'Reuse an existing OSD id', + 'default': None, + 'type': arg_validators.valid_osd_id, + }, + '--osd-fsid': { + 'help': 'Reuse an existing OSD fsid', + 'default': None, + }, + '--cluster-fsid': { + 'help': 'Specify the cluster fsid, useful when no ceph.conf is available', + 'default': None, + }, + '--crush-device-class': { + 'dest': 'crush_device_class', + 'help': 'Crush device class to assign this OSD to', + 'default': "", + }, + '--dmcrypt': { + 'action': 'store_true', + 'help': 'Enable device encryption via dm-crypt', + }, + '--no-systemd': { + 'dest': 'no_systemd', + 'action': 'store_true', + 'help': 'Skip creating and enabling systemd units and starting OSD services when activating', + }, +} + +bluestore_args = { + '--bluestore': { + 'action': 'store_true', + 'help': 'Use the bluestore objectstore', + }, + '--block.db': { + 'dest': 'block_db', + 'help': 'Path to bluestore block.db logical volume or device', + 'type': arg_validators.ValidDevice(as_string=True), + }, + '--block.db-size': { + 'dest': 'block_db_size', + 'help': 'Size of block.db LV in case device was passed in --block.db', + 'default': '0', + 'type': disk.Size.parse + }, + '--block.db-slots': { + 'dest': 'block_db_slots', + 'help': ('Intended number of slots on db device. The new OSD gets one' + 'of those slots or 1/nth of the available capacity'), + 'type': int, + 'default': 1, + }, + '--block.wal': { + 'dest': 'block_wal', + 'help': 'Path to bluestore block.wal logical volume or device', + 'type': arg_validators.ValidDevice(as_string=True), + }, + '--block.wal-size': { + 'dest': 'block_wal_size', + 'help': 'Size of block.wal LV in case device was passed in --block.wal', + 'default': '0', + 'type': disk.Size.parse + }, + '--block.wal-slots': { + 'dest': 'block_wal_slots', + 'help': ('Intended number of slots on wal device. The new OSD gets one' + 'of those slots or 1/nth of the available capacity'), + 'type': int, + 'default': 1, + }, +} + + +def get_default_args(): + defaults = {} + def format_name(name): + return name.strip('-').replace('-', '_').replace('.', '_') + for argset in (common_args, bluestore_args): + defaults.update({format_name(name): val.get('default', None) for name, val in argset.items()}) + return defaults + + +def common_parser(prog, description): + """ + Both prepare and create share the same parser, those are defined here to + avoid duplication + """ + parser = argparse.ArgumentParser( + prog=prog, + formatter_class=argparse.RawDescriptionHelpFormatter, + description=description, + ) + + bluestore_group = parser.add_argument_group('bluestore') + + for name, kwargs in common_args.items(): + parser.add_argument(name, **kwargs) + + for name, kwargs in bluestore_args.items(): + bluestore_group.add_argument(name, **kwargs) + + # Do not parse args, so that consumers can do something before the args get + # parsed triggering argparse behavior + return parser + + +create_parser = common_parser # noqa +prepare_parser = common_parser # noqa diff --git a/src/ceph-volume/ceph_volume/devices/lvm/create.py b/src/ceph-volume/ceph_volume/devices/lvm/create.py new file mode 100644 index 000000000..631a21b23 --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/create.py @@ -0,0 +1,77 @@ +from __future__ import print_function +from textwrap import dedent +import logging +from ceph_volume.util import system +from ceph_volume.util.arg_validators import exclude_group_options +from ceph_volume import decorators, terminal +from .common import create_parser, rollback_osd +from .prepare import Prepare +from .activate import Activate + +logger = logging.getLogger(__name__) + + +class Create(object): + + help = 'Create a new OSD from an LVM device' + + def __init__(self, argv): + self.argv = argv + + @decorators.needs_root + def create(self, args): + if not args.osd_fsid: + args.osd_fsid = system.generate_uuid() + prepare_step = Prepare([]) + prepare_step.safe_prepare(args) + osd_id = prepare_step.osd_id + try: + # we try this for activate only when 'creating' an OSD, because a rollback should not + # happen when doing normal activation. For example when starting an OSD, systemd will call + # activate, which would never need to be rolled back. + Activate([]).activate(args) + except Exception: + logger.exception('lvm activate was unable to complete, while creating the OSD') + logger.info('will rollback OSD ID creation') + rollback_osd(args, osd_id) + raise + terminal.success("ceph-volume lvm create successful for: %s" % args.data) + + def main(self): + sub_command_help = dedent(""" + Create an OSD by assigning an ID and FSID, registering them with the + cluster with an ID and FSID, formatting and mounting the volume, adding + all the metadata to the logical volumes using LVM tags, and starting + the OSD daemon. This is a convenience command that combines the prepare + and activate steps. + + Encryption is supported via dmcrypt and the --dmcrypt flag. + + Existing logical volume (lv): + + ceph-volume lvm create --data {vg/lv} + + Existing block device (a logical volume will be created): + + ceph-volume lvm create --data /path/to/device + + Optionally, can consume db and wal block devices, partitions or logical + volumes. A device will get a logical volume, partitions and existing + logical volumes will be used as is: + + ceph-volume lvm create --data {vg/lv} --block.wal {partition} --block.db {/path/to/device} + """) + parser = create_parser( + prog='ceph-volume lvm create', + description=sub_command_help, + ) + if len(self.argv) == 0: + print(sub_command_help) + return + exclude_group_options(parser, groups=['bluestore'], argv=self.argv) + args = parser.parse_args(self.argv) + # Default to bluestore here since defaulting it in add_argument may + # cause both to be True + if not args.bluestore: + args.bluestore = True + self.create(args) diff --git a/src/ceph-volume/ceph_volume/devices/lvm/deactivate.py b/src/ceph-volume/ceph_volume/devices/lvm/deactivate.py new file mode 100644 index 000000000..0cc8d71ae --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/deactivate.py @@ -0,0 +1,88 @@ +import argparse +import logging +import sys +from textwrap import dedent +from ceph_volume import conf +from ceph_volume.util import encryption, system +from ceph_volume.api.lvm import get_lvs_by_tag + +logger = logging.getLogger(__name__) + + +def deactivate_osd(osd_id=None, osd_uuid=None): + + lvs = [] + if osd_uuid is not None: + lvs = get_lvs_by_tag('ceph.osd_fsid={}'.format(osd_uuid)) + osd_id = next(lv.tags['ceph.osd_id'] for lv in lvs) + else: + lvs = get_lvs_by_tag('ceph.osd_id={}'.format(osd_id)) + + data_lv = next(lv for lv in lvs if lv.tags['ceph.type'] in ['data', 'block']) + + conf.cluster = data_lv.tags['ceph.cluster_name'] + logger.debug('Found cluster name {}'.format(conf.cluster)) + + tmpfs_path = '/var/lib/ceph/osd/{}-{}'.format(conf.cluster, osd_id) + system.unmount_tmpfs(tmpfs_path) + + for lv in lvs: + if lv.tags.get('ceph.encrypted', '0') == '1': + encryption.dmcrypt_close(mapping=lv.lv_uuid, skip_path_check=True) + + +class Deactivate(object): + + help = 'Deactivate OSDs' + + def deactivate(self, args=None): + if args: + self.args = args + try: + deactivate_osd(self.args.osd_id, self.args.osd_uuid) + except StopIteration: + logger.error(('No data or block LV found for OSD' + '{}').format(self.args.osd_id)) + sys.exit(1) + + def __init__(self, argv): + self.argv = argv + + def main(self): + sub_command_help = dedent(""" + Deactivate unmounts and OSDs tmpfs and closes any crypt devices. + + ceph-volume lvm deactivate {ID} {FSID} + + """) + parser = argparse.ArgumentParser( + prog='ceph-volume lvm deactivate', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + + parser.add_argument( + 'osd_id', + nargs='?', + help='The ID of the OSD' + ) + parser.add_argument( + 'osd_uuid', + nargs='?', + help='The UUID of the OSD, similar to a SHA1, takes precedence over osd_id' + ) + # parser.add_argument( + # '--all', + # action='store_true', + # help='Deactivate all OSD volumes found in the system', + # ) + if len(self.argv) == 0: + print(sub_command_help) + return + args = parser.parse_args(self.argv) + # Default to bluestore here since defaulting it in add_argument may + # cause both to be True + if not args.osd_id and not args.osd_uuid: + raise ValueError(('Can not identify OSD, pass either all or' + 'osd_id or osd_uuid')) + self.deactivate(args) diff --git a/src/ceph-volume/ceph_volume/devices/lvm/listing.py b/src/ceph-volume/ceph_volume/devices/lvm/listing.py new file mode 100644 index 000000000..c16afdaa7 --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/listing.py @@ -0,0 +1,223 @@ +from __future__ import print_function +import argparse +import json +import logging +from textwrap import dedent +from ceph_volume import decorators +from ceph_volume.api import lvm as api + +logger = logging.getLogger(__name__) + + +osd_list_header_template = """\n +{osd_id:=^20}""" + + +osd_device_header_template = """ + + {type: <13} {path} +""" + +device_metadata_item_template = """ + {tag_name: <25} {value}""" + + +def readable_tag(tag): + actual_name = tag.split('.')[-1] + return actual_name.replace('_', ' ') + + +def pretty_report(report): + output = [] + for osd_id, devices in sorted(report.items()): + output.append( + osd_list_header_template.format(osd_id=" osd.%s " % osd_id) + ) + for device in devices: + output.append( + osd_device_header_template.format( + type='[%s]' % device['type'], + path=device['path'] + ) + ) + for tag_name, value in sorted(device.get('tags', {}).items()): + output.append( + device_metadata_item_template.format( + tag_name=readable_tag(tag_name), + value=value + ) + ) + if not device.get('devices'): + continue + else: + output.append( + device_metadata_item_template.format( + tag_name='devices', + value=','.join(device['devices']) + ) + ) + + print(''.join(output)) + + +def direct_report(): + """ + Other non-cli consumers of listing information will want to consume the + report without the need to parse arguments or other flags. This helper + bypasses the need to deal with the class interface which is meant for cli + handling. + """ + return List([]).full_report() + + +# TODO: Perhaps, get rid of this class and simplify this module further? +class List(object): + + help = 'list logical volumes and devices associated with Ceph' + + def __init__(self, argv): + self.argv = argv + + @decorators.needs_root + def list(self, args): + report = self.single_report(args.device) if args.device else \ + self.full_report() + if args.format == 'json': + # If the report is empty, we don't return a non-zero exit status + # because it is assumed this is going to be consumed by automated + # systems like ceph-ansible which would be forced to ignore the + # non-zero exit status if all they need is the information in the + # JSON object + print(json.dumps(report, indent=4, sort_keys=True)) + else: + if not report: + raise SystemExit('No valid Ceph lvm devices found') + pretty_report(report) + + def create_report(self, lvs): + """ + Create a report for LVM dev(s) passed. Returns '{}' to denote failure. + """ + + report = {} + + pvs = api.get_pvs() + + for lv in lvs: + if not api.is_ceph_device(lv): + continue + + osd_id = lv.tags['ceph.osd_id'] + report.setdefault(osd_id, []) + lv_report = lv.as_dict() + + lv_report['devices'] = [pv.name for pv in pvs if pv.lv_uuid == lv.lv_uuid] if pvs else [] + report[osd_id].append(lv_report) + + phys_devs = self.create_report_non_lv_device(lv) + if phys_devs: + report[osd_id].append(phys_devs) + + return report + + def create_report_non_lv_device(self, lv): + report = {} + if lv.tags.get('ceph.type', '') in ['data', 'block']: + for dev_type in ['journal', 'wal', 'db']: + dev = lv.tags.get('ceph.{}_device'.format(dev_type), '') + # counting / in the device name seems brittle but should work, + # lvs will have 3 + if dev and dev.count('/') == 2: + device_uuid = lv.tags.get('ceph.{}_uuid'.format(dev_type)) + report = {'tags': {'PARTUUID': device_uuid}, + 'type': dev_type, + 'path': dev} + return report + + def full_report(self): + """ + Create a report of all Ceph LVs. Returns '{}' to denote failure. + """ + return self.create_report(api.get_lvs()) + + def single_report(self, arg): + """ + Generate a report for a single device. This can be either a logical + volume in the form of vg/lv, a device with an absolute path like + /dev/sda1 or /dev/sda, or a list of devices under same OSD ID. + + Return value '{}' denotes failure. + """ + if isinstance(arg, int) or arg.isdigit(): + lv = api.get_lvs_from_osd_id(arg) + elif arg[0] == '/': + lv = api.get_lvs_from_path(arg) + else: + lv = [api.get_single_lv(filters={'lv_name': arg.split('/')[1]})] + + report = self.create_report(lv) + + if not report: + # check if device is a non-lvm journals or wal/db + for dev_type in ['journal', 'wal', 'db']: + lvs = api.get_lvs(tags={ + 'ceph.{}_device'.format(dev_type): arg}) + if lvs: + # just taking the first lv here should work + lv = lvs[0] + phys_dev = self.create_report_non_lv_device(lv) + osd_id = lv.tags.get('ceph.osd_id') + if osd_id: + report[osd_id] = [phys_dev] + + + return report + + def main(self): + sub_command_help = dedent(""" + List devices or logical volumes associated with Ceph. An association is + determined if a device has information relating to an OSD. This is + verified by querying LVM's metadata and correlating it with devices. + + The lvs associated with the OSD need to have been prepared previously, + so that all needed tags and metadata exist. + + Full listing of all system devices associated with a cluster:: + + ceph-volume lvm list + + List devices under same OSD ID:: + + ceph-volume lvm list <OSD-ID> + + List a particular device, reporting all metadata about it:: + + ceph-volume lvm list /dev/sda1 + + List a logical volume, along with all its metadata (vg is a volume + group, and lv the logical volume name):: + + ceph-volume lvm list {vg/lv} + """) + parser = argparse.ArgumentParser( + prog='ceph-volume lvm list', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + + parser.add_argument( + 'device', + metavar='DEVICE', + nargs='?', + help='Path to an lv (as vg/lv) or to a device like /dev/sda1' + ) + + parser.add_argument( + '--format', + help='output format, defaults to "pretty"', + default='pretty', + choices=['json', 'pretty'], + ) + + args = parser.parse_args(self.argv) + self.list(args) diff --git a/src/ceph-volume/ceph_volume/devices/lvm/main.py b/src/ceph-volume/ceph_volume/devices/lvm/main.py new file mode 100644 index 000000000..39947454d --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/main.py @@ -0,0 +1,54 @@ +import argparse +from textwrap import dedent +from ceph_volume import terminal +from . import activate +from . import deactivate +from . import prepare +from . import create +from . import trigger +from . import listing +from . import zap +from . import batch +from . import migrate + + +class LVM(object): + + help = 'Use LVM and LVM-based technologies to deploy OSDs' + + _help = dedent(""" + Use LVM and LVM-based technologies to deploy OSDs + + {sub_help} + """) + + mapper = { + 'activate': activate.Activate, + 'deactivate': deactivate.Deactivate, + 'batch': batch.Batch, + 'prepare': prepare.Prepare, + 'create': create.Create, + 'trigger': trigger.Trigger, + 'list': listing.List, + 'zap': zap.Zap, + 'migrate': migrate.Migrate, + 'new-wal': migrate.NewWAL, + 'new-db': migrate.NewDB, + } + + def __init__(self, argv): + self.argv = argv + + def print_help(self, sub_help): + return self._help.format(sub_help=sub_help) + + def main(self): + terminal.dispatch(self.mapper, self.argv) + parser = argparse.ArgumentParser( + prog='ceph-volume lvm', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=self.print_help(terminal.subhelp(self.mapper)), + ) + parser.parse_args(self.argv) + if len(self.argv) <= 1: + return parser.print_help() diff --git a/src/ceph-volume/ceph_volume/devices/lvm/migrate.py b/src/ceph-volume/ceph_volume/devices/lvm/migrate.py new file mode 100644 index 000000000..64589a2d6 --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/migrate.py @@ -0,0 +1,719 @@ +from __future__ import print_function +import argparse +import logging +import os +from textwrap import dedent +from ceph_volume.util import system, disk, merge_dict +from ceph_volume.util.device import Device +from ceph_volume.util.arg_validators import valid_osd_id +from ceph_volume.util import encryption as encryption_utils +from ceph_volume import decorators, terminal, process +from ceph_volume.api import lvm as api +from ceph_volume.systemd import systemctl + + +logger = logging.getLogger(__name__) +mlogger = terminal.MultiLogger(__name__) + +def get_cluster_name(osd_id, osd_fsid): + """ + From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the + system that match those tag values, then return cluster_name for the first + one. + """ + lv_tags = {} + lv_tags['ceph.osd_id'] = osd_id + lv_tags['ceph.osd_fsid'] = osd_fsid + + lvs = api.get_lvs(tags=lv_tags) + if not lvs: + mlogger.error( + 'Unable to find any LV for source OSD: id:{} fsid:{}'.format( + osd_id, osd_fsid) ) + raise SystemExit('Unexpected error, terminating') + return next(iter(lvs)).tags["ceph.cluster_name"] + +def get_osd_path(osd_id, osd_fsid): + return '/var/lib/ceph/osd/{}-{}'.format( + get_cluster_name(osd_id, osd_fsid), osd_id) + +def find_associated_devices(osd_id, osd_fsid): + """ + From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the + system that match those tag values, further detect if any partitions are + part of the OSD, and then return the set of LVs and partitions (if any). + """ + lv_tags = {} + lv_tags['ceph.osd_id'] = osd_id + lv_tags['ceph.osd_fsid'] = osd_fsid + + lvs = api.get_lvs(tags=lv_tags) + if not lvs: + mlogger.error( + 'Unable to find any LV for source OSD: id:{} fsid:{}'.format( + osd_id, osd_fsid) ) + raise SystemExit('Unexpected error, terminating') + + devices = set(ensure_associated_lvs(lvs, lv_tags)) + return [(Device(path), type) for path, type in devices if path] + +def ensure_associated_lvs(lvs, lv_tags): + """ + Go through each LV and ensure if backing devices (journal, wal, block) + are LVs or partitions, so that they can be accurately reported. + """ + # look for many LVs for each backing type, because it is possible to + # receive a filtering for osd.1, and have multiple failed deployments + # leaving many journals with osd.1 - usually, only a single LV will be + # returned + + block_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'block'})) + db_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'db'})) + wal_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'wal'})) + backing_devices = [(block_lvs, 'block'), (db_lvs, 'db'), + (wal_lvs, 'wal')] + + verified_devices = [] + + for lv in lvs: + # go through each lv and append it, otherwise query `blkid` to find + # a physical device. Do this for each type (journal,db,wal) regardless + # if they have been processed in the previous LV, so that bad devices + # with the same ID can be caught + for ceph_lvs, type in backing_devices: + + if ceph_lvs: + verified_devices.extend([(l.lv_path, type) for l in ceph_lvs]) + continue + + # must be a disk partition, by querying blkid by the uuid we are + # ensuring that the device path is always correct + try: + device_uuid = lv.tags['ceph.{}_uuid'.format(type)] + except KeyError: + # Bluestore will not have ceph.journal_uuid, and Filestore + # will not not have ceph.db_uuid + continue + + osd_device = disk.get_device_from_partuuid(device_uuid) + if not osd_device: + # if the osd_device is not found by the partuuid, then it is + # not possible to ensure this device exists anymore, so skip it + continue + verified_devices.append((osd_device, type)) + + return verified_devices + +class VolumeTagTracker(object): + def __init__(self, devices, target_lv): + self.target_lv = target_lv + self.data_device = self.db_device = self.wal_device = None + for device, type in devices: + if type == 'block': + self.data_device = device + elif type == 'db': + self.db_device = device + elif type == 'wal': + self.wal_device = device + if not self.data_device: + mlogger.error('Data device not found') + raise SystemExit( + "Unexpected error, terminating") + if not self.data_device.is_lv: + mlogger.error('Data device isn\'t LVM') + raise SystemExit( + "Unexpected error, terminating") + + self.old_target_tags = self.target_lv.tags.copy() + self.old_data_tags = ( + self.data_device.lv_api.tags.copy() + if self.data_device.is_lv else None) + self.old_db_tags = ( + self.db_device.lv_api.tags.copy() + if self.db_device and self.db_device.is_lv else None) + self.old_wal_tags = ( + self.wal_device.lv_api.tags.copy() + if self.wal_device and self.wal_device.is_lv else None) + + def update_tags_when_lv_create(self, create_type): + tags = {} + if not self.data_device.is_lv: + mlogger.warning( + 'Data device is not LVM, wouldn\'t update LVM tags') + else: + tags["ceph.{}_uuid".format(create_type)] = self.target_lv.lv_uuid + tags["ceph.{}_device".format(create_type)] = self.target_lv.lv_path + self.data_device.lv_api.set_tags(tags) + + tags = self.data_device.lv_api.tags.copy() + tags["ceph.type"] = create_type + self.target_lv.set_tags(tags) + + aux_dev = None + if create_type == "db" and self.wal_device: + aux_dev = self.wal_device + elif create_type == "wal" and self.db_device: + aux_dev = self.db_device + else: + return + if not aux_dev.is_lv: + mlogger.warning( + '{} device is not LVM, wouldn\'t update LVM tags'.format( + create_type.upper())) + else: + tags = {} + tags["ceph.{}_uuid".format(create_type)] = self.target_lv.lv_uuid + tags["ceph.{}_device".format(create_type)] = self.target_lv.lv_path + aux_dev.lv_api.set_tags(tags) + + def remove_lvs(self, source_devices, target_type): + remaining_devices = [self.data_device, self.db_device, self.wal_device] + + outdated_tags = [] + for device, type in source_devices: + if type == "block" or type == target_type: + continue + remaining_devices.remove(device) + if device.is_lv: + outdated_tags.append("ceph.{}_uuid".format(type)) + outdated_tags.append("ceph.{}_device".format(type)) + device.lv_api.clear_tags() + if len(outdated_tags) > 0: + for d in remaining_devices: + if d and d.is_lv: + d.lv_api.clear_tags(outdated_tags) + + def replace_lvs(self, source_devices, target_type): + remaining_devices = [self.data_device] + if self.db_device: + remaining_devices.append(self.db_device) + if self.wal_device: + remaining_devices.append(self.wal_device) + + outdated_tags = [] + for device, type in source_devices: + if type == "block": + continue + remaining_devices.remove(device) + if device.is_lv: + outdated_tags.append("ceph.{}_uuid".format(type)) + outdated_tags.append("ceph.{}_device".format(type)) + device.lv_api.clear_tags() + + new_tags = {} + new_tags["ceph.{}_uuid".format(target_type)] = self.target_lv.lv_uuid + new_tags["ceph.{}_device".format(target_type)] = self.target_lv.lv_path + + for d in remaining_devices: + if d and d.is_lv: + if len(outdated_tags) > 0: + d.lv_api.clear_tags(outdated_tags) + d.lv_api.set_tags(new_tags) + + if not self.data_device.is_lv: + mlogger.warning( + 'Data device is not LVM, wouldn\'t properly update target LVM tags') + else: + tags = self.data_device.lv_api.tags.copy() + + tags["ceph.type"] = target_type + tags["ceph.{}_uuid".format(target_type)] = self.target_lv.lv_uuid + tags["ceph.{}_device".format(target_type)] = self.target_lv.lv_path + self.target_lv.set_tags(tags) + + def undo(self): + mlogger.info( + 'Undoing lv tag set') + if self.data_device: + if self.old_data_tags: + self.data_device.lv_api.set_tags(self.old_data_tags) + else: + self.data_device.lv_api.clear_tags() + if self.db_device: + if self.old_db_tags: + self.db_device.lv_api.set_tags(self.old_db_tags) + else: + self.db_device.lv_api.clear_tags() + if self.wal_device: + if self.old_wal_tags: + self.wal_device.lv_api.set_tags(self.old_wal_tags) + else: + self.wal_device.lv_api.clear_tags() + if self.old_target_tags: + self.target_lv.set_tags(self.old_target_tags) + else: + self.target_lv.clear_tags() + +class Migrate(object): + + help = 'Migrate BlueFS data from to another LVM device' + + def __init__(self, argv): + self.argv = argv + self.osd_id = None + + def get_source_devices(self, devices, target_type=""): + ret = [] + for device, type in devices: + if type == target_type: + continue + if type == 'block': + if 'data' not in self.args.from_: + continue; + elif type == 'db': + if 'db' not in self.args.from_: + continue; + elif type == 'wal': + if 'wal' not in self.args.from_: + continue; + ret.append([device, type]) + if ret == []: + mlogger.error('Source device list is empty') + raise SystemExit( + 'Unable to migrate to : {}'.format(self.args.target)) + return ret + + # ceph-bluestore-tool uses the following replacement rules + # (in the order of precedence, stop on the first match) + # if source list has DB volume - target device replaces it. + # if source list has WAL volume - target device replace it. + # if source list has slow volume only - operation isn't permitted, + # requires explicit allocation via new-db/new-wal command.detects which + def get_target_type_by_source(self, devices): + ret = None + for device, type in devices: + if type == 'db': + return 'db' + elif type == 'wal': + ret = 'wal' + return ret + + def get_filename_by_type(self, type): + filename = 'block' + if type == 'db' or type == 'wal': + filename += '.' + type + return filename + + def get_source_args(self, osd_path, devices): + ret = [] + for device, type in devices: + ret = ret + ["--devs-source", os.path.join( + osd_path, self.get_filename_by_type(type))] + return ret + + def close_encrypted(self, source_devices): + # close source device(-s) if they're encrypted and have been removed + for device,type in source_devices: + if (type == 'db' or type == 'wal'): + logger.info("closing dmcrypt volume {}" + .format(device.lv_api.lv_uuid)) + encryption_utils.dmcrypt_close( + mapping = device.lv_api.lv_uuid, skip_path_check=True) + + @decorators.needs_root + def migrate_to_new(self, osd_id, osd_fsid, devices, target_lv): + source_devices = self.get_source_devices(devices) + target_type = self.get_target_type_by_source(source_devices) + if not target_type: + mlogger.error( + "Unable to determine new volume type," + " please use new-db or new-wal command before.") + raise SystemExit( + "Unable to migrate to : {}".format(self.args.target)) + + target_path = target_lv.lv_path + tag_tracker = VolumeTagTracker(devices, target_lv) + # prepare and encrypt target if data volume is encrypted + if tag_tracker.data_device.lv_api.encrypted: + secret = encryption_utils.get_dmcrypt_key(osd_id, osd_fsid) + mlogger.info(' preparing dmcrypt for {}, uuid {}'.format(target_lv.lv_path, target_lv.lv_uuid)) + target_path = encryption_utils.prepare_dmcrypt( + key=secret, device=target_path, mapping=target_lv.lv_uuid) + try: + # we need to update lvm tags for all the remaining volumes + # and clear for ones which to be removed + + # ceph-bluestore-tool removes source volume(s) other than block one + # and attaches target one after successful migration + tag_tracker.replace_lvs(source_devices, target_type) + + osd_path = get_osd_path(osd_id, osd_fsid) + source_args = self.get_source_args(osd_path, source_devices) + mlogger.info("Migrate to new, Source: {} Target: {}".format( + source_args, target_path)) + stdout, stderr, exit_code = process.call([ + 'ceph-bluestore-tool', + '--path', + osd_path, + '--dev-target', + target_path, + '--command', + 'bluefs-bdev-migrate'] + + source_args) + if exit_code != 0: + mlogger.error( + 'Failed to migrate device, error code:{}'.format(exit_code)) + raise SystemExit( + 'Failed to migrate to : {}'.format(self.args.target)) + + system.chown(os.path.join(osd_path, "block.{}".format( + target_type))) + if tag_tracker.data_device.lv_api.encrypted: + self.close_encrypted(source_devices) + terminal.success('Migration successful.') + + except: + tag_tracker.undo() + raise + + return + + @decorators.needs_root + def migrate_to_existing(self, osd_id, osd_fsid, devices, target_lv): + target_type = target_lv.tags["ceph.type"] + if target_type == "wal": + mlogger.error("Migrate to WAL is not supported") + raise SystemExit( + "Unable to migrate to : {}".format(self.args.target)) + target_filename = self.get_filename_by_type(target_type) + if (target_filename == ""): + mlogger.error( + "Target Logical Volume doesn't have proper volume type " + "(ceph.type LVM tag): {}".format(target_type)) + raise SystemExit( + "Unable to migrate to : {}".format(self.args.target)) + + osd_path = get_osd_path(osd_id, osd_fsid) + source_devices = self.get_source_devices(devices, target_type) + target_path = os.path.join(osd_path, target_filename) + tag_tracker = VolumeTagTracker(devices, target_lv) + + try: + # ceph-bluestore-tool removes source volume(s) other than + # block and target ones after successful migration + tag_tracker.remove_lvs(source_devices, target_type) + source_args = self.get_source_args(osd_path, source_devices) + mlogger.info("Migrate to existing, Source: {} Target: {}".format( + source_args, target_path)) + stdout, stderr, exit_code = process.call([ + 'ceph-bluestore-tool', + '--path', + osd_path, + '--dev-target', + target_path, + '--command', + 'bluefs-bdev-migrate'] + + source_args) + if exit_code != 0: + mlogger.error( + 'Failed to migrate device, error code:{}'.format(exit_code)) + raise SystemExit( + 'Failed to migrate to : {}'.format(self.args.target)) + if tag_tracker.data_device.lv_api.encrypted: + self.close_encrypted(source_devices) + terminal.success('Migration successful.') + except: + tag_tracker.undo() + raise + + return + + @decorators.needs_root + def migrate_osd(self): + if self.args.osd_id and not self.args.no_systemd: + osd_is_running = systemctl.osd_is_active(self.args.osd_id) + if osd_is_running: + mlogger.error('OSD is running, stop it with: ' + 'systemctl stop ceph-osd@{}'.format( + self.args.osd_id)) + raise SystemExit( + 'Unable to migrate devices associated with OSD ID: {}' + .format(self.args.osd_id)) + + target_lv = api.get_lv_by_fullname(self.args.target) + if not target_lv: + mlogger.error( + 'Target path "{}" is not a Logical Volume'.format( + self.args.target)) + raise SystemExit( + 'Unable to migrate to : {}'.format(self.args.target)) + devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid) + if (not target_lv.used_by_ceph): + self.migrate_to_new(self.args.osd_id, self.args.osd_fsid, + devices, + target_lv) + else: + if (target_lv.tags['ceph.osd_id'] != self.args.osd_id or + target_lv.tags['ceph.osd_fsid'] != self.args.osd_fsid): + mlogger.error( + 'Target Logical Volume isn\'t used by the specified OSD: ' + '{} FSID: {}'.format(self.args.osd_id, + self.args.osd_fsid)) + raise SystemExit( + 'Unable to migrate to : {}'.format(self.args.target)) + + self.migrate_to_existing(self.args.osd_id, self.args.osd_fsid, + devices, + target_lv) + + def make_parser(self, prog, sub_command_help): + parser = argparse.ArgumentParser( + prog=prog, + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + + parser.add_argument( + '--osd-id', + required=True, + help='Specify an OSD ID to detect associated devices for zapping', + type=valid_osd_id + ) + + parser.add_argument( + '--osd-fsid', + required=True, + help='Specify an OSD FSID to detect associated devices for zapping', + ) + parser.add_argument( + '--target', + required=True, + help='Specify target Logical Volume (LV) to migrate data to', + ) + parser.add_argument( + '--from', + nargs='*', + dest='from_', + required=True, + choices=['data', 'db', 'wal'], + help='Copy BlueFS data from DB device', + ) + parser.add_argument( + '--no-systemd', + dest='no_systemd', + action='store_true', + help='Skip checking OSD systemd unit', + ) + return parser + + def main(self): + sub_command_help = dedent(""" + Moves BlueFS data from source volume(s) to the target one, source + volumes (except the main (i.e. data or block) one) are removed on + success. LVM volumes are permitted for Target only, both already + attached or new logical one. In the latter case it is attached to OSD + replacing one of the source devices. Following replacement rules apply + (in the order of precedence, stop on the first match): + * if source list has DB volume - target device replaces it. + * if source list has WAL volume - target device replace it. + * if source list has slow volume only - operation is not permitted, + requires explicit allocation via new-db/new-wal command. + + Example calls for supported scenarios: + + Moves BlueFS data from main device to LV already attached as DB: + + ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from data --target vgname/db + + Moves BlueFS data from shared main device to LV which will be attached + as a new DB: + + ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from data --target vgname/new_db + + Moves BlueFS data from DB device to new LV, DB is replaced: + + ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from db --target vgname/new_db + + Moves BlueFS data from main and DB devices to new LV, DB is replaced: + + ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from data db --target vgname/new_db + + Moves BlueFS data from main, DB and WAL devices to new LV, WAL is + removed and DB is replaced: + + ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from data db wal --target vgname/new_db + + Moves BlueFS data from main, DB and WAL devices to main device, WAL + and DB are removed: + + ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from db wal --target vgname/data + + """) + + parser = self.make_parser('ceph-volume lvm migrate', sub_command_help) + + if len(self.argv) == 0: + print(sub_command_help) + return + + self.args = parser.parse_args(self.argv) + + self.migrate_osd() + +class NewVolume(object): + def __init__(self, create_type, argv): + self.create_type = create_type + self.argv = argv + + def make_parser(self, prog, sub_command_help): + parser = argparse.ArgumentParser( + prog=prog, + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + + parser.add_argument( + '--osd-id', + required=True, + help='Specify an OSD ID to attach new volume to', + type=valid_osd_id, + ) + + parser.add_argument( + '--osd-fsid', + required=True, + help='Specify an OSD FSIDto attach new volume to', + ) + parser.add_argument( + '--target', + required=True, + help='Specify target Logical Volume (LV) to attach', + ) + parser.add_argument( + '--no-systemd', + dest='no_systemd', + action='store_true', + help='Skip checking OSD systemd unit', + ) + return parser + + @decorators.needs_root + def make_new_volume(self, osd_id, osd_fsid, devices, target_lv): + osd_path = get_osd_path(osd_id, osd_fsid) + mlogger.info( + 'Making new volume at {} for OSD: {} ({})'.format( + target_lv.lv_path, osd_id, osd_path)) + target_path = target_lv.lv_path + tag_tracker = VolumeTagTracker(devices, target_lv) + # prepare and encrypt target if data volume is encrypted + if tag_tracker.data_device.lv_api.encrypted: + secret = encryption_utils.get_dmcrypt_key(osd_id, osd_fsid) + mlogger.info(' preparing dmcrypt for {}, uuid {}'.format(target_lv.lv_path, target_lv.lv_uuid)) + target_path = encryption_utils.prepare_dmcrypt( + key=secret, device=target_path, mapping=target_lv.lv_uuid) + + try: + tag_tracker.update_tags_when_lv_create(self.create_type) + + stdout, stderr, exit_code = process.call([ + 'ceph-bluestore-tool', + '--path', + osd_path, + '--dev-target', + target_path, + '--command', + 'bluefs-bdev-new-{}'.format(self.create_type) + ]) + if exit_code != 0: + mlogger.error( + 'failed to attach new volume, error code:{}'.format( + exit_code)) + raise SystemExit( + "Failed to attach new volume: {}".format( + self.args.target)) + else: + system.chown(os.path.join(osd_path, "block.{}".format( + self.create_type))) + terminal.success('New volume attached.') + except: + tag_tracker.undo() + raise + return + + @decorators.needs_root + def new_volume(self): + if self.args.osd_id and not self.args.no_systemd: + osd_is_running = systemctl.osd_is_active(self.args.osd_id) + if osd_is_running: + mlogger.error('OSD ID is running, stop it with:' + ' systemctl stop ceph-osd@{}'.format(self.args.osd_id)) + raise SystemExit( + 'Unable to attach new volume for OSD: {}'.format( + self.args.osd_id)) + + target_lv = api.get_lv_by_fullname(self.args.target) + if not target_lv: + mlogger.error( + 'Target path {} is not a Logical Volume'.format( + self.args.target)) + raise SystemExit( + 'Unable to attach new volume : {}'.format(self.args.target)) + if target_lv.used_by_ceph: + mlogger.error( + 'Target Logical Volume is already used by ceph: {}'.format( + self.args.target)) + raise SystemExit( + 'Unable to attach new volume : {}'.format(self.args.target)) + else: + devices = find_associated_devices(self.args.osd_id, + self.args.osd_fsid) + self.make_new_volume( + self.args.osd_id, + self.args.osd_fsid, + devices, + target_lv) + +class NewWAL(NewVolume): + + help = 'Allocate new WAL volume for OSD at specified Logical Volume' + + def __init__(self, argv): + super(NewWAL, self).__init__("wal", argv) + + def main(self): + sub_command_help = dedent(""" + Attaches the given logical volume to the given OSD as a WAL volume. + Logical volume format is vg/lv. Fails if OSD has already got attached DB. + + Example: + + Attach vgname/lvname as a WAL volume to OSD 1 + + ceph-volume lvm new-wal --osd-id 1 --osd-fsid 55BD4219-16A7-4037-BC20-0F158EFCC83D --target vgname/new_wal + """) + parser = self.make_parser('ceph-volume lvm new-wal', sub_command_help) + + if len(self.argv) == 0: + print(sub_command_help) + return + + self.args = parser.parse_args(self.argv) + + self.new_volume() + +class NewDB(NewVolume): + + help = 'Allocate new DB volume for OSD at specified Logical Volume' + + def __init__(self, argv): + super(NewDB, self).__init__("db", argv) + + def main(self): + sub_command_help = dedent(""" + Attaches the given logical volume to the given OSD as a DB volume. + Logical volume format is vg/lv. Fails if OSD has already got attached DB. + + Example: + + Attach vgname/lvname as a DB volume to OSD 1 + + ceph-volume lvm new-db --osd-id 1 --osd-fsid 55BD4219-16A7-4037-BC20-0F158EFCC83D --target vgname/new_db + """) + + parser = self.make_parser('ceph-volume lvm new-db', sub_command_help) + if len(self.argv) == 0: + print(sub_command_help) + return + self.args = parser.parse_args(self.argv) + + self.new_volume() diff --git a/src/ceph-volume/ceph_volume/devices/lvm/prepare.py b/src/ceph-volume/ceph_volume/devices/lvm/prepare.py new file mode 100644 index 000000000..85c8a1467 --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/prepare.py @@ -0,0 +1,327 @@ +from __future__ import print_function +import json +import logging +from textwrap import dedent +from ceph_volume.util import prepare as prepare_utils +from ceph_volume.util import encryption as encryption_utils +from ceph_volume.util import system, disk +from ceph_volume.util.arg_validators import exclude_group_options +from ceph_volume import conf, decorators, terminal +from ceph_volume.api import lvm as api +from .common import prepare_parser, rollback_osd + + +logger = logging.getLogger(__name__) + + +def prepare_dmcrypt(key, device, device_type, tags): + """ + Helper for devices that are encrypted. The operations needed for + block, db, wal devices are all the same + """ + if not device: + return '' + tag_name = 'ceph.%s_uuid' % device_type + uuid = tags[tag_name] + return encryption_utils.prepare_dmcrypt(key, device, uuid) + +def prepare_bluestore(block, wal, db, secrets, tags, osd_id, fsid): + """ + :param block: The name of the logical volume for the bluestore data + :param wal: a regular/plain disk or logical volume, to be used for block.wal + :param db: a regular/plain disk or logical volume, to be used for block.db + :param secrets: A dict with the secrets needed to create the osd (e.g. cephx) + :param id_: The OSD id + :param fsid: The OSD fsid, also known as the OSD UUID + """ + cephx_secret = secrets.get('cephx_secret', prepare_utils.create_key()) + # encryption-only operations + if secrets.get('dmcrypt_key'): + # If encrypted, there is no need to create the lockbox keyring file because + # bluestore re-creates the files and does not have support for other files + # like the custom lockbox one. This will need to be done on activation. + # format and open ('decrypt' devices) and re-assign the device and journal + # variables so that the rest of the process can use the mapper paths + key = secrets['dmcrypt_key'] + block = prepare_dmcrypt(key, block, 'block', tags) + wal = prepare_dmcrypt(key, wal, 'wal', tags) + db = prepare_dmcrypt(key, db, 'db', tags) + + # create the directory + prepare_utils.create_osd_path(osd_id, tmpfs=True) + # symlink the block + prepare_utils.link_block(block, osd_id) + # get the latest monmap + prepare_utils.get_monmap(osd_id) + # write the OSD keyring if it doesn't exist already + prepare_utils.write_keyring(osd_id, cephx_secret) + # prepare the osd filesystem + prepare_utils.osd_mkfs_bluestore( + osd_id, fsid, + keyring=cephx_secret, + wal=wal, + db=db + ) + + +class Prepare(object): + + help = 'Format an LVM device and associate it with an OSD' + + def __init__(self, argv): + self.argv = argv + self.osd_id = None + + def get_ptuuid(self, argument): + uuid = disk.get_partuuid(argument) + if not uuid: + terminal.error('blkid could not detect a PARTUUID for device: %s' % argument) + raise RuntimeError('unable to use device') + return uuid + + def setup_device(self, device_type, device_name, tags, size, slots): + """ + Check if ``device`` is an lv, if so, set the tags, making sure to + update the tags with the lv_uuid and lv_path which the incoming tags + will not have. + + If the device is not a logical volume, then retrieve the partition UUID + by querying ``blkid`` + """ + if device_name is None: + return '', '', tags + tags['ceph.type'] = device_type + tags['ceph.vdo'] = api.is_vdo(device_name) + + try: + vg_name, lv_name = device_name.split('/') + lv = api.get_single_lv(filters={'lv_name': lv_name, + 'vg_name': vg_name}) + except ValueError: + lv = None + + if lv: + lv_uuid = lv.lv_uuid + path = lv.lv_path + tags['ceph.%s_uuid' % device_type] = lv_uuid + tags['ceph.%s_device' % device_type] = path + lv.set_tags(tags) + elif disk.is_device(device_name): + # We got a disk, create an lv + lv_type = "osd-{}".format(device_type) + name_uuid = system.generate_uuid() + kwargs = { + 'device': device_name, + 'tags': tags, + 'slots': slots + } + #TODO use get_block_db_size and co here to get configured size in + #conf file + if size != 0: + kwargs['size'] = size + lv = api.create_lv( + lv_type, + name_uuid, + **kwargs) + path = lv.lv_path + tags['ceph.{}_device'.format(device_type)] = path + tags['ceph.{}_uuid'.format(device_type)] = lv.lv_uuid + lv_uuid = lv.lv_uuid + lv.set_tags(tags) + else: + # otherwise assume this is a regular disk partition + name_uuid = self.get_ptuuid(device_name) + path = device_name + tags['ceph.%s_uuid' % device_type] = name_uuid + tags['ceph.%s_device' % device_type] = path + lv_uuid = name_uuid + return path, lv_uuid, tags + + def prepare_data_device(self, device_type, osd_uuid): + """ + Check if ``arg`` is a device or partition to create an LV out of it + with a distinct volume group name, assigning LV tags on it and + ultimately, returning the logical volume object. Failing to detect + a device or partition will result in error. + + :param arg: The value of ``--data`` when parsing args + :param device_type: Usually ``block`` + :param osd_uuid: The OSD uuid + """ + device = self.args.data + if disk.is_partition(device) or disk.is_device(device): + # we must create a vg, and then a single lv + lv_name_prefix = "osd-{}".format(device_type) + kwargs = {'device': device, + 'tags': {'ceph.type': device_type}, + 'slots': self.args.data_slots, + } + logger.debug('data device size: {}'.format(self.args.data_size)) + if self.args.data_size != 0: + kwargs['size'] = self.args.data_size + return api.create_lv( + lv_name_prefix, + osd_uuid, + **kwargs) + else: + error = [ + 'Cannot use device ({}).'.format(device), + 'A vg/lv path or an existing device is needed'] + raise RuntimeError(' '.join(error)) + + raise RuntimeError('no data logical volume found with: {}'.format(device)) + + def safe_prepare(self, args=None): + """ + An intermediate step between `main()` and `prepare()` so that we can + capture the `self.osd_id` in case we need to rollback + + :param args: Injected args, usually from `lvm create` which compounds + both `prepare` and `create` + """ + if args is not None: + self.args = args + + try: + vgname, lvname = self.args.data.split('/') + lv = api.get_single_lv(filters={'lv_name': lvname, + 'vg_name': vgname}) + except ValueError: + lv = None + + if api.is_ceph_device(lv): + logger.info("device {} is already used".format(self.args.data)) + raise RuntimeError("skipping {}, it is already prepared".format(self.args.data)) + try: + self.prepare() + except Exception: + logger.exception('lvm prepare was unable to complete') + logger.info('will rollback OSD ID creation') + rollback_osd(self.args, self.osd_id) + raise + terminal.success("ceph-volume lvm prepare successful for: %s" % self.args.data) + + def get_cluster_fsid(self): + """ + Allows using --cluster-fsid as an argument, but can fallback to reading + from ceph.conf if that is unset (the default behavior). + """ + if self.args.cluster_fsid: + return self.args.cluster_fsid + else: + return conf.ceph.get('global', 'fsid') + + @decorators.needs_root + def prepare(self): + # FIXME we don't allow re-using a keyring, we always generate one for the + # OSD, this needs to be fixed. This could either be a file (!) or a string + # (!!) or some flags that we would need to compound into a dict so that we + # can convert to JSON (!!!) + secrets = {'cephx_secret': prepare_utils.create_key()} + cephx_lockbox_secret = '' + encrypted = 1 if self.args.dmcrypt else 0 + cephx_lockbox_secret = '' if not encrypted else prepare_utils.create_key() + + if encrypted: + secrets['dmcrypt_key'] = encryption_utils.create_dmcrypt_key() + secrets['cephx_lockbox_secret'] = cephx_lockbox_secret + + cluster_fsid = self.get_cluster_fsid() + + osd_fsid = self.args.osd_fsid or system.generate_uuid() + crush_device_class = self.args.crush_device_class + if crush_device_class: + secrets['crush_device_class'] = crush_device_class + # reuse a given ID if it exists, otherwise create a new ID + self.osd_id = prepare_utils.create_id(osd_fsid, json.dumps(secrets), osd_id=self.args.osd_id) + tags = { + 'ceph.osd_fsid': osd_fsid, + 'ceph.osd_id': self.osd_id, + 'ceph.cluster_fsid': cluster_fsid, + 'ceph.cluster_name': conf.cluster, + 'ceph.crush_device_class': crush_device_class, + 'ceph.osdspec_affinity': prepare_utils.get_osdspec_affinity() + } + if self.args.bluestore: + try: + vg_name, lv_name = self.args.data.split('/') + block_lv = api.get_single_lv(filters={'lv_name': lv_name, + 'vg_name': vg_name}) + except ValueError: + block_lv = None + + if not block_lv: + block_lv = self.prepare_data_device('block', osd_fsid) + + tags['ceph.block_device'] = block_lv.lv_path + tags['ceph.block_uuid'] = block_lv.lv_uuid + tags['ceph.cephx_lockbox_secret'] = cephx_lockbox_secret + tags['ceph.encrypted'] = encrypted + tags['ceph.vdo'] = api.is_vdo(block_lv.lv_path) + + wal_device, wal_uuid, tags = self.setup_device( + 'wal', + self.args.block_wal, + tags, + self.args.block_wal_size, + self.args.block_wal_slots) + db_device, db_uuid, tags = self.setup_device( + 'db', + self.args.block_db, + tags, + self.args.block_db_size, + self.args.block_db_slots) + + tags['ceph.type'] = 'block' + block_lv.set_tags(tags) + + prepare_bluestore( + block_lv.lv_path, + wal_device, + db_device, + secrets, + tags, + self.osd_id, + osd_fsid, + ) + + def main(self): + sub_command_help = dedent(""" + Prepare an OSD by assigning an ID and FSID, registering them with the + cluster with an ID and FSID, formatting and mounting the volume, and + finally by adding all the metadata to the logical volumes using LVM + tags, so that it can later be discovered. + + Once the OSD is ready, an ad-hoc systemd unit will be enabled so that + it can later get activated and the OSD daemon can get started. + + Encryption is supported via dmcrypt and the --dmcrypt flag. + + Existing logical volume (lv): + + ceph-volume lvm prepare --data {vg/lv} + + Existing block device (a logical volume will be created): + + ceph-volume lvm prepare --data /path/to/device + + Optionally, can consume db and wal devices, partitions or logical + volumes. A device will get a logical volume, partitions and existing + logical volumes will be used as is: + + ceph-volume lvm prepare --data {vg/lv} --block.wal {partition} --block.db {/path/to/device} + """) + parser = prepare_parser( + prog='ceph-volume lvm prepare', + description=sub_command_help, + ) + if len(self.argv) == 0: + print(sub_command_help) + return + exclude_group_options(parser, argv=self.argv, groups=['bluestore']) + self.args = parser.parse_args(self.argv) + # Default to bluestore here since defaulting it in add_argument may + # cause both to be True + if not self.args.bluestore: + self.args.bluestore = True + self.safe_prepare() diff --git a/src/ceph-volume/ceph_volume/devices/lvm/trigger.py b/src/ceph-volume/ceph_volume/devices/lvm/trigger.py new file mode 100644 index 000000000..dc57011df --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/trigger.py @@ -0,0 +1,70 @@ +from __future__ import print_function +import argparse +from textwrap import dedent +from ceph_volume.exceptions import SuffixParsingError +from ceph_volume import decorators +from .activate import Activate + + +def parse_osd_id(string): + osd_id = string.split('-', 1)[0] + if not osd_id: + raise SuffixParsingError('OSD id', string) + if osd_id.isdigit(): + return osd_id + raise SuffixParsingError('OSD id', string) + + +def parse_osd_uuid(string): + osd_id = '%s-' % parse_osd_id(string) + # remove the id first + osd_uuid = string.split(osd_id, 1)[-1] + if not osd_uuid: + raise SuffixParsingError('OSD uuid', string) + return osd_uuid + + +class Trigger(object): + + help = 'systemd helper to activate an OSD' + + def __init__(self, argv): + self.argv = argv + + @decorators.needs_root + def main(self): + sub_command_help = dedent(""" + ** DO NOT USE DIRECTLY ** + This tool is meant to help the systemd unit that knows about OSDs. + + Proxy OSD activation to ``ceph-volume lvm activate`` by parsing the + input from systemd, detecting the UUID and ID associated with an OSD:: + + ceph-volume lvm trigger {SYSTEMD-DATA} + + The systemd "data" is expected to be in the format of:: + + {OSD ID}-{OSD UUID} + + The lvs associated with the OSD need to have been prepared previously, + so that all needed tags and metadata exist. + """) + parser = argparse.ArgumentParser( + prog='ceph-volume lvm trigger', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + + parser.add_argument( + 'systemd_data', + metavar='SYSTEMD_DATA', + nargs='?', + help='Data from a systemd unit containing ID and UUID of the OSD, like asdf-lkjh-0' + ) + if len(self.argv) == 0: + print(sub_command_help) + return + args = parser.parse_args(self.argv) + osd_id = parse_osd_id(args.systemd_data) + osd_uuid = parse_osd_uuid(args.systemd_data) + Activate(['--auto-detect-objectstore', osd_id, osd_uuid]).main() diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py new file mode 100644 index 000000000..d4d78ad01 --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py @@ -0,0 +1,405 @@ +import argparse +import os +import logging +import time + +from textwrap import dedent + +from ceph_volume import decorators, terminal, process +from ceph_volume.api import lvm as api +from ceph_volume.util import system, encryption, disk, arg_validators, str_to_int, merge_dict +from ceph_volume.util.device import Device +from ceph_volume.systemd import systemctl + +logger = logging.getLogger(__name__) +mlogger = terminal.MultiLogger(__name__) + + +def wipefs(path): + """ + Removes the filesystem from an lv or partition. + + Environment variables supported:: + + * ``CEPH_VOLUME_WIPEFS_TRIES``: Defaults to 8 + * ``CEPH_VOLUME_WIPEFS_INTERVAL``: Defaults to 5 + + """ + tries = str_to_int( + os.environ.get('CEPH_VOLUME_WIPEFS_TRIES', 8) + ) + interval = str_to_int( + os.environ.get('CEPH_VOLUME_WIPEFS_INTERVAL', 5) + ) + + for trying in range(tries): + stdout, stderr, exit_code = process.call([ + 'wipefs', + '--all', + path + ]) + if exit_code != 0: + # this could narrow the retry by poking in the stderr of the output + # to verify that 'probing initialization failed' appears, but + # better to be broad in this retry to prevent missing on + # a different message that needs to be retried as well + terminal.warning( + 'failed to wipefs device, will try again to workaround probable race condition' + ) + time.sleep(interval) + else: + return + raise RuntimeError("could not complete wipefs on device: %s" % path) + + +def zap_data(path): + """ + Clears all data from the given path. Path should be + an absolute path to an lv or partition. + + 10M of data is written to the path to make sure that + there is no trace left of any previous Filesystem. + """ + process.run([ + 'dd', + 'if=/dev/zero', + 'of={path}'.format(path=path), + 'bs=1M', + 'count=10', + 'conv=fsync' + ]) + + +def find_associated_devices(osd_id=None, osd_fsid=None): + """ + From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the + system that match those tag values, further detect if any partitions are + part of the OSD, and then return the set of LVs and partitions (if any). + """ + lv_tags = {} + if osd_id: + lv_tags['ceph.osd_id'] = osd_id + if osd_fsid: + lv_tags['ceph.osd_fsid'] = osd_fsid + + lvs = api.get_lvs(tags=lv_tags) + if not lvs: + raise RuntimeError('Unable to find any LV for zapping OSD: ' + '%s' % osd_id or osd_fsid) + + devices_to_zap = ensure_associated_lvs(lvs, lv_tags) + return [Device(path) for path in set(devices_to_zap) if path] + + +def ensure_associated_lvs(lvs, lv_tags={}): + """ + Go through each LV and ensure if backing devices (journal, wal, block) + are LVs or partitions, so that they can be accurately reported. + """ + # look for many LVs for each backing type, because it is possible to + # receive a filtering for osd.1, and have multiple failed deployments + # leaving many journals with osd.1 - usually, only a single LV will be + # returned + + db_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'db'})) + wal_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'wal'})) + backing_devices = [(db_lvs, 'db'), + (wal_lvs, 'wal')] + + verified_devices = [] + + for lv in lvs: + # go through each lv and append it, otherwise query `blkid` to find + # a physical device. Do this for each type (journal,db,wal) regardless + # if they have been processed in the previous LV, so that bad devices + # with the same ID can be caught + for ceph_lvs, _type in backing_devices: + if ceph_lvs: + verified_devices.extend([l.lv_path for l in ceph_lvs]) + continue + + # must be a disk partition, by querying blkid by the uuid we are + # ensuring that the device path is always correct + try: + device_uuid = lv.tags['ceph.%s_uuid' % _type] + except KeyError: + # Bluestore will not have ceph.journal_uuid, and Filestore + # will not not have ceph.db_uuid + continue + + osd_device = disk.get_device_from_partuuid(device_uuid) + if not osd_device: + # if the osd_device is not found by the partuuid, then it is + # not possible to ensure this device exists anymore, so skip it + continue + verified_devices.append(osd_device) + + verified_devices.append(lv.lv_path) + + # reduce the list from all the duplicates that were added + return list(set(verified_devices)) + + +class Zap(object): + + help = 'Removes all data and filesystems from a logical volume or partition.' + + def __init__(self, argv): + self.argv = argv + + def unmount_lv(self, lv): + if lv.tags.get('ceph.cluster_name') and lv.tags.get('ceph.osd_id'): + lv_path = "/var/lib/ceph/osd/{}-{}".format(lv.tags['ceph.cluster_name'], lv.tags['ceph.osd_id']) + else: + lv_path = lv.lv_path + dmcrypt_uuid = lv.lv_uuid + dmcrypt = lv.encrypted + if system.path_is_mounted(lv_path): + mlogger.info("Unmounting %s", lv_path) + system.unmount(lv_path) + if dmcrypt and dmcrypt_uuid: + self.dmcrypt_close(dmcrypt_uuid) + + def zap_lv(self, device): + """ + Device examples: vg-name/lv-name, /dev/vg-name/lv-name + Requirements: Must be a logical volume (LV) + """ + lv = api.get_single_lv(filters={'lv_name': device.lv_name, 'vg_name': + device.vg_name}) + self.unmount_lv(lv) + + wipefs(device.path) + zap_data(device.path) + + if self.args.destroy: + lvs = api.get_lvs(filters={'vg_name': device.vg_name}) + if lvs == []: + mlogger.info('No LVs left, exiting', device.vg_name) + return + elif len(lvs) <= 1: + mlogger.info('Only 1 LV left in VG, will proceed to destroy ' + 'volume group %s', device.vg_name) + pvs = api.get_pvs(filters={'lv_uuid': lv.lv_uuid}) + api.remove_vg(device.vg_name) + for pv in pvs: + api.remove_pv(pv.pv_name) + else: + mlogger.info('More than 1 LV left in VG, will proceed to ' + 'destroy LV only') + mlogger.info('Removing LV because --destroy was given: %s', + device.path) + api.remove_lv(device.path) + elif lv: + # just remove all lvm metadata, leaving the LV around + lv.clear_tags() + + def zap_partition(self, device): + """ + Device example: /dev/sda1 + Requirements: Must be a partition + """ + if device.is_encrypted: + # find the holder + holders = [ + '/dev/%s' % holder for holder in device.sys_api.get('holders', []) + ] + for mapper_uuid in os.listdir('/dev/mapper'): + mapper_path = os.path.join('/dev/mapper', mapper_uuid) + if os.path.realpath(mapper_path) in holders: + self.dmcrypt_close(mapper_uuid) + + if system.device_is_mounted(device.path): + mlogger.info("Unmounting %s", device.path) + system.unmount(device.path) + + wipefs(device.path) + zap_data(device.path) + + if self.args.destroy: + mlogger.info("Destroying partition since --destroy was used: %s" % device.path) + disk.remove_partition(device) + + def zap_lvm_member(self, device): + """ + An LVM member may have more than one LV and or VG, for example if it is + a raw device with multiple partitions each belonging to a different LV + + Device example: /dev/sda + Requirements: An LV or VG present in the device, making it an LVM member + """ + for lv in device.lvs: + if lv.lv_name: + mlogger.info('Zapping lvm member {}. lv_path is {}'.format(device.path, lv.lv_path)) + self.zap_lv(Device(lv.lv_path)) + else: + vg = api.get_single_vg(filters={'vg_name': lv.vg_name}) + if vg: + mlogger.info('Found empty VG {}, removing'.format(vg.vg_name)) + api.remove_vg(vg.vg_name) + + + + def zap_raw_device(self, device): + """ + Any whole (raw) device passed in as input will be processed here, + checking for LVM membership and partitions (if any). + + Device example: /dev/sda + Requirements: None + """ + if not self.args.destroy: + # the use of dd on a raw device causes the partition table to be + # destroyed + mlogger.warning( + '--destroy was not specified, but zapping a whole device will remove the partition table' + ) + + # look for partitions and zap those + for part_name in device.sys_api.get('partitions', {}).keys(): + self.zap_partition(Device('/dev/%s' % part_name)) + + wipefs(device.path) + zap_data(device.path) + + @decorators.needs_root + def zap(self, devices=None): + devices = devices or self.args.devices + + for device in devices: + mlogger.info("Zapping: %s", device.path) + if device.is_mapper and not device.is_mpath: + terminal.error("Refusing to zap the mapper device: {}".format(device)) + raise SystemExit(1) + if device.is_lvm_member: + self.zap_lvm_member(device) + if device.is_lv: + self.zap_lv(device) + if device.is_partition: + self.zap_partition(device) + if device.is_device: + self.zap_raw_device(device) + + if self.args.devices: + terminal.success( + "Zapping successful for: %s" % ", ".join([str(d) for d in self.args.devices]) + ) + else: + identifier = self.args.osd_id or self.args.osd_fsid + terminal.success( + "Zapping successful for OSD: %s" % identifier + ) + + @decorators.needs_root + def zap_osd(self): + if self.args.osd_id and not self.args.no_systemd: + osd_is_running = systemctl.osd_is_active(self.args.osd_id) + if osd_is_running: + mlogger.error("OSD ID %s is running, stop it with:" % self.args.osd_id) + mlogger.error("systemctl stop ceph-osd@%s" % self.args.osd_id) + raise SystemExit("Unable to zap devices associated with OSD ID: %s" % self.args.osd_id) + devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid) + self.zap(devices) + + def dmcrypt_close(self, dmcrypt_uuid): + mlogger.info("Closing encrypted volume %s", dmcrypt_uuid) + encryption.dmcrypt_close(mapping=dmcrypt_uuid, skip_path_check=True) + + def main(self): + sub_command_help = dedent(""" + Zaps the given logical volume(s), raw device(s) or partition(s) for reuse by ceph-volume. + If given a path to a logical volume it must be in the format of vg/lv. Any + filesystems present on the given device, vg/lv, or partition will be removed and + all data will be purged. + + If the logical volume, raw device or partition is being used for any ceph related + mount points they will be unmounted. + + However, the lv or partition will be kept intact. + + Example calls for supported scenarios: + + Zapping a logical volume: + + ceph-volume lvm zap {vg name/lv name} + + Zapping a partition: + + ceph-volume lvm zap /dev/sdc1 + + Zapping many raw devices: + + ceph-volume lvm zap /dev/sda /dev/sdb /db/sdc + + Zapping devices associated with an OSD ID: + + ceph-volume lvm zap --osd-id 1 + + Optionally include the OSD FSID + + ceph-volume lvm zap --osd-id 1 --osd-fsid 55BD4219-16A7-4037-BC20-0F158EFCC83D + + If the --destroy flag is given and you are zapping a raw device or partition + then all vgs and lvs that exist on that raw device or partition will be destroyed. + + This is especially useful if a raw device or partition was used by ceph-volume lvm create + or ceph-volume lvm prepare commands previously and now you want to reuse that device. + + For example: + + ceph-volume lvm zap /dev/sda --destroy + + If the --destroy flag is given and you are zapping an lv then the lv is still + kept intact for reuse. + + """) + parser = argparse.ArgumentParser( + prog='ceph-volume lvm zap', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + + parser.add_argument( + 'devices', + metavar='DEVICES', + nargs='*', + type=arg_validators.ValidZapDevice(gpt_ok=True), + default=[], + help='Path to one or many lv (as vg/lv), partition (as /dev/sda1) or device (as /dev/sda)' + ) + + parser.add_argument( + '--destroy', + action='store_true', + default=False, + help='Destroy all volume groups and logical volumes if you are zapping a raw device or partition', + ) + + parser.add_argument( + '--osd-id', + type=arg_validators.valid_osd_id, + help='Specify an OSD ID to detect associated devices for zapping', + ) + + parser.add_argument( + '--osd-fsid', + help='Specify an OSD FSID to detect associated devices for zapping', + ) + + parser.add_argument( + '--no-systemd', + dest='no_systemd', + action='store_true', + help='Skip systemd unit checks', + ) + + if len(self.argv) == 0: + print(sub_command_help) + return + + self.args = parser.parse_args(self.argv) + + if self.args.osd_id or self.args.osd_fsid: + self.zap_osd() + else: + self.zap() diff --git a/src/ceph-volume/ceph_volume/devices/raw/__init__.py b/src/ceph-volume/ceph_volume/devices/raw/__init__.py new file mode 100644 index 000000000..dd0a6534c --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/raw/__init__.py @@ -0,0 +1 @@ +from .main import Raw # noqa diff --git a/src/ceph-volume/ceph_volume/devices/raw/activate.py b/src/ceph-volume/ceph_volume/devices/raw/activate.py new file mode 100644 index 000000000..17be57dfe --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/raw/activate.py @@ -0,0 +1,166 @@ +from __future__ import print_function +import argparse +import logging +import os +from textwrap import dedent +from ceph_volume import process, conf, decorators, terminal +from ceph_volume.util import system +from ceph_volume.util import prepare as prepare_utils +from .list import direct_report + + +logger = logging.getLogger(__name__) + +def activate_bluestore(meta, tmpfs, systemd): + # find the osd + osd_id = meta['osd_id'] + osd_uuid = meta['osd_uuid'] + + # mount on tmpfs the osd directory + osd_path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id) + if not system.path_is_mounted(osd_path): + # mkdir -p and mount as tmpfs + prepare_utils.create_osd_path(osd_id, tmpfs=tmpfs) + + # XXX This needs to be removed once ceph-bluestore-tool can deal with + # symlinks that exist in the osd dir + for link_name in ['block', 'block.db', 'block.wal']: + link_path = os.path.join(osd_path, link_name) + if os.path.exists(link_path): + os.unlink(os.path.join(osd_path, link_name)) + + # Once symlinks are removed, the osd dir can be 'primed again. chown first, + # regardless of what currently exists so that ``prime-osd-dir`` can succeed + # even if permissions are somehow messed up + system.chown(osd_path) + prime_command = [ + 'ceph-bluestore-tool', + 'prime-osd-dir', + '--path', osd_path, + '--no-mon-config', + '--dev', meta['device'], + ] + process.run(prime_command) + + # always re-do the symlink regardless if it exists, so that the block, + # block.wal, and block.db devices that may have changed can be mapped + # correctly every time + prepare_utils.link_block(meta['device'], osd_id) + + if 'device_db' in meta: + prepare_utils.link_db(meta['device_db'], osd_id, osd_uuid) + + if 'device_wal' in meta: + prepare_utils.link_wal(meta['device_wal'], osd_id, osd_uuid) + + system.chown(osd_path) + terminal.success("ceph-volume raw activate successful for osd ID: %s" % osd_id) + + +class Activate(object): + + help = 'Discover and prepare a data directory for a (BlueStore) OSD on a raw device' + + def __init__(self, argv): + self.argv = argv + self.args = None + + @decorators.needs_root + def activate(self, devs, start_osd_id, start_osd_uuid, + tmpfs, systemd): + """ + :param args: The parsed arguments coming from the CLI + """ + assert devs or start_osd_id or start_osd_uuid + found = direct_report(devs) + + activated_any = False + for osd_uuid, meta in found.items(): + osd_id = meta['osd_id'] + if start_osd_id is not None and str(osd_id) != str(start_osd_id): + continue + if start_osd_uuid is not None and osd_uuid != start_osd_uuid: + continue + logger.info('Activating osd.%s uuid %s cluster %s' % ( + osd_id, osd_uuid, meta['ceph_fsid'])) + activate_bluestore(meta, + tmpfs=tmpfs, + systemd=systemd) + activated_any = True + + if not activated_any: + raise RuntimeError('did not find any matching OSD to activate') + + def main(self): + sub_command_help = dedent(""" + Activate (BlueStore) OSD on a raw block device(s) based on the + device label (normally the first block of the device). + + ceph-volume raw activate [/dev/sdb2 ...] + + or + + ceph-volume raw activate --osd-id NUM --osd-uuid UUID + + The device(s) associated with the OSD need to have been prepared + previously, so that all needed tags and metadata exist. + """) + parser = argparse.ArgumentParser( + prog='ceph-volume raw activate', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + parser.add_argument( + '--device', + help='The device for the OSD to start' + ) + parser.add_argument( + '--osd-id', + help='OSD ID to activate' + ) + parser.add_argument( + '--osd-uuid', + help='OSD UUID to active' + ) + parser.add_argument( + '--no-systemd', + dest='no_systemd', + action='store_true', + help='Skip creating and enabling systemd units and starting OSD services' + ) + parser.add_argument( + '--block.db', + dest='block_db', + help='Path to bluestore block.db block device' + ) + parser.add_argument( + '--block.wal', + dest='block_wal', + help='Path to bluestore block.wal block device' + ) + parser.add_argument( + '--no-tmpfs', + action='store_true', + help='Do not use a tmpfs mount for OSD data dir' + ) + + if not self.argv: + print(sub_command_help) + return + args = parser.parse_args(self.argv) + self.args = args + if not args.no_systemd: + terminal.error('systemd support not yet implemented') + raise SystemExit(1) + + devs = [args.device] + if args.block_wal: + devs.append(args.block_wal) + if args.block_db: + devs.append(args.block_db) + + self.activate(devs=devs, + start_osd_id=args.osd_id, + start_osd_uuid=args.osd_uuid, + tmpfs=not args.no_tmpfs, + systemd=not self.args.no_systemd) diff --git a/src/ceph-volume/ceph_volume/devices/raw/common.py b/src/ceph-volume/ceph_volume/devices/raw/common.py new file mode 100644 index 000000000..89ee285be --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/raw/common.py @@ -0,0 +1,58 @@ +import argparse +from ceph_volume.util import arg_validators + +def create_parser(prog, description): + """ + Both prepare and create share the same parser, those are defined here to + avoid duplication + """ + parser = argparse.ArgumentParser( + prog=prog, + formatter_class=argparse.RawDescriptionHelpFormatter, + description=description, + ) + parser.add_argument( + '--data', + required=True, + type=arg_validators.ValidRawDevice(as_string=True), + help='a raw device to use for the OSD', + ) + parser.add_argument( + '--bluestore', + action='store_true', + help='Use BlueStore backend') + parser.add_argument( + '--crush-device-class', + dest='crush_device_class', + help='Crush device class to assign this OSD to', + default="" + ) + parser.add_argument( + '--no-tmpfs', + action='store_true', + help='Do not use a tmpfs mount for OSD data dir' + ) + parser.add_argument( + '--block.db', + dest='block_db', + help='Path to bluestore block.db block device', + type=arg_validators.ValidRawDevice(as_string=True) + ) + parser.add_argument( + '--block.wal', + dest='block_wal', + help='Path to bluestore block.wal block device', + type=arg_validators.ValidRawDevice(as_string=True) + ) + parser.add_argument( + '--dmcrypt', + action='store_true', + help='Enable device encryption via dm-crypt', + ) + parser.add_argument( + '--osd-id', + help='Reuse an existing OSD id', + default=None, + type=arg_validators.valid_osd_id, + ) + return parser diff --git a/src/ceph-volume/ceph_volume/devices/raw/list.py b/src/ceph-volume/ceph_volume/devices/raw/list.py new file mode 100644 index 000000000..794bb18c1 --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/raw/list.py @@ -0,0 +1,174 @@ +from __future__ import print_function +import argparse +import json +import logging +from textwrap import dedent +from ceph_volume import decorators, process +from ceph_volume.util import disk +from typing import Any, Dict, List + +logger = logging.getLogger(__name__) + + +def direct_report(devices): + """ + Other non-cli consumers of listing information will want to consume the + report without the need to parse arguments or other flags. This helper + bypasses the need to deal with the class interface which is meant for cli + handling. + """ + _list = List([]) + return _list.generate(devices) + +def _get_bluestore_info(dev): + out, err, rc = process.call([ + 'ceph-bluestore-tool', 'show-label', + '--dev', dev], verbose_on_failure=False) + if rc: + # ceph-bluestore-tool returns an error (below) if device is not bluestore OSD + # > unable to read label for <device>: (2) No such file or directory + # but it's possible the error could be for a different reason (like if the disk fails) + logger.debug('assuming device {} is not BlueStore; ceph-bluestore-tool failed to get info from device: {}\n{}'.format(dev, out, err)) + return None + oj = json.loads(''.join(out)) + if dev not in oj: + # should be impossible, so warn + logger.warning('skipping device {} because it is not reported in ceph-bluestore-tool output: {}'.format(dev, out)) + return None + try: + r = { + 'osd_uuid': oj[dev]['osd_uuid'], + } + if oj[dev]['description'] == 'main': + whoami = oj[dev]['whoami'] + r.update({ + 'type': 'bluestore', + 'osd_id': int(whoami), + 'ceph_fsid': oj[dev]['ceph_fsid'], + 'device': dev, + }) + elif oj[dev]['description'] == 'bluefs db': + r['device_db'] = dev + elif oj[dev]['description'] == 'bluefs wal': + r['device_wal'] = dev + return r + except KeyError as e: + # this will appear for devices that have a bluestore header but aren't valid OSDs + # for example, due to incomplete rollback of OSDs: https://tracker.ceph.com/issues/51869 + logger.error('device {} does not have all BlueStore data needed to be a valid OSD: {}\n{}'.format(dev, out, e)) + return None + + +class List(object): + + help = 'list BlueStore OSDs on raw devices' + + def __init__(self, argv): + self.argv = argv + + def is_atari_partitions(self, _lsblk: Dict[str, Any]) -> bool: + dev = _lsblk['NAME'] + if _lsblk.get('PKNAME'): + parent = _lsblk['PKNAME'] + try: + if disk.has_bluestore_label(parent): + logger.warning(('ignoring child device {} whose parent {} is a BlueStore OSD.'.format(dev, parent), + 'device is likely a phantom Atari partition. device info: {}'.format(_lsblk))) + return True + except OSError as e: + logger.error(('ignoring child device {} to avoid reporting invalid BlueStore data from phantom Atari partitions.'.format(dev), + 'failed to determine if parent device {} is BlueStore. err: {}'.format(parent, e))) + return True + return False + + def exclude_atari_partitions(self, _lsblk_all: Dict[str, Any]) -> List[Dict[str, Any]]: + return [_lsblk for _lsblk in _lsblk_all if not self.is_atari_partitions(_lsblk)] + + def generate(self, devs=None): + logger.debug('Listing block devices via lsblk...') + info_devices = [] + if not devs or not any(devs): + # If no devs are given initially, we want to list ALL devices including children and + # parents. Parent disks with child partitions may be the appropriate device to return if + # the parent disk has a bluestore header, but children may be the most appropriate + # devices to return if the parent disk does not have a bluestore header. + info_devices = disk.lsblk_all(abspath=True) + devs = [device['NAME'] for device in info_devices if device.get('NAME',)] + else: + for dev in devs: + info_devices.append(disk.lsblk(dev, abspath=True)) + + # Linux kernels built with CONFIG_ATARI_PARTITION enabled can falsely interpret + # bluestore's on-disk format as an Atari partition table. These false Atari partitions + # can be interpreted as real OSDs if a bluestore OSD was previously created on the false + # partition. See https://tracker.ceph.com/issues/52060 for more info. If a device has a + # parent, it is a child. If the parent is a valid bluestore OSD, the child will only + # exist if it is a phantom Atari partition, and the child should be ignored. If the + # parent isn't bluestore, then the child could be a valid bluestore OSD. If we fail to + # determine whether a parent is bluestore, we should err on the side of not reporting + # the child so as not to give a false negative. + info_devices = self.exclude_atari_partitions(info_devices) + + result = {} + logger.debug('inspecting devices: {}'.format(devs)) + for info_device in info_devices: + bs_info = _get_bluestore_info(info_device['NAME']) + if bs_info is None: + # None is also returned in the rare event that there is an issue reading info from + # a BlueStore disk, so be sure to log our assumption that it isn't bluestore + logger.info('device {} does not have BlueStore information'.format(info_device['NAME'])) + continue + uuid = bs_info['osd_uuid'] + if uuid not in result: + result[uuid] = {} + result[uuid].update(bs_info) + + return result + + @decorators.needs_root + def list(self, args): + report = self.generate(args.device) + if args.format == 'json': + print(json.dumps(report, indent=4, sort_keys=True)) + else: + if not report: + raise SystemExit('No valid Ceph devices found') + raise RuntimeError('not implemented yet') + + def main(self): + sub_command_help = dedent(""" + List OSDs on raw devices with raw device labels (usually the first + block of the device). + + Full listing of all identifiable (currently, BlueStore) OSDs + on raw devices: + + ceph-volume raw list + + List a particular device, reporting all metadata about it:: + + ceph-volume raw list /dev/sda1 + + """) + parser = argparse.ArgumentParser( + prog='ceph-volume raw list', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + + parser.add_argument( + 'device', + metavar='DEVICE', + nargs='*', + help='Path to a device like /dev/sda1' + ) + + parser.add_argument( + '--format', + help='output format, defaults to "pretty"', + default='json', + choices=['json', 'pretty'], + ) + + args = parser.parse_args(self.argv) + self.list(args) diff --git a/src/ceph-volume/ceph_volume/devices/raw/main.py b/src/ceph-volume/ceph_volume/devices/raw/main.py new file mode 100644 index 000000000..efa251090 --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/raw/main.py @@ -0,0 +1,40 @@ +import argparse +from textwrap import dedent +from ceph_volume import terminal +from . import list +from . import prepare +from . import activate + +class Raw(object): + + help = 'Manage single-device OSDs on raw block devices' + + _help = dedent(""" + Manage a single-device OSD on a raw block device. Rely on + the existing device labels to store any needed metadata. + + {sub_help} + """) + + mapper = { + 'list': list.List, + 'prepare': prepare.Prepare, + 'activate': activate.Activate, + } + + def __init__(self, argv): + self.argv = argv + + def print_help(self, sub_help): + return self._help.format(sub_help=sub_help) + + def main(self): + terminal.dispatch(self.mapper, self.argv) + parser = argparse.ArgumentParser( + prog='ceph-volume raw', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=self.print_help(terminal.subhelp(self.mapper)), + ) + parser.parse_args(self.argv) + if len(self.argv) <= 1: + return parser.print_help() diff --git a/src/ceph-volume/ceph_volume/devices/raw/prepare.py b/src/ceph-volume/ceph_volume/devices/raw/prepare.py new file mode 100644 index 000000000..b3201a89d --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/raw/prepare.py @@ -0,0 +1,160 @@ +from __future__ import print_function +import json +import logging +import os +from textwrap import dedent +from ceph_volume.util import prepare as prepare_utils +from ceph_volume.util import encryption as encryption_utils +from ceph_volume.util import disk +from ceph_volume.util import system +from ceph_volume import decorators, terminal +from ceph_volume.devices.lvm.common import rollback_osd +from .common import create_parser + +logger = logging.getLogger(__name__) + +def prepare_dmcrypt(key, device, device_type, fsid): + """ + Helper for devices that are encrypted. The operations needed for + block, db, wal, devices are all the same + """ + if not device: + return '' + kname = disk.lsblk(device)['KNAME'] + mapping = 'ceph-{}-{}-{}-dmcrypt'.format(fsid, kname, device_type) + return encryption_utils.prepare_dmcrypt(key, device, mapping) + +def prepare_bluestore(block, wal, db, secrets, osd_id, fsid, tmpfs): + """ + :param block: The name of the logical volume for the bluestore data + :param wal: a regular/plain disk or logical volume, to be used for block.wal + :param db: a regular/plain disk or logical volume, to be used for block.db + :param secrets: A dict with the secrets needed to create the osd (e.g. cephx) + :param id_: The OSD id + :param fsid: The OSD fsid, also known as the OSD UUID + """ + cephx_secret = secrets.get('cephx_secret', prepare_utils.create_key()) + + if secrets.get('dmcrypt_key'): + key = secrets['dmcrypt_key'] + block = prepare_dmcrypt(key, block, 'block', fsid) + wal = prepare_dmcrypt(key, wal, 'wal', fsid) + db = prepare_dmcrypt(key, db, 'db', fsid) + + # create the directory + prepare_utils.create_osd_path(osd_id, tmpfs=tmpfs) + # symlink the block + prepare_utils.link_block(block, osd_id) + # get the latest monmap + prepare_utils.get_monmap(osd_id) + # write the OSD keyring if it doesn't exist already + prepare_utils.write_keyring(osd_id, cephx_secret) + # prepare the osd filesystem + prepare_utils.osd_mkfs_bluestore( + osd_id, fsid, + keyring=cephx_secret, + wal=wal, + db=db + ) + + +class Prepare(object): + + help = 'Format a raw device and associate it with a (BlueStore) OSD' + + def __init__(self, argv): + self.argv = argv + self.osd_id = None + + def safe_prepare(self, args=None): + """ + An intermediate step between `main()` and `prepare()` so that we can + capture the `self.osd_id` in case we need to rollback + + :param args: Injected args, usually from `raw create` which compounds + both `prepare` and `create` + """ + if args is not None: + self.args = args + try: + self.prepare() + except Exception: + logger.exception('raw prepare was unable to complete') + logger.info('will rollback OSD ID creation') + rollback_osd(self.args, self.osd_id) + raise + dmcrypt_log = 'dmcrypt' if args.dmcrypt else 'clear' + terminal.success("ceph-volume raw {} prepare successful for: {}".format(dmcrypt_log, self.args.data)) + + + @decorators.needs_root + def prepare(self): + secrets = {'cephx_secret': prepare_utils.create_key()} + encrypted = 1 if self.args.dmcrypt else 0 + cephx_lockbox_secret = '' if not encrypted else prepare_utils.create_key() + + if encrypted: + secrets['dmcrypt_key'] = os.getenv('CEPH_VOLUME_DMCRYPT_SECRET') + secrets['cephx_lockbox_secret'] = cephx_lockbox_secret # dummy value to make `ceph osd new` not complaining + + osd_fsid = system.generate_uuid() + crush_device_class = self.args.crush_device_class + if crush_device_class: + secrets['crush_device_class'] = crush_device_class + tmpfs = not self.args.no_tmpfs + wal = "" + db = "" + if self.args.block_wal: + wal = self.args.block_wal + if self.args.block_db: + db = self.args.block_db + + # reuse a given ID if it exists, otherwise create a new ID + self.osd_id = prepare_utils.create_id( + osd_fsid, + json.dumps(secrets), + osd_id=self.args.osd_id) + + prepare_bluestore( + self.args.data, + wal, + db, + secrets, + self.osd_id, + osd_fsid, + tmpfs, + ) + + def main(self): + sub_command_help = dedent(""" + Prepare an OSD by assigning an ID and FSID, registering them with the + cluster with an ID and FSID, formatting the volume. + + Once the OSD is ready, an ad-hoc systemd unit will be enabled so that + it can later get activated and the OSD daemon can get started. + + ceph-volume raw prepare --bluestore --data {device} + + DB and WAL devices are supported. + + ceph-volume raw prepare --bluestore --data {device} --block.db {device} --block.wal {device} + + """) + parser = create_parser( + prog='ceph-volume raw prepare', + description=sub_command_help, + ) + if not self.argv: + print(sub_command_help) + return + self.args = parser.parse_args(self.argv) + if not self.args.bluestore: + terminal.error('must specify --bluestore (currently the only supported backend)') + raise SystemExit(1) + if self.args.dmcrypt and not os.getenv('CEPH_VOLUME_DMCRYPT_SECRET'): + terminal.error('encryption was requested (--dmcrypt) but environment variable ' \ + 'CEPH_VOLUME_DMCRYPT_SECRET is not set, you must set ' \ + 'this variable to provide a dmcrypt secret.') + raise SystemExit(1) + + self.safe_prepare(self.args) diff --git a/src/ceph-volume/ceph_volume/devices/simple/__init__.py b/src/ceph-volume/ceph_volume/devices/simple/__init__.py new file mode 100644 index 000000000..280e130ed --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/simple/__init__.py @@ -0,0 +1 @@ +from .main import Simple # noqa diff --git a/src/ceph-volume/ceph_volume/devices/simple/activate.py b/src/ceph-volume/ceph_volume/devices/simple/activate.py new file mode 100644 index 000000000..f3dcdcef8 --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/simple/activate.py @@ -0,0 +1,282 @@ +from __future__ import print_function +import argparse +import base64 +import glob +import json +import logging +import os +from textwrap import dedent +from ceph_volume import process, decorators, terminal, conf +from ceph_volume.util import system, disk +from ceph_volume.util import encryption as encryption_utils +from ceph_volume.systemd import systemctl + + +logger = logging.getLogger(__name__) +mlogger = terminal.MultiLogger(__name__) + + +class Activate(object): + + help = 'Enable systemd units to mount configured devices and start a Ceph OSD' + + def __init__(self, argv, from_trigger=False): + self.argv = argv + self.from_trigger = from_trigger + self.skip_systemd = False + + def validate_devices(self, json_config): + """ + ``json_config`` is the loaded dictionary coming from the JSON file. It is usually mixed with + other non-device items, but for sakes of comparison it doesn't really matter. This method is + just making sure that the keys needed exist + """ + devices = json_config.keys() + try: + objectstore = json_config['type'] + except KeyError: + logger.warning( + '"type" key not found, assuming "bluestore" since journal key is not present' + ) + objectstore = 'bluestore' + + # Go through all the device combinations that are absolutely required, + # raise an error describing what was expected and what was found + # otherwise. + if objectstore == 'bluestore': + # This is a bit tricky, with newer bluestore we don't need data, older implementations + # do (e.g. with ceph-disk). ceph-volume just uses a tmpfs that doesn't require data. + if {'block', 'data'}.issubset(set(devices)): + return True + else: + bluestore_devices = ['block.db', 'block.wal', 'block', 'data'] + found = [i for i in devices if i in bluestore_devices] + mlogger.error("Required devices (block and data) not present for bluestore") + mlogger.error('bluestore devices found: %s', found) + raise RuntimeError('Unable to activate bluestore OSD due to missing devices') + + def get_device(self, uuid): + """ + If a device is encrypted, it will decrypt/open and return the mapper + path, if it isn't encrypted it will just return the device found that + is mapped to the uuid. This will make it easier for the caller to + avoid if/else to check if devices need decrypting + + :param uuid: The partition uuid of the device (PARTUUID) + """ + device = disk.get_device_from_partuuid(uuid) + + # If device is not found, it is fine to return an empty string from the + # helper that finds `device`. If it finds anything and it is not + # encrypted, just return what was found + if not self.is_encrypted or not device: + return device + + if self.encryption_type == 'luks': + encryption_utils.luks_open(self.dmcrypt_secret, device, uuid) + else: + encryption_utils.plain_open(self.dmcrypt_secret, device, uuid) + + return '/dev/mapper/%s' % uuid + + def enable_systemd_units(self, osd_id, osd_fsid): + """ + * disables the ceph-disk systemd units to prevent them from running when + a UDEV event matches Ceph rules + * creates the ``simple`` systemd units to handle the activation and + startup of the OSD with ``osd_id`` and ``osd_fsid`` + * enables the OSD systemd unit and finally starts the OSD. + """ + if not self.from_trigger and not self.skip_systemd: + # means it was scanned and now activated directly, so ensure that + # ceph-disk units are disabled, and that the `simple` systemd unit + # is created and enabled + + # enable the ceph-volume unit for this OSD + systemctl.enable_volume(osd_id, osd_fsid, 'simple') + + # disable any/all ceph-disk units + systemctl.mask_ceph_disk() + terminal.warning( + ('All ceph-disk systemd units have been disabled to ' + 'prevent OSDs getting triggered by UDEV events') + ) + else: + terminal.info('Skipping enabling of `simple` systemd unit') + terminal.info('Skipping masking of ceph-disk systemd units') + + if not self.skip_systemd: + # enable the OSD + systemctl.enable_osd(osd_id) + + # start the OSD + systemctl.start_osd(osd_id) + else: + terminal.info( + 'Skipping enabling and starting OSD simple systemd unit because --no-systemd was used' + ) + + @decorators.needs_root + def activate(self, args): + with open(args.json_config, 'r') as fp: + osd_metadata = json.load(fp) + + # Make sure that required devices are configured + self.validate_devices(osd_metadata) + + osd_id = osd_metadata.get('whoami', args.osd_id) + osd_fsid = osd_metadata.get('fsid', args.osd_fsid) + data_uuid = osd_metadata.get('data', {}).get('uuid') + conf.cluster = osd_metadata.get('cluster_name', 'ceph') + if not data_uuid: + raise RuntimeError( + 'Unable to activate OSD %s - no "uuid" key found for data' % args.osd_id + ) + + # Encryption detection, and capturing of the keys to decrypt + self.is_encrypted = osd_metadata.get('encrypted', False) + self.encryption_type = osd_metadata.get('encryption_type') + if self.is_encrypted: + lockbox_secret = osd_metadata.get('lockbox.keyring') + # write the keyring always so that we can unlock + encryption_utils.write_lockbox_keyring(osd_id, osd_fsid, lockbox_secret) + # Store the secret around so that the decrypt method can reuse + raw_dmcrypt_secret = encryption_utils.get_dmcrypt_key(osd_id, osd_fsid) + # Note how both these calls need b64decode. For some reason, the + # way ceph-disk creates these keys, it stores them in the monitor + # *undecoded*, requiring this decode call again. The lvm side of + # encryption doesn't need it, so we are assuming here that anything + # that `simple` scans, will come from ceph-disk and will need this + # extra decode call here + self.dmcrypt_secret = base64.b64decode(raw_dmcrypt_secret) + + cluster_name = osd_metadata.get('cluster_name', 'ceph') + osd_dir = '/var/lib/ceph/osd/%s-%s' % (cluster_name, osd_id) + + # XXX there is no support for LVM here + data_device = self.get_device(data_uuid) + + if not data_device: + raise RuntimeError("osd fsid {} doesn't exist, this file will " + "be skipped, consider cleaning legacy " + "json file {}".format(osd_metadata['fsid'], args.json_config)) + + block_device = self.get_device(osd_metadata.get('block', {}).get('uuid')) + block_db_device = self.get_device(osd_metadata.get('block.db', {}).get('uuid')) + block_wal_device = self.get_device(osd_metadata.get('block.wal', {}).get('uuid')) + + if not system.device_is_mounted(data_device, destination=osd_dir): + process.run(['mount', '-v', data_device, osd_dir]) + + device_map = { + 'block': block_device, + 'block.db': block_db_device, + 'block.wal': block_wal_device + } + + for name, device in device_map.items(): + if not device: + continue + # always re-do the symlink regardless if it exists, so that the journal + # device path that may have changed can be mapped correctly every time + destination = os.path.join(osd_dir, name) + process.run(['ln', '-snf', device, destination]) + + # make sure that the journal has proper permissions + system.chown(device) + + self.enable_systemd_units(osd_id, osd_fsid) + + terminal.success('Successfully activated OSD %s with FSID %s' % (osd_id, osd_fsid)) + + def main(self): + sub_command_help = dedent(""" + Activate OSDs by mounting devices previously configured to their + appropriate destination:: + + ceph-volume simple activate {ID} {FSID} + + Or using a JSON file directly:: + + ceph-volume simple activate --file /etc/ceph/osd/{ID}-{FSID}.json + + The OSD must have been "scanned" previously (see ``ceph-volume simple + scan``), so that all needed OSD device information and metadata exist. + + A previously scanned OSD would exist like:: + + /etc/ceph/osd/{ID}-{FSID}.json + + + Environment variables supported: + + CEPH_VOLUME_SIMPLE_JSON_DIR: Directory location for scanned OSD JSON configs + """) + parser = argparse.ArgumentParser( + prog='ceph-volume simple activate', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + parser.add_argument( + 'osd_id', + metavar='ID', + nargs='?', + help='The ID of the OSD, usually an integer, like 0' + ) + parser.add_argument( + 'osd_fsid', + metavar='FSID', + nargs='?', + help='The FSID of the OSD, similar to a SHA1' + ) + parser.add_argument( + '--all', + help='Activate all OSDs with a OSD JSON config', + action='store_true', + default=False, + ) + parser.add_argument( + '--file', + help='The path to a JSON file, from a scanned OSD' + ) + parser.add_argument( + '--no-systemd', + dest='skip_systemd', + action='store_true', + help='Skip creating and enabling systemd units and starting OSD services', + ) + if len(self.argv) == 0: + print(sub_command_help) + return + args = parser.parse_args(self.argv) + if not args.file and not args.all: + if not args.osd_id and not args.osd_fsid: + terminal.error('ID and FSID are required to find the right OSD to activate') + terminal.error('from a scanned OSD location in /etc/ceph/osd/') + raise RuntimeError('Unable to activate without both ID and FSID') + # don't allow a CLI flag to specify the JSON dir, because that might + # implicitly indicate that it would be possible to activate a json file + # at a non-default location which would not work at boot time if the + # custom location is not exposed through an ENV var + self.skip_systemd = args.skip_systemd + json_dir = os.environ.get('CEPH_VOLUME_SIMPLE_JSON_DIR', '/etc/ceph/osd/') + if args.all: + if args.file or args.osd_id: + mlogger.warn('--all was passed, ignoring --file and ID/FSID arguments') + json_configs = glob.glob('{}/*.json'.format(json_dir)) + for json_config in json_configs: + mlogger.info('activating OSD specified in {}'.format(json_config)) + args.json_config = json_config + try: + self.activate(args) + except RuntimeError as e: + terminal.warning(e.message) + else: + if args.file: + json_config = args.file + else: + json_config = os.path.join(json_dir, '%s-%s.json' % (args.osd_id, args.osd_fsid)) + if not os.path.exists(json_config): + raise RuntimeError('Expected JSON config path not found: %s' % json_config) + args.json_config = json_config + self.activate(args) diff --git a/src/ceph-volume/ceph_volume/devices/simple/main.py b/src/ceph-volume/ceph_volume/devices/simple/main.py new file mode 100644 index 000000000..2119963f8 --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/simple/main.py @@ -0,0 +1,41 @@ +import argparse +from textwrap import dedent +from ceph_volume import terminal +from . import scan +from . import activate +from . import trigger + + +class Simple(object): + + help = 'Manage already deployed OSDs with ceph-volume' + + _help = dedent(""" + Take over a deployed OSD, persisting its metadata in /etc/ceph/osd/ so that it can be managed + with ceph-volume directly. Avoids UDEV and ceph-disk handling. + + {sub_help} + """) + + mapper = { + 'scan': scan.Scan, + 'activate': activate.Activate, + 'trigger': trigger.Trigger, + } + + def __init__(self, argv): + self.argv = argv + + def print_help(self, sub_help): + return self._help.format(sub_help=sub_help) + + def main(self): + terminal.dispatch(self.mapper, self.argv) + parser = argparse.ArgumentParser( + prog='ceph-volume simple', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=self.print_help(terminal.subhelp(self.mapper)), + ) + parser.parse_args(self.argv) + if len(self.argv) <= 1: + return parser.print_help() diff --git a/src/ceph-volume/ceph_volume/devices/simple/scan.py b/src/ceph-volume/ceph_volume/devices/simple/scan.py new file mode 100644 index 000000000..ff7040beb --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/simple/scan.py @@ -0,0 +1,385 @@ +from __future__ import print_function +import argparse +import base64 +import json +import logging +import os +from textwrap import dedent +from ceph_volume import decorators, terminal, conf +from ceph_volume.api import lvm +from ceph_volume.systemd import systemctl +from ceph_volume.util import arg_validators, system, disk, encryption +from ceph_volume.util.device import Device + + +logger = logging.getLogger(__name__) + + +def parse_keyring(file_contents): + """ + Extract the actual key from a string. Usually from a keyring file, where + the keyring will be in a client section. In the case of a lockbox, it is + something like:: + + [client.osd-lockbox.8d7a8ab2-5db0-4f83-a785-2809aba403d5]\n\tkey = AQDtoGha/GYJExAA7HNl7Ukhqr7AKlCpLJk6UA==\n + + From the above case, it would return:: + + AQDtoGha/GYJExAA7HNl7Ukhqr7AKlCpLJk6UA== + """ + # remove newlines that might be trailing + keyring = file_contents.strip('\n') + + # Now split on spaces + keyring = keyring.split(' ')[-1] + + # Split on newlines + keyring = keyring.split('\n')[-1] + + return keyring.strip() + + +class Scan(object): + + help = 'Capture metadata from all running ceph-disk OSDs, OSD data partition or directory' + + def __init__(self, argv): + self.argv = argv + self._etc_path = '/etc/ceph/osd/' + + @property + def etc_path(self): + if os.path.isdir(self._etc_path): + return self._etc_path + + if not os.path.exists(self._etc_path): + os.mkdir(self._etc_path) + return self._etc_path + + error = "OSD Configuration path (%s) needs to be a directory" % self._etc_path + raise RuntimeError(error) + + def get_contents(self, path): + with open(path, 'r') as fp: + contents = fp.readlines() + if len(contents) > 1: + return ''.join(contents) + return ''.join(contents).strip().strip('\n') + + def scan_device(self, path): + device_metadata = {'path': None, 'uuid': None} + if not path: + return device_metadata + if self.is_encrypted: + encryption_metadata = encryption.legacy_encrypted(path) + device_metadata['path'] = encryption_metadata['device'] + device_metadata['uuid'] = disk.get_partuuid(encryption_metadata['device']) + return device_metadata + # cannot read the symlink if this is tmpfs + if os.path.islink(path): + device = os.readlink(path) + else: + device = path + lvm_device = lvm.get_single_lv(filters={'lv_path': device}) + if lvm_device: + device_uuid = lvm_device.lv_uuid + else: + device_uuid = disk.get_partuuid(device) + + device_metadata['uuid'] = device_uuid + device_metadata['path'] = device + + return device_metadata + + def scan_directory(self, path): + osd_metadata = {'cluster_name': conf.cluster} + directory_files = os.listdir(path) + if 'keyring' not in directory_files: + raise RuntimeError( + 'OSD files not found, required "keyring" file is not present at: %s' % path + ) + for file_ in os.listdir(path): + file_path = os.path.join(path, file_) + file_json_key = file_ + if file_.endswith('_dmcrypt'): + file_json_key = file_.rstrip('_dmcrypt') + logger.info( + 'reading file {}, stripping _dmcrypt suffix'.format(file_) + ) + if os.path.islink(file_path): + if os.path.exists(file_path): + osd_metadata[file_json_key] = self.scan_device(file_path) + else: + msg = 'broken symlink found %s -> %s' % (file_path, os.path.realpath(file_path)) + terminal.warning(msg) + logger.warning(msg) + + if os.path.isdir(file_path): + continue + + # the check for binary needs to go before the file, to avoid + # capturing data from binary files but still be able to capture + # contents from actual files later + try: + if system.is_binary(file_path): + logger.info('skipping binary file: %s' % file_path) + continue + except IOError: + logger.exception('skipping due to IOError on file: %s' % file_path) + continue + if os.path.isfile(file_path): + content = self.get_contents(file_path) + if 'keyring' in file_path: + content = parse_keyring(content) + try: + osd_metadata[file_json_key] = int(content) + except ValueError: + osd_metadata[file_json_key] = content + + # we must scan the paths again because this might be a temporary mount + path_mounts = system.Mounts(paths=True) + device = path_mounts.get_mounts().get(path) + + # it is possible to have more than one device, pick the first one, and + # warn that it is possible that more than one device is 'data' + if not device: + terminal.error('Unable to detect device mounted for path: %s' % path) + raise RuntimeError('Cannot activate OSD') + osd_metadata['data'] = self.scan_device(device[0] if len(device) else None) + + return osd_metadata + + def scan_encrypted(self, directory=None): + device = self.encryption_metadata['device'] + lockbox = self.encryption_metadata['lockbox'] + encryption_type = self.encryption_metadata['type'] + osd_metadata = {} + # Get the PARTUUID of the device to make sure have the right one and + # that maps to the data device + device_uuid = disk.get_partuuid(device) + dm_path = '/dev/mapper/%s' % device_uuid + # check if this partition is already mapped + device_status = encryption.status(device_uuid) + + # capture all the information from the lockbox first, reusing the + # directory scan method + if self.device_mounts.get(lockbox): + lockbox_path = self.device_mounts.get(lockbox)[0] + lockbox_metadata = self.scan_directory(lockbox_path) + # ceph-disk stores the fsid as osd-uuid in the lockbox, thanks ceph-disk + dmcrypt_secret = encryption.get_dmcrypt_key( + None, # There is no ID stored in the lockbox + lockbox_metadata['osd-uuid'], + os.path.join(lockbox_path, 'keyring') + ) + else: + with system.tmp_mount(lockbox) as lockbox_path: + lockbox_metadata = self.scan_directory(lockbox_path) + # ceph-disk stores the fsid as osd-uuid in the lockbox, thanks ceph-disk + dmcrypt_secret = encryption.get_dmcrypt_key( + None, # There is no ID stored in the lockbox + lockbox_metadata['osd-uuid'], + os.path.join(lockbox_path, 'keyring') + ) + + if not device_status: + # Note how both these calls need b64decode. For some reason, the + # way ceph-disk creates these keys, it stores them in the monitor + # *undecoded*, requiring this decode call again. The lvm side of + # encryption doesn't need it, so we are assuming here that anything + # that `simple` scans, will come from ceph-disk and will need this + # extra decode call here + dmcrypt_secret = base64.b64decode(dmcrypt_secret) + if encryption_type == 'luks': + encryption.luks_open(dmcrypt_secret, device, device_uuid) + else: + encryption.plain_open(dmcrypt_secret, device, device_uuid) + + # If we have a directory, use that instead of checking for mounts + if directory: + osd_metadata = self.scan_directory(directory) + else: + # Now check if that mapper is mounted already, to avoid remounting and + # decrypting the device + dm_path_mount = self.device_mounts.get(dm_path) + if dm_path_mount: + osd_metadata = self.scan_directory(dm_path_mount[0]) + else: + with system.tmp_mount(dm_path, encrypted=True) as device_path: + osd_metadata = self.scan_directory(device_path) + + osd_metadata['encrypted'] = True + osd_metadata['encryption_type'] = encryption_type + osd_metadata['lockbox.keyring'] = parse_keyring(lockbox_metadata['keyring']) + return osd_metadata + + @decorators.needs_root + def scan(self, args): + osd_metadata = {'cluster_name': conf.cluster} + osd_path = None + logger.info('detecting if argument is a device or a directory: %s', args.osd_path) + if os.path.isdir(args.osd_path): + logger.info('will scan directly, path is a directory') + osd_path = args.osd_path + else: + # assume this is a device, check if it is mounted and use that path + logger.info('path is not a directory, will check if mounted') + if system.device_is_mounted(args.osd_path): + logger.info('argument is a device, which is mounted') + mounted_osd_paths = self.device_mounts.get(args.osd_path) + osd_path = mounted_osd_paths[0] if len(mounted_osd_paths) else None + + # argument is not a directory, and it is not a device that is mounted + # somewhere so temporarily mount it to poke inside, otherwise, scan + # directly + if not osd_path: + # check if we have an encrypted device first, so that we can poke at + # the lockbox instead + if self.is_encrypted: + if not self.encryption_metadata.get('lockbox'): + raise RuntimeError( + 'Lockbox partition was not found for device: %s' % args.osd_path + ) + osd_metadata = self.scan_encrypted() + else: + logger.info('device is not mounted, will mount it temporarily to scan') + with system.tmp_mount(args.osd_path) as osd_path: + osd_metadata = self.scan_directory(osd_path) + else: + if self.is_encrypted: + logger.info('will scan encrypted OSD directory at path: %s', osd_path) + osd_metadata = self.scan_encrypted(osd_path) + else: + logger.info('will scan OSD directory at path: %s', osd_path) + osd_metadata = self.scan_directory(osd_path) + + osd_id = osd_metadata['whoami'] + osd_fsid = osd_metadata['fsid'] + filename = '%s-%s.json' % (osd_id, osd_fsid) + json_path = os.path.join(self.etc_path, filename) + + if os.path.exists(json_path) and not args.stdout: + if not args.force: + raise RuntimeError( + '--force was not used and OSD metadata file exists: %s' % json_path + ) + + if args.stdout: + print(json.dumps(osd_metadata, indent=4, sort_keys=True, ensure_ascii=False)) + else: + with open(json_path, 'w') as fp: + json.dump(osd_metadata, fp, indent=4, sort_keys=True, ensure_ascii=False) + fp.write(os.linesep) + terminal.success( + 'OSD %s got scanned and metadata persisted to file: %s' % ( + osd_id, + json_path + ) + ) + terminal.success( + 'To take over management of this scanned OSD, and disable ceph-disk and udev, run:' + ) + terminal.success(' ceph-volume simple activate %s %s' % (osd_id, osd_fsid)) + + if not osd_metadata.get('data'): + msg = 'Unable to determine device mounted on %s' % args.osd_path + logger.warning(msg) + terminal.warning(msg) + terminal.warning('OSD will not be able to start without this information:') + terminal.warning(' "data": "/path/to/device",') + logger.warning('Unable to determine device mounted on %s' % args.osd_path) + + def main(self): + sub_command_help = dedent(""" + Scan running OSDs, an OSD directory (or data device) for files and configurations + that will allow to take over the management of the OSD. + + Scanned OSDs will get their configurations stored in + /etc/ceph/osd/<id>-<fsid>.json + + For an OSD ID of 0 with fsid of ``a9d50838-e823-43d6-b01f-2f8d0a77afc2`` + that could mean a scan command that looks like:: + + ceph-volume simple scan /var/lib/ceph/osd/ceph-0 + + Which would store the metadata in a JSON file at:: + + /etc/ceph/osd/0-a9d50838-e823-43d6-b01f-2f8d0a77afc2.json + + To scan all running OSDs: + + ceph-volume simple scan + + To a scan a specific running OSD: + + ceph-volume simple scan /var/lib/ceph/osd/{cluster}-{osd id} + + And to scan a device (mounted or unmounted) that has OSD data in it, for example /dev/sda1 + + ceph-volume simple scan /dev/sda1 + + Scanning a device or directory that belongs to an OSD not created by ceph-disk will be ingored. + """) + parser = argparse.ArgumentParser( + prog='ceph-volume simple scan', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + + parser.add_argument( + '-f', '--force', + action='store_true', + help='If OSD has already been scanned, the JSON file will be overwritten' + ) + + parser.add_argument( + '--stdout', + action='store_true', + help='Do not save to a file, output metadata to stdout' + ) + + parser.add_argument( + 'osd_path', + metavar='OSD_PATH', + type=arg_validators.OSDPath(), + nargs='?', + default=None, + help='Path to an existing OSD directory or OSD data partition' + ) + + args = parser.parse_args(self.argv) + paths = [] + if args.osd_path: + paths.append(args.osd_path) + else: + osd_ids = systemctl.get_running_osd_ids() + for osd_id in osd_ids: + paths.append("/var/lib/ceph/osd/{}-{}".format( + conf.cluster, + osd_id, + )) + + # Capture some environment status, so that it can be reused all over + self.device_mounts = system.Mounts(devices=True).get_mounts() + self.path_mounts = system.Mounts(paths=True).get_mounts() + + for path in paths: + args.osd_path = path + device = Device(args.osd_path) + if device.is_partition: + if device.ceph_disk.type != 'data': + label = device.ceph_disk.partlabel + msg = 'Device must be the ceph data partition, but PARTLABEL reported: "%s"' % label + raise RuntimeError(msg) + + self.encryption_metadata = encryption.legacy_encrypted(args.osd_path) + self.is_encrypted = self.encryption_metadata['encrypted'] + + if self.encryption_metadata['device'] != "tmpfs": + device = Device(self.encryption_metadata['device']) + if not device.is_ceph_disk_member: + terminal.warning("Ignoring %s because it's not a ceph-disk created osd." % path) + else: + self.scan(args) + else: + terminal.warning("Ignoring %s because it's not a ceph-disk created osd." % path) diff --git a/src/ceph-volume/ceph_volume/devices/simple/trigger.py b/src/ceph-volume/ceph_volume/devices/simple/trigger.py new file mode 100644 index 000000000..c01d9ae2a --- /dev/null +++ b/src/ceph-volume/ceph_volume/devices/simple/trigger.py @@ -0,0 +1,70 @@ +from __future__ import print_function +import argparse +from textwrap import dedent +from ceph_volume.exceptions import SuffixParsingError +from ceph_volume import decorators +from .activate import Activate + + +def parse_osd_id(string): + osd_id = string.split('-', 1)[0] + if not osd_id: + raise SuffixParsingError('OSD id', string) + if osd_id.isdigit(): + return osd_id + raise SuffixParsingError('OSD id', string) + + +def parse_osd_uuid(string): + osd_id = '%s-' % parse_osd_id(string) + # remove the id first + osd_uuid = string.split(osd_id, 1)[-1] + if not osd_uuid: + raise SuffixParsingError('OSD uuid', string) + return osd_uuid + + +class Trigger(object): + + help = 'systemd helper to activate an OSD' + + def __init__(self, argv): + self.argv = argv + + @decorators.needs_root + def main(self): + sub_command_help = dedent(""" + ** DO NOT USE DIRECTLY ** + This tool is meant to help the systemd unit that knows about OSDs. + + Proxy OSD activation to ``ceph-volume simple activate`` by parsing the + input from systemd, detecting the UUID and ID associated with an OSD:: + + ceph-volume simple trigger {SYSTEMD-DATA} + + The systemd "data" is expected to be in the format of:: + + {OSD ID}-{OSD UUID} + + The devices associated with the OSD need to have been scanned previously, + so that all needed metadata can be used for starting the OSD process. + """) + parser = argparse.ArgumentParser( + prog='ceph-volume simple trigger', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=sub_command_help, + ) + + parser.add_argument( + 'systemd_data', + metavar='SYSTEMD_DATA', + nargs='?', + help='Data from a systemd unit containing ID and UUID of the OSD, like 0-asdf-lkjh' + ) + if len(self.argv) == 0: + print(sub_command_help) + return + args = parser.parse_args(self.argv) + osd_id = parse_osd_id(args.systemd_data) + osd_uuid = parse_osd_uuid(args.systemd_data) + Activate([osd_id, osd_uuid], from_trigger=True).main() |