diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/libcephsqlite.cc | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/libcephsqlite.cc')
-rw-r--r-- | src/libcephsqlite.cc | 933 |
1 files changed, 933 insertions, 0 deletions
diff --git a/src/libcephsqlite.cc b/src/libcephsqlite.cc new file mode 100644 index 000000000..b4fb96841 --- /dev/null +++ b/src/libcephsqlite.cc @@ -0,0 +1,933 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2021 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License version 2.1, as published by + * the Free Software Foundation. See file COPYING. + * + */ + +#include <boost/smart_ptr/intrusive_ptr.hpp> +#include <fmt/format.h> + +#include <fcntl.h> +#include <stdio.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include <cstdlib> +#include <iomanip> +#include <iostream> +#include <regex> +#include <sstream> +#include <string_view> + +#include <limits.h> +#include <string.h> + +#include <sqlite3ext.h> +SQLITE_EXTENSION_INIT1 + +#include "include/ceph_assert.h" +#include "include/rados/librados.hpp" + +#include "common/Clock.h" +#include "common/Formatter.h" +#include "common/ceph_argparse.h" +#include "common/ceph_mutex.h" +#include "common/common_init.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/version.h" + +#include "include/libcephsqlite.h" +#include "SimpleRADOSStriper.h" + +#define dout_subsys ceph_subsys_cephsqlite +#undef dout_prefix +#define dout_prefix *_dout << "cephsqlite: " << __func__ << ": " +#define d(cct,cluster,lvl) ldout((cct), (lvl)) << "(client." << cluster->get_instance_id() << ") " +#define dv(lvl) d(cct,cluster,(lvl)) +#define df(lvl) d(f->io.cct,f->io.cluster,(lvl)) << f->loc << " " + +enum { + P_FIRST = 0xf0000, + P_OP_OPEN, + P_OP_DELETE, + P_OP_ACCESS, + P_OP_FULLPATHNAME, + P_OP_CURRENTTIME, + P_OPF_CLOSE, + P_OPF_READ, + P_OPF_WRITE, + P_OPF_TRUNCATE, + P_OPF_SYNC, + P_OPF_FILESIZE, + P_OPF_LOCK, + P_OPF_UNLOCK, + P_OPF_CHECKRESERVEDLOCK, + P_OPF_FILECONTROL, + P_OPF_SECTORSIZE, + P_OPF_DEVICECHARACTERISTICS, + P_LAST, +}; + +using cctptr = boost::intrusive_ptr<CephContext>; +using rsptr = std::shared_ptr<librados::Rados>; + +struct cephsqlite_appdata { + ~cephsqlite_appdata() { + { + std::scoped_lock lock(cluster_mutex); + _disconnect(); + } + if (logger) { + cct->get_perfcounters_collection()->remove(logger.get()); + } + if (striper_logger) { + cct->get_perfcounters_collection()->remove(striper_logger.get()); + } + } + int setup_perf() { + ceph_assert(cct); + PerfCountersBuilder plb(cct.get(), "libcephsqlite_vfs", P_FIRST, P_LAST); + plb.add_time_avg(P_OP_OPEN, "op_open", "Time average of Open operations"); + plb.add_time_avg(P_OP_DELETE, "op_delete", "Time average of Delete operations"); + plb.add_time_avg(P_OP_ACCESS, "op_access", "Time average of Access operations"); + plb.add_time_avg(P_OP_FULLPATHNAME, "op_fullpathname", "Time average of FullPathname operations"); + plb.add_time_avg(P_OP_CURRENTTIME, "op_currenttime", "Time average of Currenttime operations"); + plb.add_time_avg(P_OPF_CLOSE, "opf_close", "Time average of Close file operations"); + plb.add_time_avg(P_OPF_READ, "opf_read", "Time average of Read file operations"); + plb.add_time_avg(P_OPF_WRITE, "opf_write", "Time average of Write file operations"); + plb.add_time_avg(P_OPF_TRUNCATE, "opf_truncate", "Time average of Truncate file operations"); + plb.add_time_avg(P_OPF_SYNC, "opf_sync", "Time average of Sync file operations"); + plb.add_time_avg(P_OPF_FILESIZE, "opf_filesize", "Time average of FileSize file operations"); + plb.add_time_avg(P_OPF_LOCK, "opf_lock", "Time average of Lock file operations"); + plb.add_time_avg(P_OPF_UNLOCK, "opf_unlock", "Time average of Unlock file operations"); + plb.add_time_avg(P_OPF_CHECKRESERVEDLOCK, "opf_checkreservedlock", "Time average of CheckReservedLock file operations"); + plb.add_time_avg(P_OPF_FILECONTROL, "opf_filecontrol", "Time average of FileControl file operations"); + plb.add_time_avg(P_OPF_SECTORSIZE, "opf_sectorsize", "Time average of SectorSize file operations"); + plb.add_time_avg(P_OPF_DEVICECHARACTERISTICS, "opf_devicecharacteristics", "Time average of DeviceCharacteristics file operations"); + logger.reset(plb.create_perf_counters()); + if (int rc = SimpleRADOSStriper::config_logger(cct.get(), "libcephsqlite_striper", &striper_logger); rc < 0) { + return rc; + } + cct->get_perfcounters_collection()->add(logger.get()); + cct->get_perfcounters_collection()->add(striper_logger.get()); + return 0; + } + + std::pair<cctptr, rsptr> get_cluster() { + std::scoped_lock lock(cluster_mutex); + if (!cct) { + if (int rc = _open(nullptr); rc < 0) { + ceph_abort("could not open connection to ceph"); + } + } + return {cct, cluster}; + } + int connect() { + std::scoped_lock lock(cluster_mutex); + return _connect(); + } + int reconnect() { + std::scoped_lock lock(cluster_mutex); + _disconnect(); + return _connect(); + } + int maybe_reconnect(rsptr _cluster) { + std::scoped_lock lock(cluster_mutex); + if (!cluster || cluster == _cluster) { + ldout(cct, 10) << "reconnecting to RADOS" << dendl; + _disconnect(); + return _connect(); + } else { + ldout(cct, 10) << "already reconnected" << dendl; + return 0; + } + } + int open(CephContext* _cct) { + std::scoped_lock lock(cluster_mutex); + return _open(_cct); + } + + std::unique_ptr<PerfCounters> logger; + std::shared_ptr<PerfCounters> striper_logger; + +private: + int _open(CephContext* _cct) { + if (!_cct) { + std::vector<const char*> env_args; + env_to_vec(env_args, "CEPH_ARGS"); + std::string cluster, conf_file_list; // unused + CephInitParameters iparams = ceph_argparse_early_args(env_args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list); + cct = cctptr(common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0), false); + cct->_conf.parse_config_files(nullptr, &std::cerr, 0); + cct->_conf.parse_env(cct->get_module_type()); // environment variables override + cct->_conf.apply_changes(nullptr); + common_init_finish(cct.get()); + } else { + cct = cctptr(_cct); + } + + if (int rc = setup_perf(); rc < 0) { + return rc; + } + + if (int rc = _connect(); rc < 0) { + return rc; + } + + return 0; + } + void _disconnect() { + if (cluster) { + cluster.reset(); + } + } + int _connect() { + ceph_assert(cct); + auto _cluster = rsptr(new librados::Rados()); + ldout(cct, 5) << "initializing RADOS handle as " << cct->_conf->name << dendl; + if (int rc = _cluster->init_with_context(cct.get()); rc < 0) { + lderr(cct) << "cannot initialize RADOS: " << cpp_strerror(rc) << dendl; + return rc; + } + if (int rc = _cluster->connect(); rc < 0) { + lderr(cct) << "cannot connect: " << cpp_strerror(rc) << dendl; + return rc; + } + auto s = _cluster->get_addrs(); + ldout(cct, 5) << "completed connection to RADOS with address " << s << dendl; + cluster = std::move(_cluster); + return 0; + } + + ceph::mutex cluster_mutex = ceph::make_mutex("libcephsqlite");; + cctptr cct; + rsptr cluster; +}; + +struct cephsqlite_fileloc { + std::string pool; + std::string radosns; + std::string name; +}; + +struct cephsqlite_fileio { + cctptr cct; + rsptr cluster; // anchor for ioctx + librados::IoCtx ioctx; + std::unique_ptr<SimpleRADOSStriper> rs; +}; + +std::ostream& operator<<(std::ostream &out, const cephsqlite_fileloc& fileloc) { + return out + << "[" + << fileloc.pool + << ":" + << fileloc.radosns + << "/" + << fileloc.name + << "]" + ; +} + +struct cephsqlite_file { + sqlite3_file base; + struct sqlite3_vfs* vfs = nullptr; + int flags = 0; + // There are 5 lock states: https://sqlite.org/c3ref/c_lock_exclusive.html + int lock = 0; + struct cephsqlite_fileloc loc{}; + struct cephsqlite_fileio io{}; +}; + + +#define getdata(vfs) (*((cephsqlite_appdata*)((vfs)->pAppData))) + +static int Lock(sqlite3_file *file, int ilock) +{ + auto f = (cephsqlite_file*)file; + auto start = ceph::coarse_mono_clock::now(); + df(5) << std::hex << ilock << dendl; + + auto& lock = f->lock; + ceph_assert(!f->io.rs->is_locked() || lock > SQLITE_LOCK_NONE); + ceph_assert(lock <= ilock); + if (!f->io.rs->is_locked() && ilock > SQLITE_LOCK_NONE) { + if (int rc = f->io.rs->lock(0); rc < 0) { + df(5) << "failed: " << rc << dendl; + if (rc == -EBLOCKLISTED) { + getdata(f->vfs).maybe_reconnect(f->io.cluster); + } + return SQLITE_IOERR; + } + } + + lock = ilock; + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_LOCK, end-start); + return SQLITE_OK; +} + +static int Unlock(sqlite3_file *file, int ilock) +{ + auto f = (cephsqlite_file*)file; + auto start = ceph::coarse_mono_clock::now(); + df(5) << std::hex << ilock << dendl; + + auto& lock = f->lock; + ceph_assert(lock == SQLITE_LOCK_NONE || (lock > SQLITE_LOCK_NONE && f->io.rs->is_locked())); + ceph_assert(lock >= ilock); + if (ilock <= SQLITE_LOCK_NONE && SQLITE_LOCK_NONE < lock) { + if (int rc = f->io.rs->unlock(); rc < 0) { + df(5) << "failed: " << rc << dendl; + if (rc == -EBLOCKLISTED) { + getdata(f->vfs).maybe_reconnect(f->io.cluster); + } + return SQLITE_IOERR; + } + } + + lock = ilock; + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_UNLOCK, end-start); + return SQLITE_OK; +} + +static int CheckReservedLock(sqlite3_file *file, int *result) +{ + auto f = (cephsqlite_file*)file; + auto start = ceph::coarse_mono_clock::now(); + df(5) << dendl; + *result = 0; + + auto& lock = f->lock; + if (lock > SQLITE_LOCK_SHARED) { + *result = 1; + } + + df(10); + f->io.rs->print_lockers(*_dout); + *_dout << dendl; + + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_CHECKRESERVEDLOCK, end-start); + return SQLITE_OK; +} + +static int Close(sqlite3_file *file) +{ + auto f = (cephsqlite_file*)file; + auto start = ceph::coarse_mono_clock::now(); + df(5) << dendl; + f->~cephsqlite_file(); + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_CLOSE, end-start); + return SQLITE_OK; +} + +static int Read(sqlite3_file *file, void *buf, int len, sqlite_int64 off) +{ + auto f = (cephsqlite_file*)file; + auto start = ceph::coarse_mono_clock::now(); + df(5) << buf << " " << off << "~" << len << dendl; + + if (int rc = f->io.rs->read(buf, len, off); rc < 0) { + df(5) << "read failed: " << cpp_strerror(rc) << dendl; + if (rc == -EBLOCKLISTED) { + getdata(f->vfs).maybe_reconnect(f->io.cluster); + } + return SQLITE_IOERR_READ; + } else { + df(5) << "= " << rc << dendl; + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_READ, end-start); + if (rc < len) { + memset((unsigned char*)buf+rc, 0, len-rc); + return SQLITE_IOERR_SHORT_READ; + } else { + return SQLITE_OK; + } + } +} + +static int Write(sqlite3_file *file, const void *buf, int len, sqlite_int64 off) +{ + auto f = (cephsqlite_file*)file; + auto start = ceph::coarse_mono_clock::now(); + df(5) << off << "~" << len << dendl; + + if (int rc = f->io.rs->write(buf, len, off); rc < 0) { + df(5) << "write failed: " << cpp_strerror(rc) << dendl; + if (rc == -EBLOCKLISTED) { + getdata(f->vfs).maybe_reconnect(f->io.cluster); + } + return SQLITE_IOERR_WRITE; + } else { + df(5) << "= " << rc << dendl; + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_WRITE, end-start); + return SQLITE_OK; + } + +} + +static int Truncate(sqlite3_file *file, sqlite_int64 size) +{ + auto f = (cephsqlite_file*)file; + auto start = ceph::coarse_mono_clock::now(); + df(5) << size << dendl; + + if (int rc = f->io.rs->truncate(size); rc < 0) { + df(5) << "truncate failed: " << cpp_strerror(rc) << dendl; + if (rc == -EBLOCKLISTED) { + getdata(f->vfs).maybe_reconnect(f->io.cluster); + } + return SQLITE_IOERR; + } + + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_TRUNCATE, end-start); + return SQLITE_OK; +} + +static int Sync(sqlite3_file *file, int flags) +{ + auto f = (cephsqlite_file*)file; + auto start = ceph::coarse_mono_clock::now(); + df(5) << flags << dendl; + + if (int rc = f->io.rs->flush(); rc < 0) { + df(5) << "failed: " << cpp_strerror(rc) << dendl; + if (rc == -EBLOCKLISTED) { + getdata(f->vfs).maybe_reconnect(f->io.cluster); + } + return SQLITE_IOERR; + } + + df(5) << " = 0" << dendl; + + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_SYNC, end-start); + return SQLITE_OK; +} + + +static int FileSize(sqlite3_file *file, sqlite_int64 *osize) +{ + auto f = (cephsqlite_file*)file; + auto start = ceph::coarse_mono_clock::now(); + df(5) << dendl; + + uint64_t size = 0; + if (int rc = f->io.rs->stat(&size); rc < 0) { + df(5) << "stat failed: " << cpp_strerror(rc) << dendl; + if (rc == -EBLOCKLISTED) { + getdata(f->vfs).maybe_reconnect(f->io.cluster); + } + return SQLITE_NOTFOUND; + } + + *osize = (sqlite_int64)size; + + df(5) << "= " << size << dendl; + + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_FILESIZE, end-start); + return SQLITE_OK; +} + + +static bool parsepath(std::string_view path, struct cephsqlite_fileloc* fileloc) +{ + static const std::regex re1{"^/*(\\*[[:digit:]]+):([[:alnum:]\\-_.]*)/([[:alnum:]\\-._]+)$"}; + static const std::regex re2{"^/*([[:alnum:]\\-_.]+):([[:alnum:]\\-_.]*)/([[:alnum:]\\-._]+)$"}; + + std::cmatch cm; + if (!std::regex_match(path.data(), cm, re1)) { + if (!std::regex_match(path.data(), cm, re2)) { + return false; + } + } + fileloc->pool = cm[1]; + fileloc->radosns = cm[2]; + fileloc->name = cm[3]; + + return true; +} + +static int makestriper(sqlite3_vfs* vfs, cctptr cct, rsptr cluster, const cephsqlite_fileloc& loc, cephsqlite_fileio* io) +{ + bool gotmap = false; + + d(cct,cluster,10) << loc << dendl; + +enoent_retry: + if (loc.pool[0] == '*') { + std::string err; + int64_t id = strict_strtoll(loc.pool.c_str()+1, 10, &err); + ceph_assert(err.empty()); + if (int rc = cluster->ioctx_create2(id, io->ioctx); rc < 0) { + if (rc == -ENOENT && !gotmap) { + cluster->wait_for_latest_osdmap(); + gotmap = true; + goto enoent_retry; + } + d(cct,cluster,1) << "cannot create ioctx: " << cpp_strerror(rc) << dendl; + return rc; + } + } else { + if (int rc = cluster->ioctx_create(loc.pool.c_str(), io->ioctx); rc < 0) { + if (rc == -ENOENT && !gotmap) { + cluster->wait_for_latest_osdmap(); + gotmap = true; + goto enoent_retry; + } + d(cct,cluster,1) << "cannot create ioctx: " << cpp_strerror(rc) << dendl; + return rc; + } + } + + if (!loc.radosns.empty()) + io->ioctx.set_namespace(loc.radosns); + + io->rs = std::make_unique<SimpleRADOSStriper>(io->ioctx, loc.name); + io->rs->set_logger(getdata(vfs).striper_logger); + io->rs->set_lock_timeout(cct->_conf.get_val<std::chrono::milliseconds>("cephsqlite_lock_renewal_timeout")); + io->rs->set_lock_interval(cct->_conf.get_val<std::chrono::milliseconds>("cephsqlite_lock_renewal_interval")); + io->rs->set_blocklist_the_dead(cct->_conf.get_val<bool>("cephsqlite_blocklist_dead_locker")); + io->cluster = std::move(cluster); + io->cct = cct; + + return 0; +} + +static int SectorSize(sqlite3_file* sf) +{ + static const int size = 65536; + auto start = ceph::coarse_mono_clock::now(); + auto f = (cephsqlite_file*)sf; + df(5) << " = " << size << dendl; + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_SECTORSIZE, end-start); + return size; +} + +static int FileControl(sqlite3_file* sf, int op, void *arg) +{ + auto f = (cephsqlite_file*)sf; + auto start = ceph::coarse_mono_clock::now(); + df(5) << op << ", " << arg << dendl; + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_FILECONTROL, end-start); + return SQLITE_NOTFOUND; +} + +static int DeviceCharacteristics(sqlite3_file* sf) +{ + auto f = (cephsqlite_file*)sf; + auto start = ceph::coarse_mono_clock::now(); + df(5) << dendl; + static const int c = 0 + |SQLITE_IOCAP_ATOMIC + |SQLITE_IOCAP_POWERSAFE_OVERWRITE + |SQLITE_IOCAP_UNDELETABLE_WHEN_OPEN + |SQLITE_IOCAP_SAFE_APPEND + ; + auto end = ceph::coarse_mono_clock::now(); + getdata(f->vfs).logger->tinc(P_OPF_DEVICECHARACTERISTICS, end-start); + return c; +} + +static int Open(sqlite3_vfs *vfs, const char *name, sqlite3_file *file, + int flags, int *oflags) +{ + static const sqlite3_io_methods io = { + 1, /* iVersion */ + Close, /* xClose */ + Read, /* xRead */ + Write, /* xWrite */ + Truncate, /* xTruncate */ + Sync, /* xSync */ + FileSize, /* xFileSize */ + Lock, /* xLock */ + Unlock, /* xUnlock */ + CheckReservedLock, /* xCheckReservedLock */ + FileControl, /* xFileControl */ + SectorSize, /* xSectorSize */ + DeviceCharacteristics /* xDeviceCharacteristics */ + }; + + auto start = ceph::coarse_mono_clock::now(); + bool gotmap = false; + auto [cct, cluster] = getdata(vfs).get_cluster(); + + /* we are not going to create temporary files */ + if (name == NULL) { + dv(-1) << " cannot open temporary database" << dendl; + return SQLITE_CANTOPEN; + } + auto path = std::string_view(name); + if (path == ":memory:") { + dv(-1) << " cannot open temporary database" << dendl; + return SQLITE_IOERR; + } + + dv(5) << path << " flags=" << std::hex << flags << dendl; + + auto f = new (file)cephsqlite_file(); + f->vfs = vfs; + if (!parsepath(path, &f->loc)) { + ceph_assert(0); /* xFullPathname validates! */ + } + f->flags = flags; + +enoent_retry: + if (int rc = makestriper(vfs, cct, cluster, f->loc, &f->io); rc < 0) { + f->~cephsqlite_file(); + dv(-1) << "cannot open striper" << dendl; + return SQLITE_IOERR; + } + + if (flags & SQLITE_OPEN_CREATE) { + dv(10) << "OPEN_CREATE" << dendl; + if (int rc = f->io.rs->create(); rc < 0 && rc != -EEXIST) { + if (rc == -ENOENT && !gotmap) { + /* we may have an out of date OSDMap which cancels the op in the + * Objecter. Try to get a new one and retry. This is mostly noticable + * in testing when pools are getting created/deleted left and right. + */ + dv(5) << "retrying create after getting latest OSDMap" << dendl; + cluster->wait_for_latest_osdmap(); + gotmap = true; + goto enoent_retry; + } + dv(5) << "file cannot be created: " << cpp_strerror(rc) << dendl; + return SQLITE_IOERR; + } + } + + if (int rc = f->io.rs->open(); rc < 0) { + if (rc == -ENOENT && !gotmap) { + /* See comment above for create case. */ + dv(5) << "retrying open after getting latest OSDMap" << dendl; + cluster->wait_for_latest_osdmap(); + gotmap = true; + goto enoent_retry; + } + dv(10) << "cannot open striper: " << cpp_strerror(rc) << dendl; + return rc; + } + + if (oflags) { + *oflags = flags; + } + f->base.pMethods = &io; + auto end = ceph::coarse_mono_clock::now(); + getdata(vfs).logger->tinc(P_OP_OPEN, end-start); + return SQLITE_OK; +} + +/* +** Delete the file identified by argument path. If the dsync parameter +** is non-zero, then ensure the file-system modification to delete the +** file has been synced to disk before returning. +*/ +static int Delete(sqlite3_vfs* vfs, const char* path, int dsync) +{ + auto start = ceph::coarse_mono_clock::now(); + auto [cct, cluster] = getdata(vfs).get_cluster(); + dv(5) << "'" << path << "', " << dsync << dendl; + + cephsqlite_fileloc fileloc; + if (!parsepath(path, &fileloc)) { + dv(5) << "path does not parse!" << dendl; + return SQLITE_NOTFOUND; + } + + cephsqlite_fileio io; + if (int rc = makestriper(vfs, cct, cluster, fileloc, &io); rc < 0) { + dv(-1) << "cannot open striper" << dendl; + return SQLITE_IOERR; + } + + if (int rc = io.rs->lock(0); rc < 0) { + return SQLITE_IOERR; + } + + if (int rc = io.rs->remove(); rc < 0) { + dv(5) << "= " << rc << dendl; + return SQLITE_IOERR_DELETE; + } + + /* No need to unlock */ + dv(5) << "= 0" << dendl; + auto end = ceph::coarse_mono_clock::now(); + getdata(vfs).logger->tinc(P_OP_DELETE, end-start); + + return SQLITE_OK; +} + +/* +** Query the file-system to see if the named file exists, is readable or +** is both readable and writable. +*/ +static int Access(sqlite3_vfs* vfs, const char* path, int flags, int* result) +{ + auto start = ceph::coarse_mono_clock::now(); + auto [cct, cluster] = getdata(vfs).get_cluster(); + dv(5) << path << " " << std::hex << flags << dendl; + + cephsqlite_fileloc fileloc; + if (!parsepath(path, &fileloc)) { + dv(5) << "path does not parse!" << dendl; + return SQLITE_NOTFOUND; + } + + cephsqlite_fileio io; + if (int rc = makestriper(vfs, cct, cluster, fileloc, &io); rc < 0) { + dv(-1) << "cannot open striper" << dendl; + return SQLITE_IOERR; + } + + if (int rc = io.rs->open(); rc < 0) { + if (rc == -ENOENT) { + *result = 0; + return SQLITE_OK; + } else { + dv(10) << "cannot open striper: " << cpp_strerror(rc) << dendl; + *result = 0; + return SQLITE_IOERR; + } + } + + uint64_t size = 0; + if (int rc = io.rs->stat(&size); rc < 0) { + dv(5) << "= " << rc << " (" << cpp_strerror(rc) << ")" << dendl; + *result = 0; + } else { + dv(5) << "= 0" << dendl; + *result = 1; + } + + auto end = ceph::coarse_mono_clock::now(); + getdata(vfs).logger->tinc(P_OP_ACCESS, end-start); + return SQLITE_OK; +} + +/* This method is only called once for each database. It provides a chance to + * reformat the path into a canonical format. + */ +static int FullPathname(sqlite3_vfs* vfs, const char* ipath, int opathlen, char* opath) +{ + auto start = ceph::coarse_mono_clock::now(); + auto path = std::string_view(ipath); + auto [cct, cluster] = getdata(vfs).get_cluster(); + dv(5) << "1: " << path << dendl; + + cephsqlite_fileloc fileloc; + if (!parsepath(path, &fileloc)) { + dv(5) << "path does not parse!" << dendl; + return SQLITE_NOTFOUND; + } + dv(5) << " parsed " << fileloc << dendl; + + auto p = fmt::format("{}:{}/{}", fileloc.pool, fileloc.radosns, fileloc.name); + if (p.size() >= (size_t)opathlen) { + dv(5) << "path too long!" << dendl; + return SQLITE_CANTOPEN; + } + strcpy(opath, p.c_str()); + dv(5) << " output " << p << dendl; + + auto end = ceph::coarse_mono_clock::now(); + getdata(vfs).logger->tinc(P_OP_FULLPATHNAME, end-start); + return SQLITE_OK; +} + +static int CurrentTime(sqlite3_vfs* vfs, sqlite3_int64* time) +{ + auto start = ceph::coarse_mono_clock::now(); + auto [cct, cluster] = getdata(vfs).get_cluster(); + dv(5) << time << dendl; + + auto t = ceph_clock_now(); + *time = t.to_msec() + 2440587.5*86400000; /* julian days since 1970 converted to ms */ + + auto end = ceph::coarse_mono_clock::now(); + getdata(vfs).logger->tinc(P_OP_CURRENTTIME, end-start); + return SQLITE_OK; +} + +LIBCEPHSQLITE_API int cephsqlite_setcct(CephContext* _cct, char** ident) +{ + ldout(_cct, 1) << "cct: " << _cct << dendl; + + if (sqlite3_api == nullptr) { + lderr(_cct) << "API violation: must have sqlite3 init libcephsqlite" << dendl; + return -EINVAL; + } + + auto vfs = sqlite3_vfs_find("ceph"); + if (!vfs) { + lderr(_cct) << "API violation: must have sqlite3 init libcephsqlite" << dendl; + return -EINVAL; + } + + auto& appd = getdata(vfs); + if (int rc = appd.open(_cct); rc < 0) { + return rc; + } + + auto [cct, cluster] = appd.get_cluster(); + + auto s = cluster->get_addrs(); + if (ident) { + *ident = strdup(s.c_str()); + } + + ldout(cct, 1) << "complete" << dendl; + + return 0; +} + +static void f_perf(sqlite3_context* ctx, int argc, sqlite3_value** argv) +{ + auto vfs = (sqlite3_vfs*)sqlite3_user_data(ctx); + auto [cct, cluster] = getdata(vfs).get_cluster(); + dv(10) << dendl; + auto&& appd = getdata(vfs); + JSONFormatter f(false); + f.open_object_section("ceph_perf"); + appd.logger->dump_formatted(&f, false, false); + appd.striper_logger->dump_formatted(&f, false, false); + f.close_section(); + { + CachedStackStringStream css; + f.flush(*css); + auto sv = css->strv(); + dv(20) << " = " << sv << dendl; + sqlite3_result_text(ctx, sv.data(), sv.size(), SQLITE_TRANSIENT); + } +} + +static void f_status(sqlite3_context* ctx, int argc, sqlite3_value** argv) +{ + auto vfs = (sqlite3_vfs*)sqlite3_user_data(ctx); + auto [cct, cluster] = getdata(vfs).get_cluster(); + dv(10) << dendl; + JSONFormatter f(false); + f.open_object_section("ceph_status"); + f.dump_int("id", cluster->get_instance_id()); + f.dump_string("addr", cluster->get_addrs()); + f.close_section(); + { + CachedStackStringStream css; + f.flush(*css); + auto sv = css->strv(); + dv(20) << " = " << sv << dendl; + sqlite3_result_text(ctx, sv.data(), sv.size(), SQLITE_TRANSIENT); + } +} + +static int autoreg(sqlite3* db, char** err, const struct sqlite3_api_routines* thunk) +{ + auto vfs = sqlite3_vfs_find("ceph"); + if (!vfs) { + ceph_abort("ceph vfs not found"); + } + + if (int rc = sqlite3_create_function(db, "ceph_perf", 0, SQLITE_UTF8, vfs, f_perf, nullptr, nullptr); rc) { + return rc; + } + + if (int rc = sqlite3_create_function(db, "ceph_status", 0, SQLITE_UTF8, vfs, f_status, nullptr, nullptr); rc) { + return rc; + } + + return SQLITE_OK; +} + +/* You may wonder why we have an atexit handler? After all, atexit/exit creates + * a mess for multithreaded programs. Well, sqlite3 does not have an API for + * orderly removal of extensions. And, in fact, any API we might make + * unofficially (such as "sqlite3_cephsqlite_fini") would potentially race with + * other threads interacting with sqlite3 + the "ceph" VFS. There is a method + * for removing a VFS but it's not called by sqlite3 in any error scenario and + * there is no mechanism within sqlite3 to tell a VFS to unregister itself. + * + * This all would be mostly okay if /bin/sqlite3 did not call exit(3), but it + * does. (This occurs only for the sqlite3 binary, not when used as a library.) + * exit(3) calls destructors on all static-duration structures for the program. + * This breaks any outstanding threads created by the librados handle in all + * sorts of fantastic ways from C++ exceptions to memory faults. In general, + * Ceph libraries are not tolerant of exit(3) (_exit(3) is okay!). Applications + * must clean up after themselves or _exit(3). + * + * So, we have an atexit handler for libcephsqlite. This simply shuts down the + * RADOS handle. We can be assured that this occurs before any ceph library + * static-duration structures are destructed due to ordering guarantees by + * exit(3). Generally, we only see this called when the VFS is used by + * /bin/sqlite3 and only during sqlite3 error scenarios (like I/O errors + * arrising from blocklisting). + */ + +static void cephsqlite_atexit() +{ + if (auto vfs = sqlite3_vfs_find("ceph"); vfs) { + if (vfs->pAppData) { + auto&& appd = getdata(vfs); + delete &appd; + vfs->pAppData = nullptr; + } + } +} + +LIBCEPHSQLITE_API int sqlite3_cephsqlite_init(sqlite3* db, char** err, const sqlite3_api_routines* api) +{ + SQLITE_EXTENSION_INIT2(api); + + auto vfs = sqlite3_vfs_find("ceph"); + if (!vfs) { + vfs = (sqlite3_vfs*) calloc(1, sizeof(sqlite3_vfs)); + auto appd = new cephsqlite_appdata; + vfs->iVersion = 2; + vfs->szOsFile = sizeof(struct cephsqlite_file); + vfs->mxPathname = 4096; + vfs->zName = "ceph"; + vfs->pAppData = appd; + vfs->xOpen = Open; + vfs->xDelete = Delete; + vfs->xAccess = Access; + vfs->xFullPathname = FullPathname; + vfs->xCurrentTimeInt64 = CurrentTime; + if (int rc = sqlite3_vfs_register(vfs, 0); rc) { + delete appd; + free(vfs); + return rc; + } + } + + if (int rc = std::atexit(cephsqlite_atexit); rc) { + return SQLITE_INTERNAL; + } + + if (int rc = sqlite3_auto_extension((void(*)(void))autoreg); rc) { + return rc; + } + if (int rc = autoreg(db, err, api); rc) { + return rc; + } + + return SQLITE_OK_LOAD_PERMANENTLY; +} |