diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:05:51 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:05:51 +0000 |
commit | 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch) | |
tree | a94efe259b9009378be6d90eb30d2b019d95c194 /fs/ceph | |
parent | Initial commit. (diff) | |
download | linux-5d1646d90e1f2cceb9f0828f4b28318cd0ec7744.tar.xz linux-5d1646d90e1f2cceb9f0828f4b28318cd0ec7744.zip |
Adding upstream version 5.10.209.upstream/5.10.209
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/ceph')
-rw-r--r-- | fs/ceph/Kconfig | 50 | ||||
-rw-r--r-- | fs/ceph/Makefile | 14 | ||||
-rw-r--r-- | fs/ceph/acl.c | 257 | ||||
-rw-r--r-- | fs/ceph/addr.c | 2147 | ||||
-rw-r--r-- | fs/ceph/cache.c | 352 | ||||
-rw-r--r-- | fs/ceph/cache.h | 176 | ||||
-rw-r--r-- | fs/ceph/caps.c | 4637 | ||||
-rw-r--r-- | fs/ceph/ceph_frag.c | 23 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 410 | ||||
-rw-r--r-- | fs/ceph/dir.c | 1984 | ||||
-rw-r--r-- | fs/ceph/export.c | 571 | ||||
-rw-r--r-- | fs/ceph/file.c | 2530 | ||||
-rw-r--r-- | fs/ceph/inode.c | 2423 | ||||
-rw-r--r-- | fs/ceph/io.c | 163 | ||||
-rw-r--r-- | fs/ceph/io.h | 12 | ||||
-rw-r--r-- | fs/ceph/ioctl.c | 295 | ||||
-rw-r--r-- | fs/ceph/ioctl.h | 101 | ||||
-rw-r--r-- | fs/ceph/locks.c | 493 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 5275 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 581 | ||||
-rw-r--r-- | fs/ceph/mdsmap.c | 419 | ||||
-rw-r--r-- | fs/ceph/metric.c | 310 | ||||
-rw-r--r-- | fs/ceph/metric.h | 160 | ||||
-rw-r--r-- | fs/ceph/quota.c | 574 | ||||
-rw-r--r-- | fs/ceph/snap.c | 1205 | ||||
-rw-r--r-- | fs/ceph/strings.c | 129 | ||||
-rw-r--r-- | fs/ceph/super.c | 1347 | ||||
-rw-r--r-- | fs/ceph/super.h | 1262 | ||||
-rw-r--r-- | fs/ceph/util.c | 100 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 1306 |
30 files changed, 29306 insertions, 0 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig new file mode 100644 index 000000000..471e40156 --- /dev/null +++ b/fs/ceph/Kconfig @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0-only +config CEPH_FS + tristate "Ceph distributed file system" + depends on INET + select CEPH_LIB + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Choose Y or M here to include support for mounting the + experimental Ceph distributed file system. Ceph is an extremely + scalable file system designed to provide high performance, + reliable access to petabytes of storage. + + More information at https://ceph.io/. + + If unsure, say N. + +if CEPH_FS +config CEPH_FSCACHE + bool "Enable Ceph client caching support" + depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y + help + Choose Y here to enable persistent, read-only local + caching support for Ceph clients using FS-Cache + +endif + +config CEPH_FS_POSIX_ACL + bool "Ceph POSIX Access Control Lists" + depends on CEPH_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + If you don't know what Access Control Lists are, say N + +config CEPH_FS_SECURITY_LABEL + bool "CephFS Security Labels" + depends on CEPH_FS && SECURITY + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the Ceph filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile new file mode 100644 index 000000000..50c635dc7 --- /dev/null +++ b/fs/ceph/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for CEPH filesystem. +# + +obj-$(CONFIG_CEPH_FS) += ceph.o + +ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ + export.o caps.o snap.o xattr.o quota.o io.o \ + mds_client.o mdsmap.o strings.o ceph_frag.o \ + debugfs.o util.o metric.o + +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o +ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c new file mode 100644 index 000000000..e0465741c --- /dev/null +++ b/fs/ceph/acl.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/fs/ceph/acl.c + * + * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> + */ + +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include "super.h" + +static inline void ceph_set_cached_acl(struct inode *inode, + int type, struct posix_acl *acl) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 0)) + set_cached_acl(inode, type, acl); + else + forget_cached_acl(inode, type); + spin_unlock(&ci->i_ceph_lock); +} + +struct posix_acl *ceph_get_acl(struct inode *inode, int type) +{ + int size; + unsigned int retry_cnt = 0; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + +retry: + size = __ceph_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __ceph_getxattr(inode, name, value, size); + } + + if (size == -ERANGE && retry_cnt < 10) { + retry_cnt++; + kfree(value); + value = NULL; + goto retry; + } + + if (size > 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + } else if (size == -ENODATA || size == 0) { + acl = NULL; + } else { + pr_err_ratelimited("get acl %llx.%llx failed, err=%d\n", + ceph_vinop(inode), size); + acl = ERR_PTR(-EIO); + } + + kfree(value); + + if (!IS_ERR(acl)) + ceph_set_cached_acl(inode, type, acl); + + return acl; +} + +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret = 0, size = 0; + const char *name = NULL; + char *value = NULL; + struct iattr newattrs; + struct timespec64 old_ctime = inode->i_ctime; + umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto out; + } + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + if (acl) { + ret = posix_acl_update_mode(inode, &new_mode, &acl); + if (ret) + goto out; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + ret = acl ? -EINVAL : 0; + goto out; + } + name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + ret = -EINVAL; + goto out; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) + goto out_free; + } + + if (new_mode != old_mode) { + newattrs.ia_ctime = current_time(inode); + newattrs.ia_mode = new_mode; + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + ret = __ceph_setattr(inode, &newattrs); + if (ret) + goto out_free; + } + + ret = __ceph_setxattr(inode, name, value, size, 0); + if (ret) { + if (new_mode != old_mode) { + newattrs.ia_ctime = old_ctime; + newattrs.ia_mode = old_mode; + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + __ceph_setattr(inode, &newattrs); + } + goto out_free; + } + + ceph_set_cached_acl(inode, type, acl); + +out_free: + kfree(value); +out: + return ret; +} + +int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acl_sec_ctx *as_ctx) +{ + struct posix_acl *acl, *default_acl; + size_t val_size1 = 0, val_size2 = 0; + struct ceph_pagelist *pagelist = NULL; + void *tmp_buf = NULL; + int err; + + err = posix_acl_create(dir, mode, &default_acl, &acl); + if (err) + return err; + + if (acl) { + err = posix_acl_equiv_mode(acl, mode); + if (err < 0) + goto out_err; + if (err == 0) { + posix_acl_release(acl); + acl = NULL; + } + } + + if (!default_acl && !acl) + return 0; + + if (acl) + val_size1 = posix_acl_xattr_size(acl->a_count); + if (default_acl) + val_size2 = posix_acl_xattr_size(default_acl->a_count); + + err = -ENOMEM; + tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); + if (!tmp_buf) + goto out_err; + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) + goto out_err; + + err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); + if (err) + goto out_err; + + ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1); + + if (acl) { + size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS); + err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8); + if (err) + goto out_err; + ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS, + len); + err = posix_acl_to_xattr(&init_user_ns, acl, + tmp_buf, val_size1); + if (err < 0) + goto out_err; + ceph_pagelist_encode_32(pagelist, val_size1); + ceph_pagelist_append(pagelist, tmp_buf, val_size1); + } + if (default_acl) { + size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT); + err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); + if (err) + goto out_err; + ceph_pagelist_encode_string(pagelist, + XATTR_NAME_POSIX_ACL_DEFAULT, len); + err = posix_acl_to_xattr(&init_user_ns, default_acl, + tmp_buf, val_size2); + if (err < 0) + goto out_err; + ceph_pagelist_encode_32(pagelist, val_size2); + ceph_pagelist_append(pagelist, tmp_buf, val_size2); + } + + kfree(tmp_buf); + + as_ctx->acl = acl; + as_ctx->default_acl = default_acl; + as_ctx->pagelist = pagelist; + return 0; + +out_err: + posix_acl_release(acl); + posix_acl_release(default_acl); + kfree(tmp_buf); + if (pagelist) + ceph_pagelist_release(pagelist); + return err; +} + +void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx) +{ + if (!inode) + return; + ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, as_ctx->acl); + ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, as_ctx->default_acl); +} diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c new file mode 100644 index 000000000..3465ff95c --- /dev/null +++ b/fs/ceph/addr.c @@ -0,0 +1,2147 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/backing-dev.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> /* generic_writepages */ +#include <linux/slab.h> +#include <linux/pagevec.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/signal.h> +#include <linux/iversion.h> +#include <linux/ktime.h> + +#include "super.h" +#include "mds_client.h" +#include "cache.h" +#include "metric.h" +#include <linux/ceph/osd_client.h> +#include <linux/ceph/striper.h> + +/* + * Ceph address space ops. + * + * There are a few funny things going on here. + * + * The page->private field is used to reference a struct + * ceph_snap_context for _every_ dirty page. This indicates which + * snapshot the page was logically dirtied in, and thus which snap + * context needs to be associated with the osd write during writeback. + * + * Similarly, struct ceph_inode_info maintains a set of counters to + * count dirty pages on the inode. In the absence of snapshots, + * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. + * + * When a snapshot is taken (that is, when the client receives + * notification that a snapshot was taken), each inode with caps and + * with dirty pages (dirty pages implies there is a cap) gets a new + * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending + * order, new snaps go to the tail). The i_wrbuffer_ref_head count is + * moved to capsnap->dirty. (Unless a sync write is currently in + * progress. In that case, the capsnap is said to be "pending", new + * writes cannot start, and the capsnap isn't "finalized" until the + * write completes (or fails) and a final size/mtime for the inode for + * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. + * + * On writeback, we must submit writes to the osd IN SNAP ORDER. So, + * we look for the first capsnap in i_cap_snaps and write out pages in + * that snap context _only_. Then we move on to the next capsnap, + * eventually reaching the "live" or "head" context (i.e., pages that + * are not yet snapped) and are writing the most recently dirtied + * pages. + * + * Invalidate and so forth must take care to ensure the dirty page + * accounting is preserved. + */ + +#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) +#define CONGESTION_OFF_THRESH(congestion_kb) \ + (CONGESTION_ON_THRESH(congestion_kb) - \ + (CONGESTION_ON_THRESH(congestion_kb) >> 2)) + +static inline struct ceph_snap_context *page_snap_context(struct page *page) +{ + if (PagePrivate(page)) + return (void *)page->private; + return NULL; +} + +/* + * Dirty a page. Optimistically adjust accounting, on the assumption + * that we won't race with invalidate. If we do, readjust. + */ +static int ceph_set_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_snap_context *snapc; + + if (PageDirty(page)) { + dout("%p set_page_dirty %p idx %lu -- already dirty\n", + mapping->host, page, page->index); + BUG_ON(!PagePrivate(page)); + return 0; + } + + inode = mapping->host; + ci = ceph_inode(inode); + + /* dirty the head */ + spin_lock(&ci->i_ceph_lock); + BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + capsnap->dirty_pages++; + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + ++ci->i_wrbuffer_ref_head; + } + if (ci->i_wrbuffer_ref == 0) + ihold(inode); + ++ci->i_wrbuffer_ref; + dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " + "snapc %p seq %lld (%d snaps)\n", + mapping->host, page, page->index, + ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + snapc, snapc->seq, snapc->num_snaps); + spin_unlock(&ci->i_ceph_lock); + + /* + * Reference snap context in page->private. Also set + * PagePrivate so that we get invalidatepage callback. + */ + BUG_ON(PagePrivate(page)); + page->private = (unsigned long)snapc; + SetPagePrivate(page); + + return __set_page_dirty_nobuffers(page); +} + +/* + * If we are truncating the full page (i.e. offset == 0), adjust the + * dirty page counters appropriately. Only called if there is private + * data on the page. + */ +static void ceph_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_snap_context *snapc = page_snap_context(page); + + inode = page->mapping->host; + ci = ceph_inode(inode); + + if (offset != 0 || length != PAGE_SIZE) { + dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", + inode, page, page->index, offset, length); + return; + } + + ceph_invalidate_fscache_page(inode, page); + + WARN_ON(!PageLocked(page)); + if (!PagePrivate(page)) + return; + + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); + + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + page->private = 0; + ClearPagePrivate(page); +} + +static int ceph_releasepage(struct page *page, gfp_t g) +{ + dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, + page, page->index, PageDirty(page) ? "" : "not "); + + /* Can we release the page from the cache? */ + if (!ceph_release_fscache_page(page, g)) + return 0; + + return !PagePrivate(page); +} + +/* read a single page, without unlocking it. */ +static int ceph_do_readpage(struct file *filp, struct page *page) +{ + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; + struct ceph_vino vino = ceph_vino(inode); + int err = 0; + u64 off = page_offset(page); + u64 len = PAGE_SIZE; + + if (off >= i_size_read(inode)) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + return 0; + } + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + /* + * Uptodate inline data should have been added + * into page cache while getting Fcr caps. + */ + if (off == 0) + return -EINVAL; + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + return 0; + } + + err = ceph_readpage_from_fscache(inode, page); + if (err == 0) + return -EINPROGRESS; + + dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", + vino.ino, vino.snap, filp, off, len, page, page->index); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, + ci->i_truncate_seq, ci->i_truncate_size, + false); + if (IS_ERR(req)) + return PTR_ERR(req); + + osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); + + err = ceph_osdc_start_request(osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(osdc, req); + + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + + ceph_osdc_put_request(req); + dout("readpage result %d\n", err); + + if (err == -ENOENT) + err = 0; + if (err < 0) { + ceph_fscache_readpage_cancel(inode, page); + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; + goto out; + } + if (err < PAGE_SIZE) + /* zero fill remainder of page */ + zero_user_segment(page, err, PAGE_SIZE); + else + flush_dcache_page(page); + + SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); + +out: + return err < 0 ? err : 0; +} + +static int ceph_readpage(struct file *filp, struct page *page) +{ + int r = ceph_do_readpage(filp, page); + if (r != -EINPROGRESS) + unlock_page(page); + else + r = 0; + return r; +} + +/* + * Finish an async read(ahead) op. + */ +static void finish_read(struct ceph_osd_request *req) +{ + struct inode *inode = req->r_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_data *osd_data; + int rc = req->r_result <= 0 ? req->r_result : 0; + int bytes = req->r_result >= 0 ? req->r_result : 0; + int num_pages; + int i; + + dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + if (rc == -EBLOCKLISTED) + ceph_inode_to_client(inode)->blocklisted = true; + + /* unlock all pages, zeroing any data we didn't read */ + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + for (i = 0; i < num_pages; i++) { + struct page *page = osd_data->pages[i]; + + if (rc < 0 && rc != -ENOENT) { + ceph_fscache_readpage_cancel(inode, page); + goto unlock; + } + if (bytes < (int)PAGE_SIZE) { + /* zero (remainder of) page */ + int s = bytes < 0 ? 0 : bytes; + zero_user_segment(page, s, PAGE_SIZE); + } + dout("finish_read %p uptodate %p idx %lu\n", inode, page, + page->index); + flush_dcache_page(page); + SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); +unlock: + unlock_page(page); + put_page(page); + bytes -= PAGE_SIZE; + } + + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + + kfree(osd_data->pages); +} + +/* + * start an async read(ahead) operation. return nr_pages we submitted + * a read for on success, or negative error code. + */ +static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, + struct list_head *page_list, int max) +{ + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct page *page = lru_to_page(page_list); + struct ceph_vino vino; + struct ceph_osd_request *req; + u64 off; + u64 len; + int i; + struct page **pages; + pgoff_t next_index; + int nr_pages = 0; + int got = 0; + int ret = 0; + + if (!rw_ctx) { + /* caller of readpages does not hold buffer and read caps + * (fadvise, madvise and readahead cases) */ + int want = CEPH_CAP_FILE_CACHE; + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, + true, &got); + if (ret < 0) { + dout("start_read %p, error getting cap\n", inode); + } else if (!(got & want)) { + dout("start_read %p, no cache cap\n", inode); + ret = 0; + } + if (ret <= 0) { + if (got) + ceph_put_cap_refs(ci, got); + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + list_del(&page->lru); + put_page(page); + } + return ret; + } + } + + off = (u64) page_offset(page); + + /* count pages */ + next_index = page->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index != next_index) + break; + nr_pages++; + next_index++; + if (max && nr_pages == max) + break; + } + len = nr_pages << PAGE_SHIFT; + dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, + off, len); + vino = ceph_vino(inode); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, + 0, 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, + ci->i_truncate_seq, ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + /* build page vector */ + nr_pages = calc_pages_for(0, len); + pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out_put; + } + for (i = 0; i < nr_pages; ++i) { + page = list_entry(page_list->prev, struct page, lru); + BUG_ON(PageLocked(page)); + list_del(&page->lru); + + dout("start_read %p adding %p idx %lu\n", inode, page, + page->index); + if (add_to_page_cache_lru(page, &inode->i_data, page->index, + GFP_KERNEL)) { + ceph_fscache_uncache_page(inode, page); + put_page(page); + dout("start_read %p add_to_page_cache failed %p\n", + inode, page); + nr_pages = i; + if (nr_pages > 0) { + len = nr_pages << PAGE_SHIFT; + osd_req_op_extent_update(req, 0, len); + break; + } + goto out_pages; + } + pages[i] = page; + } + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); + req->r_callback = finish_read; + req->r_inode = inode; + + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); + ret = ceph_osdc_start_request(osdc, req, false); + if (ret < 0) + goto out_pages; + ceph_osdc_put_request(req); + + /* After adding locked pages to page cache, the inode holds cache cap. + * So we can drop our cap refs. */ + if (got) + ceph_put_cap_refs(ci, got); + + return nr_pages; + +out_pages: + for (i = 0; i < nr_pages; ++i) { + ceph_fscache_readpage_cancel(inode, pages[i]); + unlock_page(pages[i]); + } + ceph_put_page_vector(pages, nr_pages, false); +out_put: + ceph_osdc_put_request(req); +out: + if (got) + ceph_put_cap_refs(ci, got); + return ret; +} + + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) +{ + struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_file_info *fi = file->private_data; + struct ceph_rw_context *rw_ctx; + int rc = 0; + int max = 0; + + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) + return -EINVAL; + + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, + &nr_pages); + + if (rc == 0) + goto out; + + rw_ctx = ceph_find_rw_context(fi); + max = fsc->mount_options->rsize >> PAGE_SHIFT; + dout("readpages %p file %p ctx %p nr_pages %d max %d\n", + inode, file, rw_ctx, nr_pages, max); + while (!list_empty(page_list)) { + rc = start_read(inode, rw_ctx, page_list, max); + if (rc < 0) + goto out; + } +out: + ceph_fscache_readpages_cancel(inode, page_list); + + dout("readpages %p file %p ret %d\n", inode, file, rc); + return rc; +} + +struct ceph_writeback_ctl +{ + loff_t i_size; + u64 truncate_size; + u32 truncate_seq; + bool size_stable; + bool head_snapc; +}; + +/* + * Get ref for the oldest snapc for an inode with dirty data... that is, the + * only snap context we are allowed to write back. + */ +static struct ceph_snap_context * +get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, + struct ceph_snap_context *page_snapc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc = NULL; + struct ceph_cap_snap *capsnap = NULL; + + spin_lock(&ci->i_ceph_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, + capsnap->context, capsnap->dirty_pages); + if (!capsnap->dirty_pages) + continue; + + /* get i_size, truncate_{seq,size} for page_snapc? */ + if (snapc && capsnap->context != page_snapc) + continue; + + if (ctl) { + if (capsnap->writing) { + ctl->i_size = i_size_read(inode); + ctl->size_stable = false; + } else { + ctl->i_size = capsnap->size; + ctl->size_stable = true; + } + ctl->truncate_size = capsnap->truncate_size; + ctl->truncate_seq = capsnap->truncate_seq; + ctl->head_snapc = false; + } + + if (snapc) + break; + + snapc = ceph_get_snap_context(capsnap->context); + if (!page_snapc || + page_snapc == snapc || + page_snapc->seq > snapc->seq) + break; + } + if (!snapc && ci->i_wrbuffer_ref_head) { + snapc = ceph_get_snap_context(ci->i_head_snapc); + dout(" head snapc %p has %d dirty pages\n", + snapc, ci->i_wrbuffer_ref_head); + if (ctl) { + ctl->i_size = i_size_read(inode); + ctl->truncate_size = ci->i_truncate_size; + ctl->truncate_seq = ci->i_truncate_seq; + ctl->size_stable = false; + ctl->head_snapc = true; + } + } + spin_unlock(&ci->i_ceph_lock); + return snapc; +} + +static u64 get_writepages_data_length(struct inode *inode, + struct page *page, u64 start) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_cap_snap *capsnap = NULL; + u64 end = i_size_read(inode); + + if (snapc != ci->i_head_snapc) { + bool found = false; + spin_lock(&ci->i_ceph_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->context == snapc) { + if (!capsnap->writing) + end = capsnap->size; + found = true; + break; + } + } + spin_unlock(&ci->i_ceph_lock); + WARN_ON(!found); + } + if (end > page_offset(page) + PAGE_SIZE) + end = page_offset(page) + PAGE_SIZE; + return end > start ? end - start : 0; +} + +/* + * Write a single page, but leave the page locked. + * + * If we get a write error, mark the mapping for error, but still adjust the + * dirty page accounting (i.e., page is no longer dirty). + */ +static int writepage_nounlock(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc, *oldest; + loff_t page_off = page_offset(page); + int err; + loff_t len = PAGE_SIZE; + struct ceph_writeback_ctl ceph_wbc; + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; + + dout("writepage %p idx %lu\n", page, page->index); + + /* verify this is a writeable snap context */ + snapc = page_snap_context(page); + if (!snapc) { + dout("writepage %p page %p not dirty?\n", inode, page); + return 0; + } + oldest = get_oldest_context(inode, &ceph_wbc, snapc); + if (snapc->seq > oldest->seq) { + dout("writepage %p page %p snapc %p not writeable - noop\n", + inode, page, snapc); + /* we should only noop if called by kswapd */ + WARN_ON(!(current->flags & PF_MEMALLOC)); + ceph_put_snap_context(oldest); + redirty_page_for_writepage(wbc, page); + return 0; + } + ceph_put_snap_context(oldest); + + /* is this a partial page at end of file? */ + if (page_off >= ceph_wbc.i_size) { + dout("%p page eof %llu\n", page, ceph_wbc.i_size); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); + return 0; + } + + if (ceph_wbc.i_size < page_off + len) + len = ceph_wbc.i_size - page_off; + + dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n", + inode, page, page->index, page_off, len, snapc, snapc->seq); + + if (atomic_long_inc_return(&fsc->writeback_count) > + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) + set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + + set_page_writeback(page); + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, + ceph_wbc.truncate_seq, ceph_wbc.truncate_size, + true); + if (IS_ERR(req)) { + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + return PTR_ERR(req); + } + + /* it may be a short write due to an object boundary */ + WARN_ON_ONCE(len > PAGE_SIZE); + osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); + dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); + + req->r_mtime = inode->i_mtime; + err = ceph_osdc_start_request(osdc, req, true); + if (!err) + err = ceph_osdc_wait_request(osdc, req); + + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + + ceph_osdc_put_request(req); + if (err == 0) + err = len; + + if (err < 0) { + struct writeback_control tmp_wbc; + if (!wbc) + wbc = &tmp_wbc; + if (err == -ERESTARTSYS) { + /* killed by SIGKILL */ + dout("writepage interrupted page %p\n", page); + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + return err; + } + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; + dout("writepage setting page/mapping error %d %p\n", + err, page); + mapping_set_error(&inode->i_data, err); + wbc->pages_skipped++; + } else { + dout("writepage cleaned page %p\n", page); + err = 0; /* vfs expects us to return 0 */ + } + page->private = 0; + ClearPagePrivate(page); + end_page_writeback(page); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); /* page's reference */ + + if (atomic_long_dec_return(&fsc->writeback_count) < + CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) + clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + + return err; +} + +static int ceph_writepage(struct page *page, struct writeback_control *wbc) +{ + int err; + struct inode *inode = page->mapping->host; + BUG_ON(!inode); + ihold(inode); + err = writepage_nounlock(page, wbc); + if (err == -ERESTARTSYS) { + /* direct memory reclaimer was killed by SIGKILL. return 0 + * to prevent caller from setting mapping/page error */ + err = 0; + } + unlock_page(page); + iput(inode); + return err; +} + +/* + * async writeback completion handler. + * + * If we get an error, set the mapping error bit, but not the individual + * page error bits. + */ +static void writepages_finish(struct ceph_osd_request *req) +{ + struct inode *inode = req->r_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_data *osd_data; + struct page *page; + int num_pages, total_pages = 0; + int i, j; + int rc = req->r_result; + struct ceph_snap_context *snapc = req->r_snapc; + struct address_space *mapping = inode->i_mapping; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + bool remove_page; + + dout("writepages_finish %p rc %d\n", inode, rc); + if (rc < 0) { + mapping_set_error(mapping, rc); + ceph_set_error_write(ci); + if (rc == -EBLOCKLISTED) + fsc->blocklisted = true; + } else { + ceph_clear_error_write(ci); + } + + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + + /* + * We lost the cache cap, need to truncate the page before + * it is unlocked, otherwise we'd truncate it later in the + * page truncation thread, possibly losing some data that + * raced its way in + */ + remove_page = !(ceph_caps_issued(ci) & + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); + + /* clean all pages */ + for (i = 0; i < req->r_num_ops; i++) { + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) + break; + + osd_data = osd_req_op_extent_osd_data(req, i); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + total_pages += num_pages; + for (j = 0; j < num_pages; j++) { + page = osd_data->pages[j]; + BUG_ON(!page); + WARN_ON(!PageUptodate(page)); + + if (atomic_long_dec_return(&fsc->writeback_count) < + CONGESTION_OFF_THRESH( + fsc->mount_options->congestion_kb)) + clear_bdi_congested(inode_to_bdi(inode), + BLK_RW_ASYNC); + + ceph_put_snap_context(page_snap_context(page)); + page->private = 0; + ClearPagePrivate(page); + dout("unlocking %p\n", page); + end_page_writeback(page); + + if (remove_page) + generic_error_remove_page(inode->i_mapping, + page); + + unlock_page(page); + } + dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", + inode, osd_data->length, rc >= 0 ? num_pages : 0); + + release_pages(osd_data->pages, num_pages); + } + + ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); + + osd_data = osd_req_op_extent_osd_data(req, 0); + if (osd_data->pages_from_pool) + mempool_free(osd_data->pages, ceph_wb_pagevec_pool); + else + kfree(osd_data->pages); + ceph_osdc_put_request(req); +} + +/* + * initiate async writeback + */ +static int ceph_writepages_start(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_vino vino = ceph_vino(inode); + pgoff_t index, start_index, end = -1; + struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; + struct pagevec pvec; + int rc = 0; + unsigned int wsize = i_blocksize(inode); + struct ceph_osd_request *req = NULL; + struct ceph_writeback_ctl ceph_wbc; + bool should_loop, range_whole = false; + bool done = false; + + dout("writepages_start %p (mode=%s)\n", inode, + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); + + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (ci->i_wrbuffer_ref > 0) { + pr_warn_ratelimited( + "writepage_start %p %lld forced umount\n", + inode, ceph_ino(inode)); + } + mapping_set_error(mapping, -EIO); + return -EIO; /* we're in a forced umount, don't write! */ + } + if (fsc->mount_options->wsize < wsize) + wsize = fsc->mount_options->wsize; + + pagevec_init(&pvec); + + start_index = wbc->range_cyclic ? mapping->writeback_index : 0; + index = start_index; + +retry: + /* find oldest snap context with dirty data */ + snapc = get_oldest_context(inode, &ceph_wbc, NULL); + if (!snapc) { + /* hmm, why does writepages get called when there + is no dirty data? */ + dout(" no snap context with dirty data?\n"); + goto out; + } + dout(" oldest snapc is %p seq %lld (%d snaps)\n", + snapc, snapc->seq, snapc->num_snaps); + + should_loop = false; + if (ceph_wbc.head_snapc && snapc != last_snapc) { + /* where to start/end? */ + if (wbc->range_cyclic) { + index = start_index; + end = -1; + if (index > 0) + should_loop = true; + dout(" cyclic, start at %lu\n", index); + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = true; + dout(" not cyclic, %lu to %lu\n", index, end); + } + } else if (!ceph_wbc.head_snapc) { + /* Do not respect wbc->range_{start,end}. Dirty pages + * in that range can be associated with newer snapc. + * They are not writeable until we write all dirty pages + * associated with 'snapc' get written */ + if (index > 0) + should_loop = true; + dout(" non-head snapc, range whole\n"); + } + + ceph_put_snap_context(last_snapc); + last_snapc = snapc; + + while (!done && index <= end) { + int num_ops = 0, op_idx; + unsigned i, pvec_pages, max_pages, locked_pages = 0; + struct page **pages = NULL, **data_pages; + struct page *page; + pgoff_t strip_unit_end = 0; + u64 offset = 0, len = 0; + bool from_pool = false; + + max_pages = wsize >> PAGE_SHIFT; + +get_more_pages: + pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_DIRTY); + dout("pagevec_lookup_range_tag got %d\n", pvec_pages); + if (!pvec_pages && !locked_pages) + break; + for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { + page = pvec.pages[i]; + dout("? %p idx %lu\n", page, page->index); + if (locked_pages == 0) + lock_page(page); /* first page */ + else if (!trylock_page(page)) + break; + + /* only dirty pages, or our accounting breaks */ + if (unlikely(!PageDirty(page)) || + unlikely(page->mapping != mapping)) { + dout("!dirty or !mapping %p\n", page); + unlock_page(page); + continue; + } + /* only if matching snap context */ + pgsnapc = page_snap_context(page); + if (pgsnapc != snapc) { + dout("page snapc %p %lld != oldest %p %lld\n", + pgsnapc, pgsnapc->seq, snapc, snapc->seq); + if (!should_loop && + !ceph_wbc.head_snapc && + wbc->sync_mode != WB_SYNC_NONE) + should_loop = true; + unlock_page(page); + continue; + } + if (page_offset(page) >= ceph_wbc.i_size) { + dout("%p page eof %llu\n", + page, ceph_wbc.i_size); + if ((ceph_wbc.size_stable || + page_offset(page) >= i_size_read(inode)) && + clear_page_dirty_for_io(page)) + mapping->a_ops->invalidatepage(page, + 0, PAGE_SIZE); + unlock_page(page); + continue; + } + if (strip_unit_end && (page->index > strip_unit_end)) { + dout("end of strip unit %p\n", page); + unlock_page(page); + break; + } + if (PageWriteback(page)) { + if (wbc->sync_mode == WB_SYNC_NONE) { + dout("%p under writeback\n", page); + unlock_page(page); + continue; + } + dout("waiting on writeback %p\n", page); + wait_on_page_writeback(page); + } + + if (!clear_page_dirty_for_io(page)) { + dout("%p !clear_page_dirty_for_io\n", page); + unlock_page(page); + continue; + } + + /* + * We have something to write. If this is + * the first locked page this time through, + * calculate max possinle write size and + * allocate a page array + */ + if (locked_pages == 0) { + u64 objnum; + u64 objoff; + u32 xlen; + + /* prepare async write request */ + offset = (u64)page_offset(page); + ceph_calc_file_object_mapping(&ci->i_layout, + offset, wsize, + &objnum, &objoff, + &xlen); + len = xlen; + + num_ops = 1; + strip_unit_end = page->index + + ((len - 1) >> PAGE_SHIFT); + + BUG_ON(pages); + max_pages = calc_pages_for(0, (u64)len); + pages = kmalloc_array(max_pages, + sizeof(*pages), + GFP_NOFS); + if (!pages) { + from_pool = true; + pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); + BUG_ON(!pages); + } + + len = 0; + } else if (page->index != + (offset + len) >> PAGE_SHIFT) { + if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : + CEPH_OSD_MAX_OPS)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + break; + } + + num_ops++; + offset = (u64)page_offset(page); + len = 0; + } + + /* note position of first page in pvec */ + dout("%p will write page %p idx %lu\n", + inode, page, page->index); + + if (atomic_long_inc_return(&fsc->writeback_count) > + CONGESTION_ON_THRESH( + fsc->mount_options->congestion_kb)) { + set_bdi_congested(inode_to_bdi(inode), + BLK_RW_ASYNC); + } + + + pages[locked_pages++] = page; + pvec.pages[i] = NULL; + + len += PAGE_SIZE; + } + + /* did we get anything? */ + if (!locked_pages) + goto release_pvec_pages; + if (i) { + unsigned j, n = 0; + /* shift unused page to beginning of pvec */ + for (j = 0; j < pvec_pages; j++) { + if (!pvec.pages[j]) + continue; + if (n < j) + pvec.pages[n] = pvec.pages[j]; + n++; + } + pvec.nr = n; + + if (pvec_pages && i == pvec_pages && + locked_pages < max_pages) { + dout("reached end pvec, trying for more\n"); + pagevec_release(&pvec); + goto get_more_pages; + } + } + +new_request: + offset = page_offset(pages[0]); + len = wsize; + + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, num_ops, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + snapc, ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, false); + if (IS_ERR(req)) { + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, + min(num_ops, + CEPH_OSD_SLAB_OPS), + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, + snapc, ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, true); + BUG_ON(IS_ERR(req)); + } + BUG_ON(len < page_offset(pages[locked_pages - 1]) + + PAGE_SIZE - offset); + + req->r_callback = writepages_finish; + req->r_inode = inode; + + /* Format the osd request message and submit the write */ + len = 0; + data_pages = pages; + op_idx = 0; + for (i = 0; i < locked_pages; i++) { + u64 cur_offset = page_offset(pages[i]); + if (offset + len != cur_offset) { + if (op_idx + 1 == req->r_num_ops) + break; + osd_req_op_extent_dup_last(req, op_idx, + cur_offset - offset); + dout("writepages got pages at %llu~%llu\n", + offset, len); + osd_req_op_extent_osd_data_pages(req, op_idx, + data_pages, len, 0, + from_pool, false); + osd_req_op_extent_update(req, op_idx, len); + + len = 0; + offset = cur_offset; + data_pages = pages + i; + op_idx++; + } + + set_page_writeback(pages[i]); + len += PAGE_SIZE; + } + + if (ceph_wbc.size_stable) { + len = min(len, ceph_wbc.i_size - offset); + } else if (i == locked_pages) { + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + u64 min_len = len + 1 - PAGE_SIZE; + len = get_writepages_data_length(inode, pages[i - 1], + offset); + len = max(len, min_len); + } + dout("writepages got pages at %llu~%llu\n", offset, len); + + osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, + 0, from_pool, false); + osd_req_op_extent_update(req, op_idx, len); + + BUG_ON(op_idx + 1 != req->r_num_ops); + + from_pool = false; + if (i < locked_pages) { + BUG_ON(num_ops <= req->r_num_ops); + num_ops -= req->r_num_ops; + locked_pages -= i; + + /* allocate new pages array for next request */ + data_pages = pages; + pages = kmalloc_array(locked_pages, sizeof(*pages), + GFP_NOFS); + if (!pages) { + from_pool = true; + pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); + BUG_ON(!pages); + } + memcpy(pages, data_pages + i, + locked_pages * sizeof(*pages)); + memset(data_pages + i, 0, + locked_pages * sizeof(*pages)); + } else { + BUG_ON(num_ops != req->r_num_ops); + index = pages[i - 1]->index + 1; + /* request message now owns the pages array */ + pages = NULL; + } + + req->r_mtime = inode->i_mtime; + rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); + BUG_ON(rc); + req = NULL; + + wbc->nr_to_write -= i; + if (pages) + goto new_request; + + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) + done = true; + +release_pvec_pages: + dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, + pvec.nr ? pvec.pages[0] : NULL); + pagevec_release(&pvec); + } + + if (should_loop && !done) { + /* more to do; loop back to beginning of file */ + dout("writepages looping back to beginning of file\n"); + end = start_index - 1; /* OK even when start_index == 0 */ + + /* to write dirty pages associated with next snapc, + * we need to wait until current writes complete */ + if (wbc->sync_mode != WB_SYNC_NONE && + start_index == 0 && /* all dirty pages were checked */ + !ceph_wbc.head_snapc) { + struct page *page; + unsigned i, nr; + index = 0; + while ((index <= end) && + (nr = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK))) { + for (i = 0; i < nr; i++) { + page = pvec.pages[i]; + if (page_snap_context(page) != snapc) + continue; + wait_on_page_writeback(page); + } + pagevec_release(&pvec); + cond_resched(); + } + } + + start_index = 0; + index = 0; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + +out: + ceph_osdc_put_request(req); + ceph_put_snap_context(last_snapc); + dout("writepages dend - startone, rc = %d\n", rc); + return rc; +} + + + +/* + * See if a given @snapc is either writeable, or already written. + */ +static int context_is_writeable_or_written(struct inode *inode, + struct ceph_snap_context *snapc) +{ + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL); + int ret = !oldest || snapc->seq <= oldest->seq; + + ceph_put_snap_context(oldest); + return ret; +} + +/** + * ceph_find_incompatible - find an incompatible context and return it + * @page: page being dirtied + * + * We are only allowed to write into/dirty a page if the page is + * clean, or already dirty within the same snap context. Returns a + * conflicting context if there is one, NULL if there isn't, or a + * negative error code on other errors. + * + * Must be called with page lock held. + */ +static struct ceph_snap_context * +ceph_find_incompatible(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout(" page %p forced umount\n", page); + return ERR_PTR(-EIO); + } + + for (;;) { + struct ceph_snap_context *snapc, *oldest; + + wait_on_page_writeback(page); + + snapc = page_snap_context(page); + if (!snapc || snapc == ci->i_head_snapc) + break; + + /* + * this page is already dirty in another (older) snap + * context! is it writeable now? + */ + oldest = get_oldest_context(inode, NULL, NULL); + if (snapc->seq > oldest->seq) { + /* not writeable -- return it for the caller to deal with */ + ceph_put_snap_context(oldest); + dout(" page %p snapc %p not current or oldest\n", page, snapc); + return ceph_get_snap_context(snapc); + } + ceph_put_snap_context(oldest); + + /* yay, writeable, do it now (without dropping page lock) */ + dout(" page %p snapc %p not current, but oldest\n", page, snapc); + if (clear_page_dirty_for_io(page)) { + int r = writepage_nounlock(page, NULL); + if (r < 0) + return ERR_PTR(r); + } + } + return NULL; +} + +/** + * prep_noread_page - prep a page for writing without reading first + * @page: page being prepared + * @pos: starting position for the write + * @len: length of write + * + * In some cases, write_begin doesn't need to read at all: + * - full page write + * - file is currently zero-length + * - write that lies in a page that is completely beyond EOF + * - write that covers the the page from start to EOF or beyond it + * + * If any of these criteria are met, then zero out the unwritten parts + * of the page and return true. Otherwise, return false. + */ +static bool skip_page_read(struct page *page, loff_t pos, size_t len) +{ + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + size_t offset = offset_in_page(pos); + + /* Full page write */ + if (offset == 0 && len >= PAGE_SIZE) + return true; + + /* pos beyond last page in the file */ + if (pos - offset >= i_size) + goto zero_out; + + /* write that covers the whole page from start to EOF or beyond it */ + if (offset == 0 && (pos + len) >= i_size) + goto zero_out; + + return false; +zero_out: + zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); + return true; +} + +/* + * We are only allowed to write into/dirty the page if the page is + * clean, or already dirty within the same snap context. + */ +static int ceph_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + struct page *page = NULL; + pgoff_t index = pos >> PAGE_SHIFT; + int r = 0; + + dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); + + for (;;) { + page = grab_cache_page_write_begin(mapping, index, 0); + if (!page) { + r = -ENOMEM; + break; + } + + snapc = ceph_find_incompatible(page); + if (snapc) { + if (IS_ERR(snapc)) { + r = PTR_ERR(snapc); + break; + } + unlock_page(page); + put_page(page); + page = NULL; + ceph_queue_writeback(inode); + r = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + if (r != 0) + break; + continue; + } + + if (PageUptodate(page)) { + dout(" page %p already uptodate\n", page); + break; + } + + /* No need to read in some cases */ + if (skip_page_read(page, pos, len)) + break; + + /* + * We need to read it. If we get back -EINPROGRESS, then the page was + * handed off to fscache and it will be unlocked when the read completes. + * Refind the page in that case so we can reacquire the page lock. Otherwise + * we got a hard error or the read was completed synchronously. + */ + r = ceph_do_readpage(file, page); + if (r != -EINPROGRESS) + break; + } + + if (r < 0) { + if (page) { + unlock_page(page); + put_page(page); + } + } else { + *pagep = page; + } + return r; +} + +/* + * we don't do anything in here that simple_write_end doesn't do + * except adjust dirty page accounting + */ +static int ceph_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = file_inode(file); + bool check_cap = false; + + dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, + inode, page, (int)pos, (int)copied, (int)len); + + /* zero the stale part of the page if we did a short copy */ + if (!PageUptodate(page)) { + if (copied < len) { + copied = 0; + goto out; + } + SetPageUptodate(page); + } + + /* did file size increase? */ + if (pos+copied > i_size_read(inode)) + check_cap = ceph_inode_set_size(inode, pos+copied); + + set_page_dirty(page); + +out: + unlock_page(page); + put_page(page); + + if (check_cap) + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + + return copied; +} + +/* + * we set .direct_IO to indicate direct io is supported, but since we + * intercept O_DIRECT reads and writes early, this function should + * never get called. + */ +static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter) +{ + WARN_ON(1); + return -EINVAL; +} + +const struct address_space_operations ceph_aops = { + .readpage = ceph_readpage, + .readpages = ceph_readpages, + .writepage = ceph_writepage, + .writepages = ceph_writepages_start, + .write_begin = ceph_write_begin, + .write_end = ceph_write_end, + .set_page_dirty = ceph_set_page_dirty, + .invalidatepage = ceph_invalidatepage, + .releasepage = ceph_releasepage, + .direct_IO = ceph_direct_io, +}; + +static void ceph_block_sigs(sigset_t *oldset) +{ + sigset_t mask; + siginitsetinv(&mask, sigmask(SIGKILL)); + sigprocmask(SIG_BLOCK, &mask, oldset); +} + +static void ceph_restore_sigs(sigset_t *oldset) +{ + sigprocmask(SIG_SETMASK, oldset, NULL); +} + +/* + * vm ops + */ +static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + struct page *pinned_page = NULL; + loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; + int want, got, err; + sigset_t oldset; + vm_fault_t ret = VM_FAULT_SIGBUS; + + ceph_block_sigs(&oldset); + + dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", + inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + + got = 0; + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); + if (err < 0) + goto out_restore; + + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", + inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || + ci->i_inline_version == CEPH_INLINE_NONE) { + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); + ceph_add_rw_context(fi, &rw_ctx); + ret = filemap_fault(vmf); + ceph_del_rw_context(fi, &rw_ctx); + dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n", + inode, off, (size_t)PAGE_SIZE, + ceph_cap_string(got), ret); + } else + err = -EAGAIN; + + if (pinned_page) + put_page(pinned_page); + ceph_put_cap_refs(ci, got); + + if (err != -EAGAIN) + goto out_restore; + + /* read inline data */ + if (off >= PAGE_SIZE) { + /* does not support inline data > PAGE_SIZE */ + ret = VM_FAULT_SIGBUS; + } else { + struct address_space *mapping = inode->i_mapping; + struct page *page = find_or_create_page(mapping, 0, + mapping_gfp_constraint(mapping, + ~__GFP_FS)); + if (!page) { + ret = VM_FAULT_OOM; + goto out_inline; + } + err = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (err < 0 || off >= i_size_read(inode)) { + unlock_page(page); + put_page(page); + ret = vmf_error(err); + goto out_inline; + } + if (err < PAGE_SIZE) + zero_user_segment(page, err, PAGE_SIZE); + else + flush_dcache_page(page); + SetPageUptodate(page); + vmf->page = page; + ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; +out_inline: + dout("filemap_fault %p %llu~%zd read inline data ret %x\n", + inode, off, (size_t)PAGE_SIZE, ret); + } +out_restore: + ceph_restore_sigs(&oldset); + if (err < 0) + ret = vmf_error(err); + + return ret; +} + +/* + * Reuse write_begin here for simplicity. + */ +static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + struct ceph_cap_flush *prealloc_cf; + struct page *page = vmf->page; + loff_t off = page_offset(page); + loff_t size = i_size_read(inode); + size_t len; + int want, got, err; + sigset_t oldset; + vm_fault_t ret = VM_FAULT_SIGBUS; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return VM_FAULT_OOM; + + sb_start_pagefault(inode->i_sb); + ceph_block_sigs(&oldset); + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + struct page *locked_page = NULL; + if (off == 0) { + lock_page(page); + locked_page = page; + } + err = ceph_uninline_data(vma->vm_file, locked_page); + if (locked_page) + unlock_page(locked_page); + if (err < 0) + goto out_free; + } + + if (off + PAGE_SIZE <= size) + len = PAGE_SIZE; + else + len = size & ~PAGE_MASK; + + dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", + inode, ceph_vinop(inode), off, len, size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + + got = 0; + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, + &got, NULL); + if (err < 0) + goto out_free; + + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", + inode, off, len, ceph_cap_string(got)); + + /* Update time before taking page lock */ + file_update_time(vma->vm_file); + inode_inc_iversion_raw(inode); + + do { + struct ceph_snap_context *snapc; + + lock_page(page); + + if (page_mkwrite_check_truncate(page, inode) < 0) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + break; + } + + snapc = ceph_find_incompatible(page); + if (!snapc) { + /* success. we'll keep the page locked. */ + set_page_dirty(page); + ret = VM_FAULT_LOCKED; + break; + } + + unlock_page(page); + + if (IS_ERR(snapc)) { + ret = VM_FAULT_SIGBUS; + break; + } + + ceph_queue_writeback(inode); + err = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + } while (err == 0); + + if (ret == VM_FAULT_LOCKED || + ci->i_inline_version != CEPH_INLINE_NONE) { + int dirty; + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n", + inode, off, len, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); +out_free: + ceph_restore_sigs(&oldset); + sb_end_pagefault(inode->i_sb); + ceph_free_cap_flush(prealloc_cf); + if (err < 0) + ret = vmf_error(err); + return ret; +} + +void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + if (locked_page) { + page = locked_page; + } else { + if (i_size_read(inode) == 0) + return; + page = find_or_create_page(mapping, 0, + mapping_gfp_constraint(mapping, + ~__GFP_FS)); + if (!page) + return; + if (PageUptodate(page)) { + unlock_page(page); + put_page(page); + return; + } + } + + dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n", + inode, ceph_vinop(inode), len, locked_page); + + if (len > 0) { + void *kaddr = kmap_atomic(page); + memcpy(kaddr, data, len); + kunmap_atomic(kaddr); + } + + if (page != locked_page) { + if (len < PAGE_SIZE) + zero_user_segment(page, len, PAGE_SIZE); + else + flush_dcache_page(page); + + SetPageUptodate(page); + unlock_page(page); + put_page(page); + } +} + +int ceph_uninline_data(struct file *filp, struct page *locked_page) +{ + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + struct page *page = NULL; + u64 len, inline_version; + int err = 0; + bool from_pagecache = false; + + spin_lock(&ci->i_ceph_lock); + inline_version = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + + dout("uninline_data %p %llx.%llx inline_version %llu\n", + inode, ceph_vinop(inode), inline_version); + + if (inline_version == 1 || /* initial version, no data */ + inline_version == CEPH_INLINE_NONE) + goto out; + + if (locked_page) { + page = locked_page; + WARN_ON(!PageUptodate(page)); + } else if (ceph_caps_issued(ci) & + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { + page = find_get_page(inode->i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + from_pagecache = true; + lock_page(page); + } else { + put_page(page); + page = NULL; + } + } + } + + if (page) { + len = i_size_read(inode); + if (len > PAGE_SIZE) + len = PAGE_SIZE; + } else { + page = __page_cache_alloc(GFP_NOFS); + if (!page) { + err = -ENOMEM; + goto out; + } + err = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (err < 0) { + /* no inline data */ + if (err == -ENODATA) + err = 0; + goto out; + } + len = err; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 0, 1, + CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, + NULL, 0, 0, false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_mtime = inode->i_mtime; + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_put_request(req); + if (err < 0) + goto out; + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 1, 3, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); + + { + __le64 xattr_buf = cpu_to_le64(inline_version); + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, + "inline_version", &xattr_buf, + sizeof(xattr_buf), + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64); + if (err) + goto out_put; + } + + { + char xattr_buf[32]; + int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf), + "%llu", inline_version); + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, + "inline_version", + xattr_buf, xattr_len, 0, 0); + if (err) + goto out_put; + } + + req->r_mtime = inode->i_mtime; + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + +out_put: + ceph_osdc_put_request(req); + if (err == -ECANCELED) + err = 0; +out: + if (page && page != locked_page) { + if (from_pagecache) { + unlock_page(page); + put_page(page); + } else + __free_pages(page, 0); + } + + dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", + inode, ceph_vinop(inode), inline_version, err); + return err; +} + +static const struct vm_operations_struct ceph_vmops = { + .fault = ceph_filemap_fault, + .page_mkwrite = ceph_page_mkwrite, +}; + +int ceph_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &ceph_vmops; + return 0; +} + +enum { + POOL_READ = 1, + POOL_WRITE = 2, +}; + +static int __ceph_pool_perm_get(struct ceph_inode_info *ci, + s64 pool, struct ceph_string *pool_ns) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; + struct rb_node **p, *parent; + struct ceph_pool_perm *perm; + struct page **pages; + size_t pool_ns_len; + int err = 0, err2 = 0, have = 0; + + down_read(&mdsc->pool_perm_rwsem); + p = &mdsc->pool_perm_tree.rb_node; + while (*p) { + perm = rb_entry(*p, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + int ret = ceph_compare_string(pool_ns, + perm->pool_ns, + perm->pool_ns_len); + if (ret < 0) + p = &(*p)->rb_left; + else if (ret > 0) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + } + up_read(&mdsc->pool_perm_rwsem); + if (*p) + goto out; + + if (pool_ns) + dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n", + pool, (int)pool_ns->len, pool_ns->str); + else + dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool); + + down_write(&mdsc->pool_perm_rwsem); + p = &mdsc->pool_perm_tree.rb_node; + parent = NULL; + while (*p) { + parent = *p; + perm = rb_entry(parent, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + int ret = ceph_compare_string(pool_ns, + perm->pool_ns, + perm->pool_ns_len); + if (ret < 0) + p = &(*p)->rb_left; + else if (ret > 0) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + } + if (*p) { + up_write(&mdsc->pool_perm_rwsem); + goto out; + } + + rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, + 1, false, GFP_NOFS); + if (!rd_req) { + err = -ENOMEM; + goto out_unlock; + } + + rd_req->r_flags = CEPH_OSD_FLAG_READ; + osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); + rd_req->r_base_oloc.pool = pool; + if (pool_ns) + rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns); + ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); + + err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); + if (err) + goto out_unlock; + + wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, + 1, false, GFP_NOFS); + if (!wr_req) { + err = -ENOMEM; + goto out_unlock; + } + + wr_req->r_flags = CEPH_OSD_FLAG_WRITE; + osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); + ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); + ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); + + err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); + if (err) + goto out_unlock; + + /* one page should be large enough for STAT data */ + pages = ceph_alloc_page_vector(1, GFP_KERNEL); + if (IS_ERR(pages)) { + err = PTR_ERR(pages); + goto out_unlock; + } + + osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, + 0, false, true); + err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); + + wr_req->r_mtime = ci->vfs_inode.i_mtime; + err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); + + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); + if (!err2) + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); + + if (err >= 0 || err == -ENOENT) + have |= POOL_READ; + else if (err != -EPERM) { + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; + goto out_unlock; + } + + if (err2 == 0 || err2 == -EEXIST) + have |= POOL_WRITE; + else if (err2 != -EPERM) { + if (err2 == -EBLOCKLISTED) + fsc->blocklisted = true; + err = err2; + goto out_unlock; + } + + pool_ns_len = pool_ns ? pool_ns->len : 0; + perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS); + if (!perm) { + err = -ENOMEM; + goto out_unlock; + } + + perm->pool = pool; + perm->perm = have; + perm->pool_ns_len = pool_ns_len; + if (pool_ns_len > 0) + memcpy(perm->pool_ns, pool_ns->str, pool_ns_len); + perm->pool_ns[pool_ns_len] = 0; + + rb_link_node(&perm->node, parent, p); + rb_insert_color(&perm->node, &mdsc->pool_perm_tree); + err = 0; +out_unlock: + up_write(&mdsc->pool_perm_rwsem); + + ceph_osdc_put_request(rd_req); + ceph_osdc_put_request(wr_req); +out: + if (!err) + err = have; + if (pool_ns) + dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n", + pool, (int)pool_ns->len, pool_ns->str, err); + else + dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err); + return err; +} + +int ceph_pool_perm_check(struct inode *inode, int need) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_string *pool_ns; + s64 pool; + int ret, flags; + + if (ci->i_vino.snap != CEPH_NOSNAP) { + /* + * Pool permission check needs to write to the first object. + * But for snapshot, head of the first object may have alread + * been deleted. Skip check to avoid creating orphan object. + */ + return 0; + } + + if (ceph_test_mount_opt(ceph_inode_to_client(inode), + NOPOOLPERM)) + return 0; + + spin_lock(&ci->i_ceph_lock); + flags = ci->i_ceph_flags; + pool = ci->i_layout.pool_id; + spin_unlock(&ci->i_ceph_lock); +check: + if (flags & CEPH_I_POOL_PERM) { + if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { + dout("ceph_pool_perm_check pool %lld no read perm\n", + pool); + return -EPERM; + } + if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { + dout("ceph_pool_perm_check pool %lld no write perm\n", + pool); + return -EPERM; + } + return 0; + } + + pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); + ret = __ceph_pool_perm_get(ci, pool, pool_ns); + ceph_put_string(pool_ns); + if (ret < 0) + return ret; + + flags = CEPH_I_POOL_PERM; + if (ret & POOL_READ) + flags |= CEPH_I_POOL_RD; + if (ret & POOL_WRITE) + flags |= CEPH_I_POOL_WR; + + spin_lock(&ci->i_ceph_lock); + if (pool == ci->i_layout.pool_id && + pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) { + ci->i_ceph_flags |= flags; + } else { + pool = ci->i_layout.pool_id; + flags = ci->i_ceph_flags; + } + spin_unlock(&ci->i_ceph_lock); + goto check; +} + +void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc) +{ + struct ceph_pool_perm *perm; + struct rb_node *n; + + while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) { + n = rb_first(&mdsc->pool_perm_tree); + perm = rb_entry(n, struct ceph_pool_perm, node); + rb_erase(n, &mdsc->pool_perm_tree); + kfree(perm); + } +} diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c new file mode 100644 index 000000000..2f5cb6bc7 --- /dev/null +++ b/fs/ceph/cache.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + */ + +#include <linux/ceph/ceph_debug.h> + +#include <linux/fs_context.h> +#include "super.h" +#include "cache.h" + +struct ceph_aux_inode { + u64 version; + u64 mtime_sec; + u64 mtime_nsec; +}; + +struct fscache_netfs ceph_cache_netfs = { + .name = "ceph", + .version = 0, +}; + +static DEFINE_MUTEX(ceph_fscache_lock); +static LIST_HEAD(ceph_fscache_list); + +struct ceph_fscache_entry { + struct list_head list; + struct fscache_cookie *fscache; + size_t uniq_len; + /* The following members must be last */ + struct ceph_fsid fsid; + char uniquifier[]; +}; + +static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { + .name = "CEPH.fsid", + .type = FSCACHE_COOKIE_TYPE_INDEX, +}; + +int __init ceph_fscache_register(void) +{ + return fscache_register_netfs(&ceph_cache_netfs); +} + +void ceph_fscache_unregister(void) +{ + fscache_unregister_netfs(&ceph_cache_netfs); +} + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) +{ + const struct ceph_fsid *fsid = &fsc->client->fsid; + const char *fscache_uniq = fsc->mount_options->fscache_uniq; + size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; + struct ceph_fscache_entry *ent; + int err = 0; + + mutex_lock(&ceph_fscache_lock); + list_for_each_entry(ent, &ceph_fscache_list, list) { + if (memcmp(&ent->fsid, fsid, sizeof(*fsid))) + continue; + if (ent->uniq_len != uniq_len) + continue; + if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len)) + continue; + + errorfc(fc, "fscache cookie already registered for fsid %pU, use fsc=<uniquifier> option", + fsid); + err = -EBUSY; + goto out_unlock; + } + + ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL); + if (!ent) { + err = -ENOMEM; + goto out_unlock; + } + + memcpy(&ent->fsid, fsid, sizeof(*fsid)); + if (uniq_len > 0) { + memcpy(&ent->uniquifier, fscache_uniq, uniq_len); + ent->uniq_len = uniq_len; + } + + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, + &ceph_fscache_fsid_object_def, + &ent->fsid, sizeof(ent->fsid) + uniq_len, + NULL, 0, + fsc, 0, true); + + if (fsc->fscache) { + ent->fscache = fsc->fscache; + list_add_tail(&ent->list, &ceph_fscache_list); + } else { + kfree(ent); + errorfc(fc, "unable to register fscache cookie for fsid %pU", + fsid); + /* all other fs ignore this error */ + } +out_unlock: + mutex_unlock(&ceph_fscache_lock); + return err; +} + +static enum fscache_checkaux ceph_fscache_inode_check_aux( + void *cookie_netfs_data, const void *data, uint16_t dlen, + loff_t object_size) +{ + struct ceph_aux_inode aux; + struct ceph_inode_info* ci = cookie_netfs_data; + struct inode* inode = &ci->vfs_inode; + + if (dlen != sizeof(aux) || + i_size_read(inode) != object_size) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; + aux.mtime_sec = inode->i_mtime.tv_sec; + aux.mtime_nsec = inode->i_mtime.tv_nsec; + + if (memcmp(data, &aux, sizeof(aux)) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + dout("ceph inode 0x%p cached okay\n", ci); + return FSCACHE_CHECKAUX_OKAY; +} + +static const struct fscache_cookie_def ceph_fscache_inode_object_def = { + .name = "CEPH.inode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .check_aux = ceph_fscache_inode_check_aux, +}; + +void ceph_fscache_register_inode_cookie(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_aux_inode aux; + + /* No caching for filesystem */ + if (!fsc->fscache) + return; + + /* Only cache for regular files that are read only */ + if (!S_ISREG(inode->i_mode)) + return; + + inode_lock_nested(inode, I_MUTEX_CHILD); + if (!ci->fscache) { + memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; + aux.mtime_sec = inode->i_mtime.tv_sec; + aux.mtime_nsec = inode->i_mtime.tv_nsec; + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + &ci->i_vino, sizeof(ci->i_vino), + &aux, sizeof(aux), + ci, i_size_read(inode), false); + } + inode_unlock(inode); +} + +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ + struct fscache_cookie* cookie; + + if ((cookie = ci->fscache) == NULL) + return; + + ci->fscache = NULL; + + fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); + fscache_relinquish_cookie(cookie, &ci->i_vino, false); +} + +static bool ceph_fscache_can_enable(void *data) +{ + struct inode *inode = data; + return !inode_is_open_for_write(inode); +} + +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!fscache_cookie_valid(ci->fscache)) + return; + + if (inode_is_open_for_write(inode)) { + dout("fscache_file_set_cookie %p %p disabling cache\n", + inode, filp); + fscache_disable_cookie(ci->fscache, &ci->i_vino, false); + fscache_uncache_all_inode_pages(ci->fscache, inode); + } else { + fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode), + ceph_fscache_can_enable, inode); + if (fscache_cookie_enabled(ci->fscache)) { + dout("fscache_file_set_cookie %p %p enabling cache\n", + inode, filp); + } + } +} + +static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); + + unlock_page(page); +} + +static inline bool cache_valid(struct ceph_inode_info *ci) +{ + return ci->i_fscache_gen == ci->i_rdcache_gen; +} + + +/* Atempt to read from the fscache, + * + * This function is called from the readpage_nounlock context. DO NOT attempt to + * unlock the page here (or in the callback). + */ +int ceph_readpage_from_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_page(ci->fscache, page, + ceph_readpage_from_fscache_complete, NULL, + GFP_KERNEL); + + switch (ret) { + case 0: /* Page found */ + dout("page read submitted\n"); + return 0; + case -ENOBUFS: /* Pages were not found, and can't be */ + case -ENODATA: /* Pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, + ceph_readpage_from_fscache_complete, + NULL, mapping_gfp_mask(mapping)); + + switch (ret) { + case 0: /* All pages found */ + dout("all-page read submitted\n"); + return 0; + case -ENOBUFS: /* Some pages were not found, and can't be */ + case -ENODATA: /* some pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +void ceph_readpage_to_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!PageFsCache(page)) + return; + + if (!cache_valid(ci)) + return; + + ret = fscache_write_page(ci->fscache, page, i_size_read(inode), + GFP_KERNEL); + if (ret) + fscache_uncache_page(ci->fscache, page); +} + +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!PageFsCache(page)) + return; + + fscache_wait_on_page_write(ci->fscache, page); + fscache_uncache_page(ci->fscache, page); +} + +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ + if (fscache_cookie_valid(fsc->fscache)) { + struct ceph_fscache_entry *ent; + bool found = false; + + mutex_lock(&ceph_fscache_lock); + list_for_each_entry(ent, &ceph_fscache_list, list) { + if (ent->fscache == fsc->fscache) { + list_del(&ent->list); + kfree(ent); + found = true; + break; + } + } + WARN_ON_ONCE(!found); + mutex_unlock(&ceph_fscache_lock); + + __fscache_relinquish_cookie(fsc->fscache, NULL, false); + } + fsc->fscache = NULL; +} + +/* + * caller should hold CEPH_CAP_FILE_{RD,CACHE} + */ +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) +{ + if (cache_valid(ci)) + return; + + /* resue i_truncate_mutex. There should be no pending + * truncate while the caller holds CEPH_CAP_FILE_RD */ + mutex_lock(&ci->i_truncate_mutex); + if (!cache_valid(ci)) { + if (fscache_check_consistency(ci->fscache, &ci->i_vino)) + fscache_invalidate(ci->fscache); + spin_lock(&ci->i_ceph_lock); + ci->i_fscache_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); + } + mutex_unlock(&ci->i_truncate_mutex); +} diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h new file mode 100644 index 000000000..89dbdd1eb --- /dev/null +++ b/fs/ceph/cache.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + */ + +#ifndef _CEPH_CACHE_H +#define _CEPH_CACHE_H + +#ifdef CONFIG_CEPH_FSCACHE + +extern struct fscache_netfs ceph_cache_netfs; + +int ceph_fscache_register(void); +void ceph_fscache_unregister(void); + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc); +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); + +void ceph_fscache_register_inode_cookie(struct inode *inode); +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); + +int ceph_readpage_from_fscache(struct inode *inode, struct page *page); +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages); +void ceph_readpage_to_fscache(struct inode *inode, struct page *page); +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); + +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ + ci->fscache = NULL; + ci->i_fscache_gen = 0; +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ + fscache_invalidate(ceph_inode(inode)->fscache); +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_uncache_page(ci->fscache, page); +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + struct inode* inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_maybe_release_page(ci->fscache, page, gfp); +} + +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) + __fscache_uncache_page(ci->fscache, page); +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_readpages_cancel(ci->fscache, pages); +} + +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) +{ + ci->i_fscache_gen = ci->i_rdcache_gen - 1; +} + +#else + +static inline int ceph_fscache_register(void) +{ + return 0; +} + +static inline void ceph_fscache_unregister(void) +{ +} + +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, + struct fs_context *fc) +{ + return 0; +} + +static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ +} + +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ +} + +static inline void ceph_fscache_register_inode_cookie(struct inode *inode) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_file_set_cookie(struct inode *inode, + struct file *filp) +{ +} + +static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) +{ +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *pages) +{ +} + +static inline int ceph_readpage_from_fscache(struct inode* inode, + struct page *page) +{ + return -ENOBUFS; +} + +static inline int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void ceph_readpage_to_fscache(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ +} + +static inline void ceph_invalidate_fscache_page(struct inode *inode, + struct page *page) +{ +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + return 1; +} + +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ +} + +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) +{ +} + +#endif + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c new file mode 100644 index 000000000..432dc2a16 --- /dev/null +++ b/fs/ceph/caps.c @@ -0,0 +1,4637 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/wait.h> +#include <linux/writeback.h> +#include <linux/iversion.h> + +#include "super.h" +#include "mds_client.h" +#include "cache.h" +#include <linux/ceph/decode.h> +#include <linux/ceph/messenger.h> + +/* + * Capability management + * + * The Ceph metadata servers control client access to inode metadata + * and file data by issuing capabilities, granting clients permission + * to read and/or write both inode field and file data to OSDs + * (storage nodes). Each capability consists of a set of bits + * indicating which operations are allowed. + * + * If the client holds a *_SHARED cap, the client has a coherent value + * that can be safely read from the cached inode. + * + * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the + * client is allowed to change inode attributes (e.g., file size, + * mtime), note its dirty state in the ceph_cap, and asynchronously + * flush that metadata change to the MDS. + * + * In the event of a conflicting operation (perhaps by another + * client), the MDS will revoke the conflicting client capabilities. + * + * In order for a client to cache an inode, it must hold a capability + * with at least one MDS server. When inodes are released, release + * notifications are batched and periodically sent en masse to the MDS + * cluster to release server state. + */ + +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc); +static void __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci, + u64 oldest_flush_tid); + +/* + * Generate readable cap strings for debugging output. + */ +#define MAX_CAP_STR 20 +static char cap_str[MAX_CAP_STR][40]; +static DEFINE_SPINLOCK(cap_str_lock); +static int last_cap_str; + +static char *gcap_string(char *s, int c) +{ + if (c & CEPH_CAP_GSHARED) + *s++ = 's'; + if (c & CEPH_CAP_GEXCL) + *s++ = 'x'; + if (c & CEPH_CAP_GCACHE) + *s++ = 'c'; + if (c & CEPH_CAP_GRD) + *s++ = 'r'; + if (c & CEPH_CAP_GWR) + *s++ = 'w'; + if (c & CEPH_CAP_GBUFFER) + *s++ = 'b'; + if (c & CEPH_CAP_GWREXTEND) + *s++ = 'a'; + if (c & CEPH_CAP_GLAZYIO) + *s++ = 'l'; + return s; +} + +const char *ceph_cap_string(int caps) +{ + int i; + char *s; + int c; + + spin_lock(&cap_str_lock); + i = last_cap_str++; + if (last_cap_str == MAX_CAP_STR) + last_cap_str = 0; + spin_unlock(&cap_str_lock); + + s = cap_str[i]; + + if (caps & CEPH_CAP_PIN) + *s++ = 'p'; + + c = (caps >> CEPH_CAP_SAUTH) & 3; + if (c) { + *s++ = 'A'; + s = gcap_string(s, c); + } + + c = (caps >> CEPH_CAP_SLINK) & 3; + if (c) { + *s++ = 'L'; + s = gcap_string(s, c); + } + + c = (caps >> CEPH_CAP_SXATTR) & 3; + if (c) { + *s++ = 'X'; + s = gcap_string(s, c); + } + + c = caps >> CEPH_CAP_SFILE; + if (c) { + *s++ = 'F'; + s = gcap_string(s, c); + } + + if (s == cap_str[i]) + *s++ = '-'; + *s = 0; + return cap_str[i]; +} + +void ceph_caps_init(struct ceph_mds_client *mdsc) +{ + INIT_LIST_HEAD(&mdsc->caps_list); + spin_lock_init(&mdsc->caps_list_lock); +} + +void ceph_caps_finalize(struct ceph_mds_client *mdsc) +{ + struct ceph_cap *cap; + + spin_lock(&mdsc->caps_list_lock); + while (!list_empty(&mdsc->caps_list)) { + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + kmem_cache_free(ceph_cap_cachep, cap); + } + mdsc->caps_total_count = 0; + mdsc->caps_avail_count = 0; + mdsc->caps_use_count = 0; + mdsc->caps_reserve_count = 0; + mdsc->caps_min_count = 0; + spin_unlock(&mdsc->caps_list_lock); +} + +void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, + struct ceph_mount_options *fsopt) +{ + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_min_count = fsopt->max_readdir; + if (mdsc->caps_min_count < 1024) + mdsc->caps_min_count = 1024; + mdsc->caps_use_max = fsopt->caps_max; + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_max < mdsc->caps_min_count) + mdsc->caps_use_max = mdsc->caps_min_count; + spin_unlock(&mdsc->caps_list_lock); +} + +static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps) +{ + struct ceph_cap *cap; + int i; + + if (nr_caps) { + BUG_ON(mdsc->caps_reserve_count < nr_caps); + mdsc->caps_reserve_count -= nr_caps; + if (mdsc->caps_avail_count >= + mdsc->caps_reserve_count + mdsc->caps_min_count) { + mdsc->caps_total_count -= nr_caps; + for (i = 0; i < nr_caps; i++) { + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + kmem_cache_free(ceph_cap_cachep, cap); + } + } else { + mdsc->caps_avail_count += nr_caps; + } + + dout("%s: caps %d = %d used + %d resv + %d avail\n", + __func__, + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + } +} + +/* + * Called under mdsc->mutex. + */ +int ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need) +{ + int i, j; + struct ceph_cap *cap; + int have; + int alloc = 0; + int max_caps; + int err = 0; + bool trimmed = false; + struct ceph_mds_session *s; + LIST_HEAD(newcaps); + + dout("reserve caps ctx=%p need=%d\n", ctx, need); + + /* first reserve any caps that are already allocated */ + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count >= need) + have = need; + else + have = mdsc->caps_avail_count; + mdsc->caps_avail_count -= have; + mdsc->caps_reserve_count += have; + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + + for (i = have; i < need; ) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (cap) { + list_add(&cap->caps_item, &newcaps); + alloc++; + i++; + continue; + } + + if (!trimmed) { + for (j = 0; j < mdsc->max_sessions; j++) { + s = __ceph_lookup_mds_session(mdsc, j); + if (!s) + continue; + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + max_caps = s->s_nr_caps - (need - i); + ceph_trim_caps(mdsc, s, max_caps); + mutex_unlock(&s->s_mutex); + + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + trimmed = true; + + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count) { + int more_have; + if (mdsc->caps_avail_count >= need - i) + more_have = need - i; + else + more_have = mdsc->caps_avail_count; + + i += more_have; + have += more_have; + mdsc->caps_avail_count -= more_have; + mdsc->caps_reserve_count += more_have; + + } + spin_unlock(&mdsc->caps_list_lock); + + continue; + } + + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); + err = -ENOMEM; + break; + } + + if (!err) { + BUG_ON(have + alloc != need); + ctx->count = need; + ctx->used = 0; + } + + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_total_count += alloc; + mdsc->caps_reserve_count += alloc; + list_splice(&newcaps, &mdsc->caps_list); + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + + if (err) + __ceph_unreserve_caps(mdsc, have + alloc); + + spin_unlock(&mdsc->caps_list_lock); + + dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", + ctx, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + return err; +} + +void ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) +{ + bool reclaim = false; + if (!ctx->count) + return; + + dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); + spin_lock(&mdsc->caps_list_lock); + __ceph_unreserve_caps(mdsc, ctx->count); + ctx->count = 0; + + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_count > mdsc->caps_use_max) + reclaim = true; + spin_unlock(&mdsc->caps_list_lock); + + if (reclaim) + ceph_reclaim_caps_nr(mdsc, ctx->used); +} + +struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) +{ + struct ceph_cap *cap = NULL; + + /* temporary, until we do something about cap import/export */ + if (!ctx) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (cap) { + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_use_count++; + mdsc->caps_total_count++; + spin_unlock(&mdsc->caps_list_lock); + } else { + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count) { + BUG_ON(list_empty(&mdsc->caps_list)); + + mdsc->caps_avail_count--; + mdsc->caps_use_count++; + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + } + spin_unlock(&mdsc->caps_list_lock); + } + + return cap; + } + + spin_lock(&mdsc->caps_list_lock); + dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", + ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(!ctx->count); + BUG_ON(ctx->count > mdsc->caps_reserve_count); + BUG_ON(list_empty(&mdsc->caps_list)); + + ctx->count--; + ctx->used++; + mdsc->caps_reserve_count--; + mdsc->caps_use_count++; + + cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); + list_del(&cap->caps_item); + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + return cap; +} + +void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) +{ + spin_lock(&mdsc->caps_list_lock); + dout("put_cap %p %d = %d used + %d resv + %d avail\n", + cap, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + mdsc->caps_use_count--; + /* + * Keep some preallocated caps around (ceph_min_count), to + * avoid lots of free/alloc churn. + */ + if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + + mdsc->caps_min_count) { + mdsc->caps_total_count--; + kmem_cache_free(ceph_cap_cachep, cap); + } else { + mdsc->caps_avail_count++; + list_add(&cap->caps_item, &mdsc->caps_list); + } + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); +} + +void ceph_reservation_status(struct ceph_fs_client *fsc, + int *total, int *avail, int *used, int *reserved, + int *min) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + + spin_lock(&mdsc->caps_list_lock); + + if (total) + *total = mdsc->caps_total_count; + if (avail) + *avail = mdsc->caps_avail_count; + if (used) + *used = mdsc->caps_use_count; + if (reserved) + *reserved = mdsc->caps_reserve_count; + if (min) + *min = mdsc->caps_min_count; + + spin_unlock(&mdsc->caps_list_lock); +} + +/* + * Find ceph_cap for given mds, if any. + * + * Called with i_ceph_lock held. + */ +static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + struct rb_node *n = ci->i_caps.rb_node; + + while (n) { + cap = rb_entry(n, struct ceph_cap, ci_node); + if (mds < cap->mds) + n = n->rb_left; + else if (mds > cap->mds) + n = n->rb_right; + else + return cap; + } + return NULL; +} + +struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + spin_unlock(&ci->i_ceph_lock); + return cap; +} + +/* + * Called under i_ceph_lock. + */ +static void __insert_cap_node(struct ceph_inode_info *ci, + struct ceph_cap *new) +{ + struct rb_node **p = &ci->i_caps.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap *cap = NULL; + + while (*p) { + parent = *p; + cap = rb_entry(parent, struct ceph_cap, ci_node); + if (new->mds < cap->mds) + p = &(*p)->rb_left; + else if (new->mds > cap->mds) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->ci_node, parent, p); + rb_insert_color(&new->ci_node, &ci->i_caps); +} + +/* + * (re)set cap hold timeouts, which control the delayed release + * of unused caps back to the MDS. Should be called on cap use. + */ +static void __cap_set_timeouts(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + struct ceph_mount_options *opt = mdsc->fsc->mount_options; + ci->i_hold_caps_max = round_jiffies(jiffies + + opt->caps_wanted_delay_max * HZ); + dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, + ci->i_hold_caps_max - jiffies); +} + +/* + * (Re)queue cap at the end of the delayed cap release list. + * + * If I_FLUSH is set, leave the inode at the front of the list. + * + * Caller holds i_ceph_lock + * -> we take mdsc->cap_delay_lock + */ +static void __cap_delay_requeue(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, + ci->i_ceph_flags, ci->i_hold_caps_max); + if (!mdsc->stopping) { + spin_lock(&mdsc->cap_delay_lock); + if (!list_empty(&ci->i_cap_delay_list)) { + if (ci->i_ceph_flags & CEPH_I_FLUSH) + goto no_change; + list_del_init(&ci->i_cap_delay_list); + } + __cap_set_timeouts(mdsc, ci); + list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); +no_change: + spin_unlock(&mdsc->cap_delay_lock); + } +} + +/* + * Queue an inode for immediate writeback. Mark inode with I_FLUSH, + * indicating we should send a cap message to flush dirty metadata + * asap, and move to the front of the delayed cap list. + */ +static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); + spin_lock(&mdsc->cap_delay_lock); + ci->i_ceph_flags |= CEPH_I_FLUSH; + if (!list_empty(&ci->i_cap_delay_list)) + list_del_init(&ci->i_cap_delay_list); + list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Cancel delayed work on cap. + * + * Caller must hold i_ceph_lock. + */ +static void __cap_delay_cancel(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_cancel %p\n", &ci->vfs_inode); + if (list_empty(&ci->i_cap_delay_list)) + return; + spin_lock(&mdsc->cap_delay_lock); + list_del_init(&ci->i_cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); +} + +/* Common issue checks for add_cap, handle_cap_grant. */ +static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, + unsigned issued) +{ + unsigned had = __ceph_caps_issued(ci, NULL); + + lockdep_assert_held(&ci->i_ceph_lock); + + /* + * Each time we receive FILE_CACHE anew, we increment + * i_rdcache_gen. + */ + if (S_ISREG(ci->vfs_inode.i_mode) && + (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { + ci->i_rdcache_gen++; + } + + /* + * If FILE_SHARED is newly issued, mark dir not complete. We don't + * know what happened to this directory while we didn't have the cap. + * If FILE_SHARED is being revoked, also mark dir not complete. It + * stops on-going cached readdir. + */ + if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { + if (issued & CEPH_CAP_FILE_SHARED) + atomic_inc(&ci->i_shared_gen); + if (S_ISDIR(ci->vfs_inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->vfs_inode); + __ceph_dir_clear_complete(ci); + } + } + + /* Wipe saved layout if we're losing DIR_CREATE caps */ + if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && + !(issued & CEPH_CAP_DIR_CREATE)) { + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); + } +} + +/** + * change_auth_cap_ses - move inode to appropriate lists when auth caps change + * @ci: inode to be moved + * @session: new auth caps session + */ +static void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session) +{ + lockdep_assert_held(&ci->i_ceph_lock); + + if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) + return; + + spin_lock(&session->s_mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) + list_move(&ci->i_dirty_item, &session->s_cap_dirty); + if (!list_empty(&ci->i_flushing_item)) + list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); + spin_unlock(&session->s_mdsc->cap_dirty_lock); +} + +/* + * Add a capability under the given MDS session. + * + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock + * + * @fmode is the open file mode, if we are opening a file, otherwise + * it is < 0. (This is so we can atomically add the cap and add an + * open file reference to it.) + */ +void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap **new_cap) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + int mds = session->s_mds; + int actual_wanted; + u32 gen; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, + session->s_mds, cap_id, ceph_cap_string(issued), seq); + + spin_lock(&session->s_gen_ttl_lock); + gen = session->s_cap_gen; + spin_unlock(&session->s_gen_ttl_lock); + + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + cap = *new_cap; + *new_cap = NULL; + + cap->issued = 0; + cap->implemented = 0; + cap->mds = mds; + cap->mds_wanted = 0; + cap->mseq = 0; + + cap->ci = ci; + __insert_cap_node(ci, cap); + + /* add to session cap list */ + cap->session = session; + spin_lock(&session->s_cap_lock); + list_add_tail(&cap->session_caps, &session->s_caps); + session->s_nr_caps++; + atomic64_inc(&mdsc->metric.total_caps); + spin_unlock(&session->s_cap_lock); + } else { + spin_lock(&session->s_cap_lock); + list_move_tail(&cap->session_caps, &session->s_caps); + spin_unlock(&session->s_cap_lock); + + if (cap->cap_gen < gen) + cap->issued = cap->implemented = CEPH_CAP_PIN; + + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != cap_id); + seq = cap->seq; + mseq = cap->mseq; + issued |= cap->issued; + flags |= CEPH_CAP_FLAG_AUTH; + } + } + + if (!ci->i_snap_realm || + ((flags & CEPH_CAP_FLAG_AUTH) && + realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) { + /* + * add this inode to the appropriate snap realm + */ + struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, + realmino); + if (realm) { + struct ceph_snap_realm *oldrealm = ci->i_snap_realm; + if (oldrealm) { + spin_lock(&oldrealm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + spin_unlock(&oldrealm->inodes_with_caps_lock); + } + + spin_lock(&realm->inodes_with_caps_lock); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + ci->i_snap_realm = realm; + if (realm->ino == ci->i_vino.ino) + realm->inode = inode; + spin_unlock(&realm->inodes_with_caps_lock); + + if (oldrealm) + ceph_put_snap_realm(mdsc, oldrealm); + } else { + pr_err("ceph_add_cap: couldn't find snap realm %llx\n", + realmino); + WARN_ON(!realm); + } + } + + __check_cap_issue(ci, cap, issued); + + /* + * If we are issued caps we don't want, or the mds' wanted + * value appears to be off, queue a check so we'll release + * later and/or update the mds wanted value. + */ + actual_wanted = __ceph_caps_wanted(ci); + if ((wanted & ~actual_wanted) || + (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { + dout(" issued %s, mds wanted %s, actual %s, queueing\n", + ceph_cap_string(issued), ceph_cap_string(wanted), + ceph_cap_string(actual_wanted)); + __cap_delay_requeue(mdsc, ci); + } + + if (flags & CEPH_CAP_FLAG_AUTH) { + if (!ci->i_auth_cap || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { + if (ci->i_auth_cap && + ci->i_auth_cap->session != cap->session) + change_auth_cap_ses(ci, cap->session); + ci->i_auth_cap = cap; + cap->mds_wanted = wanted; + } + } else { + WARN_ON(ci->i_auth_cap == cap); + } + + dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", + inode, ceph_vinop(inode), cap, ceph_cap_string(issued), + ceph_cap_string(issued|cap->issued), seq, mds); + cap->cap_id = cap_id; + cap->issued = issued; + cap->implemented |= issued; + if (ceph_seq_cmp(mseq, cap->mseq) > 0) + cap->mds_wanted = wanted; + else + cap->mds_wanted |= wanted; + cap->seq = seq; + cap->issue_seq = seq; + cap->mseq = mseq; + cap->cap_gen = gen; +} + +/* + * Return true if cap has not timed out and belongs to the current + * generation of the MDS session (i.e. has not gone 'stale' due to + * us losing touch with the mds). + */ +static int __cap_is_valid(struct ceph_cap *cap) +{ + unsigned long ttl; + u32 gen; + + spin_lock(&cap->session->s_gen_ttl_lock); + gen = cap->session->s_cap_gen; + ttl = cap->session->s_cap_ttl; + spin_unlock(&cap->session->s_gen_ttl_lock); + + if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { + dout("__cap_is_valid %p cap %p issued %s " + "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, + cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); + return 0; + } + + return 1; +} + +/* + * Return set of valid cap bits issued to us. Note that caps time + * out, and may be invalidated in bulk if the client session times out + * and session->s_cap_gen is bumped. + */ +int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) +{ + int have = ci->i_snap_caps; + struct ceph_cap *cap; + struct rb_node *p; + + if (implemented) + *implemented = 0; + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + dout("__ceph_caps_issued %p cap %p issued %s\n", + &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); + have |= cap->issued; + if (implemented) + *implemented |= cap->implemented; + } + /* + * exclude caps issued by non-auth MDS, but are been revoking + * by the auth MDS. The non-auth MDS should be revoking/exporting + * these caps, but the message is delayed. + */ + if (ci->i_auth_cap) { + cap = ci->i_auth_cap; + have &= ~cap->implemented | cap->issued; + } + return have; +} + +/* + * Get cap bits issued by caps other than @ocap + */ +int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) +{ + int have = ci->i_snap_caps; + struct ceph_cap *cap; + struct rb_node *p; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (cap == ocap) + continue; + if (!__cap_is_valid(cap)) + continue; + have |= cap->issued; + } + return have; +} + +/* + * Move a cap to the end of the LRU (oldest caps at list head, newest + * at list tail). + */ +static void __touch_cap(struct ceph_cap *cap) +{ + struct ceph_mds_session *s = cap->session; + + spin_lock(&s->s_cap_lock); + if (!s->s_cap_iterator) { + dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, + s->s_mds); + list_move_tail(&cap->session_caps, &s->s_caps); + } else { + dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", + &cap->ci->vfs_inode, cap, s->s_mds); + } + spin_unlock(&s->s_cap_lock); +} + +/* + * Check if we hold the given mask. If so, move the cap(s) to the + * front of their respective LRUs. (This is the preferred way for + * callers to check for caps they want.) + */ +int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) +{ + struct ceph_cap *cap; + struct rb_node *p; + int have = ci->i_snap_caps; + + if ((have & mask) == mask) { + dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" + " (mask %s)\n", ceph_ino(&ci->vfs_inode), + ceph_cap_string(have), + ceph_cap_string(mask)); + return 1; + } + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + if ((cap->issued & mask) == mask) { + dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" + " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); + if (touch) + __touch_cap(cap); + return 1; + } + + /* does a combination of caps satisfy mask? */ + have |= cap->issued; + if ((have & mask) == mask) { + dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" + " (mask %s)\n", ceph_ino(&ci->vfs_inode), + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); + if (touch) { + struct rb_node *q; + + /* touch this + preceding caps */ + __touch_cap(cap); + for (q = rb_first(&ci->i_caps); q != p; + q = rb_next(q)) { + cap = rb_entry(q, struct ceph_cap, + ci_node); + if (!__cap_is_valid(cap)) + continue; + if (cap->issued & mask) + __touch_cap(cap); + } + } + return 1; + } + } + + return 0; +} + +int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, + int touch) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + int r; + + r = __ceph_caps_issued_mask(ci, mask, touch); + if (r) + ceph_update_cap_hit(&fsc->mdsc->metric); + else + ceph_update_cap_mis(&fsc->mdsc->metric); + return r; +} + +/* + * Return true if mask caps are currently being revoked by an MDS. + */ +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask) +{ + struct ceph_cap *cap; + struct rb_node *p; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (cap != ocap && + (cap->implemented & ~cap->issued & mask)) + return 1; + } + return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_caps_revoking_other(ci, NULL, mask); + spin_unlock(&ci->i_ceph_lock); + dout("ceph_caps_revoking %p %s = %d\n", inode, + ceph_cap_string(mask), ret); + return ret; +} + +int __ceph_caps_used(struct ceph_inode_info *ci) +{ + int used = 0; + if (ci->i_pin_ref) + used |= CEPH_CAP_PIN; + if (ci->i_rd_ref) + used |= CEPH_CAP_FILE_RD; + if (ci->i_rdcache_ref || + (S_ISREG(ci->vfs_inode.i_mode) && + ci->vfs_inode.i_data.nrpages)) + used |= CEPH_CAP_FILE_CACHE; + if (ci->i_wr_ref) + used |= CEPH_CAP_FILE_WR; + if (ci->i_wb_ref || ci->i_wrbuffer_ref) + used |= CEPH_CAP_FILE_BUFFER; + if (ci->i_fx_ref) + used |= CEPH_CAP_FILE_EXCL; + return used; +} + +#define FMODE_WAIT_BIAS 1000 + +/* + * wanted, by virtue of open file modes + */ +int __ceph_caps_file_wanted(struct ceph_inode_info *ci) +{ + const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); + const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); + const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); + const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); + struct ceph_mount_options *opt = + ceph_inode_to_client(&ci->vfs_inode)->mount_options; + unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; + unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; + + if (S_ISDIR(ci->vfs_inode.i_mode)) { + int want = 0; + + /* use used_cutoff here, to keep dir's wanted caps longer */ + if (ci->i_nr_by_mode[RD_SHIFT] > 0 || + time_after(ci->i_last_rd, used_cutoff)) + want |= CEPH_CAP_ANY_SHARED; + + if (ci->i_nr_by_mode[WR_SHIFT] > 0 || + time_after(ci->i_last_wr, used_cutoff)) { + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + want |= CEPH_CAP_ANY_DIR_OPS; + } + + if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) + want |= CEPH_CAP_PIN; + + return want; + } else { + int bits = 0; + + if (ci->i_nr_by_mode[RD_SHIFT] > 0) { + if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || + time_after(ci->i_last_rd, used_cutoff)) + bits |= 1 << RD_SHIFT; + } else if (time_after(ci->i_last_rd, idle_cutoff)) { + bits |= 1 << RD_SHIFT; + } + + if (ci->i_nr_by_mode[WR_SHIFT] > 0) { + if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || + time_after(ci->i_last_wr, used_cutoff)) + bits |= 1 << WR_SHIFT; + } else if (time_after(ci->i_last_wr, idle_cutoff)) { + bits |= 1 << WR_SHIFT; + } + + /* check lazyio only when read/write is wanted */ + if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && + ci->i_nr_by_mode[LAZY_SHIFT] > 0) + bits |= 1 << LAZY_SHIFT; + + return bits ? ceph_caps_for_mode(bits >> 1) : 0; + } +} + +/* + * wanted, by virtue of open file modes AND cap refs (buffered/cached data) + */ +int __ceph_caps_wanted(struct ceph_inode_info *ci) +{ + int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); + if (S_ISDIR(ci->vfs_inode.i_mode)) { + /* we want EXCL if holding caps of dir ops */ + if (w & CEPH_CAP_ANY_DIR_OPS) + w |= CEPH_CAP_FILE_EXCL; + } else { + /* we want EXCL if dirty data */ + if (w & CEPH_CAP_FILE_BUFFER) + w |= CEPH_CAP_FILE_EXCL; + } + return w; +} + +/* + * Return caps we have registered with the MDS(s) as 'wanted'. + */ +int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) +{ + struct ceph_cap *cap; + struct rb_node *p; + int mds_wanted = 0; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (check && !__cap_is_valid(cap)) + continue; + if (cap == ci->i_auth_cap) + mds_wanted |= cap->mds_wanted; + else + mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); + } + return mds_wanted; +} + +int ceph_is_any_caps(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_is_any_real_caps(ci); + spin_unlock(&ci->i_ceph_lock); + + return ret; +} + +static void drop_inode_snap_realm(struct ceph_inode_info *ci) +{ + struct ceph_snap_realm *realm = ci->i_snap_realm; + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm_counter++; + ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, + realm); +} + +/* + * Remove a cap. Take steps to deal with a racing iterate_session_caps. + * + * caller should hold i_ceph_lock. + * caller will not hold session s_mutex if called from destroy_inode. + */ +void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) +{ + struct ceph_mds_session *session = cap->session; + struct ceph_inode_info *ci = cap->ci; + struct ceph_mds_client *mdsc; + int removed = 0; + + /* 'ci' being NULL means the remove have already occurred */ + if (!ci) { + dout("%s: cap inode is NULL\n", __func__); + return; + } + + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + + mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; + + /* remove from inode's cap rbtree, and clear auth cap */ + rb_erase(&cap->ci_node, &ci->i_caps); + if (ci->i_auth_cap == cap) { + WARN_ON_ONCE(!list_empty(&ci->i_dirty_item)); + ci->i_auth_cap = NULL; + } + + /* remove from session list */ + spin_lock(&session->s_cap_lock); + if (session->s_cap_iterator == cap) { + /* not yet, we are iterating over this very cap */ + dout("__ceph_remove_cap delaying %p removal from session %p\n", + cap, cap->session); + } else { + list_del_init(&cap->session_caps); + session->s_nr_caps--; + atomic64_dec(&mdsc->metric.total_caps); + cap->session = NULL; + removed = 1; + } + /* protect backpointer with s_cap_lock: see iterate_session_caps */ + cap->ci = NULL; + + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { + cap->queue_release = 1; + if (removed) { + __ceph_queue_cap_release(session, cap); + removed = 0; + } + } else { + cap->queue_release = 0; + } + cap->cap_ino = ci->i_vino.ino; + + spin_unlock(&session->s_cap_lock); + + if (removed) + ceph_put_cap(mdsc, cap); + + if (!__ceph_is_any_real_caps(ci)) { + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (ci->i_wr_ref == 0 && ci->i_snap_realm) + drop_inode_snap_realm(ci); + + __cap_delay_cancel(mdsc, ci); + } +} + +struct cap_msg_args { + struct ceph_mds_session *session; + u64 ino, cid, follows; + u64 flush_tid, oldest_flush_tid, size, max_size; + u64 xattr_version; + u64 change_attr; + struct ceph_buffer *xattr_buf; + struct ceph_buffer *old_xattr_buf; + struct timespec64 atime, mtime, ctime, btime; + int op, caps, wanted, dirty; + u32 seq, issue_seq, mseq, time_warp_seq; + u32 flags; + kuid_t uid; + kgid_t gid; + umode_t mode; + bool inline_data; + bool wake; +}; + +/* + * cap struct size + flock buffer size + inline version + inline data size + + * osd_epoch_barrier + oldest_flush_tid + */ +#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) + +/* Marshal up the cap msg to the MDS */ +static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) +{ + struct ceph_mds_caps *fc; + void *p; + struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; + + dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n", + __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino, + ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), + ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, + arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, + arg->size, arg->max_size, arg->xattr_version, + arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); + + msg->hdr.version = cpu_to_le16(10); + msg->hdr.tid = cpu_to_le64(arg->flush_tid); + + fc = msg->front.iov_base; + memset(fc, 0, sizeof(*fc)); + + fc->cap_id = cpu_to_le64(arg->cid); + fc->op = cpu_to_le32(arg->op); + fc->seq = cpu_to_le32(arg->seq); + fc->issue_seq = cpu_to_le32(arg->issue_seq); + fc->migrate_seq = cpu_to_le32(arg->mseq); + fc->caps = cpu_to_le32(arg->caps); + fc->wanted = cpu_to_le32(arg->wanted); + fc->dirty = cpu_to_le32(arg->dirty); + fc->ino = cpu_to_le64(arg->ino); + fc->snap_follows = cpu_to_le64(arg->follows); + + fc->size = cpu_to_le64(arg->size); + fc->max_size = cpu_to_le64(arg->max_size); + ceph_encode_timespec64(&fc->mtime, &arg->mtime); + ceph_encode_timespec64(&fc->atime, &arg->atime); + ceph_encode_timespec64(&fc->ctime, &arg->ctime); + fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq); + + fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid)); + fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid)); + fc->mode = cpu_to_le32(arg->mode); + + fc->xattr_version = cpu_to_le64(arg->xattr_version); + if (arg->xattr_buf) { + msg->middle = ceph_buffer_get(arg->xattr_buf); + fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); + msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); + } + + p = fc + 1; + /* flock buffer size (version 2) */ + ceph_encode_32(&p, 0); + /* inline version (version 4) */ + ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); + /* inline data size */ + ceph_encode_32(&p, 0); + /* + * osd_epoch_barrier (version 5) + * The epoch_barrier is protected osdc->lock, so READ_ONCE here in + * case it was recently changed + */ + ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier)); + /* oldest_flush_tid (version 6) */ + ceph_encode_64(&p, arg->oldest_flush_tid); + + /* + * caller_uid/caller_gid (version 7) + * + * Currently, we don't properly track which caller dirtied the caps + * last, and force a flush of them when there is a conflict. For now, + * just set this to 0:0, to emulate how the MDS has worked up to now. + */ + ceph_encode_32(&p, 0); + ceph_encode_32(&p, 0); + + /* pool namespace (version 8) (mds always ignores this) */ + ceph_encode_32(&p, 0); + + /* btime and change_attr (version 9) */ + ceph_encode_timespec64(p, &arg->btime); + p += sizeof(struct ceph_timespec); + ceph_encode_64(&p, arg->change_attr); + + /* Advisory flags (version 10) */ + ceph_encode_32(&p, arg->flags); +} + +/* + * Queue cap releases when an inode is dropped from our cache. + */ +void __ceph_remove_caps(struct ceph_inode_info *ci) +{ + struct rb_node *p; + + /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) + * may call __ceph_caps_issued_mask() on a freeing inode. */ + spin_lock(&ci->i_ceph_lock); + p = rb_first(&ci->i_caps); + while (p) { + struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); + p = rb_next(p); + __ceph_remove_cap(cap, true); + } + spin_unlock(&ci->i_ceph_lock); +} + +/* + * Prepare to send a cap message to an MDS. Update the cap state, and populate + * the arg struct with the parameters that will need to be sent. This should + * be done under the i_ceph_lock to guard against changes to cap state. + * + * Make note of max_size reported/requested from mds, revoked caps + * that have now been implemented. + */ +static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, + int op, int flags, int used, int want, int retain, + int flushing, u64 flush_tid, u64 oldest_flush_tid) +{ + struct ceph_inode_info *ci = cap->ci; + struct inode *inode = &ci->vfs_inode; + int held, revoking; + + lockdep_assert_held(&ci->i_ceph_lock); + + held = cap->issued | cap->implemented; + revoking = cap->implemented & ~cap->issued; + retain &= ~revoking; + + dout("%s %p cap %p session %p %s -> %s (revoking %s)\n", + __func__, inode, cap, cap->session, + ceph_cap_string(held), ceph_cap_string(held & retain), + ceph_cap_string(revoking)); + BUG_ON((retain & CEPH_CAP_PIN) == 0); + + ci->i_ceph_flags &= ~CEPH_I_FLUSH; + + cap->issued &= retain; /* drop bits we don't want */ + /* + * Wake up any waiters on wanted -> needed transition. This is due to + * the weird transition from buffered to sync IO... we need to flush + * dirty pages _before_ allowing sync writes to avoid reordering. + */ + arg->wake = cap->implemented & ~cap->issued; + cap->implemented &= cap->issued | used; + cap->mds_wanted = want; + + arg->session = cap->session; + arg->ino = ceph_vino(inode).ino; + arg->cid = cap->cap_id; + arg->follows = flushing ? ci->i_head_snapc->seq : 0; + arg->flush_tid = flush_tid; + arg->oldest_flush_tid = oldest_flush_tid; + + arg->size = inode->i_size; + ci->i_reported_size = arg->size; + arg->max_size = ci->i_wanted_max_size; + if (cap == ci->i_auth_cap) { + if (want & CEPH_CAP_ANY_FILE_WR) + ci->i_requested_max_size = arg->max_size; + else + ci->i_requested_max_size = 0; + } + + if (flushing & CEPH_CAP_XATTR_EXCL) { + arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); + arg->xattr_version = ci->i_xattrs.version; + arg->xattr_buf = ci->i_xattrs.blob; + } else { + arg->xattr_buf = NULL; + arg->old_xattr_buf = NULL; + } + + arg->mtime = inode->i_mtime; + arg->atime = inode->i_atime; + arg->ctime = inode->i_ctime; + arg->btime = ci->i_btime; + arg->change_attr = inode_peek_iversion_raw(inode); + + arg->op = op; + arg->caps = cap->implemented; + arg->wanted = want; + arg->dirty = flushing; + + arg->seq = cap->seq; + arg->issue_seq = cap->issue_seq; + arg->mseq = cap->mseq; + arg->time_warp_seq = ci->i_time_warp_seq; + + arg->uid = inode->i_uid; + arg->gid = inode->i_gid; + arg->mode = inode->i_mode; + + arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && + !list_empty(&ci->i_cap_snaps)) { + struct ceph_cap_snap *capsnap; + list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->cap_flush.tid) + break; + if (capsnap->need_flush) { + flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; + break; + } + } + } + arg->flags = flags; +} + +/* + * Send a cap msg on the given inode. + * + * Caller should hold snap_rwsem (read), s_mutex. + */ +static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) +{ + struct ceph_msg *msg; + struct inode *inode = &ci->vfs_inode; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + if (!msg) { + pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", + ceph_vinop(inode), ceph_cap_string(arg->dirty), + arg->flush_tid); + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(arg->session->s_mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + return; + } + + encode_cap_msg(msg, arg); + ceph_con_send(&arg->session->s_con, msg); + ceph_buffer_put(arg->old_xattr_buf); + if (arg->wake) + wake_up_all(&ci->i_cap_wq); +} + +static inline int __send_flush_snap(struct inode *inode, + struct ceph_mds_session *session, + struct ceph_cap_snap *capsnap, + u32 mseq, u64 oldest_flush_tid) +{ + struct cap_msg_args arg; + struct ceph_msg *msg; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + if (!msg) + return -ENOMEM; + + arg.session = session; + arg.ino = ceph_vino(inode).ino; + arg.cid = 0; + arg.follows = capsnap->follows; + arg.flush_tid = capsnap->cap_flush.tid; + arg.oldest_flush_tid = oldest_flush_tid; + + arg.size = capsnap->size; + arg.max_size = 0; + arg.xattr_version = capsnap->xattr_version; + arg.xattr_buf = capsnap->xattr_blob; + arg.old_xattr_buf = NULL; + + arg.atime = capsnap->atime; + arg.mtime = capsnap->mtime; + arg.ctime = capsnap->ctime; + arg.btime = capsnap->btime; + arg.change_attr = capsnap->change_attr; + + arg.op = CEPH_CAP_OP_FLUSHSNAP; + arg.caps = capsnap->issued; + arg.wanted = 0; + arg.dirty = capsnap->dirty; + + arg.seq = 0; + arg.issue_seq = 0; + arg.mseq = mseq; + arg.time_warp_seq = capsnap->time_warp_seq; + + arg.uid = capsnap->uid; + arg.gid = capsnap->gid; + arg.mode = capsnap->mode; + + arg.inline_data = capsnap->inline_data; + arg.flags = 0; + arg.wake = false; + + encode_cap_msg(msg, &arg); + ceph_con_send(&arg.session->s_con, msg); + return 0; +} + +/* + * When a snapshot is taken, clients accumulate dirty metadata on + * inodes with capabilities in ceph_cap_snaps to describe the file + * state at the time the snapshot was taken. This must be flushed + * asynchronously back to the MDS once sync writes complete and dirty + * data is written out. + * + * Called under i_ceph_lock. Takes s_mutex as needed. + */ +static void __ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session *session) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_cap_snap *capsnap; + u64 oldest_flush_tid = 0; + u64 first_tid = 1, last_tid = 0; + + dout("__flush_snaps %p session %p\n", inode, session); + + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + /* + * we need to wait for sync writes to complete and for dirty + * pages to be written out. + */ + if (capsnap->dirty_pages || capsnap->writing) + break; + + /* should be removed by ceph_try_drop_cap_snap() */ + BUG_ON(!capsnap->need_flush); + + /* only flush each capsnap once */ + if (capsnap->cap_flush.tid > 0) { + dout(" already flushed %p, skipping\n", capsnap); + continue; + } + + spin_lock(&mdsc->cap_dirty_lock); + capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; + list_add_tail(&capsnap->cap_flush.g_list, + &mdsc->cap_flush_list); + if (oldest_flush_tid == 0) + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + if (list_empty(&ci->i_flushing_item)) { + list_add_tail(&ci->i_flushing_item, + &session->s_cap_flushing); + } + spin_unlock(&mdsc->cap_dirty_lock); + + list_add_tail(&capsnap->cap_flush.i_list, + &ci->i_cap_flush_list); + + if (first_tid == 1) + first_tid = capsnap->cap_flush.tid; + last_tid = capsnap->cap_flush.tid; + } + + ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS; + + while (first_tid <= last_tid) { + struct ceph_cap *cap = ci->i_auth_cap; + struct ceph_cap_flush *cf; + int ret; + + if (!(cap && cap->session == session)) { + dout("__flush_snaps %p auth cap %p not mds%d, " + "stop\n", inode, cap, session->s_mds); + break; + } + + ret = -ENOENT; + list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { + if (cf->tid >= first_tid) { + ret = 0; + break; + } + } + if (ret < 0) + break; + + first_tid = cf->tid + 1; + + capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); + refcount_inc(&capsnap->nref); + spin_unlock(&ci->i_ceph_lock); + + dout("__flush_snaps %p capsnap %p tid %llu %s\n", + inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); + + ret = __send_flush_snap(inode, session, capsnap, cap->mseq, + oldest_flush_tid); + if (ret < 0) { + pr_err("__flush_snaps: error sending cap flushsnap, " + "ino (%llx.%llx) tid %llu follows %llu\n", + ceph_vinop(inode), cf->tid, capsnap->follows); + } + + ceph_put_cap_snap(capsnap); + spin_lock(&ci->i_ceph_lock); + } +} + +void ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *session = NULL; + bool need_put = false; + int mds; + + dout("ceph_flush_snaps %p\n", inode); + if (psession) + session = *psession; +retry: + spin_lock(&ci->i_ceph_lock); + if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { + dout(" no capsnap needs flush, doing nothing\n"); + goto out; + } + if (!ci->i_auth_cap) { + dout(" no auth cap (migrating?), doing nothing\n"); + goto out; + } + + mds = ci->i_auth_cap->session->s_mds; + if (session && session->s_mds != mds) { + dout(" oops, wrong session %p mutex\n", session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + session = NULL; + } + if (!session) { + spin_unlock(&ci->i_ceph_lock); + mutex_lock(&mdsc->mutex); + session = __ceph_lookup_mds_session(mdsc, mds); + mutex_unlock(&mdsc->mutex); + if (session) { + dout(" inverting session/ino locks on %p\n", session); + mutex_lock(&session->s_mutex); + } + goto retry; + } + + // make sure flushsnap messages are sent in proper order. + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) + __kick_flushing_caps(mdsc, session, ci, 0); + + __ceph_flush_snaps(ci, session); +out: + spin_unlock(&ci->i_ceph_lock); + + if (psession) { + *psession = session; + } else if (session) { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } + /* we flushed them all; remove this inode from the queue */ + spin_lock(&mdsc->snap_flush_lock); + if (!list_empty(&ci->i_snap_flush_item)) + need_put = true; + list_del_init(&ci->i_snap_flush_item); + spin_unlock(&mdsc->snap_flush_lock); + + if (need_put) + iput(inode); +} + +/* + * Mark caps dirty. If inode is newly dirty, return the dirty flags. + * Caller is then responsible for calling __mark_inode_dirty with the + * returned flags value. + */ +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf) +{ + struct ceph_mds_client *mdsc = + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + struct inode *inode = &ci->vfs_inode; + int was = ci->i_dirty_caps; + int dirty = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + if (!ci->i_auth_cap) { + pr_warn("__mark_dirty_caps %p %llx mask %s, " + "but no auth cap (session was closed?)\n", + inode, ceph_ino(inode), ceph_cap_string(mask)); + return 0; + } + + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, + ceph_cap_string(mask), ceph_cap_string(was), + ceph_cap_string(was | mask)); + ci->i_dirty_caps |= mask; + if (was == 0) { + struct ceph_mds_session *session = ci->i_auth_cap->session; + + WARN_ON_ONCE(ci->i_prealloc_cap_flush); + swap(ci->i_prealloc_cap_flush, *pcf); + + if (!ci->i_head_snapc) { + WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + } + dout(" inode %p now dirty snapc %p auth cap %p\n", + &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + BUG_ON(!list_empty(&ci->i_dirty_item)); + spin_lock(&mdsc->cap_dirty_lock); + list_add(&ci->i_dirty_item, &session->s_cap_dirty); + spin_unlock(&mdsc->cap_dirty_lock); + if (ci->i_flushing_caps == 0) { + ihold(inode); + dirty |= I_DIRTY_SYNC; + } + } else { + WARN_ON_ONCE(!ci->i_prealloc_cap_flush); + } + BUG_ON(list_empty(&ci->i_dirty_item)); + if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && + (mask & CEPH_CAP_FILE_BUFFER)) + dirty |= I_DIRTY_DATASYNC; + __cap_delay_requeue(mdsc, ci); + return dirty; +} + +struct ceph_cap_flush *ceph_alloc_cap_flush(void) +{ + struct ceph_cap_flush *cf; + + cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + if (!cf) + return NULL; + + cf->is_capsnap = false; + return cf; +} + +void ceph_free_cap_flush(struct ceph_cap_flush *cf) +{ + if (cf) + kmem_cache_free(ceph_cap_flush_cachep, cf); +} + +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) +{ + if (!list_empty(&mdsc->cap_flush_list)) { + struct ceph_cap_flush *cf = + list_first_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); + return cf->tid; + } + return 0; +} + +/* + * Remove cap_flush from the mdsc's or inode's flushing cap list. + * Return true if caller needs to wake up flush waiters. + */ +static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, + struct ceph_cap_flush *cf) +{ + struct ceph_cap_flush *prev; + bool wake = cf->wake; + + if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { + prev = list_prev_entry(cf, g_list); + prev->wake = true; + wake = false; + } + list_del_init(&cf->g_list); + return wake; +} + +static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct ceph_cap_flush *prev; + bool wake = cf->wake; + + if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { + prev = list_prev_entry(cf, i_list); + prev->wake = true; + wake = false; + } + list_del_init(&cf->i_list); + return wake; +} + +/* + * Add dirty inode to the flushing list. Assigned a seq number so we + * can wait for caps to flush without starving. + * + * Called under i_ceph_lock. Returns the flush tid. + */ +static u64 __mark_caps_flushing(struct inode *inode, + struct ceph_mds_session *session, bool wake, + u64 *oldest_flush_tid) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_flush *cf = NULL; + int flushing; + + lockdep_assert_held(&ci->i_ceph_lock); + BUG_ON(ci->i_dirty_caps == 0); + BUG_ON(list_empty(&ci->i_dirty_item)); + BUG_ON(!ci->i_prealloc_cap_flush); + + flushing = ci->i_dirty_caps; + dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", + ceph_cap_string(flushing), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps | flushing)); + ci->i_flushing_caps |= flushing; + ci->i_dirty_caps = 0; + dout(" inode %p now !dirty\n", inode); + + swap(cf, ci->i_prealloc_cap_flush); + cf->caps = flushing; + cf->wake = wake; + + spin_lock(&mdsc->cap_dirty_lock); + list_del_init(&ci->i_dirty_item); + + cf->tid = ++mdsc->last_cap_flush_tid; + list_add_tail(&cf->g_list, &mdsc->cap_flush_list); + *oldest_flush_tid = __get_oldest_flush_tid(mdsc); + + if (list_empty(&ci->i_flushing_item)) { + list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); + mdsc->num_cap_flushing++; + } + spin_unlock(&mdsc->cap_dirty_lock); + + list_add_tail(&cf->i_list, &ci->i_cap_flush_list); + + return cf->tid; +} + +/* + * try to invalidate mapping pages without blocking. + */ +static int try_nonblocking_invalidate(struct inode *inode) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u32 invalidating_gen = ci->i_rdcache_gen; + + spin_unlock(&ci->i_ceph_lock); + ceph_fscache_invalidate(inode); + invalidate_mapping_pages(&inode->i_data, 0, -1); + spin_lock(&ci->i_ceph_lock); + + if (inode->i_data.nrpages == 0 && + invalidating_gen == ci->i_rdcache_gen) { + /* success. */ + dout("try_nonblocking_invalidate %p success\n", inode); + /* save any racing async invalidate some trouble */ + ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; + return 0; + } + dout("try_nonblocking_invalidate %p failed\n", inode); + return -1; +} + +bool __ceph_should_report_size(struct ceph_inode_info *ci) +{ + loff_t size = ci->vfs_inode.i_size; + /* mds will adjust max size according to the reported size */ + if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) + return false; + if (size >= ci->i_max_size) + return true; + /* half of previous max_size increment has been used */ + if (ci->i_max_size > ci->i_reported_size && + (size << 1) >= ci->i_max_size + ci->i_reported_size) + return true; + return false; +} + +/* + * Swiss army knife function to examine currently used and wanted + * versus held caps. Release, flush, ack revoked caps to mds as + * appropriate. + * + * CHECK_CAPS_AUTHONLY - we should only check the auth cap + * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without + * further delay. + */ +void ceph_check_caps(struct ceph_inode_info *ci, int flags, + struct ceph_mds_session *session) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_cap *cap; + u64 flush_tid, oldest_flush_tid; + int file_wanted, used, cap_used; + int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ + int issued, implemented, want, retain, revoking, flushing = 0; + int mds = -1; /* keep track of how far we've gone through i_caps list + to avoid an infinite loop on retry */ + struct rb_node *p; + bool queue_invalidate = false; + bool tried_invalidate = false; + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_FLUSH) + flags |= CHECK_CAPS_FLUSH; + + goto retry_locked; +retry: + spin_lock(&ci->i_ceph_lock); +retry_locked: + /* Caps wanted by virtue of active open files. */ + file_wanted = __ceph_caps_file_wanted(ci); + + /* Caps which have active references against them */ + used = __ceph_caps_used(ci); + + /* + * "issued" represents the current caps that the MDS wants us to have. + * "implemented" is the set that we have been granted, and includes the + * ones that have not yet been returned to the MDS (the "revoking" set, + * usually because they have outstanding references). + */ + issued = __ceph_caps_issued(ci, &implemented); + revoking = implemented & ~issued; + + want = file_wanted; + + /* The ones we currently want to retain (may be adjusted below) */ + retain = file_wanted | used | CEPH_CAP_PIN; + if (!mdsc->stopping && inode->i_nlink > 0) { + if (file_wanted) { + retain |= CEPH_CAP_ANY; /* be greedy */ + } else if (S_ISDIR(inode->i_mode) && + (issued & CEPH_CAP_FILE_SHARED) && + __ceph_dir_is_complete(ci)) { + /* + * If a directory is complete, we want to keep + * the exclusive cap. So that MDS does not end up + * revoking the shared cap on every create/unlink + * operation. + */ + if (IS_RDONLY(inode)) { + want = CEPH_CAP_ANY_SHARED; + } else { + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + } + retain |= want; + } else { + + retain |= CEPH_CAP_ANY_SHARED; + /* + * keep RD only if we didn't have the file open RW, + * because then the mds would revoke it anyway to + * journal max_size=0. + */ + if (ci->i_max_size == 0) + retain |= CEPH_CAP_ANY_RD; + } + } + + dout("check_caps %p file_want %s used %s dirty %s flushing %s" + " issued %s revoking %s retain %s %s%s\n", inode, + ceph_cap_string(file_wanted), + ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(issued), ceph_cap_string(revoking), + ceph_cap_string(retain), + (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", + (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); + + /* + * If we no longer need to hold onto old our caps, and we may + * have cached pages, but don't want them, then try to invalidate. + * If we fail, it's because pages are locked.... try again later. + */ + if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && + S_ISREG(inode->i_mode) && + !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ + inode->i_data.nrpages && /* have cached pages */ + (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ + !tried_invalidate) { + dout("check_caps trying to invalidate on %p\n", inode); + if (try_nonblocking_invalidate(inode) < 0) { + dout("check_caps queuing invalidate\n"); + queue_invalidate = true; + ci->i_rdcache_revoking = ci->i_rdcache_gen; + } + tried_invalidate = true; + goto retry_locked; + } + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + int mflags = 0; + struct cap_msg_args arg; + + cap = rb_entry(p, struct ceph_cap, ci_node); + + /* avoid looping forever */ + if (mds >= cap->mds || + ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) + continue; + + /* NOTE: no side-effects allowed, until we take s_mutex */ + + /* + * If we have an auth cap, we don't need to consider any + * overlapping caps as used. + */ + cap_used = used; + if (ci->i_auth_cap && cap != ci->i_auth_cap) + cap_used &= ~ci->i_auth_cap->issued; + + revoking = cap->implemented & ~cap->issued; + dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", + cap->mds, cap, ceph_cap_string(cap_used), + ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented), + ceph_cap_string(revoking)); + + if (cap == ci->i_auth_cap && + (cap->issued & CEPH_CAP_FILE_WR)) { + /* request larger max_size from MDS? */ + if (ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) { + dout("requesting new max_size\n"); + goto ack; + } + + /* approaching file_max? */ + if (__ceph_should_report_size(ci)) { + dout("i_size approaching max_size\n"); + goto ack; + } + } + /* flush anything dirty? */ + if (cap == ci->i_auth_cap) { + if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) { + dout("flushing dirty caps\n"); + goto ack; + } + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) { + dout("flushing snap caps\n"); + goto ack; + } + } + + /* completed revocation? going down and there are no caps? */ + if (revoking && (revoking & cap_used) == 0) { + dout("completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* want more caps from mds? */ + if (want & ~cap->mds_wanted) { + if (want & ~(cap->mds_wanted | cap->issued)) + goto ack; + if (!__cap_is_valid(cap)) + goto ack; + } + + /* things we might delay */ + if ((cap->issued & ~retain) == 0) + continue; /* nope, all good */ + +ack: + if (session && session != cap->session) { + dout("oops, wrong session %p mutex\n", session); + mutex_unlock(&session->s_mutex); + session = NULL; + } + if (!session) { + session = cap->session; + if (mutex_trylock(&session->s_mutex) == 0) { + dout("inverting session/ino locks on %p\n", + session); + session = ceph_get_mds_session(session); + spin_unlock(&ci->i_ceph_lock); + if (took_snap_rwsem) { + up_read(&mdsc->snap_rwsem); + took_snap_rwsem = 0; + } + if (session) { + mutex_lock(&session->s_mutex); + ceph_put_mds_session(session); + } else { + /* + * Because we take the reference while + * holding the i_ceph_lock, it should + * never be NULL. Throw a warning if it + * ever is. + */ + WARN_ON_ONCE(true); + } + goto retry; + } + } + + /* kick flushing and flush snaps before sending normal + * cap message */ + if (cap == ci->i_auth_cap && + (ci->i_ceph_flags & + (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) + __kick_flushing_caps(mdsc, session, ci, 0); + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) + __ceph_flush_snaps(ci, session); + + goto retry_locked; + } + + /* take snap_rwsem after session mutex */ + if (!took_snap_rwsem) { + if (down_read_trylock(&mdsc->snap_rwsem) == 0) { + dout("inverting snap/in locks on %p\n", + inode); + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + took_snap_rwsem = 1; + goto retry; + } + took_snap_rwsem = 1; + } + + if (cap == ci->i_auth_cap && ci->i_dirty_caps) { + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, false, + &oldest_flush_tid); + if (flags & CHECK_CAPS_FLUSH && + list_empty(&session->s_cap_dirty)) + mflags |= CEPH_CLIENT_CAPS_SYNC; + } else { + flushing = 0; + flush_tid = 0; + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + } + + mds = cap->mds; /* remember mds, so we don't repeat */ + + __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, + want, retain, flushing, flush_tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + + __send_cap(&arg, ci); + + goto retry; /* retake i_ceph_lock and restart our cap scan. */ + } + + /* periodically re-calculate caps wanted by open files */ + if (__ceph_is_any_real_caps(ci) && + list_empty(&ci->i_cap_delay_list) && + (file_wanted & ~CEPH_CAP_PIN) && + !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + __cap_delay_requeue(mdsc, ci); + } + + spin_unlock(&ci->i_ceph_lock); + + if (queue_invalidate) + ceph_queue_invalidate(inode); + + if (session) + mutex_unlock(&session->s_mutex); + if (took_snap_rwsem) + up_read(&mdsc->snap_rwsem); +} + +/* + * Try to flush dirty caps back to the auth mds. + */ +static int try_flush_caps(struct inode *inode, u64 *ptid) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_session *session = NULL; + int flushing = 0; + u64 flush_tid = 0, oldest_flush_tid = 0; + +retry: + spin_lock(&ci->i_ceph_lock); +retry_locked: + if (ci->i_dirty_caps && ci->i_auth_cap) { + struct ceph_cap *cap = ci->i_auth_cap; + struct cap_msg_args arg; + + if (session != cap->session) { + spin_unlock(&ci->i_ceph_lock); + if (session) + mutex_unlock(&session->s_mutex); + session = cap->session; + mutex_lock(&session->s_mutex); + goto retry; + } + if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) { + spin_unlock(&ci->i_ceph_lock); + goto out; + } + + if (ci->i_ceph_flags & + (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) + __kick_flushing_caps(mdsc, session, ci, 0); + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) + __ceph_flush_snaps(ci, session); + goto retry_locked; + } + + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, true, + &oldest_flush_tid); + + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, + __ceph_caps_used(ci), __ceph_caps_wanted(ci), + (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + + __send_cap(&arg, ci); + } else { + if (!list_empty(&ci->i_cap_flush_list)) { + struct ceph_cap_flush *cf = + list_last_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + cf->wake = true; + flush_tid = cf->tid; + } + flushing = ci->i_flushing_caps; + spin_unlock(&ci->i_ceph_lock); + } +out: + if (session) + mutex_unlock(&session->s_mutex); + + *ptid = flush_tid; + return flushing; +} + +/* + * Return true if we've flushed caps through the given flush_tid. + */ +static int caps_are_flushed(struct inode *inode, u64 flush_tid) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret = 1; + + spin_lock(&ci->i_ceph_lock); + if (!list_empty(&ci->i_cap_flush_list)) { + struct ceph_cap_flush * cf = + list_first_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + if (cf->tid <= flush_tid) + ret = 0; + } + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +/* + * wait for any unsafe requests to complete. + */ +static int unsafe_request_wait(struct inode *inode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req1 = NULL, *req2 = NULL; + int ret, err = 0; + + spin_lock(&ci->i_unsafe_lock); + if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) { + req1 = list_last_entry(&ci->i_unsafe_dirops, + struct ceph_mds_request, + r_unsafe_dir_item); + ceph_mdsc_get_request(req1); + } + if (!list_empty(&ci->i_unsafe_iops)) { + req2 = list_last_entry(&ci->i_unsafe_iops, + struct ceph_mds_request, + r_unsafe_target_item); + ceph_mdsc_get_request(req2); + } + spin_unlock(&ci->i_unsafe_lock); + + /* + * Trigger to flush the journal logs in all the relevant MDSes + * manually, or in the worst case we must wait at most 5 seconds + * to wait the journal logs to be flushed by the MDSes periodically. + */ + if (req1 || req2) { + struct ceph_mds_request *req; + struct ceph_mds_session **sessions; + struct ceph_mds_session *s; + unsigned int max_sessions; + int i; + + mutex_lock(&mdsc->mutex); + max_sessions = mdsc->max_sessions; + + sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); + if (!sessions) { + mutex_unlock(&mdsc->mutex); + err = -ENOMEM; + goto out; + } + + spin_lock(&ci->i_unsafe_lock); + if (req1) { + list_for_each_entry(req, &ci->i_unsafe_dirops, + r_unsafe_dir_item) { + s = req->r_session; + if (!s) + continue; + if (!sessions[s->s_mds]) { + s = ceph_get_mds_session(s); + sessions[s->s_mds] = s; + } + } + } + if (req2) { + list_for_each_entry(req, &ci->i_unsafe_iops, + r_unsafe_target_item) { + s = req->r_session; + if (!s) + continue; + if (!sessions[s->s_mds]) { + s = ceph_get_mds_session(s); + sessions[s->s_mds] = s; + } + } + } + spin_unlock(&ci->i_unsafe_lock); + + /* the auth MDS */ + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) { + s = ci->i_auth_cap->session; + if (!sessions[s->s_mds]) + sessions[s->s_mds] = ceph_get_mds_session(s); + } + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&mdsc->mutex); + + /* send flush mdlog request to MDSes */ + for (i = 0; i < max_sessions; i++) { + s = sessions[i]; + if (s) { + send_flush_mdlog(s); + ceph_put_mds_session(s); + } + } + kfree(sessions); + } + + dout("unsafe_request_wait %p wait on tid %llu %llu\n", + inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); + if (req1) { + ret = !wait_for_completion_timeout(&req1->r_safe_completion, + ceph_timeout_jiffies(req1->r_timeout)); + if (ret) + err = -EIO; + } + if (req2) { + ret = !wait_for_completion_timeout(&req2->r_safe_completion, + ceph_timeout_jiffies(req2->r_timeout)); + if (ret) + err = -EIO; + } + +out: + if (req1) + ceph_mdsc_put_request(req1); + if (req2) + ceph_mdsc_put_request(req2); + return err; +} + +int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 flush_tid; + int ret, err; + int dirty; + + dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); + + ret = file_write_and_wait_range(file, start, end); + if (datasync) + goto out; + + ret = ceph_wait_on_async_create(inode); + if (ret) + goto out; + + dirty = try_flush_caps(inode, &flush_tid); + dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + + err = unsafe_request_wait(inode); + + /* + * only wait on non-file metadata writeback (the mds + * can recover size and mtime, so we don't need to + * wait for that) + */ + if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { + err = wait_event_interruptible(ci->i_cap_wq, + caps_are_flushed(inode, flush_tid)); + } + + if (err < 0) + ret = err; + + err = file_check_and_advance_wb_err(file); + if (err < 0) + ret = err; +out: + dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); + return ret; +} + +/* + * Flush any dirty caps back to the mds. If we aren't asked to wait, + * queue inode for flush but don't do so immediately, because we can + * get by with fewer MDS messages if we wait for data writeback to + * complete first. + */ +int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u64 flush_tid; + int err = 0; + int dirty; + int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); + + dout("write_inode %p wait=%d\n", inode, wait); + if (wait) { + dirty = try_flush_caps(inode, &flush_tid); + if (dirty) + err = wait_event_interruptible(ci->i_cap_wq, + caps_are_flushed(inode, flush_tid)); + } else { + struct ceph_mds_client *mdsc = + ceph_sb_to_client(inode->i_sb)->mdsc; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_dirty(ci)) + __cap_delay_requeue_front(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + } + return err; +} + +static void __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci, + u64 oldest_flush_tid) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + struct ceph_cap_flush *cf; + int ret; + u64 first_tid = 0; + u64 last_snap_flush = 0; + + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + + list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { + if (cf->is_capsnap) { + last_snap_flush = cf->tid; + break; + } + } + + list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { + if (cf->tid < first_tid) + continue; + + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", + inode, cap, session->s_mds); + break; + } + + first_tid = cf->tid + 1; + + if (!cf->is_capsnap) { + struct cap_msg_args arg; + + dout("kick_flushing_caps %p cap %p tid %llu %s\n", + inode, cap, cf->tid, ceph_cap_string(cf->caps)); + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, + (cf->tid < last_snap_flush ? + CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + (cap->issued | cap->implemented), + cf->caps, cf->tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + __send_cap(&arg, ci); + } else { + struct ceph_cap_snap *capsnap = + container_of(cf, struct ceph_cap_snap, + cap_flush); + dout("kick_flushing_caps %p capsnap %p tid %llu %s\n", + inode, capsnap, cf->tid, + ceph_cap_string(capsnap->dirty)); + + refcount_inc(&capsnap->nref); + spin_unlock(&ci->i_ceph_lock); + + ret = __send_flush_snap(inode, session, capsnap, cap->mseq, + oldest_flush_tid); + if (ret < 0) { + pr_err("kick_flushing_caps: error sending " + "cap flushsnap, ino (%llx.%llx) " + "tid %llu follows %llu\n", + ceph_vinop(inode), cf->tid, + capsnap->follows); + } + + ceph_put_cap_snap(capsnap); + } + + spin_lock(&ci->i_ceph_lock); + } +} + +void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci; + struct ceph_cap *cap; + u64 oldest_flush_tid; + + dout("early_kick_flushing_caps mds%d\n", session->s_mds); + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", + &ci->vfs_inode, cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + continue; + } + + + /* + * if flushing caps were revoked, we re-send the cap flush + * in client reconnect stage. This guarantees MDS * processes + * the cap flush message before issuing the flushing caps to + * other client. + */ + if ((cap->issued & ci->i_flushing_caps) != + ci->i_flushing_caps) { + /* encode_caps_cb() also will reset these sequence + * numbers. make sure sequence numbers in cap flush + * message match later reconnect message */ + cap->seq = 0; + cap->issue_seq = 0; + cap->mseq = 0; + __kick_flushing_caps(mdsc, session, ci, + oldest_flush_tid); + } else { + ci->i_ceph_flags |= CEPH_I_KICK_FLUSH; + } + + spin_unlock(&ci->i_ceph_lock); + } +} + +void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci; + struct ceph_cap *cap; + u64 oldest_flush_tid; + + lockdep_assert_held(&session->s_mutex); + + dout("kick_flushing_caps mds%d\n", session->s_mds); + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", + &ci->vfs_inode, cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + continue; + } + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + __kick_flushing_caps(mdsc, session, ci, + oldest_flush_tid); + } + spin_unlock(&ci->i_ceph_lock); + } +} + +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, + struct ceph_inode_info *ci) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_cap *cap = ci->i_auth_cap; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, + ceph_cap_string(ci->i_flushing_caps)); + + if (!list_empty(&ci->i_cap_flush_list)) { + u64 oldest_flush_tid; + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &cap->session->s_cap_flushing); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + + __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); + } +} + + +/* + * Take references to capabilities we hold, so that we don't release + * them to the MDS prematurely. + */ +void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, + bool snap_rwsem_locked) +{ + lockdep_assert_held(&ci->i_ceph_lock); + + if (got & CEPH_CAP_PIN) + ci->i_pin_ref++; + if (got & CEPH_CAP_FILE_RD) + ci->i_rd_ref++; + if (got & CEPH_CAP_FILE_CACHE) + ci->i_rdcache_ref++; + if (got & CEPH_CAP_FILE_EXCL) + ci->i_fx_ref++; + if (got & CEPH_CAP_FILE_WR) { + if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { + BUG_ON(!snap_rwsem_locked); + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + } + ci->i_wr_ref++; + } + if (got & CEPH_CAP_FILE_BUFFER) { + if (ci->i_wb_ref == 0) + ihold(&ci->vfs_inode); + ci->i_wb_ref++; + dout("%s %p wb %d -> %d (?)\n", __func__, + &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); + } +} + +/* + * Try to grab cap references. Specify those refs we @want, and the + * minimal set we @need. Also include the larger offset we are writing + * to (when applicable), and check against max_size here as well. + * Note that caller is responsible for ensuring max_size increases are + * requested from the MDS. + * + * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, + * or a negative error code. There are 3 speical error codes: + * -EAGAIN: need to sleep but non-blocking is specified + * -EFBIG: ask caller to call check_max_size() and try again. + * -ESTALE: ask caller to call ceph_renew_caps() and try again. + */ +enum { + /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ + NON_BLOCKING = (1 << 8), + CHECK_FILELOCK = (1 << 9), +}; + +static int try_get_cap_refs(struct inode *inode, int need, int want, + loff_t endoff, int flags, int *got) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + int ret = 0; + int have, implemented; + bool snap_rwsem_locked = false; + + dout("get_cap_refs %p need %s want %s\n", inode, + ceph_cap_string(need), ceph_cap_string(want)); + +again: + spin_lock(&ci->i_ceph_lock); + + if ((flags & CHECK_FILELOCK) && + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { + dout("try_get_cap_refs %p error filelock\n", inode); + ret = -EIO; + goto out_unlock; + } + + /* finish pending truncate */ + while (ci->i_truncate_pending) { + spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) { + up_read(&mdsc->snap_rwsem); + snap_rwsem_locked = false; + } + __ceph_do_pending_vmtruncate(inode); + spin_lock(&ci->i_ceph_lock); + } + + have = __ceph_caps_issued(ci, &implemented); + + if (have & need & CEPH_CAP_FILE_WR) { + if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { + dout("get_cap_refs %p endoff %llu > maxsize %llu\n", + inode, endoff, ci->i_max_size); + if (endoff > ci->i_requested_max_size) + ret = ci->i_auth_cap ? -EFBIG : -ESTALE; + goto out_unlock; + } + /* + * If a sync write is in progress, we must wait, so that we + * can get a final snapshot value for size+mtime. + */ + if (__ceph_have_pending_cap_snap(ci)) { + dout("get_cap_refs %p cap_snap_pending\n", inode); + goto out_unlock; + } + } + + if ((have & need) == need) { + /* + * Look at (implemented & ~have & not) so that we keep waiting + * on transition from wanted -> needed caps. This is needed + * for WRBUFFER|WR -> WR to avoid a new WR sync write from + * going before a prior buffered writeback happens. + */ + int not = want & ~(have & need); + int revoking = implemented & ~have; + dout("get_cap_refs %p have %s but not %s (revoking %s)\n", + inode, ceph_cap_string(have), ceph_cap_string(not), + ceph_cap_string(revoking)); + if ((revoking & not) == 0) { + if (!snap_rwsem_locked && + !ci->i_head_snapc && + (need & CEPH_CAP_FILE_WR)) { + if (!down_read_trylock(&mdsc->snap_rwsem)) { + /* + * we can not call down_read() when + * task isn't in TASK_RUNNING state + */ + if (flags & NON_BLOCKING) { + ret = -EAGAIN; + goto out_unlock; + } + + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + snap_rwsem_locked = true; + goto again; + } + snap_rwsem_locked = true; + } + if ((have & want) == want) + *got = need | want; + else + *got = need; + if (S_ISREG(inode->i_mode) && + (need & CEPH_CAP_FILE_RD) && + !(*got & CEPH_CAP_FILE_CACHE)) + ceph_disable_fscache_readpage(ci); + ceph_take_cap_refs(ci, *got, true); + ret = 1; + } + } else { + int session_readonly = false; + int mds_wanted; + if (ci->i_auth_cap && + (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { + struct ceph_mds_session *s = ci->i_auth_cap->session; + spin_lock(&s->s_cap_lock); + session_readonly = s->s_readonly; + spin_unlock(&s->s_cap_lock); + } + if (session_readonly) { + dout("get_cap_refs %p need %s but mds%d readonly\n", + inode, ceph_cap_string(need), ci->i_auth_cap->mds); + ret = -EROFS; + goto out_unlock; + } + + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + ret = -EIO; + goto out_unlock; + } + mds_wanted = __ceph_caps_mds_wanted(ci, false); + if (need & ~mds_wanted) { + dout("get_cap_refs %p need %s > mds_wanted %s\n", + inode, ceph_cap_string(need), + ceph_cap_string(mds_wanted)); + ret = -ESTALE; + goto out_unlock; + } + + dout("get_cap_refs %p have %s need %s\n", inode, + ceph_cap_string(have), ceph_cap_string(need)); + } +out_unlock: + + __ceph_touch_fmode(ci, mdsc, flags); + + spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) + up_read(&mdsc->snap_rwsem); + + if (!ret) + ceph_update_cap_mis(&mdsc->metric); + else if (ret == 1) + ceph_update_cap_hit(&mdsc->metric); + + dout("get_cap_refs %p ret %d got %s\n", inode, + ret, ceph_cap_string(*got)); + return ret; +} + +/* + * Check the offset we are writing up to against our current + * max_size. If necessary, tell the MDS we want to write to + * a larger offset. + */ +static void check_max_size(struct inode *inode, loff_t endoff) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int check = 0; + + /* do we need to explicitly request a larger max_size? */ + spin_lock(&ci->i_ceph_lock); + if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { + dout("write %p at large endoff %llu, req max_size\n", + inode, endoff); + ci->i_wanted_max_size = endoff; + } + /* duplicate ceph_check_caps()'s logic */ + if (ci->i_auth_cap && + (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && + ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) + check = 1; + spin_unlock(&ci->i_ceph_lock); + if (check) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); +} + +static inline int get_used_fmode(int caps) +{ + int fmode = 0; + if (caps & CEPH_CAP_FILE_RD) + fmode |= CEPH_FILE_MODE_RD; + if (caps & CEPH_CAP_FILE_WR) + fmode |= CEPH_FILE_MODE_WR; + return fmode; +} + +int ceph_try_get_caps(struct inode *inode, int need, int want, + bool nonblock, int *got) +{ + int ret, flags; + + BUG_ON(need & ~CEPH_CAP_FILE_RD); + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | + CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_ANY_DIR_OPS)); + if (need) { + ret = ceph_pool_perm_check(inode, need); + if (ret < 0) + return ret; + } + + flags = get_used_fmode(need | want); + if (nonblock) + flags |= NON_BLOCKING; + + ret = try_get_cap_refs(inode, need, want, 0, flags, got); + /* three special error codes */ + if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE) + ret = 0; + return ret; +} + +/* + * Wait for caps, and take cap references. If we can't get a WR cap + * due to a small max_size, make sure we check_max_size (and possibly + * ask the mds) so we don't get hung up indefinitely. + */ +int ceph_get_caps(struct file *filp, int need, int want, + loff_t endoff, int *got, struct page **pinned_page) +{ + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int ret, _got, flags; + + ret = ceph_pool_perm_check(inode, need); + if (ret < 0) + return ret; + + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) + return -EBADF; + + flags = get_used_fmode(need | want); + + while (true) { + flags &= CEPH_FILE_MODE_MASK; + if (vfs_inode_has_locks(inode)) + flags |= CHECK_FILELOCK; + _got = 0; + ret = try_get_cap_refs(inode, need, want, endoff, + flags, &_got); + WARN_ON_ONCE(ret == -EAGAIN); + if (!ret) { + struct ceph_mds_client *mdsc = fsc->mdsc; + struct cap_wait cw; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + cw.ino = ceph_ino(inode); + cw.tgid = current->tgid; + cw.need = need; + cw.want = want; + + spin_lock(&mdsc->caps_list_lock); + list_add(&cw.list, &mdsc->cap_wait_list); + spin_unlock(&mdsc->caps_list_lock); + + /* make sure used fmode not timeout */ + ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); + add_wait_queue(&ci->i_cap_wq, &wait); + + flags |= NON_BLOCKING; + while (!(ret = try_get_cap_refs(inode, need, want, + endoff, flags, &_got))) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + + remove_wait_queue(&ci->i_cap_wq, &wait); + ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); + + spin_lock(&mdsc->caps_list_lock); + list_del(&cw.list); + spin_unlock(&mdsc->caps_list_lock); + + if (ret == -EAGAIN) + continue; + } + + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) { + if (ret >= 0 && _got) + ceph_put_cap_refs(ci, _got); + return -EBADF; + } + + if (ret < 0) { + if (ret == -EFBIG || ret == -ESTALE) { + int ret2 = ceph_wait_on_async_create(inode); + if (ret2 < 0) + return ret2; + } + if (ret == -EFBIG) { + check_max_size(inode, endoff); + continue; + } + if (ret == -ESTALE) { + /* session was killed, try renew caps */ + ret = ceph_renew_caps(inode, flags); + if (ret == 0) + continue; + } + return ret; + } + + if (S_ISREG(ci->vfs_inode.i_mode) && + ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(inode) > 0) { + struct page *page = + find_get_page(inode->i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + break; + } + put_page(page); + } + /* + * drop cap refs first because getattr while + * holding * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* + * getattr request will bring inline data into + * page cache + */ + ret = __ceph_do_getattr(inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, + true); + if (ret < 0) + return ret; + continue; + } + break; + } + + if (S_ISREG(ci->vfs_inode.i_mode) && + (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) + ceph_fscache_revalidate_cookie(ci); + + *got = _got; + return 0; +} + +/* + * Take cap refs. Caller must already know we hold at least one ref + * on the caps in question or we don't know this is safe. + */ +void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) +{ + spin_lock(&ci->i_ceph_lock); + ceph_take_cap_refs(ci, caps, false); + spin_unlock(&ci->i_ceph_lock); +} + + +/* + * drop cap_snap that is not associated with any snapshot. + * we don't need to send FLUSHSNAP message for it. + */ +static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap) +{ + if (!capsnap->need_flush && + !capsnap->writing && !capsnap->dirty_pages) { + dout("dropping cap_snap %p follows %llu\n", + capsnap, capsnap->follows); + BUG_ON(capsnap->cap_flush.tid > 0); + ceph_put_snap_context(capsnap->context); + if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps)) + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; + + list_del(&capsnap->ci_item); + ceph_put_cap_snap(capsnap); + return 1; + } + return 0; +} + +/* + * Release cap refs. + * + * If we released the last ref on any given cap, call ceph_check_caps + * to release (or schedule a release). + * + * If we are releasing a WR cap (from a sync write), finalize any affected + * cap_snap, and wake up any waiters. + */ +static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, + bool skip_checking_caps) +{ + struct inode *inode = &ci->vfs_inode; + int last = 0, put = 0, flushsnaps = 0, wake = 0; + + spin_lock(&ci->i_ceph_lock); + if (had & CEPH_CAP_PIN) + --ci->i_pin_ref; + if (had & CEPH_CAP_FILE_RD) + if (--ci->i_rd_ref == 0) + last++; + if (had & CEPH_CAP_FILE_CACHE) + if (--ci->i_rdcache_ref == 0) + last++; + if (had & CEPH_CAP_FILE_EXCL) + if (--ci->i_fx_ref == 0) + last++; + if (had & CEPH_CAP_FILE_BUFFER) { + if (--ci->i_wb_ref == 0) { + last++; + put++; + } + dout("put_cap_refs %p wb %d -> %d (?)\n", + inode, ci->i_wb_ref+1, ci->i_wb_ref); + } + if (had & CEPH_CAP_FILE_WR) + if (--ci->i_wr_ref == 0) { + last++; + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + capsnap->writing = 0; + if (ceph_try_drop_cap_snap(ci, capsnap)) + put++; + else if (__ceph_finish_cap_snap(ci, capsnap)) + flushsnaps = 1; + wake = 1; + } + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + /* see comment in __ceph_remove_cap() */ + if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) + drop_inode_snap_realm(ci); + } + spin_unlock(&ci->i_ceph_lock); + + dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), + last ? " last" : "", put ? " put" : ""); + + if (!skip_checking_caps) { + if (last) + ceph_check_caps(ci, 0, NULL); + else if (flushsnaps) + ceph_flush_snaps(ci, NULL); + } + if (wake) + wake_up_all(&ci->i_cap_wq); + while (put-- > 0) + iput(inode); +} + +void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) +{ + __ceph_put_cap_refs(ci, had, false); +} + +void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) +{ + __ceph_put_cap_refs(ci, had, true); +} + +/* + * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap + * context. Adjust per-snap dirty page accounting as appropriate. + * Once all dirty data for a cap_snap is flushed, flush snapped file + * metadata back to the MDS. If we dropped the last ref, call + * ceph_check_caps. + */ +void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, + struct ceph_snap_context *snapc) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap_snap *capsnap = NULL; + int put = 0; + bool last = false; + bool found = false; + bool flush_snaps = false; + bool complete_capsnap = false; + + spin_lock(&ci->i_ceph_lock); + ci->i_wrbuffer_ref -= nr; + if (ci->i_wrbuffer_ref == 0) { + last = true; + put++; + } + + if (ci->i_head_snapc == snapc) { + ci->i_wrbuffer_ref_head -= nr; + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n", + inode, + ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + last ? " LAST" : ""); + } else { + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->context == snapc) { + found = true; + break; + } + } + + if (!found) { + /* + * The capsnap should already be removed when removing + * auth cap in the case of a forced unmount. + */ + WARN_ON_ONCE(ci->i_auth_cap); + goto unlock; + } + + capsnap->dirty_pages -= nr; + if (capsnap->dirty_pages == 0) { + complete_capsnap = true; + if (!capsnap->writing) { + if (ceph_try_drop_cap_snap(ci, capsnap)) { + put++; + } else { + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; + flush_snaps = true; + } + } + } + dout("put_wrbuffer_cap_refs on %p cap_snap %p " + " snap %lld %d/%d -> %d/%d %s%s\n", + inode, capsnap, capsnap->context->seq, + ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, + ci->i_wrbuffer_ref, capsnap->dirty_pages, + last ? " (wrbuffer last)" : "", + complete_capsnap ? " (complete capsnap)" : ""); + } + +unlock: + spin_unlock(&ci->i_ceph_lock); + + if (last) { + ceph_check_caps(ci, 0, NULL); + } else if (flush_snaps) { + ceph_flush_snaps(ci, NULL); + } + if (complete_capsnap) + wake_up_all(&ci->i_cap_wq); + while (put-- > 0) { + /* avoid calling iput_final() in osd dispatch threads */ + ceph_async_iput(inode); + } +} + +/* + * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. + */ +static void invalidate_aliases(struct inode *inode) +{ + struct dentry *dn, *prev = NULL; + + dout("invalidate_aliases inode %p\n", inode); + d_prune_aliases(inode); + /* + * For non-directory inode, d_find_alias() only returns + * hashed dentry. After calling d_invalidate(), the + * dentry becomes unhashed. + * + * For directory inode, d_find_alias() can return + * unhashed dentry. But directory inode should have + * one alias at most. + */ + while ((dn = d_find_alias(inode))) { + if (dn == prev) { + dput(dn); + break; + } + d_invalidate(dn); + if (prev) + dput(prev); + prev = dn; + } + if (prev) + dput(prev); +} + +struct cap_extra_info { + struct ceph_string *pool_ns; + /* inline data */ + u64 inline_version; + void *inline_data; + u32 inline_len; + /* dirstat */ + bool dirstat_valid; + u64 nfiles; + u64 nsubdirs; + u64 change_attr; + /* currently issued */ + int issued; + struct timespec64 btime; +}; + +/* + * Handle a cap GRANT message from the MDS. (Note that a GRANT may + * actually be a revocation if it specifies a smaller cap set.) + * + * caller holds s_mutex and i_ceph_lock, we drop both. + */ +static void handle_cap_grant(struct inode *inode, + struct ceph_mds_session *session, + struct ceph_cap *cap, + struct ceph_mds_caps *grant, + struct ceph_buffer *xattr_buf, + struct cap_extra_info *extra_info) + __releases(ci->i_ceph_lock) + __releases(session->s_mdsc->snap_rwsem) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int seq = le32_to_cpu(grant->seq); + int newcaps = le32_to_cpu(grant->caps); + int used, wanted, dirty; + u64 size = le64_to_cpu(grant->size); + u64 max_size = le64_to_cpu(grant->max_size); + unsigned char check_caps = 0; + bool was_stale = cap->cap_gen < session->s_cap_gen; + bool wake = false; + bool writeback = false; + bool queue_trunc = false; + bool queue_invalidate = false; + bool deleted_inode = false; + bool fill_inline = false; + + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", + inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); + dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, + inode->i_size); + + + /* + * If CACHE is being revoked, and we have no dirty buffers, + * try to invalidate (once). (If there are dirty buffers, we + * will invalidate _after_ writeback.) + */ + if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ + ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && + !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { + if (try_nonblocking_invalidate(inode)) { + /* there were locked pages.. invalidate later + in a separate thread. */ + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { + queue_invalidate = true; + ci->i_rdcache_revoking = ci->i_rdcache_gen; + } + } + } + + if (was_stale) + cap->issued = cap->implemented = CEPH_CAP_PIN; + + /* + * auth mds of the inode changed. we received the cap export message, + * but still haven't received the cap import message. handle_cap_export + * updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message + * that was sent before the cap import message. So don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); + seq = cap->seq; + newcaps |= cap->issued; + } + + /* side effects now are allowed */ + cap->cap_gen = session->s_cap_gen; + cap->seq = seq; + + __check_cap_issue(ci, cap, newcaps); + + inode_set_max_iversion_raw(inode, extra_info->change_attr); + + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { + inode->i_mode = le32_to_cpu(grant->mode); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); + ci->i_btime = extra_info->btime; + dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); + } + + if ((newcaps & CEPH_CAP_LINK_SHARED) && + (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { + set_nlink(inode, le32_to_cpu(grant->nlink)); + if (inode->i_nlink == 0 && + (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + deleted_inode = true; + } + + if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 && + grant->xattr_len) { + int len = le32_to_cpu(grant->xattr_len); + u64 version = le64_to_cpu(grant->xattr_version); + + if (version > ci->i_xattrs.version) { + dout(" got new xattrs v%llu on %p len %d\n", + version, inode, len); + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); + ci->i_xattrs.version = version; + ceph_forget_all_cached_acls(inode); + ceph_security_invalidate_secctx(inode); + } + } + + if (newcaps & CEPH_CAP_ANY_RD) { + struct timespec64 mtime, atime, ctime; + /* ctime/mtime/atime? */ + ceph_decode_timespec64(&mtime, &grant->mtime); + ceph_decode_timespec64(&atime, &grant->atime); + ceph_decode_timespec64(&ctime, &grant->ctime); + ceph_fill_file_time(inode, extra_info->issued, + le32_to_cpu(grant->time_warp_seq), + &ctime, &mtime, &atime); + } + + if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) { + ci->i_files = extra_info->nfiles; + ci->i_subdirs = extra_info->nsubdirs; + } + + if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { + /* file layout may have changed */ + s64 old_pool = ci->i_layout.pool_id; + struct ceph_string *old_ns; + + ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); + old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, + lockdep_is_held(&ci->i_ceph_lock)); + rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns); + + if (ci->i_layout.pool_id != old_pool || + extra_info->pool_ns != old_ns) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; + + extra_info->pool_ns = old_ns; + + /* size/truncate_seq? */ + queue_trunc = ceph_fill_file_size(inode, extra_info->issued, + le32_to_cpu(grant->truncate_seq), + le64_to_cpu(grant->truncate_size), + size); + } + + if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) { + if (max_size != ci->i_max_size) { + dout("max_size %lld -> %llu\n", + ci->i_max_size, max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { + ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + } + wake = true; + } + } + + /* check cap bits */ + wanted = __ceph_caps_wanted(ci); + used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + dout(" my wanted = %s, used = %s, dirty %s\n", + ceph_cap_string(wanted), + ceph_cap_string(used), + ceph_cap_string(dirty)); + + if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && + (wanted & ~(cap->mds_wanted | newcaps))) { + /* + * If mds is importing cap, prior cap messages that update + * 'wanted' may get dropped by mds (migrate seq mismatch). + * + * We don't send cap message to update 'wanted' if what we + * want are already issued. If mds revokes caps, cap message + * that releases caps also tells mds what we want. But if + * caps got revoked by mds forcedly (session stale). We may + * haven't told mds what we want. + */ + check_caps = 1; + } + + /* revocation, grant, or no-op? */ + if (cap->issued & ~newcaps) { + int revoking = cap->issued & ~newcaps; + + dout("revocation: %s -> %s (revoking %s)\n", + ceph_cap_string(cap->issued), + ceph_cap_string(newcaps), + ceph_cap_string(revoking)); + if (S_ISREG(inode->i_mode) && + (revoking & used & CEPH_CAP_FILE_BUFFER)) + writeback = true; /* initiate writeback; will delay ack */ + else if (queue_invalidate && + revoking == CEPH_CAP_FILE_CACHE && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) + ; /* do nothing yet, invalidation will be queued */ + else if (cap == ci->i_auth_cap) + check_caps = 1; /* check auth cap only */ + else + check_caps = 2; /* check all caps */ + cap->issued = newcaps; + cap->implemented |= newcaps; + } else if (cap->issued == newcaps) { + dout("caps unchanged: %s -> %s\n", + ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + } else { + dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), + ceph_cap_string(newcaps)); + /* non-auth MDS is revoking the newly grant caps ? */ + if (cap == ci->i_auth_cap && + __ceph_caps_revoking_other(ci, cap, newcaps)) + check_caps = 2; + + cap->issued = newcaps; + cap->implemented |= newcaps; /* add bits only, to + * avoid stepping on a + * pending revocation */ + wake = true; + } + BUG_ON(cap->issued & ~cap->implemented); + + /* don't let check_caps skip sending a response to MDS for revoke msgs */ + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { + cap->mds_wanted = 0; + if (cap == ci->i_auth_cap) + check_caps = 1; /* check auth cap only */ + else + check_caps = 2; /* check all caps */ + } + + if (extra_info->inline_version > 0 && + extra_info->inline_version >= ci->i_inline_version) { + ci->i_inline_version = extra_info->inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) + fill_inline = true; + } + + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + if (ci->i_auth_cap == cap) { + if (newcaps & ~extra_info->issued) + wake = true; + + if (ci->i_requested_max_size > max_size || + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { + /* re-request max_size if necessary */ + ci->i_requested_max_size = 0; + wake = true; + } + + ceph_kick_flushing_inode_caps(session, ci); + } + up_read(&session->s_mdsc->snap_rwsem); + } + spin_unlock(&ci->i_ceph_lock); + + if (fill_inline) + ceph_fill_inline_data(inode, NULL, extra_info->inline_data, + extra_info->inline_len); + + if (queue_trunc) + ceph_queue_vmtruncate(inode); + + if (writeback) + /* + * queue inode for writeback: we can't actually call + * filemap_write_and_wait, etc. from message handler + * context. + */ + ceph_queue_writeback(inode); + if (queue_invalidate) + ceph_queue_invalidate(inode); + if (deleted_inode) + invalidate_aliases(inode); + if (wake) + wake_up_all(&ci->i_cap_wq); + + if (check_caps == 1) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, + session); + else if (check_caps == 2) + ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); + else + mutex_unlock(&session->s_mutex); +} + +/* + * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the + * MDS has been safely committed. + */ +static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, + struct ceph_mds_caps *m, + struct ceph_mds_session *session, + struct ceph_cap *cap) + __releases(ci->i_ceph_lock) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_cap_flush *cf, *tmp_cf; + LIST_HEAD(to_remove); + unsigned seq = le32_to_cpu(m->seq); + int dirty = le32_to_cpu(m->dirty); + int cleaned = 0; + bool drop = false; + bool wake_ci = false; + bool wake_mdsc = false; + + list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { + /* Is this the one that was flushed? */ + if (cf->tid == flush_tid) + cleaned = cf->caps; + + /* Is this a capsnap? */ + if (cf->is_capsnap) + continue; + + if (cf->tid <= flush_tid) { + /* + * An earlier or current tid. The FLUSH_ACK should + * represent a superset of this flush's caps. + */ + wake_ci |= __detach_cap_flush_from_ci(ci, cf); + list_add_tail(&cf->i_list, &to_remove); + } else { + /* + * This is a later one. Any caps in it are still dirty + * so don't count them as cleaned. + */ + cleaned &= ~cf->caps; + if (!cleaned) + break; + } + } + + dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," + " flushing %s -> %s\n", + inode, session->s_mds, seq, ceph_cap_string(dirty), + ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps & ~cleaned)); + + if (list_empty(&to_remove) && !cleaned) + goto out; + + ci->i_flushing_caps &= ~cleaned; + + spin_lock(&mdsc->cap_dirty_lock); + + list_for_each_entry(cf, &to_remove, i_list) + wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); + + if (ci->i_flushing_caps == 0) { + if (list_empty(&ci->i_cap_flush_list)) { + list_del_init(&ci->i_flushing_item); + if (!list_empty(&session->s_cap_flushing)) { + dout(" mds%d still flushing cap on %p\n", + session->s_mds, + &list_first_entry(&session->s_cap_flushing, + struct ceph_inode_info, + i_flushing_item)->vfs_inode); + } + } + mdsc->num_cap_flushing--; + dout(" inode %p now !flushing\n", inode); + + if (ci->i_dirty_caps == 0) { + dout(" inode %p now clean\n", inode); + BUG_ON(!list_empty(&ci->i_dirty_item)); + drop = true; + if (ci->i_wr_ref == 0 && + ci->i_wrbuffer_ref_head == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } else { + BUG_ON(list_empty(&ci->i_dirty_item)); + } + } + spin_unlock(&mdsc->cap_dirty_lock); + +out: + spin_unlock(&ci->i_ceph_lock); + + while (!list_empty(&to_remove)) { + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, i_list); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); + } + + if (wake_ci) + wake_up_all(&ci->i_cap_wq); + if (wake_mdsc) + wake_up_all(&mdsc->cap_flushing_wq); + if (drop) + iput(inode); +} + +void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + bool ret; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci); + + list_del_init(&capsnap->ci_item); + ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); + if (wake_ci) + *wake_ci = ret; + + spin_lock(&mdsc->cap_dirty_lock); + if (list_empty(&ci->i_cap_flush_list)) + list_del_init(&ci->i_flushing_item); + + ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); + if (wake_mdsc) + *wake_mdsc = ret; + spin_unlock(&mdsc->cap_dirty_lock); +} + +void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + lockdep_assert_held(&ci->i_ceph_lock); + + WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); + __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); +} + +/* + * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can + * throw away our cap_snap. + * + * Caller hold s_mutex. + */ +static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, + struct ceph_mds_caps *m, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + u64 follows = le64_to_cpu(m->snap_follows); + struct ceph_cap_snap *capsnap; + bool flushed = false; + bool wake_ci = false; + bool wake_mdsc = false; + + dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", + inode, ci, session->s_mds, follows); + + spin_lock(&ci->i_ceph_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->follows == follows) { + if (capsnap->cap_flush.tid != flush_tid) { + dout(" cap_snap %p follows %lld tid %lld !=" + " %lld\n", capsnap, follows, + flush_tid, capsnap->cap_flush.tid); + break; + } + flushed = true; + break; + } else { + dout(" skipping cap_snap %p follows %lld\n", + capsnap, capsnap->follows); + } + } + if (flushed) + ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); + spin_unlock(&ci->i_ceph_lock); + + if (flushed) { + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + if (wake_ci) + wake_up_all(&ci->i_cap_wq); + if (wake_mdsc) + wake_up_all(&mdsc->cap_flushing_wq); + iput(inode); + } +} + +/* + * Handle TRUNC from MDS, indicating file truncation. + * + * caller hold s_mutex. + */ +static bool handle_cap_trunc(struct inode *inode, + struct ceph_mds_caps *trunc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + int seq = le32_to_cpu(trunc->seq); + u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); + u64 truncate_size = le64_to_cpu(trunc->truncate_size); + u64 size = le64_to_cpu(trunc->size); + int implemented = 0; + int dirty = __ceph_caps_dirty(ci); + int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); + bool queue_trunc = false; + + lockdep_assert_held(&ci->i_ceph_lock); + + issued |= implemented | dirty; + + dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", + inode, mds, seq, truncate_size, truncate_seq); + queue_trunc = ceph_fill_file_size(inode, issued, + truncate_seq, truncate_size, size); + return queue_trunc; +} + +/* + * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a + * different one. If we are the most recent migration we've seen (as + * indicated by mseq), make note of the migrating cap bits for the + * duration (until we see the corresponding IMPORT). + * + * caller holds s_mutex + */ +static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, + struct ceph_mds_cap_peer *ph, + struct ceph_mds_session *session) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *tsession = NULL; + struct ceph_cap *cap, *tcap, *new_cap = NULL; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 t_cap_id; + unsigned mseq = le32_to_cpu(ex->migrate_seq); + unsigned t_seq, t_mseq; + int target, issued; + int mds = session->s_mds; + + if (ph) { + t_cap_id = le64_to_cpu(ph->cap_id); + t_seq = le32_to_cpu(ph->seq); + t_mseq = le32_to_cpu(ph->mseq); + target = le32_to_cpu(ph->mds); + } else { + t_cap_id = t_seq = t_mseq = 0; + target = -1; + } + + dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", + inode, ci, mds, mseq, target); +retry: + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) + goto out_unlock; + + if (target < 0) { + __ceph_remove_cap(cap, false); + goto out_unlock; + } + + /* + * now we know we haven't received the cap import message yet + * because the exported cap still exist. + */ + + issued = cap->issued; + if (issued != cap->implemented) + pr_err_ratelimited("handle_cap_export: issued != implemented: " + "ino (%llx.%llx) mds%d seq %d mseq %d " + "issued %s implemented %s\n", + ceph_vinop(inode), mds, cap->seq, cap->mseq, + ceph_cap_string(issued), + ceph_cap_string(cap->implemented)); + + + tcap = __get_cap_for_mds(ci, target); + if (tcap) { + /* already have caps from the target */ + if (tcap->cap_id == t_cap_id && + ceph_seq_cmp(tcap->seq, t_seq) < 0) { + dout(" updating import cap %p mds%d\n", tcap, target); + tcap->cap_id = t_cap_id; + tcap->seq = t_seq - 1; + tcap->issue_seq = t_seq - 1; + tcap->issued |= issued; + tcap->implemented |= issued; + if (cap == ci->i_auth_cap) { + ci->i_auth_cap = tcap; + change_auth_cap_ses(ci, tcap->session); + } + } + __ceph_remove_cap(cap, false); + goto out_unlock; + } else if (tsession) { + /* add placeholder for the export tagert */ + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + tcap = new_cap; + ceph_add_cap(inode, tsession, t_cap_id, issued, 0, + t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + + if (!list_empty(&ci->i_cap_flush_list) && + ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + } + + __ceph_remove_cap(cap, false); + goto out_unlock; + } + + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + + /* open target session */ + tsession = ceph_mdsc_open_export_target_session(mdsc, target); + if (!IS_ERR(tsession)) { + if (mds > target) { + mutex_lock(&session->s_mutex); + mutex_lock_nested(&tsession->s_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&tsession->s_mutex); + mutex_lock_nested(&session->s_mutex, + SINGLE_DEPTH_NESTING); + } + new_cap = ceph_get_cap(mdsc, NULL); + } else { + WARN_ON(1); + tsession = NULL; + target = -1; + mutex_lock(&session->s_mutex); + } + goto retry; + +out_unlock: + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + if (tsession) { + mutex_unlock(&tsession->s_mutex); + ceph_put_mds_session(tsession); + } + if (new_cap) + ceph_put_cap(mdsc, new_cap); +} + +/* + * Handle cap IMPORT. + * + * caller holds s_mutex. acquires i_ceph_lock + */ +static void handle_cap_import(struct ceph_mds_client *mdsc, + struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_cap_peer *ph, + struct ceph_mds_session *session, + struct ceph_cap **target_cap, int *old_issued) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap, *ocap, *new_cap = NULL; + int mds = session->s_mds; + int issued; + unsigned caps = le32_to_cpu(im->caps); + unsigned wanted = le32_to_cpu(im->wanted); + unsigned seq = le32_to_cpu(im->seq); + unsigned mseq = le32_to_cpu(im->migrate_seq); + u64 realmino = le64_to_cpu(im->realm); + u64 cap_id = le64_to_cpu(im->cap_id); + u64 p_cap_id; + int peer; + + if (ph) { + p_cap_id = le64_to_cpu(ph->cap_id); + peer = le32_to_cpu(ph->mds); + } else { + p_cap_id = 0; + peer = -1; + } + + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", + inode, ci, mds, mseq, peer); +retry: + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (!new_cap) { + spin_unlock(&ci->i_ceph_lock); + new_cap = ceph_get_cap(mdsc, NULL); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + cap = new_cap; + } else { + if (new_cap) { + ceph_put_cap(mdsc, new_cap); + new_cap = NULL; + } + } + + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + + ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, + realmino, CEPH_CAP_FLAG_AUTH, &new_cap); + + ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (ocap && ocap->cap_id == p_cap_id) { + dout(" remove export cap %p mds%d flags %d\n", + ocap, peer, ph->flags); + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && + (ocap->seq != le32_to_cpu(ph->seq) || + ocap->mseq != le32_to_cpu(ph->mseq))) { + pr_err_ratelimited("handle_cap_import: " + "mismatched seq/mseq: ino (%llx.%llx) " + "mds%d seq %d mseq %d importer mds%d " + "has peer seq %d mseq %d\n", + ceph_vinop(inode), peer, ocap->seq, + ocap->mseq, mds, le32_to_cpu(ph->seq), + le32_to_cpu(ph->mseq)); + } + __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + } + + *old_issued = issued; + *target_cap = cap; +} + +/* + * Handle a caps message from the MDS. + * + * Identify the appropriate session, inode, and call the right handler + * based on the cap op. + */ +void ceph_handle_caps(struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_cap *cap; + struct ceph_mds_caps *h; + struct ceph_mds_cap_peer *peer = NULL; + struct ceph_snap_realm *realm = NULL; + int op; + int msg_version = le16_to_cpu(msg->hdr.version); + u32 seq, mseq; + struct ceph_vino vino; + void *snaptrace; + size_t snaptrace_len; + void *p, *end; + struct cap_extra_info extra_info = {}; + bool queue_trunc; + + dout("handle_caps from mds%d\n", session->s_mds); + + /* decode */ + end = msg->front.iov_base + msg->front.iov_len; + if (msg->front.iov_len < sizeof(*h)) + goto bad; + h = msg->front.iov_base; + op = le32_to_cpu(h->op); + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + seq = le32_to_cpu(h->seq); + mseq = le32_to_cpu(h->migrate_seq); + + snaptrace = h + 1; + snaptrace_len = le32_to_cpu(h->snap_trace_len); + p = snaptrace + snaptrace_len; + + if (msg_version >= 2) { + u32 flock_len; + ceph_decode_32_safe(&p, end, flock_len, bad); + if (p + flock_len > end) + goto bad; + p += flock_len; + } + + if (msg_version >= 3) { + if (op == CEPH_CAP_OP_IMPORT) { + if (p + sizeof(*peer) > end) + goto bad; + peer = p; + p += sizeof(*peer); + } else if (op == CEPH_CAP_OP_EXPORT) { + /* recorded in unused fields */ + peer = (void *)&h->size; + } + } + + if (msg_version >= 4) { + ceph_decode_64_safe(&p, end, extra_info.inline_version, bad); + ceph_decode_32_safe(&p, end, extra_info.inline_len, bad); + if (p + extra_info.inline_len > end) + goto bad; + extra_info.inline_data = p; + p += extra_info.inline_len; + } + + if (msg_version >= 5) { + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; + u32 epoch_barrier; + + ceph_decode_32_safe(&p, end, epoch_barrier, bad); + ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); + } + + if (msg_version >= 8) { + u64 flush_tid; + u32 caller_uid, caller_gid; + u32 pool_ns_len; + + /* version >= 6 */ + ceph_decode_64_safe(&p, end, flush_tid, bad); + /* version >= 7 */ + ceph_decode_32_safe(&p, end, caller_uid, bad); + ceph_decode_32_safe(&p, end, caller_gid, bad); + /* version >= 8 */ + ceph_decode_32_safe(&p, end, pool_ns_len, bad); + if (pool_ns_len > 0) { + ceph_decode_need(&p, end, pool_ns_len, bad); + extra_info.pool_ns = + ceph_find_or_create_string(p, pool_ns_len); + p += pool_ns_len; + } + } + + if (msg_version >= 9) { + struct ceph_timespec *btime; + + if (p + sizeof(*btime) > end) + goto bad; + btime = p; + ceph_decode_timespec64(&extra_info.btime, btime); + p += sizeof(*btime); + ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); + } + + if (msg_version >= 11) { + u32 flags; + /* version >= 10 */ + ceph_decode_32_safe(&p, end, flags, bad); + /* version >= 11 */ + extra_info.dirstat_valid = true; + ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); + ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); + } + + /* lookup ino */ + inode = ceph_find_inode(mdsc->fsc->sb, vino); + ci = ceph_inode(inode); + dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, + vino.snap, inode); + + mutex_lock(&session->s_mutex); + inc_session_sequence(session); + dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, + (unsigned)seq); + + if (!inode) { + dout(" i don't have ino %llx\n", vino.ino); + + if (op == CEPH_CAP_OP_IMPORT) { + cap = ceph_get_cap(mdsc, NULL); + cap->cap_ino = vino.ino; + cap->queue_release = 1; + cap->cap_id = le64_to_cpu(h->cap_id); + cap->mseq = mseq; + cap->seq = seq; + cap->issue_seq = seq; + spin_lock(&session->s_cap_lock); + __ceph_queue_cap_release(session, cap); + spin_unlock(&session->s_cap_lock); + } + goto flush_cap_releases; + } + + /* these will work even if we don't have a cap yet */ + switch (op) { + case CEPH_CAP_OP_FLUSHSNAP_ACK: + handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid), + h, session); + goto done; + + case CEPH_CAP_OP_EXPORT: + handle_cap_export(inode, h, peer, session); + goto done_unlocked; + + case CEPH_CAP_OP_IMPORT: + realm = NULL; + if (snaptrace_len) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm); + downgrade_write(&mdsc->snap_rwsem); + } else { + down_read(&mdsc->snap_rwsem); + } + spin_lock(&ci->i_ceph_lock); + handle_cap_import(mdsc, inode, h, peer, session, + &cap, &extra_info.issued); + handle_cap_grant(inode, session, cap, + h, msg->middle, &extra_info); + if (realm) + ceph_put_snap_realm(mdsc, realm); + goto done_unlocked; + } + + /* the rest require a cap */ + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); + if (!cap) { + dout(" no cap on %p ino %llx.%llx from mds%d\n", + inode, ceph_ino(inode), ceph_snap(inode), + session->s_mds); + spin_unlock(&ci->i_ceph_lock); + goto flush_cap_releases; + } + + /* note that each of these drops i_ceph_lock for us */ + switch (op) { + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + __ceph_caps_issued(ci, &extra_info.issued); + extra_info.issued |= __ceph_caps_dirty(ci); + handle_cap_grant(inode, session, cap, + h, msg->middle, &extra_info); + goto done_unlocked; + + case CEPH_CAP_OP_FLUSH_ACK: + handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid), + h, session, cap); + break; + + case CEPH_CAP_OP_TRUNC: + queue_trunc = handle_cap_trunc(inode, h, session); + spin_unlock(&ci->i_ceph_lock); + if (queue_trunc) + ceph_queue_vmtruncate(inode); + break; + + default: + spin_unlock(&ci->i_ceph_lock); + pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, + ceph_cap_op_name(op)); + } + +done: + mutex_unlock(&session->s_mutex); +done_unlocked: + ceph_put_string(extra_info.pool_ns); + /* avoid calling iput_final() in mds dispatch threads */ + ceph_async_iput(inode); + return; + +flush_cap_releases: + /* + * send any cap release message to try to move things + * along for the mds (who clearly thinks we still have this + * cap). + */ + ceph_flush_cap_releases(mdsc, session); + goto done; + +bad: + pr_err("ceph_handle_caps: corrupt message\n"); + ceph_msg_dump(msg); + return; +} + +/* + * Delayed work handler to process end of delayed cap release LRU list. + * + * If new caps are added to the list while processing it, these won't get + * processed in this run. In this case, the ci->i_hold_caps_max will be + * returned so that the work can be scheduled accordingly. + */ +unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_mount_options *opt = mdsc->fsc->mount_options; + unsigned long delay_max = opt->caps_wanted_delay_max * HZ; + unsigned long loop_start = jiffies; + unsigned long delay = 0; + + dout("check_delayed_caps\n"); + spin_lock(&mdsc->cap_delay_lock); + while (!list_empty(&mdsc->cap_delay_list)) { + ci = list_first_entry(&mdsc->cap_delay_list, + struct ceph_inode_info, + i_cap_delay_list); + if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { + dout("%s caps added recently. Exiting loop", __func__); + delay = ci->i_hold_caps_max; + break; + } + if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && + time_before(jiffies, ci->i_hold_caps_max)) + break; + list_del_init(&ci->i_cap_delay_list); + + inode = igrab(&ci->vfs_inode); + if (inode) { + spin_unlock(&mdsc->cap_delay_lock); + dout("check_delayed_caps on %p\n", inode); + ceph_check_caps(ci, 0, NULL); + /* avoid calling iput_final() in tick thread */ + ceph_async_iput(inode); + spin_lock(&mdsc->cap_delay_lock); + } + } + spin_unlock(&mdsc->cap_delay_lock); + + return delay; +} + +/* + * Flush all dirty caps to the mds + */ +static void flush_dirty_session_caps(struct ceph_mds_session *s) +{ + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_inode_info *ci; + struct inode *inode; + + dout("flush_dirty_caps\n"); + spin_lock(&mdsc->cap_dirty_lock); + while (!list_empty(&s->s_cap_dirty)) { + ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, + i_dirty_item); + inode = &ci->vfs_inode; + ihold(inode); + dout("flush_dirty_caps %p\n", inode); + spin_unlock(&mdsc->cap_dirty_lock); + ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); + iput(inode); + spin_lock(&mdsc->cap_dirty_lock); + } + spin_unlock(&mdsc->cap_dirty_lock); + dout("flush_dirty_caps done\n"); +} + +void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) +{ + ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); +} + +void __ceph_touch_fmode(struct ceph_inode_info *ci, + struct ceph_mds_client *mdsc, int fmode) +{ + unsigned long now = jiffies; + if (fmode & CEPH_FILE_MODE_RD) + ci->i_last_rd = now; + if (fmode & CEPH_FILE_MODE_WR) + ci->i_last_wr = now; + /* queue periodic check */ + if (fmode && + __ceph_is_any_real_caps(ci) && + list_empty(&ci->i_cap_delay_list)) + __cap_delay_requeue(mdsc, ci); +} + +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); + int bits = (fmode << 1) | 1; + bool already_opened = false; + int i; + + if (count == 1) + atomic64_inc(&mdsc->metric.opened_files); + + spin_lock(&ci->i_ceph_lock); + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + /* + * If any of the mode ref is larger than 0, + * that means it has been already opened by + * others. Just skip checking the PIN ref. + */ + if (i && ci->i_nr_by_mode[i]) + already_opened = true; + + if (bits & (1 << i)) + ci->i_nr_by_mode[i] += count; + } + + if (!already_opened) + percpu_counter_inc(&mdsc->metric.opened_inodes); + spin_unlock(&ci->i_ceph_lock); +} + +/* + * Drop open file reference. If we were the last open file, + * we may need to release capabilities to the MDS (or schedule + * their delayed release). + */ +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); + int bits = (fmode << 1) | 1; + bool is_closed = true; + int i; + + if (count == 1) + atomic64_dec(&mdsc->metric.opened_files); + + spin_lock(&ci->i_ceph_lock); + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (bits & (1 << i)) { + BUG_ON(ci->i_nr_by_mode[i] < count); + ci->i_nr_by_mode[i] -= count; + } + + /* + * If any of the mode ref is not 0 after + * decreased, that means it is still opened + * by others. Just skip checking the PIN ref. + */ + if (i && ci->i_nr_by_mode[i]) + is_closed = false; + } + + if (is_closed) + percpu_counter_dec(&mdsc->metric.opened_inodes); + spin_unlock(&ci->i_ceph_lock); +} + +/* + * For a soon-to-be unlinked file, drop the LINK caps. If it + * looks like the link count will hit 0, drop any other caps (other + * than PIN) we don't specifically want (due to the file still being + * open). + */ +int ceph_drop_caps_for_unlink(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + spin_lock(&ci->i_ceph_lock); + if (inode->i_nlink == 1) { + drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); + + if (__ceph_caps_dirty(ci)) { + struct ceph_mds_client *mdsc = + ceph_inode_to_client(inode)->mdsc; + __cap_delay_requeue_front(mdsc, ci); + } + } + spin_unlock(&ci->i_ceph_lock); + return drop; +} + +/* + * Helpers for embedding cap and dentry lease releases into mds + * requests. + * + * @force is used by dentry_release (below) to force inclusion of a + * record for the directory inode, even when there aren't any caps to + * drop. + */ +int ceph_encode_inode_release(void **p, struct inode *inode, + int mds, int drop, int unless, int force) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + struct ceph_mds_request_release *rel = *p; + int used, dirty; + int ret = 0; + + spin_lock(&ci->i_ceph_lock); + used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + + dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", + inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), + ceph_cap_string(unless)); + + /* only drop unused, clean caps */ + drop &= ~(used | dirty); + + cap = __get_cap_for_mds(ci, mds); + if (cap && __cap_is_valid(cap)) { + unless &= cap->issued; + if (unless) { + if (unless & CEPH_CAP_AUTH_EXCL) + drop &= ~CEPH_CAP_AUTH_SHARED; + if (unless & CEPH_CAP_LINK_EXCL) + drop &= ~CEPH_CAP_LINK_SHARED; + if (unless & CEPH_CAP_XATTR_EXCL) + drop &= ~CEPH_CAP_XATTR_SHARED; + if (unless & CEPH_CAP_FILE_EXCL) + drop &= ~CEPH_CAP_FILE_SHARED; + } + + if (force || (cap->issued & drop)) { + if (cap->issued & drop) { + int wanted = __ceph_caps_wanted(ci); + dout("encode_inode_release %p cap %p " + "%s -> %s, wanted %s -> %s\n", inode, cap, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); + + cap->issued &= ~drop; + cap->implemented &= ~drop; + cap->mds_wanted = wanted; + if (cap == ci->i_auth_cap && + !(wanted & CEPH_CAP_ANY_FILE_WR)) + ci->i_requested_max_size = 0; + } else { + dout("encode_inode_release %p cap %p %s" + " (force)\n", inode, cap, + ceph_cap_string(cap->issued)); + } + + rel->ino = cpu_to_le64(ceph_ino(inode)); + rel->cap_id = cpu_to_le64(cap->cap_id); + rel->seq = cpu_to_le32(cap->seq); + rel->issue_seq = cpu_to_le32(cap->issue_seq); + rel->mseq = cpu_to_le32(cap->mseq); + rel->caps = cpu_to_le32(cap->implemented); + rel->wanted = cpu_to_le32(cap->mds_wanted); + rel->dname_len = 0; + rel->dname_seq = 0; + *p += sizeof(*rel); + ret = 1; + } else { + dout("encode_inode_release %p cap %p %s (noop)\n", + inode, cap, ceph_cap_string(cap->issued)); + } + } + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +int ceph_encode_dentry_release(void **p, struct dentry *dentry, + struct inode *dir, + int mds, int drop, int unless) +{ + struct dentry *parent = NULL; + struct ceph_mds_request_release *rel = *p; + struct ceph_dentry_info *di = ceph_dentry(dentry); + int force = 0; + int ret; + + /* + * force an record for the directory caps if we have a dentry lease. + * this is racy (can't take i_ceph_lock and d_lock together), but it + * doesn't have to be perfect; the mds will revoke anything we don't + * release. + */ + spin_lock(&dentry->d_lock); + if (di->lease_session && di->lease_session->s_mds == mds) + force = 1; + if (!dir) { + parent = dget(dentry->d_parent); + dir = d_inode(parent); + } + spin_unlock(&dentry->d_lock); + + ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); + dput(parent); + + spin_lock(&dentry->d_lock); + if (ret && di->lease_session && di->lease_session->s_mds == mds) { + dout("encode_dentry_release %p mds%d seq %d\n", + dentry, mds, (int)di->lease_seq); + rel->dname_len = cpu_to_le32(dentry->d_name.len); + memcpy(*p, dentry->d_name.name, dentry->d_name.len); + *p += dentry->d_name.len; + rel->dname_seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); + } + spin_unlock(&dentry->d_lock); + return ret; +} diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c new file mode 100644 index 000000000..6f67d5b88 --- /dev/null +++ b/fs/ceph/ceph_frag.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ceph 'frag' type + */ +#include <linux/module.h> +#include <linux/ceph/types.h> + +int ceph_frag_compare(__u32 a, __u32 b) +{ + unsigned va = ceph_frag_value(a); + unsigned vb = ceph_frag_value(b); + if (va < vb) + return -1; + if (va > vb) + return 1; + va = ceph_frag_bits(a); + vb = ceph_frag_bits(b); + if (va < vb) + return -1; + if (va > vb) + return 1; + return 0; +} diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c new file mode 100644 index 000000000..7a8fbe3e4 --- /dev/null +++ b/fs/ceph/debugfs.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/math64.h> +#include <linux/ktime.h> + +#include <linux/ceph/libceph.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> + +#include "super.h" + +#ifdef CONFIG_DEBUG_FS + +#include "mds_client.h" +#include "metric.h" + +static int mdsmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_fs_client *fsc = s->private; + struct ceph_mdsmap *mdsmap; + + if (!fsc->mdsc || !fsc->mdsc->mdsmap) + return 0; + mdsmap = fsc->mdsc->mdsmap; + seq_printf(s, "epoch %d\n", mdsmap->m_epoch); + seq_printf(s, "root %d\n", mdsmap->m_root); + seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); + seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); + seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); + for (i = 0; i < mdsmap->possible_max_rank; i++) { + struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; + int state = mdsmap->m_info[i].state; + seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, + ceph_pr_addr(addr), + ceph_mds_state_name(state)); + } + return 0; +} + +/* + * mdsc debugfs + */ +static int mdsc_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + struct rb_node *rp; + int pathlen = 0; + u64 pathbase; + char *path; + + mutex_lock(&mdsc->mutex); + for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { + req = rb_entry(rp, struct ceph_mds_request, r_node); + + if (req->r_request && req->r_session) + seq_printf(s, "%lld\tmds%d\t", req->r_tid, + req->r_session->s_mds); + else if (!req->r_request) + seq_printf(s, "%lld\t(no request)\t", req->r_tid); + else + seq_printf(s, "%lld\t(no session)\t", req->r_tid); + + seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); + + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) + seq_puts(s, "\t(unsafe)"); + else + seq_puts(s, "\t"); + + if (req->r_inode) { + seq_printf(s, " #%llx", ceph_ino(req->r_inode)); + } else if (req->r_dentry) { + path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &pathbase, 0); + if (IS_ERR(path)) + path = NULL; + spin_lock(&req->r_dentry->d_lock); + seq_printf(s, " #%llx/%pd (%s)", + ceph_ino(d_inode(req->r_dentry->d_parent)), + req->r_dentry, + path ? path : ""); + spin_unlock(&req->r_dentry->d_lock); + ceph_mdsc_free_path(path, pathlen); + } else if (req->r_path1) { + seq_printf(s, " #%llx/%s", req->r_ino1.ino, + req->r_path1); + } else { + seq_printf(s, " #%llx", req->r_ino1.ino); + } + + if (req->r_old_dentry) { + path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, + &pathbase, 0); + if (IS_ERR(path)) + path = NULL; + spin_lock(&req->r_old_dentry->d_lock); + seq_printf(s, " #%llx/%pd (%s)", + req->r_old_dentry_dir ? + ceph_ino(req->r_old_dentry_dir) : 0, + req->r_old_dentry, + path ? path : ""); + spin_unlock(&req->r_old_dentry->d_lock); + ceph_mdsc_free_path(path, pathlen); + } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { + if (req->r_ino2.ino) + seq_printf(s, " #%llx/%s", req->r_ino2.ino, + req->r_path2); + else + seq_printf(s, " %s", req->r_path2); + } + + seq_puts(s, "\n"); + } + mutex_unlock(&mdsc->mutex); + + return 0; +} + +#define CEPH_METRIC_SHOW(name, total, avg, min, max, sq) { \ + s64 _total, _avg, _min, _max, _sq, _st; \ + _avg = ktime_to_us(avg); \ + _min = ktime_to_us(min == KTIME_MAX ? 0 : min); \ + _max = ktime_to_us(max); \ + _total = total - 1; \ + _sq = _total > 0 ? DIV64_U64_ROUND_CLOSEST(sq, _total) : 0; \ + _st = int_sqrt64(_sq); \ + _st = ktime_to_us(_st); \ + seq_printf(s, "%-14s%-12lld%-16lld%-16lld%-16lld%lld\n", \ + name, total, _avg, _min, _max, _st); \ +} + +static int metric_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_client_metric *m = &mdsc->metric; + int nr_caps = 0; + s64 total, sum, avg, min, max, sq; + + sum = percpu_counter_sum(&m->total_inodes); + seq_printf(s, "item total\n"); + seq_printf(s, "------------------------------------------\n"); + seq_printf(s, "%-35s%lld / %lld\n", "opened files / total inodes", + atomic64_read(&m->opened_files), sum); + seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes", + atomic64_read(&m->total_caps), sum); + seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes", + percpu_counter_sum(&m->opened_inodes), sum); + + seq_printf(s, "\n"); + seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); + seq_printf(s, "-----------------------------------------------------------------------------------\n"); + + spin_lock(&m->read_latency_lock); + total = m->total_reads; + sum = m->read_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->read_latency_min; + max = m->read_latency_max; + sq = m->read_latency_sq_sum; + spin_unlock(&m->read_latency_lock); + CEPH_METRIC_SHOW("read", total, avg, min, max, sq); + + spin_lock(&m->write_latency_lock); + total = m->total_writes; + sum = m->write_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->write_latency_min; + max = m->write_latency_max; + sq = m->write_latency_sq_sum; + spin_unlock(&m->write_latency_lock); + CEPH_METRIC_SHOW("write", total, avg, min, max, sq); + + spin_lock(&m->metadata_latency_lock); + total = m->total_metadatas; + sum = m->metadata_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->metadata_latency_min; + max = m->metadata_latency_max; + sq = m->metadata_latency_sq_sum; + spin_unlock(&m->metadata_latency_lock); + CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq); + + seq_printf(s, "\n"); + seq_printf(s, "item total miss hit\n"); + seq_printf(s, "-------------------------------------------------\n"); + + seq_printf(s, "%-14s%-16lld%-16lld%lld\n", "d_lease", + atomic64_read(&m->total_dentries), + percpu_counter_sum(&m->d_lease_mis), + percpu_counter_sum(&m->d_lease_hit)); + + nr_caps = atomic64_read(&m->total_caps); + seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps, + percpu_counter_sum(&m->i_caps_mis), + percpu_counter_sum(&m->i_caps_hit)); + + return 0; +} + +static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) +{ + struct seq_file *s = p; + + seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), + cap->session->s_mds, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented)); + return 0; +} + +static int caps_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + int total, avail, used, reserved, min, i; + struct cap_wait *cw; + + ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); + seq_printf(s, "total\t\t%d\n" + "avail\t\t%d\n" + "used\t\t%d\n" + "reserved\t%d\n" + "min\t\t%d\n\n", + total, avail, used, reserved, min); + seq_printf(s, "ino mds issued implemented\n"); + seq_printf(s, "--------------------------------------------------\n"); + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *session; + + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + ceph_iterate_session_caps(session, caps_show_cb, s); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + seq_printf(s, "\n\nWaiters:\n--------\n"); + seq_printf(s, "tgid ino need want\n"); + seq_printf(s, "-----------------------------------------------------\n"); + + spin_lock(&mdsc->caps_list_lock); + list_for_each_entry(cw, &mdsc->cap_wait_list, list) { + seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino, + ceph_cap_string(cw->need), + ceph_cap_string(cw->want)); + } + spin_unlock(&mdsc->caps_list_lock); + + return 0; +} + +static int mds_sessions_show(struct seq_file *s, void *ptr) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_auth_client *ac = fsc->client->monc.auth; + struct ceph_options *opt = fsc->client->options; + int mds; + + mutex_lock(&mdsc->mutex); + + /* The 'num' portion of an 'entity name' */ + seq_printf(s, "global_id %llu\n", ac->global_id); + + /* The -o name mount argument */ + seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : ""); + + /* The list of MDS session rank+state */ + for (mds = 0; mds < mdsc->max_sessions; mds++) { + struct ceph_mds_session *session = + __ceph_lookup_mds_session(mdsc, mds); + if (!session) { + continue; + } + mutex_unlock(&mdsc->mutex); + seq_printf(s, "mds.%d %s\n", + session->s_mds, + ceph_session_state_name(session->s_state)); + + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(mdsmap); +DEFINE_SHOW_ATTRIBUTE(mdsc); +DEFINE_SHOW_ATTRIBUTE(caps); +DEFINE_SHOW_ATTRIBUTE(mds_sessions); +DEFINE_SHOW_ATTRIBUTE(metric); + + +/* + * debugfs + */ +static int congestion_kb_set(void *data, u64 val) +{ + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + + fsc->mount_options->congestion_kb = (int)val; + return 0; +} + +static int congestion_kb_get(void *data, u64 *val) +{ + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + + *val = (u64)fsc->mount_options->congestion_kb; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, + congestion_kb_set, "%llu\n"); + + +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) +{ + dout("ceph_fs_debugfs_cleanup\n"); + debugfs_remove(fsc->debugfs_bdi); + debugfs_remove(fsc->debugfs_congestion_kb); + debugfs_remove(fsc->debugfs_mdsmap); + debugfs_remove(fsc->debugfs_mds_sessions); + debugfs_remove(fsc->debugfs_caps); + debugfs_remove(fsc->debugfs_metric); + debugfs_remove(fsc->debugfs_mdsc); +} + +void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) +{ + char name[100]; + + dout("ceph_fs_debugfs_init\n"); + fsc->debugfs_congestion_kb = + debugfs_create_file("writeback_congestion_kb", + 0600, + fsc->client->debugfs_dir, + fsc, + &congestion_kb_fops); + + snprintf(name, sizeof(name), "../../bdi/%s", + bdi_dev_name(fsc->sb->s_bdi)); + fsc->debugfs_bdi = + debugfs_create_symlink("bdi", + fsc->client->debugfs_dir, + name); + + fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", + 0400, + fsc->client->debugfs_dir, + fsc, + &mdsmap_fops); + + fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions", + 0400, + fsc->client->debugfs_dir, + fsc, + &mds_sessions_fops); + + fsc->debugfs_mdsc = debugfs_create_file("mdsc", + 0400, + fsc->client->debugfs_dir, + fsc, + &mdsc_fops); + + fsc->debugfs_metric = debugfs_create_file("metrics", + 0400, + fsc->client->debugfs_dir, + fsc, + &metric_fops); + + fsc->debugfs_caps = debugfs_create_file("caps", + 0400, + fsc->client->debugfs_dir, + fsc, + &caps_fops); +} + + +#else /* CONFIG_DEBUG_FS */ + +void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) +{ +} + +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) +{ +} + +#endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c new file mode 100644 index 000000000..1fddb9cd3 --- /dev/null +++ b/fs/ceph/dir.c @@ -0,0 +1,1984 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/spinlock.h> +#include <linux/namei.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/xattr.h> + +#include "super.h" +#include "mds_client.h" + +/* + * Directory operations: readdir, lookup, create, link, unlink, + * rename, etc. + */ + +/* + * Ceph MDS operations are specified in terms of a base ino and + * relative path. Thus, the client can specify an operation on a + * specific inode (e.g., a getattr due to fstat(2)), or as a path + * relative to, say, the root directory. + * + * Normally, we limit ourselves to strict inode ops (no path component) + * or dentry operations (a single path component relative to an ino). The + * exception to this is open_root_dentry(), which will open the mount + * point by name. + */ + +const struct dentry_operations ceph_dentry_ops; + +static bool __dentry_lease_is_valid(struct ceph_dentry_info *di); +static int __dir_lease_try_check(const struct dentry *dentry); + +/* + * Initialize ceph dentry state. + */ +static int ceph_d_init(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb); + + di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); + if (!di) + return -ENOMEM; /* oh well */ + + di->dentry = dentry; + di->lease_session = NULL; + di->time = jiffies; + dentry->d_fsdata = di; + INIT_LIST_HEAD(&di->lease_list); + + atomic64_inc(&mdsc->metric.total_dentries); + + return 0; +} + +/* + * for f_pos for readdir: + * - hash order: + * (0xff << 52) | ((24 bits hash) << 28) | + * (the nth entry has hash collision); + * - frag+name order; + * ((frag value) << 28) | (the nth entry in frag); + */ +#define OFFSET_BITS 28 +#define OFFSET_MASK ((1 << OFFSET_BITS) - 1) +#define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) +loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) +{ + loff_t fpos = ((loff_t)high << 28) | (loff_t)off; + if (hash_order) + fpos |= HASH_ORDER; + return fpos; +} + +static bool is_hash_order(loff_t p) +{ + return (p & HASH_ORDER) == HASH_ORDER; +} + +static unsigned fpos_frag(loff_t p) +{ + return p >> OFFSET_BITS; +} + +static unsigned fpos_hash(loff_t p) +{ + return ceph_frag_value(fpos_frag(p)); +} + +static unsigned fpos_off(loff_t p) +{ + return p & OFFSET_MASK; +} + +static int fpos_cmp(loff_t l, loff_t r) +{ + int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); + if (v) + return v; + return (int)(fpos_off(l) - fpos_off(r)); +} + +/* + * make note of the last dentry we read, so we can + * continue at the same lexicographical point, + * regardless of what dir changes take place on the + * server. + */ +static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name, + int len, unsigned next_offset) +{ + char *buf = kmalloc(len+1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + kfree(dfi->last_name); + dfi->last_name = buf; + memcpy(dfi->last_name, name, len); + dfi->last_name[len] = 0; + dfi->next_offset = next_offset; + dout("note_last_dentry '%s'\n", dfi->last_name); + return 0; +} + + +static struct dentry * +__dcache_find_get_entry(struct dentry *parent, u64 idx, + struct ceph_readdir_cache_control *cache_ctl) +{ + struct inode *dir = d_inode(parent); + struct dentry *dentry; + unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; + loff_t ptr_pos = idx * sizeof(struct dentry *); + pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; + + if (ptr_pos >= i_size_read(dir)) + return NULL; + + if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { + ceph_readdir_cache_release(cache_ctl); + cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); + if (!cache_ctl->page) { + dout(" page %lu not found\n", ptr_pgoff); + return ERR_PTR(-EAGAIN); + } + /* reading/filling the cache are serialized by + i_mutex, no need to use page lock */ + unlock_page(cache_ctl->page); + cache_ctl->dentries = kmap(cache_ctl->page); + } + + cache_ctl->index = idx & idx_mask; + + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_mutex. */ + if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) + dentry = cache_ctl->dentries[cache_ctl->index]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + return dentry ? : ERR_PTR(-EAGAIN); +} + +/* + * When possible, we try to satisfy a readdir by peeking at the + * dcache. We make this work by carefully ordering dentries on + * d_child when we initially get results back from the MDS, and + * falling back to a "normal" sync readdir if any dentries in the dir + * are dropped. + * + * Complete dir indicates that we have all dentries in the dir. It is + * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by + * the MDS if/when the directory is modified). + */ +static int __dcache_readdir(struct file *file, struct dir_context *ctx, + int shared_gen) +{ + struct ceph_dir_file_info *dfi = file->private_data; + struct dentry *parent = file->f_path.dentry; + struct inode *dir = d_inode(parent); + struct dentry *dentry, *last = NULL; + struct ceph_dentry_info *di; + struct ceph_readdir_cache_control cache_ctl = {}; + u64 idx = 0; + int err = 0; + + dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos); + + /* search start position */ + if (ctx->pos > 2) { + u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); + while (count > 0) { + u64 step = count >> 1; + dentry = __dcache_find_get_entry(parent, idx + step, + &cache_ctl); + if (!dentry) { + /* use linar search */ + idx = 0; + break; + } + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); + if (fpos_cmp(di->offset, ctx->pos) < 0) { + idx += step + 1; + count -= step + 1; + } else { + count = step; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + } + + dout("__dcache_readdir %p cache idx %llu\n", dir, idx); + } + + + for (;;) { + bool emit_dentry = false; + dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); + if (!dentry) { + dfi->file_info.flags |= CEPH_F_ATEND; + err = 0; + break; + } + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (d_unhashed(dentry) || + d_really_is_negative(dentry) || + di->lease_shared_gen != shared_gen) { + spin_unlock(&dentry->d_lock); + dput(dentry); + err = -EAGAIN; + goto out; + } + if (fpos_cmp(ctx->pos, di->offset) <= 0) { + __ceph_dentry_dir_lease_touch(di); + emit_dentry = true; + } + spin_unlock(&dentry->d_lock); + + if (emit_dentry) { + dout(" %llx dentry %p %pd %p\n", di->offset, + dentry, dentry, d_inode(dentry)); + ctx->pos = di->offset; + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, ceph_present_inode(d_inode(dentry)), + d_inode(dentry)->i_mode >> 12)) { + dput(dentry); + err = 0; + break; + } + ctx->pos++; + + if (last) + dput(last); + last = dentry; + } else { + dput(dentry); + } + } +out: + ceph_readdir_cache_release(&cache_ctl); + if (last) { + int ret; + di = ceph_dentry(last); + ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len, + fpos_off(di->offset) + 1); + if (ret < 0) + err = ret; + dput(last); + /* last_name no longer match cache index */ + if (dfi->readdir_cache_idx >= 0) { + dfi->readdir_cache_idx = -1; + dfi->dir_release_count = 0; + } + } + return err; +} + +static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos) +{ + if (!dfi->last_readdir) + return true; + if (is_hash_order(pos)) + return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos)); + else + return dfi->frag != fpos_frag(pos); +} + +static int ceph_readdir(struct file *file, struct dir_context *ctx) +{ + struct ceph_dir_file_info *dfi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + int i; + int err; + unsigned frag = -1; + struct ceph_mds_reply_info_parsed *rinfo; + + dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); + if (dfi->file_info.flags & CEPH_F_ATEND) + return 0; + + /* always start with . and .. */ + if (ctx->pos == 0) { + dout("readdir off 0 -> '.'\n"); + if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode), + inode->i_mode >> 12)) + return 0; + ctx->pos = 1; + } + if (ctx->pos == 1) { + u64 ino; + struct dentry *dentry = file->f_path.dentry; + + spin_lock(&dentry->d_lock); + ino = ceph_present_inode(dentry->d_parent->d_inode); + spin_unlock(&dentry->d_lock); + + dout("readdir off 1 -> '..'\n"); + if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12)) + return 0; + ctx->pos = 2; + } + + spin_lock(&ci->i_ceph_lock); + /* request Fx cap. if have Fx, we don't need to release Fs cap + * for later create/unlink. */ + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR); + /* can we use the dcache? */ + if (ceph_test_mount_opt(fsc, DCACHE) && + !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && + ceph_snap(inode) != CEPH_SNAPDIR && + __ceph_dir_is_complete_ordered(ci) && + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) { + int shared_gen = atomic_read(&ci->i_shared_gen); + + spin_unlock(&ci->i_ceph_lock); + err = __dcache_readdir(file, ctx, shared_gen); + if (err != -EAGAIN) + return err; + } else { + spin_unlock(&ci->i_ceph_lock); + } + + /* proceed with a normal readdir */ +more: + /* do we have the correct frag content buffered? */ + if (need_send_readdir(dfi, ctx->pos)) { + struct ceph_mds_request *req; + int op = ceph_snap(inode) == CEPH_SNAPDIR ? + CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; + + /* discard old result, if any */ + if (dfi->last_readdir) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; + } + + if (is_hash_order(ctx->pos)) { + /* fragtree isn't always accurate. choose frag + * based on previous reply when possible. */ + if (frag == (unsigned)-1) + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); + } else { + frag = fpos_frag(ctx->pos); + } + + dout("readdir fetching %llx.%llx frag %x offset '%s'\n", + ceph_vinop(inode), frag, dfi->last_name); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) { + ceph_mdsc_put_request(req); + return err; + } + /* hints to request -> mds selection code */ + req->r_direct_mode = USE_AUTH_MDS; + if (op == CEPH_MDS_OP_READDIR) { + req->r_direct_hash = ceph_frag_value(frag); + __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + req->r_inode_drop = CEPH_CAP_FILE_EXCL; + } + if (dfi->last_name) { + req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + return -ENOMEM; + } + } else if (is_hash_order(ctx->pos)) { + req->r_args.readdir.offset_hash = + cpu_to_le32(fpos_hash(ctx->pos)); + } + + req->r_dir_release_cnt = dfi->dir_release_count; + req->r_dir_ordered_cnt = dfi->dir_ordered_count; + req->r_readdir_cache_idx = dfi->readdir_cache_idx; + req->r_readdir_offset = dfi->next_offset; + req->r_args.readdir.frag = cpu_to_le32(frag); + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); + + req->r_inode = inode; + ihold(inode); + req->r_dentry = dget(file->f_path.dentry); + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) { + ceph_mdsc_put_request(req); + return err; + } + dout("readdir got and parsed readdir result=%d on " + "frag %x, end=%d, complete=%d, hash_order=%d\n", + err, frag, + (int)req->r_reply_info.dir_end, + (int)req->r_reply_info.dir_complete, + (int)req->r_reply_info.hash_order); + + rinfo = &req->r_reply_info; + if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (!rinfo->hash_order) { + dfi->next_offset = req->r_readdir_offset; + /* adjust ctx->pos to beginning of frag */ + ctx->pos = ceph_make_fpos(frag, + dfi->next_offset, + false); + } + } + + dfi->frag = frag; + dfi->last_readdir = req; + + if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { + dfi->readdir_cache_idx = req->r_readdir_cache_idx; + if (dfi->readdir_cache_idx < 0) { + /* preclude from marking dir ordered */ + dfi->dir_ordered_count = 0; + } else if (ceph_frag_is_leftmost(frag) && + dfi->next_offset == 2) { + /* note dir version at start of readdir so + * we can tell if any dentries get dropped */ + dfi->dir_release_count = req->r_dir_release_cnt; + dfi->dir_ordered_count = req->r_dir_ordered_cnt; + } + } else { + dout("readdir !did_prepopulate\n"); + /* disable readdir cache */ + dfi->readdir_cache_idx = -1; + /* preclude from marking dir complete */ + dfi->dir_release_count = 0; + } + + /* note next offset and last dentry name */ + if (rinfo->dir_nr > 0) { + struct ceph_mds_reply_dir_entry *rde = + rinfo->dir_entries + (rinfo->dir_nr-1); + unsigned next_offset = req->r_reply_info.dir_end ? + 2 : (fpos_off(rde->offset) + 1); + err = note_last_dentry(dfi, rde->name, rde->name_len, + next_offset); + if (err) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; + return err; + } + } else if (req->r_reply_info.dir_end) { + dfi->next_offset = 2; + /* keep last name */ + } + } + + rinfo = &dfi->last_readdir->r_reply_info; + dout("readdir frag %x num %d pos %llx chunk first %llx\n", + dfi->frag, rinfo->dir_nr, ctx->pos, + rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); + + i = 0; + /* search start position */ + if (rinfo->dir_nr > 0) { + int step, nr = rinfo->dir_nr; + while (nr > 0) { + step = nr >> 1; + if (rinfo->dir_entries[i + step].offset < ctx->pos) { + i += step + 1; + nr -= step + 1; + } else { + nr = step; + } + } + } + for (; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; + + BUG_ON(rde->offset < ctx->pos); + + ctx->pos = rde->offset; + dout("readdir (%d/%d) -> %llx '%.*s' %p\n", + i, rinfo->dir_nr, ctx->pos, + rde->name_len, rde->name, &rde->inode.in); + + BUG_ON(!rde->inode.in); + + if (!dir_emit(ctx, rde->name, rde->name_len, + ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), + le32_to_cpu(rde->inode.in->mode) >> 12)) { + /* + * NOTE: Here no need to put the 'dfi->last_readdir', + * because when dir_emit stops us it's most likely + * doesn't have enough memory, etc. So for next readdir + * it will continue. + */ + dout("filldir stopping us...\n"); + return 0; + } + ctx->pos++; + } + + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; + + if (dfi->next_offset > 2) { + frag = dfi->frag; + goto more; + } + + /* more frags? */ + if (!ceph_frag_is_rightmost(dfi->frag)) { + frag = ceph_frag_next(dfi->frag); + if (is_hash_order(ctx->pos)) { + loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), + dfi->next_offset, true); + if (new_pos > ctx->pos) + ctx->pos = new_pos; + /* keep last_name */ + } else { + ctx->pos = ceph_make_fpos(frag, dfi->next_offset, + false); + kfree(dfi->last_name); + dfi->last_name = NULL; + } + dout("readdir next frag is %x\n", frag); + goto more; + } + dfi->file_info.flags |= CEPH_F_ATEND; + + /* + * if dir_release_count still matches the dir, no dentries + * were released during the whole readdir, and we should have + * the complete dir contents in our cache. + */ + if (atomic64_read(&ci->i_release_count) == + dfi->dir_release_count) { + spin_lock(&ci->i_ceph_lock); + if (dfi->dir_ordered_count == + atomic64_read(&ci->i_ordered_count)) { + dout(" marking %p complete and ordered\n", inode); + /* use i_size to track number of entries in + * readdir cache */ + BUG_ON(dfi->readdir_cache_idx < 0); + i_size_write(inode, dfi->readdir_cache_idx * + sizeof(struct dentry*)); + } else { + dout(" marking %p complete\n", inode); + } + __ceph_dir_set_complete(ci, dfi->dir_release_count, + dfi->dir_ordered_count); + spin_unlock(&ci->i_ceph_lock); + } + + dout("readdir %p file %p done.\n", inode, file); + return 0; +} + +static void reset_readdir(struct ceph_dir_file_info *dfi) +{ + if (dfi->last_readdir) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; + } + kfree(dfi->last_name); + dfi->last_name = NULL; + dfi->dir_release_count = 0; + dfi->readdir_cache_idx = -1; + dfi->next_offset = 2; /* compensate for . and .. */ + dfi->file_info.flags &= ~CEPH_F_ATEND; +} + +/* + * discard buffered readdir content on seekdir(0), or seek to new frag, + * or seek prior to current chunk + */ +static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos) +{ + struct ceph_mds_reply_info_parsed *rinfo; + loff_t chunk_offset; + if (new_pos == 0) + return true; + if (is_hash_order(new_pos)) { + /* no need to reset last_name for a forward seek when + * dentries are sotred in hash order */ + } else if (dfi->frag != fpos_frag(new_pos)) { + return true; + } + rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL; + if (!rinfo || !rinfo->dir_nr) + return true; + chunk_offset = rinfo->dir_entries[0].offset; + return new_pos < chunk_offset || + is_hash_order(new_pos) != is_hash_order(chunk_offset); +} + +static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct ceph_dir_file_info *dfi = file->private_data; + struct inode *inode = file->f_mapping->host; + loff_t retval; + + inode_lock(inode); + retval = -EINVAL; + switch (whence) { + case SEEK_CUR: + offset += file->f_pos; + case SEEK_SET: + break; + case SEEK_END: + retval = -EOPNOTSUPP; + default: + goto out; + } + + if (offset >= 0) { + if (need_reset_readdir(dfi, offset)) { + dout("dir_llseek dropping %p content\n", file); + reset_readdir(dfi); + } else if (is_hash_order(offset) && offset > file->f_pos) { + /* for hash offset, we don't know if a forward seek + * is within same frag */ + dfi->dir_release_count = 0; + dfi->readdir_cache_idx = -1; + } + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + dfi->file_info.flags &= ~CEPH_F_ATEND; + } + retval = offset; + } +out: + inode_unlock(inode); + return retval; +} + +/* + * Handle lookups for the hidden .snap directory. + */ +int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ + + /* .snap dir? */ + if (err == -ENOENT && + ceph_snap(parent) == CEPH_NOSNAP && + strcmp(dentry->d_name.name, + fsc->mount_options->snapdir_name) == 0) { + struct inode *inode = ceph_get_snapdir(parent); + dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", + dentry, dentry, inode); + BUG_ON(!d_unhashed(dentry)); + d_add(dentry, inode); + err = 0; + } + return err; +} + +/* + * Figure out final result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ + if (err == -ENOENT) { + /* no trace? */ + err = 0; + if (!req->r_reply_info.head->is_dentry) { + dout("ENOENT and no trace, dentry %p inode %p\n", + dentry, d_inode(dentry)); + if (d_really_is_positive(dentry)) { + d_drop(dentry); + err = -ENOENT; + } else { + d_add(dentry, NULL); + } + } + } + if (err) + dentry = ERR_PTR(err); + else if (dentry != req->r_dentry) + dentry = dget(req->r_dentry); /* we got spliced */ + else + dentry = NULL; + return dentry; +} + +static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) +{ + return ceph_ino(inode) == CEPH_INO_ROOT && + strncmp(dentry->d_name.name, ".ceph", 5) == 0; +} + +/* + * Look up a single dir entry. If there is a lookup intent, inform + * the MDS so that it gets our 'caps wanted' value in a single op. + */ +static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_mds_request *req; + int op; + int mask; + int err; + + dout("lookup %p dentry %p '%pd'\n", + dir, dentry, dentry); + + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + /* can we conclude ENOENT locally? */ + if (d_really_is_negative(dentry)) { + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + + spin_lock(&ci->i_ceph_lock); + dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags); + if (strncmp(dentry->d_name.name, + fsc->mount_options->snapdir_name, + dentry->d_name.len) && + !is_root_ceph_dentry(dir, dentry) && + ceph_test_mount_opt(fsc, DCACHE) && + __ceph_dir_is_complete(ci) && + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) { + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); + spin_unlock(&ci->i_ceph_lock); + dout(" dir %p complete, -ENOENT\n", dir); + d_add(dentry, NULL); + di->lease_shared_gen = atomic_read(&ci->i_shared_gen); + return NULL; + } + spin_unlock(&ci->i_ceph_lock); + } + + op = ceph_snap(dir) == CEPH_SNAPDIR ? + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; + req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.getattr.mask = cpu_to_le32(mask); + + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + err = ceph_mdsc_do_request(mdsc, NULL, req); + err = ceph_handle_snapdir(req, dentry, err); + dentry = ceph_finish_lookup(req, dentry, err); + ceph_mdsc_put_request(req); /* will dput(dentry) */ + dout("lookup result=%p\n", dentry); + return dentry; +} + +/* + * If we do a create but get no trace back from the MDS, follow up with + * a lookup (the VFS expects us to link up the provided dentry). + */ +int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) +{ + struct dentry *result = ceph_lookup(dir, dentry, 0); + + if (result && !IS_ERR(result)) { + /* + * We created the item, then did a lookup, and found + * it was already linked to another inode we already + * had in our cache (and thus got spliced). To not + * confuse VFS (especially when inode is a directory), + * we don't link our dentry to that inode, return an + * error instead. + * + * This event should be rare and it happens only when + * we talk to old MDS. Recent MDS does not send traceless + * reply for request that creates new inode. + */ + d_drop(result); + return -ESTALE; + } + return PTR_ERR(result); +} + +static int ceph_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_mds_request *req; + struct ceph_acl_sec_ctx as_ctx = {}; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + if (ceph_quota_is_max_files_exceeded(dir)) { + err = -EDQUOT; + goto out; + } + + err = ceph_pre_init_acls(dir, &mode, &as_ctx); + if (err < 0) + goto out; + err = ceph_security_init_secctx(dentry, mode, &as_ctx); + if (err < 0) + goto out; + + dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", + dir, dentry, mode, rdev); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_args.mknod.mode = cpu_to_le32(mode); + req->r_args.mknod.rdev = cpu_to_le32(rdev); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (as_ctx.pagelist) { + req->r_pagelist = as_ctx.pagelist; + as_ctx.pagelist = NULL; + } + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); +out: + if (!err) + ceph_init_inode_acls(d_inode(dentry), &as_ctx); + else + d_drop(dentry); + ceph_release_acl_sec_ctx(&as_ctx); + return err; +} + +static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return ceph_mknod(dir, dentry, mode, 0); +} + +static int ceph_symlink(struct inode *dir, struct dentry *dentry, + const char *dest) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_mds_request *req; + struct ceph_acl_sec_ctx as_ctx = {}; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + if (ceph_quota_is_max_files_exceeded(dir)) { + err = -EDQUOT; + goto out; + } + + err = ceph_security_init_secctx(dentry, S_IFLNK | 0777, &as_ctx); + if (err < 0) + goto out; + + dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_path2 = kstrdup(dest, GFP_KERNEL); + if (!req->r_path2) { + err = -ENOMEM; + ceph_mdsc_put_request(req); + goto out; + } + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (as_ctx.pagelist) { + req->r_pagelist = as_ctx.pagelist; + as_ctx.pagelist = NULL; + } + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); +out: + if (err) + d_drop(dentry); + ceph_release_acl_sec_ctx(&as_ctx); + return err; +} + +static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_mds_request *req; + struct ceph_acl_sec_ctx as_ctx = {}; + int err = -EROFS; + int op; + + if (ceph_snap(dir) == CEPH_SNAPDIR) { + /* mkdir .snap/foo is a MKSNAP */ + op = CEPH_MDS_OP_MKSNAP; + dout("mksnap dir %p snap '%pd' dn %p\n", dir, + dentry, dentry); + } else if (ceph_snap(dir) == CEPH_NOSNAP) { + dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); + op = CEPH_MDS_OP_MKDIR; + } else { + goto out; + } + + if (op == CEPH_MDS_OP_MKDIR && + ceph_quota_is_max_files_exceeded(dir)) { + err = -EDQUOT; + goto out; + } + + mode |= S_IFDIR; + err = ceph_pre_init_acls(dir, &mode, &as_ctx); + if (err < 0) + goto out; + err = ceph_security_init_secctx(dentry, mode, &as_ctx); + if (err < 0) + goto out; + + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_args.mkdir.mode = cpu_to_le32(mode); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (as_ctx.pagelist) { + req->r_pagelist = as_ctx.pagelist; + as_ctx.pagelist = NULL; + } + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && + !req->r_reply_info.head->is_target && + !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); +out: + if (!err) + ceph_init_inode_acls(d_inode(dentry), &as_ctx); + else + d_drop(dentry); + ceph_release_acl_sec_ctx(&as_ctx); + return err; +} + +static int ceph_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_mds_request *req; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + dout("link in dir %p old_dentry %p dentry %p\n", dir, + old_dentry, dentry); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); + if (IS_ERR(req)) { + d_drop(dentry); + return PTR_ERR(req); + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_old_dentry = dget(old_dentry); + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_SHARED on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (err) { + d_drop(dentry); + } else if (!req->r_reply_info.head->is_dentry) { + ihold(d_inode(old_dentry)); + d_instantiate(dentry, d_inode(old_dentry)); + } + ceph_mdsc_put_request(req); + return err; +} + +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + int result = req->r_err ? req->r_err : + le32_to_cpu(req->r_reply_info.head->result); + + if (result == -EJUKEBOX) + goto out; + + /* If op failed, mark everyone involved for errors */ + if (result) { + int pathlen = 0; + u64 base = 0; + char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &base, 0); + + /* mark error on parent + clear complete */ + mapping_set_error(req->r_parent->i_mapping, result); + ceph_dir_clear_complete(req->r_parent); + + /* drop the dentry -- we don't know its status */ + if (!d_unhashed(req->r_dentry)) + d_drop(req->r_dentry); + + /* mark inode itself for an error (since metadata is bogus) */ + mapping_set_error(req->r_old_inode->i_mapping, result); + + pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path(path, pathlen); + } +out: + iput(req->r_old_inode); + ceph_mdsc_release_dir_caps(req); +} + +static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di; + int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK; + + spin_lock(&ci->i_ceph_lock); + if ((__ceph_caps_issued(ci, NULL) & want) == want) { + ceph_take_cap_refs(ci, want, false); + got = want; + } + spin_unlock(&ci->i_ceph_lock); + + /* If we didn't get anything, return 0 */ + if (!got) + return 0; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + /* + * - We are holding Fx, which implies Fs caps. + * - Only support async unlink for primary linkage + */ + if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen || + !(di->flags & CEPH_DENTRY_PRIMARY_LINK)) + want = 0; + spin_unlock(&dentry->d_lock); + + /* Do we still want what we've got? */ + if (want == got) + return got; + + ceph_put_cap_refs(ci, got); + return 0; +} + +/* + * rmdir and unlink are differ only by the metadata op code + */ +static int ceph_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct inode *inode = d_inode(dentry); + struct ceph_mds_request *req; + bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); + int err = -EROFS; + int op; + + if (ceph_snap(dir) == CEPH_SNAPDIR) { + /* rmdir .snap/foo is RMSNAP */ + dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry); + op = CEPH_MDS_OP_RMSNAP; + } else if (ceph_snap(dir) == CEPH_NOSNAP) { + dout("unlink/rmdir dir %p dn %p inode %p\n", + dir, dentry, inode); + op = d_is_dir(dentry) ? + CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; + } else + goto out; +retry: + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_parent = dir; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + req->r_inode_drop = ceph_drop_caps_for_unlink(inode); + + if (try_async && op == CEPH_MDS_OP_UNLINK && + (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { + dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir), + dentry->d_name.len, dentry->d_name.name, + ceph_cap_string(req->r_dir_caps)); + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); + req->r_callback = ceph_async_unlink_cb; + req->r_old_inode = d_inode(dentry); + ihold(req->r_old_inode); + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) { + /* + * We have enough caps, so we assume that the unlink + * will succeed. Fix up the target inode and dcache. + */ + drop_nlink(inode); + d_delete(dentry); + } else if (err == -EJUKEBOX) { + try_async = false; + ceph_mdsc_put_request(req); + goto retry; + } + } else { + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + d_delete(dentry); + } + + ceph_mdsc_put_request(req); +out: + return err; +} + +static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb); + struct ceph_mds_request *req; + int op = CEPH_MDS_OP_RENAME; + int err; + + if (flags) + return -EINVAL; + + if (ceph_snap(old_dir) != ceph_snap(new_dir)) + return -EXDEV; + if (ceph_snap(old_dir) != CEPH_NOSNAP) { + if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR) + op = CEPH_MDS_OP_RENAMESNAP; + else + return -EROFS; + } else if (old_dir != new_dir) { + err = ceph_quota_check_rename(mdsc, d_inode(old_dentry), + new_dir); + if (err) + return err; + } + + dout("rename dir %p dentry %p to dir %p dentry %p\n", + old_dir, old_dentry, new_dir, new_dentry); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + ihold(old_dir); + req->r_dentry = dget(new_dentry); + req->r_num_caps = 2; + req->r_old_dentry = dget(old_dentry); + req->r_old_dentry_dir = old_dir; + req->r_parent = new_dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_RDCACHE on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + if (d_really_is_positive(new_dentry)) { + req->r_inode_drop = + ceph_drop_caps_for_unlink(d_inode(new_dentry)); + } + err = ceph_mdsc_do_request(mdsc, old_dir, req); + if (!err && !req->r_reply_info.head->is_dentry) { + /* + * Normally d_move() is done by fill_trace (called by + * do_request, above). If there is no trace, we need + * to do it here. + */ + d_move(old_dentry, new_dentry); + } + ceph_mdsc_put_request(req); + return err; +} + +/* + * Move dentry to tail of mdsc->dentry_leases list when lease is updated. + * Leases at front of the list will expire first. (Assume all leases have + * similar duration) + * + * Called under dentry->d_lock. + */ +void __ceph_dentry_lease_touch(struct ceph_dentry_info *di) +{ + struct dentry *dn = di->dentry; + struct ceph_mds_client *mdsc; + + dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn); + + di->flags |= CEPH_DENTRY_LEASE_LIST; + if (di->flags & CEPH_DENTRY_SHRINK_LIST) { + di->flags |= CEPH_DENTRY_REFERENCED; + return; + } + + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + list_move_tail(&di->lease_list, &mdsc->dentry_leases); + spin_unlock(&mdsc->dentry_list_lock); +} + +static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc, + struct ceph_dentry_info *di) +{ + di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED); + di->lease_gen = 0; + di->time = jiffies; + list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases); +} + +/* + * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases + * list if it's not in the list, otherwise set 'referenced' flag. + * + * Called under dentry->d_lock. + */ +void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) +{ + struct dentry *dn = di->dentry; + struct ceph_mds_client *mdsc; + + dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n", + di, dn, dn, di->offset); + + if (!list_empty(&di->lease_list)) { + if (di->flags & CEPH_DENTRY_LEASE_LIST) { + /* don't remove dentry from dentry lease list + * if its lease is valid */ + if (__dentry_lease_is_valid(di)) + return; + } else { + di->flags |= CEPH_DENTRY_REFERENCED; + return; + } + } + + if (di->flags & CEPH_DENTRY_SHRINK_LIST) { + di->flags |= CEPH_DENTRY_REFERENCED; + di->flags &= ~CEPH_DENTRY_LEASE_LIST; + return; + } + + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + __dentry_dir_lease_touch(mdsc, di), + spin_unlock(&mdsc->dentry_list_lock); +} + +static void __dentry_lease_unlist(struct ceph_dentry_info *di) +{ + struct ceph_mds_client *mdsc; + if (di->flags & CEPH_DENTRY_SHRINK_LIST) + return; + if (list_empty(&di->lease_list)) + return; + + mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + list_del_init(&di->lease_list); + spin_unlock(&mdsc->dentry_list_lock); +} + +enum { + KEEP = 0, + DELETE = 1, + TOUCH = 2, + STOP = 4, +}; + +struct ceph_lease_walk_control { + bool dir_lease; + bool expire_dir_lease; + unsigned long nr_to_scan; + unsigned long dir_lease_ttl; +}; + +static unsigned long +__dentry_leases_walk(struct ceph_mds_client *mdsc, + struct ceph_lease_walk_control *lwc, + int (*check)(struct dentry*, void*)) +{ + struct ceph_dentry_info *di, *tmp; + struct dentry *dentry, *last = NULL; + struct list_head* list; + LIST_HEAD(dispose); + unsigned long freed = 0; + int ret = 0; + + list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases; + spin_lock(&mdsc->dentry_list_lock); + list_for_each_entry_safe(di, tmp, list, lease_list) { + if (!lwc->nr_to_scan) + break; + --lwc->nr_to_scan; + + dentry = di->dentry; + if (last == dentry) + break; + + if (!spin_trylock(&dentry->d_lock)) + continue; + + if (__lockref_is_dead(&dentry->d_lockref)) { + list_del_init(&di->lease_list); + goto next; + } + + ret = check(dentry, lwc); + if (ret & TOUCH) { + /* move it into tail of dir lease list */ + __dentry_dir_lease_touch(mdsc, di); + if (!last) + last = dentry; + } + if (ret & DELETE) { + /* stale lease */ + di->flags &= ~CEPH_DENTRY_REFERENCED; + if (dentry->d_lockref.count > 0) { + /* update_dentry_lease() will re-add + * it to lease list, or + * ceph_d_delete() will return 1 when + * last reference is dropped */ + list_del_init(&di->lease_list); + } else { + di->flags |= CEPH_DENTRY_SHRINK_LIST; + list_move_tail(&di->lease_list, &dispose); + dget_dlock(dentry); + } + } +next: + spin_unlock(&dentry->d_lock); + if (ret & STOP) + break; + } + spin_unlock(&mdsc->dentry_list_lock); + + while (!list_empty(&dispose)) { + di = list_first_entry(&dispose, struct ceph_dentry_info, + lease_list); + dentry = di->dentry; + spin_lock(&dentry->d_lock); + + list_del_init(&di->lease_list); + di->flags &= ~CEPH_DENTRY_SHRINK_LIST; + if (di->flags & CEPH_DENTRY_REFERENCED) { + spin_lock(&mdsc->dentry_list_lock); + if (di->flags & CEPH_DENTRY_LEASE_LIST) { + list_add_tail(&di->lease_list, + &mdsc->dentry_leases); + } else { + __dentry_dir_lease_touch(mdsc, di); + } + spin_unlock(&mdsc->dentry_list_lock); + } else { + freed++; + } + + spin_unlock(&dentry->d_lock); + /* ceph_d_delete() does the trick */ + dput(dentry); + } + return freed; +} + +static int __dentry_lease_check(struct dentry *dentry, void *arg) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + int ret; + + if (__dentry_lease_is_valid(di)) + return STOP; + ret = __dir_lease_try_check(dentry); + if (ret == -EBUSY) + return KEEP; + if (ret > 0) + return TOUCH; + return DELETE; +} + +static int __dir_lease_check(struct dentry *dentry, void *arg) +{ + struct ceph_lease_walk_control *lwc = arg; + struct ceph_dentry_info *di = ceph_dentry(dentry); + + int ret = __dir_lease_try_check(dentry); + if (ret == -EBUSY) + return KEEP; + if (ret > 0) { + if (time_before(jiffies, di->time + lwc->dir_lease_ttl)) + return STOP; + /* Move dentry to tail of dir lease list if we don't want + * to delete it. So dentries in the list are checked in a + * round robin manner */ + if (!lwc->expire_dir_lease) + return TOUCH; + if (dentry->d_lockref.count > 0 || + (di->flags & CEPH_DENTRY_REFERENCED)) + return TOUCH; + /* invalidate dir lease */ + di->lease_shared_gen = 0; + } + return DELETE; +} + +int ceph_trim_dentries(struct ceph_mds_client *mdsc) +{ + struct ceph_lease_walk_control lwc; + unsigned long count; + unsigned long freed; + + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_count > mdsc->caps_use_max) + count = mdsc->caps_use_count - mdsc->caps_use_max; + else + count = 0; + spin_unlock(&mdsc->caps_list_lock); + + lwc.dir_lease = false; + lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2; + freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check); + if (!lwc.nr_to_scan) /* more invalid leases */ + return -EAGAIN; + + if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE) + lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE; + + lwc.dir_lease = true; + lwc.expire_dir_lease = freed < count; + lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ; + freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check); + if (!lwc.nr_to_scan) /* more to check */ + return -EAGAIN; + + return freed > 0 ? 1 : 0; +} + +/* + * Ensure a dentry lease will no longer revalidate. + */ +void ceph_invalidate_dentry_lease(struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); + di->time = jiffies; + di->lease_shared_gen = 0; + di->flags &= ~CEPH_DENTRY_PRIMARY_LINK; + __dentry_lease_unlist(di); + spin_unlock(&dentry->d_lock); +} + +/* + * Check if dentry lease is valid. If not, delete the lease. Try to + * renew if the least is more than half up. + */ +static bool __dentry_lease_is_valid(struct ceph_dentry_info *di) +{ + struct ceph_mds_session *session; + + if (!di->lease_gen) + return false; + + session = di->lease_session; + if (session) { + u32 gen; + unsigned long ttl; + + spin_lock(&session->s_gen_ttl_lock); + gen = session->s_cap_gen; + ttl = session->s_cap_ttl; + spin_unlock(&session->s_gen_ttl_lock); + + if (di->lease_gen == gen && + time_before(jiffies, ttl) && + time_before(jiffies, di->time)) + return true; + } + di->lease_gen = 0; + return false; +} + +static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags) +{ + struct ceph_dentry_info *di; + struct ceph_mds_session *session = NULL; + u32 seq = 0; + int valid = 0; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (di && __dentry_lease_is_valid(di)) { + valid = 1; + + if (di->lease_renew_after && + time_after(jiffies, di->lease_renew_after)) { + /* + * We should renew. If we're in RCU walk mode + * though, we can't do that so just return + * -ECHILD. + */ + if (flags & LOOKUP_RCU) { + valid = -ECHILD; + } else { + session = ceph_get_mds_session(di->lease_session); + seq = di->lease_seq; + di->lease_renew_after = 0; + di->lease_renew_from = jiffies; + } + } + } + spin_unlock(&dentry->d_lock); + + if (session) { + ceph_mdsc_lease_send_msg(session, dentry, + CEPH_MDS_LEASE_RENEW, seq); + ceph_put_mds_session(session); + } + dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); + return valid; +} + +/* + * Called under dentry->d_lock. + */ +static int __dir_lease_try_check(const struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + struct inode *dir; + struct ceph_inode_info *ci; + int valid = 0; + + if (!di->lease_shared_gen) + return 0; + if (IS_ROOT(dentry)) + return 0; + + dir = d_inode(dentry->d_parent); + ci = ceph_inode(dir); + + if (spin_trylock(&ci->i_ceph_lock)) { + if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen && + __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0)) + valid = 1; + spin_unlock(&ci->i_ceph_lock); + } else { + valid = -EBUSY; + } + + if (!valid) + di->lease_shared_gen = 0; + return valid; +} + +/* + * Check if directory-wide content lease/cap is valid. + */ +static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry, + struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + int valid; + int shared_gen; + + spin_lock(&ci->i_ceph_lock); + valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); + if (valid) { + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); + shared_gen = atomic_read(&ci->i_shared_gen); + } + spin_unlock(&ci->i_ceph_lock); + if (valid) { + struct ceph_dentry_info *di; + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (dir == d_inode(dentry->d_parent) && + di && di->lease_shared_gen == shared_gen) + __ceph_dentry_dir_lease_touch(di); + else + valid = 0; + spin_unlock(&dentry->d_lock); + } + dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n", + dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid); + return valid; +} + +/* + * Check if cached dentry can be trusted. + */ +static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + int valid = 0; + struct dentry *parent; + struct inode *dir, *inode; + struct ceph_mds_client *mdsc; + + if (flags & LOOKUP_RCU) { + parent = READ_ONCE(dentry->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + inode = d_inode_rcu(dentry); + } else { + parent = dget_parent(dentry); + dir = d_inode(parent); + inode = d_inode(dentry); + } + + dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry, + dentry, inode, ceph_dentry(dentry)->offset); + + mdsc = ceph_sb_to_client(dir->i_sb)->mdsc; + + /* always trust cached snapped dentries, snapdir dentry */ + if (ceph_snap(dir) != CEPH_NOSNAP) { + dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, + dentry, inode); + valid = 1; + } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { + valid = 1; + } else { + valid = dentry_lease_is_valid(dentry, flags); + if (valid == -ECHILD) + return valid; + if (valid || dir_lease_is_valid(dir, dentry, mdsc)) { + if (inode) + valid = ceph_is_any_caps(inode); + else + valid = 1; + } + } + + if (!valid) { + struct ceph_mds_request *req; + int op, err; + u32 mask; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + percpu_counter_inc(&mdsc->metric.d_lease_mis); + + op = ceph_snap(dir) == CEPH_SNAPDIR ? + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; + req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); + if (!IS_ERR(req)) { + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_parent = dir; + + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.getattr.mask = cpu_to_le32(mask); + + err = ceph_mdsc_do_request(mdsc, NULL, req); + switch (err) { + case 0: + if (d_really_is_positive(dentry) && + d_inode(dentry) == req->r_target_inode) + valid = 1; + break; + case -ENOENT: + if (d_really_is_negative(dentry)) + valid = 1; + fallthrough; + default: + break; + } + ceph_mdsc_put_request(req); + dout("d_revalidate %p lookup result=%d\n", + dentry, err); + } + } else { + percpu_counter_inc(&mdsc->metric.d_lease_hit); + } + + dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); + if (!valid) + ceph_dir_clear_complete(dir); + + if (!(flags & LOOKUP_RCU)) + dput(parent); + return valid; +} + +/* + * Delete unused dentry that doesn't have valid lease + * + * Called under dentry->d_lock. + */ +static int ceph_d_delete(const struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + /* won't release caps */ + if (d_really_is_negative(dentry)) + return 0; + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) + return 0; + /* vaild lease? */ + di = ceph_dentry(dentry); + if (di) { + if (__dentry_lease_is_valid(di)) + return 0; + if (__dir_lease_try_check(dentry)) + return 0; + } + return 1; +} + +/* + * Release our ceph_dentry_info. + */ +static void ceph_d_release(struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + + dout("d_release %p\n", dentry); + + atomic64_dec(&fsc->mdsc->metric.total_dentries); + + spin_lock(&dentry->d_lock); + __dentry_lease_unlist(di); + dentry->d_fsdata = NULL; + spin_unlock(&dentry->d_lock); + + ceph_put_mds_session(di->lease_session); + kmem_cache_free(ceph_dentry_cachep, di); +} + +/* + * When the VFS prunes a dentry from the cache, we need to clear the + * complete flag on the parent directory. + * + * Called under dentry->d_lock. + */ +static void ceph_d_prune(struct dentry *dentry) +{ + struct ceph_inode_info *dir_ci; + struct ceph_dentry_info *di; + + dout("ceph_d_prune %pd %p\n", dentry, dentry); + + /* do we have a valid parent? */ + if (IS_ROOT(dentry)) + return; + + /* we hold d_lock, so d_parent is stable */ + dir_ci = ceph_inode(d_inode(dentry->d_parent)); + if (dir_ci->i_vino.snap == CEPH_SNAPDIR) + return; + + /* who calls d_delete() should also disable dcache readdir */ + if (d_really_is_negative(dentry)) + return; + + /* d_fsdata does not get cleared until d_release */ + if (!d_unhashed(dentry)) { + __ceph_dir_clear_complete(dir_ci); + return; + } + + /* Disable dcache readdir just in case that someone called d_drop() + * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED + * properly (dcache readdir is still enabled) */ + di = ceph_dentry(dentry); + if (di->offset > 0 && + di->lease_shared_gen == atomic_read(&dir_ci->i_shared_gen)) + __ceph_dir_clear_ordered(dir_ci); +} + +/* + * read() on a dir. This weird interface hack only works if mounted + * with '-o dirstat'. + */ +static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + struct ceph_dir_file_info *dfi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + int left; + const int bufsize = 1024; + + if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) + return -EISDIR; + + if (!dfi->dir_info) { + dfi->dir_info = kmalloc(bufsize, GFP_KERNEL); + if (!dfi->dir_info) + return -ENOMEM; + dfi->dir_info_len = + snprintf(dfi->dir_info, bufsize, + "entries: %20lld\n" + " files: %20lld\n" + " subdirs: %20lld\n" + "rentries: %20lld\n" + " rfiles: %20lld\n" + " rsubdirs: %20lld\n" + "rbytes: %20lld\n" + "rctime: %10lld.%09ld\n", + ci->i_files + ci->i_subdirs, + ci->i_files, + ci->i_subdirs, + ci->i_rfiles + ci->i_rsubdirs, + ci->i_rfiles, + ci->i_rsubdirs, + ci->i_rbytes, + ci->i_rctime.tv_sec, + ci->i_rctime.tv_nsec); + } + + if (*ppos >= dfi->dir_info_len) + return 0; + size = min_t(unsigned, size, dfi->dir_info_len-*ppos); + left = copy_to_user(buf, dfi->dir_info + *ppos, size); + if (left == size) + return -EFAULT; + *ppos += (size - left); + return size - left; +} + + + +/* + * Return name hash for a given dentry. This is dependent on + * the parent directory's hash function. + */ +unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) +{ + struct ceph_inode_info *dci = ceph_inode(dir); + unsigned hash; + + switch (dci->i_dir_layout.dl_dir_hash) { + case 0: /* for backward compat */ + case CEPH_STR_HASH_LINUX: + return dn->d_name.hash; + + default: + spin_lock(&dn->d_lock); + hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + dn->d_name.name, dn->d_name.len); + spin_unlock(&dn->d_lock); + return hash; + } +} + +const struct file_operations ceph_dir_fops = { + .read = ceph_read_dir, + .iterate = ceph_readdir, + .llseek = ceph_dir_llseek, + .open = ceph_open, + .release = ceph_release, + .unlocked_ioctl = ceph_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .fsync = ceph_fsync, + .lock = ceph_lock, + .flock = ceph_flock, +}; + +const struct file_operations ceph_snapdir_fops = { + .iterate = ceph_readdir, + .llseek = ceph_dir_llseek, + .open = ceph_open, + .release = ceph_release, +}; + +const struct inode_operations ceph_dir_iops = { + .lookup = ceph_lookup, + .permission = ceph_permission, + .getattr = ceph_getattr, + .setattr = ceph_setattr, + .listxattr = ceph_listxattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, + .mknod = ceph_mknod, + .symlink = ceph_symlink, + .mkdir = ceph_mkdir, + .link = ceph_link, + .unlink = ceph_unlink, + .rmdir = ceph_unlink, + .rename = ceph_rename, + .create = ceph_create, + .atomic_open = ceph_atomic_open, +}; + +const struct inode_operations ceph_snapdir_iops = { + .lookup = ceph_lookup, + .permission = ceph_permission, + .getattr = ceph_getattr, + .mkdir = ceph_mkdir, + .rmdir = ceph_unlink, + .rename = ceph_rename, +}; + +const struct dentry_operations ceph_dentry_ops = { + .d_revalidate = ceph_d_revalidate, + .d_delete = ceph_d_delete, + .d_release = ceph_d_release, + .d_prune = ceph_d_prune, + .d_init = ceph_d_init, +}; diff --git a/fs/ceph/export.c b/fs/ceph/export.c new file mode 100644 index 000000000..042bb4a02 --- /dev/null +++ b/fs/ceph/export.c @@ -0,0 +1,571 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/exportfs.h> +#include <linux/slab.h> +#include <asm/unaligned.h> + +#include "super.h" +#include "mds_client.h" + +/* + * Basic fh + */ +struct ceph_nfs_fh { + u64 ino; +} __attribute__ ((packed)); + +/* + * Larger fh that includes parent ino. + */ +struct ceph_nfs_confh { + u64 ino, parent_ino; +} __attribute__ ((packed)); + +/* + * fh for snapped inode + */ +struct ceph_nfs_snapfh { + u64 ino; + u64 snapid; + u64 parent_ino; + u32 hash; +} __attribute__ ((packed)); + +static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, + struct inode *parent_inode) +{ + static const int snap_handle_length = + sizeof(struct ceph_nfs_snapfh) >> 2; + struct ceph_nfs_snapfh *sfh = (void *)rawfh; + u64 snapid = ceph_snap(inode); + int ret; + bool no_parent = true; + + if (*max_len < snap_handle_length) { + *max_len = snap_handle_length; + ret = FILEID_INVALID; + goto out; + } + + ret = -EINVAL; + if (snapid != CEPH_SNAPDIR) { + struct inode *dir; + struct dentry *dentry = d_find_alias(inode); + if (!dentry) + goto out; + + rcu_read_lock(); + dir = d_inode_rcu(dentry->d_parent); + if (ceph_snap(dir) != CEPH_SNAPDIR) { + sfh->parent_ino = ceph_ino(dir); + sfh->hash = ceph_dentry_hash(dir, dentry); + no_parent = false; + } + rcu_read_unlock(); + dput(dentry); + } + + if (no_parent) { + if (!S_ISDIR(inode->i_mode)) + goto out; + sfh->parent_ino = sfh->ino; + sfh->hash = 0; + } + sfh->ino = ceph_ino(inode); + sfh->snapid = snapid; + + *max_len = snap_handle_length; + ret = FILEID_BTRFS_WITH_PARENT; +out: + dout("encode_snapfh %llx.%llx ret=%d\n", ceph_vinop(inode), ret); + return ret; +} + +static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, + struct inode *parent_inode) +{ + static const int handle_length = + sizeof(struct ceph_nfs_fh) >> 2; + static const int connected_handle_length = + sizeof(struct ceph_nfs_confh) >> 2; + int type; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return ceph_encode_snapfh(inode, rawfh, max_len, parent_inode); + + if (parent_inode && (*max_len < connected_handle_length)) { + *max_len = connected_handle_length; + return FILEID_INVALID; + } else if (*max_len < handle_length) { + *max_len = handle_length; + return FILEID_INVALID; + } + + if (parent_inode) { + struct ceph_nfs_confh *cfh = (void *)rawfh; + dout("encode_fh %llx with parent %llx\n", + ceph_ino(inode), ceph_ino(parent_inode)); + cfh->ino = ceph_ino(inode); + cfh->parent_ino = ceph_ino(parent_inode); + *max_len = connected_handle_length; + type = FILEID_INO32_GEN_PARENT; + } else { + struct ceph_nfs_fh *fh = (void *)rawfh; + dout("encode_fh %llx\n", ceph_ino(inode)); + fh->ino = ceph_ino(inode); + *max_len = handle_length; + type = FILEID_INO32_GEN; + } + return type; +} + +static struct inode *__lookup_inode(struct super_block *sb, u64 ino) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct inode *inode; + struct ceph_vino vino; + int err; + + vino.ino = ino; + vino.snap = CEPH_NOSNAP; + + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-ESTALE); + + inode = ceph_find_inode(sb, vino); + if (!inode) { + struct ceph_mds_request *req; + int mask; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + mask = CEPH_STAT_CAP_INODE; + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.lookupino.mask = cpu_to_le32(mask); + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE); + } + return inode; +} + +struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode = __lookup_inode(sb, ino); + if (IS_ERR(inode)) + return inode; + if (inode->i_nlink == 0) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) +{ + struct inode *inode = __lookup_inode(sb, ino); + int err; + + if (IS_ERR(inode)) + return ERR_CAST(inode); + /* We need LINK caps to reliably check i_nlink */ + err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false); + if (err) { + iput(inode); + return ERR_PTR(err); + } + /* -ESTALE if inode as been unlinked and no file is open */ + if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return d_obtain_alias(inode); +} + +static struct dentry *__snapfh_to_dentry(struct super_block *sb, + struct ceph_nfs_snapfh *sfh, + bool want_parent) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_request *req; + struct inode *inode; + struct ceph_vino vino; + int mask; + int err; + bool unlinked = false; + + if (want_parent) { + vino.ino = sfh->parent_ino; + if (sfh->snapid == CEPH_SNAPDIR) + vino.snap = CEPH_NOSNAP; + else if (sfh->ino == sfh->parent_ino) + vino.snap = CEPH_SNAPDIR; + else + vino.snap = sfh->snapid; + } else { + vino.ino = sfh->ino; + vino.snap = sfh->snapid; + } + + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-ESTALE); + + inode = ceph_find_inode(sb, vino); + if (inode) + return d_obtain_alias(inode); + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + mask = CEPH_STAT_CAP_INODE; + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.lookupino.mask = cpu_to_le32(mask); + if (vino.snap < CEPH_NOSNAP) { + req->r_args.lookupino.snapid = cpu_to_le64(vino.snap); + if (!want_parent && sfh->ino != sfh->parent_ino) { + req->r_args.lookupino.parent = + cpu_to_le64(sfh->parent_ino); + req->r_args.lookupino.hash = + cpu_to_le32(sfh->hash); + } + } + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) { + if (vino.snap == CEPH_SNAPDIR) { + if (inode->i_nlink == 0) + unlinked = true; + inode = ceph_get_snapdir(inode); + } else if (ceph_snap(inode) == vino.snap) { + ihold(inode); + } else { + /* mds does not support lookup snapped inode */ + err = -EOPNOTSUPP; + inode = NULL; + } + } + ceph_mdsc_put_request(req); + + if (want_parent) { + dout("snapfh_to_parent %llx.%llx\n err=%d\n", + vino.ino, vino.snap, err); + } else { + dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d", + vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); + } + if (!inode) + return ERR_PTR(-ESTALE); + /* see comments in ceph_get_parent() */ + return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); +} + +/* + * convert regular fh to dentry + */ +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_nfs_fh *fh = (void *)fid->raw; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + struct ceph_nfs_snapfh *sfh = (void *)fid->raw; + return __snapfh_to_dentry(sb, sfh, false); + } + + if (fh_type != FILEID_INO32_GEN && + fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*fh) / 4) + return NULL; + + dout("fh_to_dentry %llx\n", fh->ino); + return __fh_to_dentry(sb, fh->ino); +} + +static struct dentry *__get_parent(struct super_block *sb, + struct dentry *child, u64 ino) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_request *req; + struct inode *inode; + int mask; + int err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + if (child) { + req->r_inode = d_inode(child); + ihold(d_inode(child)); + } else { + req->r_ino1 = (struct ceph_vino) { + .ino = ino, + .snap = CEPH_NOSNAP, + }; + } + + mask = CEPH_STAT_CAP_INODE; + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.getattr.mask = cpu_to_le32(mask); + + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err) { + ceph_mdsc_put_request(req); + return ERR_PTR(err); + } + + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ENOENT); + + return d_obtain_alias(inode); +} + +static struct dentry *ceph_get_parent(struct dentry *child) +{ + struct inode *inode = d_inode(child); + struct dentry *dn; + + if (ceph_snap(inode) != CEPH_NOSNAP) { + struct inode* dir; + bool unlinked = false; + /* do not support non-directory */ + if (!d_is_dir(child)) { + dn = ERR_PTR(-EINVAL); + goto out; + } + dir = __lookup_inode(inode->i_sb, ceph_ino(inode)); + if (IS_ERR(dir)) { + dn = ERR_CAST(dir); + goto out; + } + /* There can be multiple paths to access snapped inode. + * For simplicity, treat snapdir of head inode as parent */ + if (ceph_snap(inode) != CEPH_SNAPDIR) { + struct inode *snapdir = ceph_get_snapdir(dir); + if (dir->i_nlink == 0) + unlinked = true; + iput(dir); + if (IS_ERR(snapdir)) { + dn = ERR_CAST(snapdir); + goto out; + } + dir = snapdir; + } + /* If directory has already been deleted, futher get_parent + * will fail. Do not mark snapdir dentry as disconnected, + * this prevent exportfs from doing futher get_parent. */ + if (unlinked) + dn = d_obtain_root(dir); + else + dn = d_obtain_alias(dir); + } else { + dn = __get_parent(child->d_sb, child, 0); + } +out: + dout("get_parent %p ino %llx.%llx err=%ld\n", + child, ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn)); + return dn; +} + +/* + * convert regular fh to parent + */ +static struct dentry *ceph_fh_to_parent(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_nfs_confh *cfh = (void *)fid->raw; + struct dentry *dentry; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + struct ceph_nfs_snapfh *sfh = (void *)fid->raw; + return __snapfh_to_dentry(sb, sfh, true); + } + + if (fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*cfh) / 4) + return NULL; + + dout("fh_to_parent %llx\n", cfh->parent_ino); + dentry = __get_parent(sb, NULL, cfh->ino); + if (unlikely(dentry == ERR_PTR(-ENOENT))) + dentry = __fh_to_dentry(sb, cfh->parent_ino); + return dentry; +} + +static int __get_snap_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = d_inode(child); + struct inode *dir = d_inode(parent); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_request *req = NULL; + char *last_name = NULL; + unsigned next_offset = 2; + int err = -EINVAL; + + if (ceph_ino(inode) != ceph_ino(dir)) + goto out; + if (ceph_snap(inode) == CEPH_SNAPDIR) { + if (ceph_snap(dir) == CEPH_NOSNAP) { + strcpy(name, fsc->mount_options->snapdir_name); + err = 0; + } + goto out; + } + if (ceph_snap(dir) != CEPH_SNAPDIR) + goto out; + + while (1) { + struct ceph_mds_reply_info_parsed *rinfo; + struct ceph_mds_reply_dir_entry *rde; + int i; + + req = ceph_mdsc_create_request(fsc->mdsc, CEPH_MDS_OP_LSSNAP, + USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) + goto out; + + req->r_direct_mode = USE_AUTH_MDS; + req->r_readdir_offset = next_offset; + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); + if (last_name) { + req->r_path2 = last_name; + last_name = NULL; + } + + req->r_inode = dir; + ihold(dir); + req->r_dentry = dget(parent); + + inode_lock(dir); + err = ceph_mdsc_do_request(fsc->mdsc, NULL, req); + inode_unlock(dir); + + if (err < 0) + goto out; + + rinfo = &req->r_reply_info; + for (i = 0; i < rinfo->dir_nr; i++) { + rde = rinfo->dir_entries + i; + BUG_ON(!rde->inode.in); + if (ceph_snap(inode) == + le64_to_cpu(rde->inode.in->snapid)) { + memcpy(name, rde->name, rde->name_len); + name[rde->name_len] = '\0'; + err = 0; + goto out; + } + } + + if (rinfo->dir_end) + break; + + BUG_ON(rinfo->dir_nr <= 0); + rde = rinfo->dir_entries + (rinfo->dir_nr - 1); + next_offset += rinfo->dir_nr; + last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); + if (!last_name) { + err = -ENOMEM; + goto out; + } + + ceph_mdsc_put_request(req); + req = NULL; + } + err = -ENOENT; +out: + if (req) + ceph_mdsc_put_request(req); + kfree(last_name); + dout("get_snap_name %p ino %llx.%llx err=%d\n", + child, ceph_vinop(inode), err); + return err; +} + +static int ceph_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct ceph_mds_client *mdsc; + struct ceph_mds_request *req; + struct inode *inode = d_inode(child); + int err; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return __get_snap_name(parent, name, child); + + mdsc = ceph_inode_to_client(inode)->mdsc; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, + USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + inode_lock(d_inode(parent)); + + req->r_inode = inode; + ihold(inode); + req->r_ino2 = ceph_vino(d_inode(parent)); + req->r_parent = d_inode(parent); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + + inode_unlock(d_inode(parent)); + + if (!err) { + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + memcpy(name, rinfo->dname, rinfo->dname_len); + name[rinfo->dname_len] = 0; + dout("get_name %p ino %llx.%llx name %s\n", + child, ceph_vinop(inode), name); + } else { + dout("get_name %p ino %llx.%llx err %d\n", + child, ceph_vinop(inode), err); + } + + ceph_mdsc_put_request(req); + return err; +} + +const struct export_operations ceph_export_ops = { + .encode_fh = ceph_encode_fh, + .fh_to_dentry = ceph_fh_to_dentry, + .fh_to_parent = ceph_fh_to_parent, + .get_parent = ceph_get_parent, + .get_name = ceph_get_name, +}; diff --git a/fs/ceph/file.c b/fs/ceph/file.c new file mode 100644 index 000000000..d4974c652 --- /dev/null +++ b/fs/ceph/file.c @@ -0,0 +1,2530 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> +#include <linux/ceph/striper.h> + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/writeback.h> +#include <linux/falloc.h> +#include <linux/iversion.h> +#include <linux/ktime.h> + +#include "super.h" +#include "mds_client.h" +#include "cache.h" +#include "io.h" +#include "metric.h" + +static __le32 ceph_flags_sys2wire(u32 flags) +{ + u32 wire_flags = 0; + + switch (flags & O_ACCMODE) { + case O_RDONLY: + wire_flags |= CEPH_O_RDONLY; + break; + case O_WRONLY: + wire_flags |= CEPH_O_WRONLY; + break; + case O_RDWR: + wire_flags |= CEPH_O_RDWR; + break; + } + + flags &= ~O_ACCMODE; + +#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } + + ceph_sys2wire(O_CREAT); + ceph_sys2wire(O_EXCL); + ceph_sys2wire(O_TRUNC); + ceph_sys2wire(O_DIRECTORY); + ceph_sys2wire(O_NOFOLLOW); + +#undef ceph_sys2wire + + if (flags) + dout("unused open flags: %x\n", flags); + + return cpu_to_le32(wire_flags); +} + +/* + * Ceph file operations + * + * Implement basic open/close functionality, and implement + * read/write. + * + * We implement three modes of file I/O: + * - buffered uses the generic_file_aio_{read,write} helpers + * + * - synchronous is used when there is multi-client read/write + * sharing, avoids the page cache, and synchronously waits for an + * ack from the OSD. + * + * - direct io takes the variant of the sync path that references + * user pages directly. + * + * fsync() flushes and waits on dirty pages, but just queues metadata + * for writeback: since the MDS can recover size and mtime there is no + * need to wait for MDS acknowledgement. + */ + +/* + * How many pages to get in one call to iov_iter_get_pages(). This + * determines the size of the on-stack array used as a buffer. + */ +#define ITER_GET_BVECS_PAGES 64 + +static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, + struct bio_vec *bvecs) +{ + size_t size = 0; + int bvec_idx = 0; + + if (maxsize > iov_iter_count(iter)) + maxsize = iov_iter_count(iter); + + while (size < maxsize) { + struct page *pages[ITER_GET_BVECS_PAGES]; + ssize_t bytes; + size_t start; + int idx = 0; + + bytes = iov_iter_get_pages(iter, pages, maxsize - size, + ITER_GET_BVECS_PAGES, &start); + if (bytes < 0) + return size ?: bytes; + + iov_iter_advance(iter, bytes); + size += bytes; + + for ( ; bytes; idx++, bvec_idx++) { + struct bio_vec bv = { + .bv_page = pages[idx], + .bv_len = min_t(int, bytes, PAGE_SIZE - start), + .bv_offset = start, + }; + + bvecs[bvec_idx] = bv; + bytes -= bv.bv_len; + start = 0; + } + } + + return size; +} + +/* + * iov_iter_get_pages() only considers one iov_iter segment, no matter + * what maxsize or maxpages are given. For ITER_BVEC that is a single + * page. + * + * Attempt to get up to @maxsize bytes worth of pages from @iter. + * Return the number of bytes in the created bio_vec array, or an error. + */ +static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize, + struct bio_vec **bvecs, int *num_bvecs) +{ + struct bio_vec *bv; + size_t orig_count = iov_iter_count(iter); + ssize_t bytes; + int npages; + + iov_iter_truncate(iter, maxsize); + npages = iov_iter_npages(iter, INT_MAX); + iov_iter_reexpand(iter, orig_count); + + /* + * __iter_get_bvecs() may populate only part of the array -- zero it + * out. + */ + bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO); + if (!bv) + return -ENOMEM; + + bytes = __iter_get_bvecs(iter, maxsize, bv); + if (bytes < 0) { + /* + * No pages were pinned -- just free the array. + */ + kvfree(bv); + return bytes; + } + + *bvecs = bv; + *num_bvecs = npages; + return bytes; +} + +static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) +{ + int i; + + for (i = 0; i < num_bvecs; i++) { + if (bvecs[i].bv_page) { + if (should_dirty) + set_page_dirty_lock(bvecs[i].bv_page); + put_page(bvecs[i].bv_page); + } + } + kvfree(bvecs); +} + +/* + * Prepare an open request. Preallocate ceph_cap to avoid an + * inopportune ENOMEM later. + */ +static struct ceph_mds_request * +prepare_open_request(struct super_block *sb, int flags, int create_mode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); + struct ceph_mds_request *req; + int want_auth = USE_ANY_MDS; + int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; + + if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC)) + want_auth = USE_AUTH_MDS; + + req = ceph_mdsc_create_request(mdsc, op, want_auth); + if (IS_ERR(req)) + goto out; + req->r_fmode = ceph_flags_to_mode(flags); + req->r_args.open.flags = ceph_flags_sys2wire(flags); + req->r_args.open.mode = cpu_to_le32(create_mode); +out: + return req; +} + +static int ceph_init_file_info(struct inode *inode, struct file *file, + int fmode, bool isdir) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi; + + dout("%s %p %p 0%o (%s)\n", __func__, inode, file, + inode->i_mode, isdir ? "dir" : "regular"); + BUG_ON(inode->i_fop->release != ceph_release); + + if (isdir) { + struct ceph_dir_file_info *dfi = + kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); + if (!dfi) + return -ENOMEM; + + file->private_data = dfi; + fi = &dfi->file_info; + dfi->next_offset = 2; + dfi->readdir_cache_idx = -1; + } else { + fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); + if (!fi) + return -ENOMEM; + + file->private_data = fi; + } + + ceph_get_fmode(ci, fmode, 1); + fi->fmode = fmode; + + spin_lock_init(&fi->rw_contexts_lock); + INIT_LIST_HEAD(&fi->rw_contexts); + fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); + + return 0; +} + +/* + * initialize private struct file data. + * if we fail, clean up by dropping fmode reference on the ceph_inode + */ +static int ceph_init_file(struct inode *inode, struct file *file, int fmode) +{ + int ret = 0; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + ceph_fscache_register_inode_cookie(inode); + ceph_fscache_file_set_cookie(inode, file); + fallthrough; + case S_IFDIR: + ret = ceph_init_file_info(inode, file, fmode, + S_ISDIR(inode->i_mode)); + break; + + case S_IFLNK: + dout("init_file %p %p 0%o (symlink)\n", inode, file, + inode->i_mode); + break; + + default: + dout("init_file %p %p 0%o (special)\n", inode, file, + inode->i_mode); + /* + * we need to drop the open ref now, since we don't + * have .release set to ceph_release. + */ + BUG_ON(inode->i_fop->release == ceph_release); + + /* call the proper open fop */ + ret = inode->i_fop->open(inode, file); + } + return ret; +} + +/* + * try renew caps after session gets killed. + */ +int ceph_renew_caps(struct inode *inode, int fmode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req; + int err, flags, wanted; + + spin_lock(&ci->i_ceph_lock); + __ceph_touch_fmode(ci, mdsc, fmode); + wanted = __ceph_caps_file_wanted(ci); + if (__ceph_is_any_real_caps(ci) && + (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { + int issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + dout("renew caps %p want %s issued %s updating mds_wanted\n", + inode, ceph_cap_string(wanted), ceph_cap_string(issued)); + ceph_check_caps(ci, 0, NULL); + return 0; + } + spin_unlock(&ci->i_ceph_lock); + + flags = 0; + if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) + flags = O_RDWR; + else if (wanted & CEPH_CAP_FILE_RD) + flags = O_RDONLY; + else if (wanted & CEPH_CAP_FILE_WR) + flags = O_WRONLY; +#ifdef O_LAZY + if (wanted & CEPH_CAP_FILE_LAZYIO) + flags |= O_LAZY; +#endif + + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); +out: + dout("renew caps %p open result=%d\n", inode, err); + return err < 0 ? err : 0; +} + +/* + * If we already have the requisite capabilities, we can satisfy + * the open request locally (no need to request new caps from the + * MDS). We do, however, need to inform the MDS (asynchronously) + * if our wanted caps set expands. + */ +int ceph_open(struct inode *inode, struct file *file) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + struct ceph_file_info *fi = file->private_data; + int err; + int flags, fmode, wanted; + + if (fi) { + dout("open file %p is already opened\n", file); + return 0; + } + + /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ + flags = file->f_flags & ~(O_CREAT|O_EXCL); + if (S_ISDIR(inode->i_mode)) + flags = O_DIRECTORY; /* mds likes to know */ + + dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, + ceph_vinop(inode), file, flags, file->f_flags); + fmode = ceph_flags_to_mode(flags); + wanted = ceph_caps_for_mode(fmode); + + /* snapped files are read-only */ + if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) + return -EROFS; + + /* trivially open snapdir */ + if (ceph_snap(inode) == CEPH_SNAPDIR) { + return ceph_init_file(inode, file, fmode); + } + + /* + * No need to block if we have caps on the auth MDS (for + * write) or any MDS (for read). Update wanted set + * asynchronously. + */ + spin_lock(&ci->i_ceph_lock); + if (__ceph_is_any_real_caps(ci) && + (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { + int mds_wanted = __ceph_caps_mds_wanted(ci, true); + int issued = __ceph_caps_issued(ci, NULL); + + dout("open %p fmode %d want %s issued %s using existing\n", + inode, fmode, ceph_cap_string(wanted), + ceph_cap_string(issued)); + __ceph_touch_fmode(ci, mdsc, fmode); + spin_unlock(&ci->i_ceph_lock); + + /* adjust wanted? */ + if ((issued & wanted) != wanted && + (mds_wanted & wanted) != wanted && + ceph_snap(inode) != CEPH_SNAPDIR) + ceph_check_caps(ci, 0, NULL); + + return ceph_init_file(inode, file, fmode); + } else if (ceph_snap(inode) != CEPH_NOSNAP && + (ci->i_snap_caps & wanted) == wanted) { + __ceph_touch_fmode(ci, mdsc, fmode); + spin_unlock(&ci->i_ceph_lock); + return ceph_init_file(inode, file, fmode); + } + + spin_unlock(&ci->i_ceph_lock); + + dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_inode = inode; + ihold(inode); + + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (!err) + err = ceph_init_file(inode, file, req->r_fmode); + ceph_mdsc_put_request(req); + dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); +out: + return err; +} + +/* Clone the layout from a synchronous create, if the dir now has Dc caps */ +static void +cache_file_layout(struct inode *dst, struct inode *src) +{ + struct ceph_inode_info *cdst = ceph_inode(dst); + struct ceph_inode_info *csrc = ceph_inode(src); + + spin_lock(&cdst->i_ceph_lock); + if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) && + !ceph_file_layout_is_valid(&cdst->i_cached_layout)) { + memcpy(&cdst->i_cached_layout, &csrc->i_layout, + sizeof(cdst->i_cached_layout)); + rcu_assign_pointer(cdst->i_cached_layout.pool_ns, + ceph_try_get_string(csrc->i_layout.pool_ns)); + } + spin_unlock(&cdst->i_ceph_lock); +} + +/* + * Try to set up an async create. We need caps, a file layout, and inode number, + * and either a lease on the dentry or complete dir info. If any of those + * criteria are not satisfied, then return false and the caller can go + * synchronous. + */ +static int try_prep_async_create(struct inode *dir, struct dentry *dentry, + struct ceph_file_layout *lo, u64 *pino) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE; + u64 ino; + + spin_lock(&ci->i_ceph_lock); + /* No auth cap means no chance for Dc caps */ + if (!ci->i_auth_cap) + goto no_async; + + /* Any delegated inos? */ + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) + goto no_async; + + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) + goto no_async; + + if ((__ceph_caps_issued(ci, NULL) & want) != want) + goto no_async; + + if (d_in_lookup(dentry)) { + if (!__ceph_dir_is_complete(ci)) + goto no_async; + spin_lock(&dentry->d_lock); + di->lease_shared_gen = atomic_read(&ci->i_shared_gen); + spin_unlock(&dentry->d_lock); + } else if (atomic_read(&ci->i_shared_gen) != + READ_ONCE(di->lease_shared_gen)) { + goto no_async; + } + + ino = ceph_get_deleg_ino(ci->i_auth_cap->session); + if (!ino) + goto no_async; + + *pino = ino; + ceph_take_cap_refs(ci, want, false); + memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); + rcu_assign_pointer(lo->pool_ns, + ceph_try_get_string(ci->i_cached_layout.pool_ns)); + got = want; +no_async: + spin_unlock(&ci->i_ceph_lock); + return got; +} + +static void restore_deleg_ino(struct inode *dir, u64 ino) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_session *s = NULL; + + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + s = ceph_get_mds_session(ci->i_auth_cap->session); + spin_unlock(&ci->i_ceph_lock); + if (s) { + int err = ceph_restore_deleg_ino(s, ino); + if (err) + pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n", + ino, err); + ceph_put_mds_session(s); + } +} + +static void ceph_async_create_cb(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + int result = req->r_err ? req->r_err : + le32_to_cpu(req->r_reply_info.head->result); + + if (result == -EJUKEBOX) + goto out; + + mapping_set_error(req->r_parent->i_mapping, result); + + if (result) { + struct dentry *dentry = req->r_dentry; + int pathlen = 0; + u64 base = 0; + char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &base, 0); + + ceph_dir_clear_complete(req->r_parent); + if (!d_unhashed(dentry)) + d_drop(dentry); + + /* FIXME: start returning I/O errors on all accesses? */ + pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path(path, pathlen); + } + + if (req->r_target_inode) { + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); + u64 ino = ceph_vino(req->r_target_inode).ino; + + if (req->r_deleg_ino != ino) + pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", + __func__, req->r_err, req->r_deleg_ino, ino); + mapping_set_error(req->r_target_inode->i_mapping, result); + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + } + ceph_kick_flushing_inode_caps(req->r_session, ci); + spin_unlock(&ci->i_ceph_lock); + } else { + pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, + req->r_deleg_ino); + } +out: + ceph_mdsc_release_dir_caps(req); +} + +static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, + struct file *file, umode_t mode, + struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx, + struct ceph_file_layout *lo) +{ + int ret; + char xattr_buf[4]; + struct ceph_mds_reply_inode in = { }; + struct ceph_mds_reply_info_in iinfo = { .in = &in }; + struct ceph_inode_info *ci = ceph_inode(dir); + struct inode *inode; + struct timespec64 now; + struct ceph_string *pool_ns; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_vino vino = { .ino = req->r_deleg_ino, + .snap = CEPH_NOSNAP }; + + ktime_get_real_ts64(&now); + + inode = ceph_get_inode(dentry->d_sb, vino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + iinfo.inline_version = CEPH_INLINE_NONE; + iinfo.change_attr = 1; + ceph_encode_timespec64(&iinfo.btime, &now); + + if (req->r_pagelist) { + iinfo.xattr_len = req->r_pagelist->length; + iinfo.xattr_data = req->r_pagelist->mapped_tail; + } else { + /* fake it */ + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + } + + in.ino = cpu_to_le64(vino.ino); + in.snapid = cpu_to_le64(CEPH_NOSNAP); + in.version = cpu_to_le64(1); // ??? + in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); + in.cap.cap_id = cpu_to_le64(1); + in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); + in.cap.flags = CEPH_CAP_FLAG_AUTH; + in.ctime = in.mtime = in.atime = iinfo.btime; + in.truncate_seq = cpu_to_le32(1); + in.truncate_size = cpu_to_le64(-1ULL); + in.xattr_version = cpu_to_le64(1); + in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); + if (dir->i_mode & S_ISGID) { + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid)); + + /* Directories always inherit the setgid bit. */ + if (S_ISDIR(mode)) + mode |= S_ISGID; + else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && + !in_group_p(dir->i_gid) && + !capable_wrt_inode_uidgid(dir, CAP_FSETID)) + mode &= ~S_ISGID; + } else { + in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid())); + } + in.mode = cpu_to_le32((u32)mode); + + in.nlink = cpu_to_le32(1); + in.max_size = cpu_to_le64(lo->stripe_unit); + + ceph_file_layout_to_legacy(lo, &in.layout); + /* lo is private, so pool_ns can't change */ + pool_ns = rcu_dereference_raw(lo->pool_ns); + if (pool_ns) { + iinfo.pool_ns_len = pool_ns->len; + iinfo.pool_ns_data = pool_ns->str; + } + + down_read(&mdsc->snap_rwsem); + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, + req->r_fmode, NULL); + up_read(&mdsc->snap_rwsem); + if (ret) { + dout("%s failed to fill inode: %d\n", __func__, ret); + ceph_dir_clear_complete(dir); + if (!d_unhashed(dentry)) + d_drop(dentry); + if (inode->i_state & I_NEW) + discard_new_inode(inode); + } else { + struct dentry *dn; + + dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__, + vino.ino, ceph_ino(dir), dentry->d_name.name); + ceph_dir_clear_ordered(dir); + ceph_init_inode_acls(inode, as_ctx); + if (inode->i_state & I_NEW) { + /* + * If it's not I_NEW, then someone created this before + * we got here. Assume the server is aware of it at + * that point and don't worry about setting + * CEPH_I_ASYNC_CREATE. + */ + ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; + unlock_new_inode(inode); + } + if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { + if (!d_unhashed(dentry)) + d_drop(dentry); + dn = d_splice_alias(inode, dentry); + WARN_ON_ONCE(dn && dn != dentry); + } + file->f_mode |= FMODE_CREATED; + ret = finish_open(file, dentry, ceph_open); + } + return ret; +} + +/* + * Do a lookup + open with a single request. If we get a non-existent + * file or symlink, return 1 so the VFS can retry. + */ +int ceph_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + struct dentry *dn; + struct ceph_acl_sec_ctx as_ctx = {}; + bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); + int mask; + int err; + + dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", + dir, dentry, dentry, + d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); + + if (dentry->d_name.len > NAME_MAX) + return -ENAMETOOLONG; + + /* + * Do not truncate the file, since atomic_open is called before the + * permission check. The caller will do the truncation afterward. + */ + flags &= ~O_TRUNC; + + if (flags & O_CREAT) { + if (ceph_quota_is_max_files_exceeded(dir)) + return -EDQUOT; + err = ceph_pre_init_acls(dir, &mode, &as_ctx); + if (err < 0) + return err; + err = ceph_security_init_secctx(dentry, mode, &as_ctx); + if (err < 0) + goto out_ctx; + /* Async create can't handle more than a page of xattrs */ + if (as_ctx.pagelist && + !list_is_singular(&as_ctx.pagelist->head)) + try_async = false; + } else if (!d_in_lookup(dentry)) { + /* If it's not being looked up, it's negative */ + return -ENOENT; + } +retry: + /* do the open */ + req = prepare_open_request(dir->i_sb, flags, mode); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out_ctx; + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.open.mask = cpu_to_le32(mask); + req->r_parent = dir; + + if (flags & O_CREAT) { + struct ceph_file_layout lo; + + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (as_ctx.pagelist) { + req->r_pagelist = as_ctx.pagelist; + as_ctx.pagelist = NULL; + } + if (try_async && + (req->r_dir_caps = + try_prep_async_create(dir, dentry, &lo, + &req->r_deleg_ino))) { + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); + req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); + req->r_callback = ceph_async_create_cb; + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) { + err = ceph_finish_async_create(dir, dentry, + file, mode, req, + &as_ctx, &lo); + } else if (err == -EJUKEBOX) { + restore_deleg_ino(dir, req->r_deleg_ino); + ceph_mdsc_put_request(req); + try_async = false; + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); + goto retry; + } + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); + goto out_req; + } + } + + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); + err = ceph_handle_snapdir(req, dentry, err); + if (err) + goto out_req; + + if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + + if (d_in_lookup(dentry)) { + dn = ceph_finish_lookup(req, dentry, err); + if (IS_ERR(dn)) + err = PTR_ERR(dn); + } else { + /* we were given a hashed negative dentry */ + dn = NULL; + } + if (err) + goto out_req; + if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { + /* make vfs retry on splice, ENOENT, or symlink */ + dout("atomic_open finish_no_open on dn %p\n", dn); + err = finish_no_open(file, dn); + } else { + dout("atomic_open finish_open on dn %p\n", dn); + if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { + struct inode *newino = d_inode(dentry); + + cache_file_layout(dir, newino); + ceph_init_inode_acls(newino, &as_ctx); + file->f_mode |= FMODE_CREATED; + } + err = finish_open(file, dentry, ceph_open); + } +out_req: + ceph_mdsc_put_request(req); +out_ctx: + ceph_release_acl_sec_ctx(&as_ctx); + dout("atomic_open result=%d\n", err); + return err; +} + +int ceph_release(struct inode *inode, struct file *file) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (S_ISDIR(inode->i_mode)) { + struct ceph_dir_file_info *dfi = file->private_data; + dout("release inode %p dir file %p\n", inode, file); + WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); + + ceph_put_fmode(ci, dfi->file_info.fmode, 1); + + if (dfi->last_readdir) + ceph_mdsc_put_request(dfi->last_readdir); + kfree(dfi->last_name); + kfree(dfi->dir_info); + kmem_cache_free(ceph_dir_file_cachep, dfi); + } else { + struct ceph_file_info *fi = file->private_data; + dout("release inode %p regular file %p\n", inode, file); + WARN_ON(!list_empty(&fi->rw_contexts)); + + ceph_put_fmode(ci, fi->fmode, 1); + + kmem_cache_free(ceph_file_cachep, fi); + } + + /* wake up anyone waiting for caps on this inode */ + wake_up_all(&ci->i_cap_wq); + return 0; +} + +enum { + HAVE_RETRIED = 1, + CHECK_EOF = 2, + READ_INLINE = 3, +}; + +/* + * Completely synchronous read and write methods. Direct from __user + * buffer to osd, or directly to user pages (if O_DIRECT). + * + * If the read spans object boundary, just do multiple reads. (That's not + * atomic, but good enough for now.) + * + * If we get a short result from the OSD, check against i_size; we need to + * only return a short read to the caller if we hit EOF. + */ +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, + int *retry_op) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_client *osdc = &fsc->client->osdc; + ssize_t ret; + u64 off = iocb->ki_pos; + u64 len = iov_iter_count(to); + + dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, + (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + if (!len) + return 0; + /* + * flush any page cache pages in this range. this + * will make concurrent normal and sync io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, + off, off + len - 1); + if (ret < 0) + return ret; + + ret = 0; + while ((len = iov_iter_count(to)) > 0) { + struct ceph_osd_request *req; + struct page **pages; + int num_pages; + size_t page_off; + u64 i_size; + bool more; + int idx; + size_t left; + + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, off, &len, 0, 1, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + more = len < iov_iter_count(to); + + num_pages = calc_pages_for(off, len); + page_off = off & ~PAGE_MASK; + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) { + ceph_osdc_put_request(req); + ret = PTR_ERR(pages); + break; + } + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, + false, false); + ret = ceph_osdc_start_request(osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(osdc, req); + + ceph_update_read_latency(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + ret); + + ceph_osdc_put_request(req); + + i_size = i_size_read(inode); + dout("sync_read %llu~%llu got %zd i_size %llu%s\n", + off, len, ret, i_size, (more ? " MORE" : "")); + + if (ret == -ENOENT) + ret = 0; + if (ret >= 0 && ret < len && (off + ret < i_size)) { + int zlen = min(len - ret, i_size - off - ret); + int zoff = page_off + ret; + dout("sync_read zero gap %llu~%llu\n", + off + ret, off + ret + zlen); + ceph_zero_page_vector_range(zoff, zlen, pages); + ret += zlen; + } + + idx = 0; + left = ret > 0 ? ret : 0; + while (left > 0) { + size_t len, copied; + page_off = off & ~PAGE_MASK; + len = min_t(size_t, left, PAGE_SIZE - page_off); + SetPageUptodate(pages[idx]); + copied = copy_page_to_iter(pages[idx++], + page_off, len, to); + off += copied; + left -= copied; + if (copied < len) { + ret = -EFAULT; + break; + } + } + ceph_release_page_vector(pages, num_pages); + + if (ret < 0) { + if (ret == -EBLOCKLISTED) + fsc->blocklisted = true; + break; + } + + if (off >= i_size || !more) + break; + } + + if (off > iocb->ki_pos) { + if (ret >= 0 && + iov_iter_count(to) > 0 && off >= i_size_read(inode)) + *retry_op = CHECK_EOF; + ret = off - iocb->ki_pos; + iocb->ki_pos = off; + } + + dout("sync_read result %zd retry_op %d\n", ret, *retry_op); + return ret; +} + +struct ceph_aio_request { + struct kiocb *iocb; + size_t total_len; + bool write; + bool should_dirty; + int error; + struct list_head osd_reqs; + unsigned num_reqs; + atomic_t pending_reqs; + struct timespec64 mtime; + struct ceph_cap_flush *prealloc_cf; +}; + +struct ceph_aio_work { + struct work_struct work; + struct ceph_osd_request *req; +}; + +static void ceph_aio_retry_work(struct work_struct *work); + +static void ceph_aio_complete(struct inode *inode, + struct ceph_aio_request *aio_req) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!atomic_dec_and_test(&aio_req->pending_reqs)) + return; + + if (aio_req->iocb->ki_flags & IOCB_DIRECT) + inode_dio_end(inode); + + ret = aio_req->error; + if (!ret) + ret = aio_req->total_len; + + dout("ceph_aio_complete %p rc %d\n", inode, ret); + + if (ret >= 0 && aio_req->write) { + int dirty; + + loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; + if (endoff > i_size_read(inode)) { + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + } + + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &aio_req->prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + + } + + ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD)); + + aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); + + ceph_free_cap_flush(aio_req->prealloc_cf); + kfree(aio_req); +} + +static void ceph_aio_complete_req(struct ceph_osd_request *req) +{ + int rc = req->r_result; + struct inode *inode = req->r_inode; + struct ceph_aio_request *aio_req = req->r_priv; + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; + + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); + BUG_ON(!osd_data->num_bvecs); + + dout("ceph_aio_complete_req %p rc %d bytes %u\n", + inode, rc, osd_data->bvec_pos.iter.bi_size); + + /* r_start_latency == 0 means the request was not submitted */ + if (req->r_start_latency) { + if (aio_req->write) + ceph_update_write_latency(metric, req->r_start_latency, + req->r_end_latency, rc); + else + ceph_update_read_latency(metric, req->r_start_latency, + req->r_end_latency, rc); + } + + if (rc == -EOLDSNAPC) { + struct ceph_aio_work *aio_work; + BUG_ON(!aio_req->write); + + aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS); + if (aio_work) { + INIT_WORK(&aio_work->work, ceph_aio_retry_work); + aio_work->req = req; + queue_work(ceph_inode_to_client(inode)->inode_wq, + &aio_work->work); + return; + } + rc = -ENOMEM; + } else if (!aio_req->write) { + if (rc == -ENOENT) + rc = 0; + if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) { + struct iov_iter i; + int zlen = osd_data->bvec_pos.iter.bi_size - rc; + + /* + * If read is satisfied by single OSD request, + * it can pass EOF. Otherwise read is within + * i_size. + */ + if (aio_req->num_reqs == 1) { + loff_t i_size = i_size_read(inode); + loff_t endoff = aio_req->iocb->ki_pos + rc; + if (endoff < i_size) + zlen = min_t(size_t, zlen, + i_size - endoff); + aio_req->total_len = rc + zlen; + } + + iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, + osd_data->num_bvecs, + osd_data->bvec_pos.iter.bi_size); + iov_iter_advance(&i, rc); + iov_iter_zero(zlen, &i); + } + } + + put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, + aio_req->should_dirty); + ceph_osdc_put_request(req); + + if (rc < 0) + cmpxchg(&aio_req->error, 0, rc); + + ceph_aio_complete(inode, aio_req); + return; +} + +static void ceph_aio_retry_work(struct work_struct *work) +{ + struct ceph_aio_work *aio_work = + container_of(work, struct ceph_aio_work, work); + struct ceph_osd_request *orig_req = aio_work->req; + struct ceph_aio_request *aio_req = orig_req->r_priv; + struct inode *inode = orig_req->r_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + struct ceph_osd_request *req; + int ret; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + + req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1, + false, GFP_NOFS); + if (!req) { + ret = -ENOMEM; + req = orig_req; + goto out; + } + + req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; + ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); + ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); + + req->r_ops[0] = orig_req->r_ops[0]; + + req->r_mtime = aio_req->mtime; + req->r_data_offset = req->r_ops[0].extent.offset; + + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (ret) { + ceph_osdc_put_request(req); + req = orig_req; + goto out; + } + + ceph_osdc_put_request(orig_req); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + + ret = ceph_osdc_start_request(req->r_osdc, req, false); +out: + if (ret < 0) { + req->r_result = ret; + ceph_aio_complete_req(req); + } + + ceph_put_snap_context(snapc); + kfree(aio_work); +} + +static ssize_t +ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, + struct ceph_snap_context *snapc, + struct ceph_cap_flush **pcf) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_client_metric *metric = &fsc->mdsc->metric; + struct ceph_vino vino; + struct ceph_osd_request *req; + struct bio_vec *bvecs; + struct ceph_aio_request *aio_req = NULL; + int num_pages = 0; + int flags; + int ret = 0; + struct timespec64 mtime = current_time(inode); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos; + bool write = iov_iter_rw(iter) == WRITE; + bool should_dirty = !write && iter_is_iovec(iter); + + if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", + (write ? "write" : "read"), file, pos, (unsigned)count, + snapc, snapc ? snapc->seq : 0); + + if (write) { + int ret2 = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + count - 1) >> PAGE_SHIFT); + if (ret2 < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret2); + + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; + } else { + flags = CEPH_OSD_FLAG_READ; + } + + while (iov_iter_count(iter) > 0) { + u64 size = iov_iter_count(iter); + ssize_t len; + + if (write) + size = min_t(u64, size, fsc->mount_options->wsize); + else + size = min_t(u64, size, fsc->mount_options->rsize); + + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &size, 0, + 1, + write ? CEPH_OSD_OP_WRITE : + CEPH_OSD_OP_READ, + flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); + if (len < 0) { + ceph_osdc_put_request(req); + ret = len; + break; + } + if (len != size) + osd_req_op_extent_update(req, 0, len); + + /* + * To simplify error handling, allow AIO when IO within i_size + * or IO can be satisfied by single OSD request. + */ + if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) && + (len == count || pos + count <= i_size_read(inode))) { + aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL); + if (aio_req) { + aio_req->iocb = iocb; + aio_req->write = write; + aio_req->should_dirty = should_dirty; + INIT_LIST_HEAD(&aio_req->osd_reqs); + if (write) { + aio_req->mtime = mtime; + swap(aio_req->prealloc_cf, *pcf); + } + } + /* ignore error */ + } + + if (write) { + /* + * throw out any page cache pages in this range. this + * may block. + */ + truncate_inode_pages_range(inode->i_mapping, pos, + PAGE_ALIGN(pos + len) - 1); + + req->r_mtime = mtime; + } + + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); + + if (aio_req) { + aio_req->total_len += len; + aio_req->num_reqs++; + atomic_inc(&aio_req->pending_reqs); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + list_add_tail(&req->r_private_item, &aio_req->osd_reqs); + + pos += len; + continue; + } + + ret = ceph_osdc_start_request(req->r_osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + if (write) + ceph_update_write_latency(metric, req->r_start_latency, + req->r_end_latency, ret); + else + ceph_update_read_latency(metric, req->r_start_latency, + req->r_end_latency, ret); + + size = i_size_read(inode); + if (!write) { + if (ret == -ENOENT) + ret = 0; + if (ret >= 0 && ret < len && pos + ret < size) { + struct iov_iter i; + int zlen = min_t(size_t, len - ret, + size - pos - ret); + + iov_iter_bvec(&i, READ, bvecs, num_pages, len); + iov_iter_advance(&i, ret); + iov_iter_zero(zlen, &i); + ret += zlen; + } + if (ret >= 0) + len = ret; + } + + put_bvecs(bvecs, num_pages, should_dirty); + ceph_osdc_put_request(req); + if (ret < 0) + break; + + pos += len; + if (!write && pos >= size) + break; + + if (write && pos > size) { + if (ceph_inode_set_size(inode, pos)) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } + + if (aio_req) { + LIST_HEAD(osd_reqs); + + if (aio_req->num_reqs == 0) { + kfree(aio_req); + return ret; + } + + ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD); + + list_splice(&aio_req->osd_reqs, &osd_reqs); + inode_dio_begin(inode); + while (!list_empty(&osd_reqs)) { + req = list_first_entry(&osd_reqs, + struct ceph_osd_request, + r_private_item); + list_del_init(&req->r_private_item); + if (ret >= 0) + ret = ceph_osdc_start_request(req->r_osdc, + req, false); + if (ret < 0) { + req->r_result = ret; + ceph_aio_complete_req(req); + } + } + return -EIOCBQUEUED; + } + + if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { + ret = pos - iocb->ki_pos; + iocb->ki_pos = pos; + } + return ret; +} + +/* + * Synchronous write, straight from __user pointer or user pages. + * + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t +ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, + struct ceph_snap_context *snapc) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_vino vino; + struct ceph_osd_request *req; + struct page **pages; + u64 len; + int num_pages; + int written = 0; + int flags; + int ret; + bool check_caps = false; + struct timespec64 mtime = current_time(inode); + size_t count = iov_iter_count(from); + + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", + file, pos, (unsigned)count, snapc, snapc->seq); + + ret = filemap_write_and_wait_range(inode->i_mapping, + pos, pos + count - 1); + if (ret < 0) + return ret; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + count - 1) >> PAGE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); + + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; + + while ((len = iov_iter_count(from)) > 0) { + size_t left; + int n; + + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, 0, 1, + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + /* + * write from beginning of first page, + * regardless of io alignment + */ + num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out; + } + + left = len; + for (n = 0; n < num_pages; n++) { + size_t plen = min_t(size_t, left, PAGE_SIZE); + ret = copy_page_from_iter(pages[n], 0, plen, from); + if (ret != plen) { + ret = -EFAULT; + break; + } + left -= ret; + } + + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + goto out; + } + + req->r_inode = inode; + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + false, true); + + req->r_mtime = mtime; + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, ret); +out: + ceph_osdc_put_request(req); + if (ret != 0) { + ceph_set_error_write(ci); + break; + } + + ceph_clear_error_write(ci); + pos += len; + written += len; + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + + } + + if (ret != -EOLDSNAPC && written > 0) { + ret = written; + iocb->ki_pos = pos; + } + return ret; +} + +/* + * Wrap generic_file_aio_read with checks for cap bits on the inode. + * Atomically grab references, so that those bits are not released + * back to the MDS mid-read. + * + * Hmm, the sync read case isn't actually async... should it be? + */ +static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *filp = iocb->ki_filp; + struct ceph_file_info *fi = filp->private_data; + size_t len = iov_iter_count(to); + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct page *pinned_page = NULL; + bool direct_lock = iocb->ki_flags & IOCB_DIRECT; + ssize_t ret; + int want, got = 0; + int retry_op = 0, read = 0; + +again: + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + + if (direct_lock) + ceph_start_io_direct(inode); + else + ceph_start_io_read(inode); + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); + if (ret < 0) { + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); + return ret; + } + + if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || + (iocb->ki_flags & IOCB_DIRECT) || + (fi->flags & CEPH_F_SYNC)) { + + dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + + if (ci->i_inline_version == CEPH_INLINE_NONE) { + if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + ret = ceph_direct_read_write(iocb, to, + NULL, NULL); + if (ret >= 0 && ret < len) + retry_op = CHECK_EOF; + } else { + ret = ceph_sync_read(iocb, to, &retry_op); + } + } else { + retry_op = READ_INLINE; + } + } else { + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + ceph_add_rw_context(fi, &rw_ctx); + ret = generic_file_read_iter(iocb, to); + ceph_del_rw_context(fi, &rw_ctx); + } + + dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", + inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); + if (pinned_page) { + put_page(pinned_page); + pinned_page = NULL; + } + ceph_put_cap_refs(ci, got); + + if (direct_lock) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); + + if (retry_op > HAVE_RETRIED && ret >= 0) { + int statret; + struct page *page = NULL; + loff_t i_size; + if (retry_op == READ_INLINE) { + page = __page_cache_alloc(GFP_KERNEL); + if (!page) + return -ENOMEM; + } + + statret = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, !!page); + if (statret < 0) { + if (page) + __free_page(page); + if (statret == -ENODATA) { + BUG_ON(retry_op != READ_INLINE); + goto again; + } + return statret; + } + + i_size = i_size_read(inode); + if (retry_op == READ_INLINE) { + BUG_ON(ret > 0 || read > 0); + if (iocb->ki_pos < i_size && + iocb->ki_pos < PAGE_SIZE) { + loff_t end = min_t(loff_t, i_size, + iocb->ki_pos + len); + end = min_t(loff_t, end, PAGE_SIZE); + if (statret < end) + zero_user_segment(page, statret, end); + ret = copy_page_to_iter(page, + iocb->ki_pos & ~PAGE_MASK, + end - iocb->ki_pos, to); + iocb->ki_pos += ret; + read += ret; + } + if (iocb->ki_pos < i_size && read < len) { + size_t zlen = min_t(size_t, len - read, + i_size - iocb->ki_pos); + ret = iov_iter_zero(zlen, to); + iocb->ki_pos += ret; + read += ret; + } + __free_pages(page, 0); + return read; + } + + /* hit EOF or hole? */ + if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && + ret < len) { + dout("sync_read hit hole, ppos %lld < size %lld" + ", reading more\n", iocb->ki_pos, i_size); + + read += ret; + len -= ret; + retry_op = HAVE_RETRIED; + goto again; + } + } + + if (ret >= 0) + ret += read; + + return ret; +} + +/* + * Take cap references to avoid releasing caps to MDS mid-write. + * + * If we are synchronous, and write with an old snap context, the OSD + * may return EOLDSNAPC. In that case, retry the write.. _after_ + * dropping our cap refs and allowing the pending snap to logically + * complete _before_ this write occurs. + * + * If we are near ENOSPC, write synchronously. + */ +static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_cap_flush *prealloc_cf; + ssize_t count, written = 0; + int err, want, got; + bool direct_lock = false; + u32 map_flags; + u64 pool_flags; + loff_t pos; + loff_t limit = max(i_size_read(inode), fsc->max_file_size); + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT) + direct_lock = true; + +retry_snap: + if (direct_lock) + ceph_start_io_direct(inode); + else + ceph_start_io_write(inode); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + + if (iocb->ki_flags & IOCB_APPEND) { + err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); + if (err < 0) + goto out; + } + + err = generic_write_checks(iocb, from); + if (err <= 0) + goto out; + + pos = iocb->ki_pos; + if (unlikely(pos >= limit)) { + err = -EFBIG; + goto out; + } else { + iov_iter_truncate(from, limit - pos); + } + + count = iov_iter_count(from); + if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) { + err = -EDQUOT; + goto out; + } + + down_read(&osdc->lock); + map_flags = osdc->osdmap->flags; + pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); + up_read(&osdc->lock); + if ((map_flags & CEPH_OSDMAP_FULL) || + (pool_flags & CEPH_POOL_FLAG_FULL)) { + err = -ENOSPC; + goto out; + } + + err = file_remove_privs(file); + if (err) + goto out; + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + err = ceph_uninline_data(file, NULL); + if (err < 0) + goto out; + } + + dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, count, i_size_read(inode)); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + got = 0; + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, + &got, NULL); + if (err < 0) + goto out; + + err = file_update_time(file); + if (err) + goto out_caps; + + inode_inc_iversion_raw(inode); + + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", + inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || + (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) || + (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { + struct ceph_snap_context *snapc; + struct iov_iter data; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + + /* we might need to revert back to that point */ + data = *from; + if (iocb->ki_flags & IOCB_DIRECT) + written = ceph_direct_read_write(iocb, &data, snapc, + &prealloc_cf); + else + written = ceph_sync_write(iocb, &data, pos, snapc); + if (direct_lock) + ceph_end_io_direct(inode); + else + ceph_end_io_write(inode); + if (written > 0) + iov_iter_advance(from, written); + ceph_put_snap_context(snapc); + } else { + /* + * No need to acquire the i_truncate_mutex. Because + * the MDS revokes Fwb caps before sending truncate + * message to us. We can't get Fwb cap while there + * are pending vmtruncate. So write and vmtruncate + * can not run at the same time + */ + written = generic_perform_write(file, from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; + ceph_end_io_write(inode); + } + + if (written >= 0) { + int dirty; + + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) + ceph_check_caps(ci, 0, NULL); + } + + dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)count, + ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + + if (written == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), pos, (unsigned)count); + goto retry_snap; + } + + if (written >= 0) { + if ((map_flags & CEPH_OSDMAP_NEARFULL) || + (pool_flags & CEPH_POOL_FLAG_NEARFULL)) + iocb->ki_flags |= IOCB_DSYNC; + written = generic_write_sync(iocb, written); + } + + goto out_unlocked; +out_caps: + ceph_put_cap_refs(ci, got); +out: + if (direct_lock) + ceph_end_io_direct(inode); + else + ceph_end_io_write(inode); +out_unlocked: + ceph_free_cap_flush(prealloc_cf); + current->backing_dev_info = NULL; + return written ? written : err; +} + +/* + * llseek. be sure to verify file size on SEEK_END. + */ +static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + loff_t i_size; + loff_t ret; + + inode_lock(inode); + + if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { + ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); + if (ret < 0) + goto out; + } + + i_size = i_size_read(inode); + switch (whence) { + case SEEK_END: + offset += i_size; + break; + case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) { + ret = file->f_pos; + goto out; + } + offset += file->f_pos; + break; + case SEEK_DATA: + if (offset < 0 || offset >= i_size) { + ret = -ENXIO; + goto out; + } + break; + case SEEK_HOLE: + if (offset < 0 || offset >= i_size) { + ret = -ENXIO; + goto out; + } + offset = i_size; + break; + } + + ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size)); + +out: + inode_unlock(inode); + return ret; +} + +static inline void ceph_zero_partial_page( + struct inode *inode, loff_t offset, unsigned size) +{ + struct page *page; + pgoff_t index = offset >> PAGE_SHIFT; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + zero_user(page, offset & (PAGE_SIZE - 1), size); + unlock_page(page); + put_page(page); + } +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, + loff_t length) +{ + loff_t nearly = round_up(offset, PAGE_SIZE); + if (offset < nearly) { + loff_t size = nearly - offset; + if (length < size) + size = length; + ceph_zero_partial_page(inode, offset, size); + offset += size; + length -= size; + } + if (length >= PAGE_SIZE) { + loff_t size = round_down(length, PAGE_SIZE); + truncate_pagecache_range(inode, offset, offset + size - 1); + offset += size; + length -= size; + } + if (length) + ceph_zero_partial_page(inode, offset, length); +} + +static int ceph_zero_partial_object(struct inode *inode, + loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + loff_t zero = 0; + int op; + + if (!length) { + op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; + length = &zero; + } else { + op = CEPH_OSD_OP_ZERO; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), + offset, length, + 0, 1, op, + CEPH_OSD_FLAG_WRITE, + NULL, 0, 0, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + req->r_mtime = inode->i_mtime; + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) { + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; + } + ceph_osdc_put_request(req); + +out: + return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + s32 stripe_unit = ci->i_layout.stripe_unit; + s32 stripe_count = ci->i_layout.stripe_count; + s32 object_size = ci->i_layout.object_size; + u64 object_set_size = object_size * stripe_count; + u64 nearly, t; + + /* round offset up to next period boundary */ + nearly = offset + object_set_size - 1; + t = nearly; + nearly -= do_div(t, object_set_size); + + while (length && offset < nearly) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + while (length >= object_set_size) { + int i; + loff_t pos = offset; + for (i = 0; i < stripe_count; ++i) { + ret = ceph_zero_partial_object(inode, pos, NULL); + if (ret < 0) + return ret; + pos += stripe_unit; + } + offset += object_set_size; + length -= object_set_size; + } + while (length) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + return ret; +} + +static long ceph_fallocate(struct file *file, int mode, + loff_t offset, loff_t length) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_flush *prealloc_cf; + int want, got = 0; + int dirty; + int ret = 0; + loff_t endoff = 0; + loff_t size; + + if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + inode_lock(inode); + + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto unlock; + } + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + ret = ceph_uninline_data(file, NULL); + if (ret < 0) + goto unlock; + } + + size = i_size_read(inode); + + /* Are we punching a hole beyond EOF? */ + if (offset >= size) + goto unlock; + if ((offset + length) > size) + length = size - offset; + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); + if (ret < 0) + goto unlock; + + ceph_zero_pagecache_range(inode, offset, length); + ret = ceph_zero_objects(inode, offset, length); + + if (!ret) { + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + ceph_put_cap_refs(ci, got); +unlock: + inode_unlock(inode); + ceph_free_cap_flush(prealloc_cf); + return ret; +} + +/* + * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for + * src_ci. Two attempts are made to obtain both caps, and an error is return if + * this fails; zero is returned on success. + */ +static int get_rd_wr_caps(struct file *src_filp, int *src_got, + struct file *dst_filp, + loff_t dst_endoff, int *dst_got) +{ + int ret = 0; + bool retrying = false; + +retry_caps: + ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + dst_endoff, dst_got, NULL); + if (ret < 0) + return ret; + + /* + * Since we're already holding the FILE_WR capability for the dst file, + * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some + * retry dance instead to try to get both capabilities. + */ + ret = ceph_try_get_caps(file_inode(src_filp), + CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + false, src_got); + if (ret <= 0) { + /* Start by dropping dst_ci caps and getting src_ci caps */ + ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); + if (retrying) { + if (!ret) + /* ceph_try_get_caps masks EAGAIN */ + ret = -EAGAIN; + return ret; + } + ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, -1, src_got, NULL); + if (ret < 0) + return ret; + /*... drop src_ci caps too, and retry */ + ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); + retrying = true; + goto retry_caps; + } + return ret; +} + +static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, + struct ceph_inode_info *dst_ci, int dst_got) +{ + ceph_put_cap_refs(src_ci, src_got); + ceph_put_cap_refs(dst_ci, dst_got); +} + +/* + * This function does several size-related checks, returning an error if: + * - source file is smaller than off+len + * - destination file size is not OK (inode_newsize_ok()) + * - max bytes quotas is exceeded + */ +static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, + loff_t src_off, loff_t dst_off, size_t len) +{ + loff_t size, endoff; + + size = i_size_read(src_inode); + /* + * Don't copy beyond source file EOF. Instead of simply setting length + * to (size - src_off), just drop to VFS default implementation, as the + * local i_size may be stale due to other clients writing to the source + * inode. + */ + if (src_off + len > size) { + dout("Copy beyond EOF (%llu + %zu > %llu)\n", + src_off, len, size); + return -EOPNOTSUPP; + } + size = i_size_read(dst_inode); + + endoff = dst_off + len; + if (inode_newsize_ok(dst_inode, endoff)) + return -EOPNOTSUPP; + + if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) + return -EDQUOT; + + return 0; +} + +static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off, + struct ceph_inode_info *dst_ci, u64 *dst_off, + struct ceph_fs_client *fsc, + size_t len, unsigned int flags) +{ + struct ceph_object_locator src_oloc, dst_oloc; + struct ceph_object_id src_oid, dst_oid; + size_t bytes = 0; + u64 src_objnum, src_objoff, dst_objnum, dst_objoff; + u32 src_objlen, dst_objlen; + u32 object_size = src_ci->i_layout.object_size; + int ret; + + src_oloc.pool = src_ci->i_layout.pool_id; + src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); + dst_oloc.pool = dst_ci->i_layout.pool_id; + dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); + + while (len >= object_size) { + ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off, + object_size, &src_objnum, + &src_objoff, &src_objlen); + ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off, + object_size, &dst_objnum, + &dst_objoff, &dst_objlen); + ceph_oid_init(&src_oid); + ceph_oid_printf(&src_oid, "%llx.%08llx", + src_ci->i_vino.ino, src_objnum); + ceph_oid_init(&dst_oid); + ceph_oid_printf(&dst_oid, "%llx.%08llx", + dst_ci->i_vino.ino, dst_objnum); + /* Do an object remote copy */ + ret = ceph_osdc_copy_from(&fsc->client->osdc, + src_ci->i_vino.snap, 0, + &src_oid, &src_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, + &dst_oid, &dst_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, + dst_ci->i_truncate_seq, + dst_ci->i_truncate_size, + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); + if (ret) { + if (ret == -EOPNOTSUPP) { + fsc->have_copy_from2 = false; + pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); + } + dout("ceph_osdc_copy_from returned %d\n", ret); + if (!bytes) + bytes = ret; + goto out; + } + len -= object_size; + bytes += object_size; + *src_off += object_size; + *dst_off += object_size; + } + +out: + ceph_oloc_destroy(&src_oloc); + ceph_oloc_destroy(&dst_oloc); + return bytes; +} + +static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + struct ceph_inode_info *src_ci = ceph_inode(src_inode); + struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); + struct ceph_cap_flush *prealloc_cf; + struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); + loff_t size; + ssize_t ret = -EIO, bytes; + u64 src_objnum, dst_objnum, src_objoff, dst_objoff; + u32 src_objlen, dst_objlen; + int src_got = 0, dst_got = 0, err, dirty; + + if (src_inode->i_sb != dst_inode->i_sb) { + struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); + + if (ceph_fsid_compare(&src_fsc->client->fsid, + &dst_fsc->client->fsid)) { + dout("Copying files across clusters: src: %pU dst: %pU\n", + &src_fsc->client->fsid, &dst_fsc->client->fsid); + return -EXDEV; + } + } + if (ceph_snap(dst_inode) != CEPH_NOSNAP) + return -EROFS; + + /* + * Some of the checks below will return -EOPNOTSUPP, which will force a + * fallback to the default VFS copy_file_range implementation. This is + * desirable in several cases (for ex, the 'len' is smaller than the + * size of the objects, or in cases where that would be more + * efficient). + */ + + if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) + return -EOPNOTSUPP; + + if (!src_fsc->have_copy_from2) + return -EOPNOTSUPP; + + /* + * Striped file layouts require that we copy partial objects, but the + * OSD copy-from operation only supports full-object copies. Limit + * this to non-striped file layouts for now. + */ + if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || + (src_ci->i_layout.stripe_count != 1) || + (dst_ci->i_layout.stripe_count != 1) || + (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) { + dout("Invalid src/dst files layout\n"); + return -EOPNOTSUPP; + } + + if (len < src_ci->i_layout.object_size) + return -EOPNOTSUPP; /* no remote copy will be done */ + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + /* Start by sync'ing the source and destination files */ + ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); + if (ret < 0) { + dout("failed to write src file (%zd)\n", ret); + goto out; + } + ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len)); + if (ret < 0) { + dout("failed to write dst file (%zd)\n", ret); + goto out; + } + + /* + * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other + * clients may have dirty data in their caches. And OSDs know nothing + * about caps, so they can't safely do the remote object copies. + */ + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); + if (err < 0) { + dout("get_rd_wr_caps returned %d\n", err); + ret = -EOPNOTSUPP; + goto out; + } + + ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len); + if (ret < 0) + goto out_caps; + + /* Drop dst file cached pages */ + ret = invalidate_inode_pages2_range(dst_inode->i_mapping, + dst_off >> PAGE_SHIFT, + (dst_off + len) >> PAGE_SHIFT); + if (ret < 0) { + dout("Failed to invalidate inode pages (%zd)\n", ret); + ret = 0; /* XXX */ + } + ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, + src_ci->i_layout.object_size, + &src_objnum, &src_objoff, &src_objlen); + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, + dst_ci->i_layout.object_size, + &dst_objnum, &dst_objoff, &dst_objlen); + /* object-level offsets need to the same */ + if (src_objoff != dst_objoff) { + ret = -EOPNOTSUPP; + goto out_caps; + } + + /* + * Do a manual copy if the object offset isn't object aligned. + * 'src_objlen' contains the bytes left until the end of the object, + * starting at the src_off + */ + if (src_objoff) { + dout("Initial partial copy of %u bytes\n", src_objlen); + + /* + * we need to temporarily drop all caps as we'll be calling + * {read,write}_iter, which will get caps again. + */ + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); + ret = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, src_objlen, flags); + /* Abort on short copies or on error */ + if (ret < (long)src_objlen) { + dout("Failed partial copy (%zd)\n", ret); + goto out; + } + len -= ret; + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); + if (err < 0) + goto out; + err = is_file_size_ok(src_inode, dst_inode, + src_off, dst_off, len); + if (err < 0) + goto out_caps; + } + + size = i_size_read(dst_inode); + bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off, + src_fsc, len, flags); + if (bytes <= 0) { + if (!ret) + ret = bytes; + goto out_caps; + } + dout("Copied %zu bytes out of %zu\n", bytes, len); + len -= bytes; + ret += bytes; + + file_update_time(dst_file); + inode_inc_iversion_raw(dst_inode); + + if (dst_off > size) { + /* Let the MDS know about dst file size change */ + if (ceph_inode_set_size(dst_inode, dst_off) || + ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); + } + /* Mark Fw dirty */ + spin_lock(&dst_ci->i_ceph_lock); + dst_ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&dst_ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(dst_inode, dirty); + +out_caps: + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); + + /* + * Do the final manual copy if we still have some bytes left, unless + * there were errors in remote object copies (len >= object_size). + */ + if (len && (len < src_ci->i_layout.object_size)) { + dout("Final partial copy of %zu bytes\n", len); + bytes = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, len, flags); + if (bytes > 0) + ret += bytes; + else + dout("Failed partial copy (%zd)\n", bytes); + } + +out: + ceph_free_cap_flush(prealloc_cf); + + return ret; +} + +static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) +{ + ssize_t ret; + + ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off, + len, flags); + + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = generic_copy_file_range(src_file, src_off, dst_file, + dst_off, len, flags); + return ret; +} + +const struct file_operations ceph_file_fops = { + .open = ceph_open, + .release = ceph_release, + .llseek = ceph_llseek, + .read_iter = ceph_read_iter, + .write_iter = ceph_write_iter, + .mmap = ceph_mmap, + .fsync = ceph_fsync, + .lock = ceph_lock, + .setlease = simple_nosetlease, + .flock = ceph_flock, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .unlocked_ioctl = ceph_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .fallocate = ceph_fallocate, + .copy_file_range = ceph_copy_file_range, +}; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c new file mode 100644 index 000000000..36e3342d3 --- /dev/null +++ b/fs/ceph/inode.c @@ -0,0 +1,2423 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/writeback.h> +#include <linux/vmalloc.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/random.h> +#include <linux/sort.h> +#include <linux/iversion.h> + +#include "super.h" +#include "mds_client.h" +#include "cache.h" +#include <linux/ceph/decode.h> + +/* + * Ceph inode operations + * + * Implement basic inode helpers (get, alloc) and inode ops (getattr, + * setattr, etc.), xattr helpers, and helpers for assimilating + * metadata returned by the MDS into our cache. + * + * Also define helpers for doing asynchronous writeback, invalidation, + * and truncation for the benefit of those who can't afford to block + * (typically because they are in the message handler path). + */ + +static const struct inode_operations ceph_symlink_iops; + +static void ceph_inode_work(struct work_struct *work); + +/* + * find or create an inode, given the ceph ino number + */ +static int ceph_set_ino_cb(struct inode *inode, void *data) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + + ci->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino_t(ci->i_vino); + inode_set_iversion_raw(inode, 0); + percpu_counter_inc(&mdsc->metric.total_inodes); + + return 0; +} + +struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) +{ + struct inode *inode; + + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-EREMOTEIO); + + inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, + ceph_set_ino_cb, &vino); + if (!inode) + return ERR_PTR(-ENOMEM); + + dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode), + ceph_vinop(inode), inode, !!(inode->i_state & I_NEW)); + return inode; +} + +/* + * get/constuct snapdir inode for a given directory + */ +struct inode *ceph_get_snapdir(struct inode *parent) +{ + struct ceph_vino vino = { + .ino = ceph_ino(parent), + .snap = CEPH_SNAPDIR, + }; + struct inode *inode = ceph_get_inode(parent->i_sb, vino); + struct ceph_inode_info *ci = ceph_inode(inode); + + BUG_ON(!S_ISDIR(parent->i_mode)); + if (IS_ERR(inode)) + return inode; + inode->i_mode = parent->i_mode; + inode->i_uid = parent->i_uid; + inode->i_gid = parent->i_gid; + inode->i_mtime = parent->i_mtime; + inode->i_ctime = parent->i_ctime; + inode->i_atime = parent->i_atime; + ci->i_rbytes = 0; + ci->i_btime = ceph_inode(parent)->i_btime; + + if (inode->i_state & I_NEW) { + inode->i_op = &ceph_snapdir_iops; + inode->i_fop = &ceph_snapdir_fops; + ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ + unlock_new_inode(inode); + } + + return inode; +} + +const struct inode_operations ceph_file_iops = { + .permission = ceph_permission, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .listxattr = ceph_listxattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, +}; + + +/* + * We use a 'frag tree' to keep track of the MDS's directory fragments + * for a given inode (usually there is just a single fragment). We + * need to know when a child frag is delegated to a new MDS, or when + * it is flagged as replicated, so we can direct our requests + * accordingly. + */ + +/* + * find/create a frag in the tree + */ +static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, + u32 f) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_frag *frag; + int c; + + p = &ci->i_fragtree.rb_node; + while (*p) { + parent = *p; + frag = rb_entry(parent, struct ceph_inode_frag, node); + c = ceph_frag_compare(f, frag->frag); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else + return frag; + } + + frag = kmalloc(sizeof(*frag), GFP_NOFS); + if (!frag) + return ERR_PTR(-ENOMEM); + + frag->frag = f; + frag->split_by = 0; + frag->mds = -1; + frag->ndist = 0; + + rb_link_node(&frag->node, parent, p); + rb_insert_color(&frag->node, &ci->i_fragtree); + + dout("get_or_create_frag added %llx.%llx frag %x\n", + ceph_vinop(&ci->vfs_inode), f); + return frag; +} + +/* + * find a specific frag @f + */ +struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) +{ + struct rb_node *n = ci->i_fragtree.rb_node; + + while (n) { + struct ceph_inode_frag *frag = + rb_entry(n, struct ceph_inode_frag, node); + int c = ceph_frag_compare(f, frag->frag); + if (c < 0) + n = n->rb_left; + else if (c > 0) + n = n->rb_right; + else + return frag; + } + return NULL; +} + +/* + * Choose frag containing the given value @v. If @pfrag is + * specified, copy the frag delegation info to the caller if + * it is present. + */ +static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) +{ + u32 t = ceph_frag_make(0, 0); + struct ceph_inode_frag *frag; + unsigned nway, i; + u32 n; + + if (found) + *found = 0; + + while (1) { + WARN_ON(!ceph_frag_contains_value(t, v)); + frag = __ceph_find_frag(ci, t); + if (!frag) + break; /* t is a leaf */ + if (frag->split_by == 0) { + if (pfrag) + memcpy(pfrag, frag, sizeof(*pfrag)); + if (found) + *found = 1; + break; + } + + /* choose child */ + nway = 1 << frag->split_by; + dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t, + frag->split_by, nway); + for (i = 0; i < nway; i++) { + n = ceph_frag_make_child(t, frag->split_by, i); + if (ceph_frag_contains_value(n, v)) { + t = n; + break; + } + } + BUG_ON(i == nway); + } + dout("choose_frag(%x) = %x\n", v, t); + + return t; +} + +u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) +{ + u32 ret; + mutex_lock(&ci->i_fragtree_mutex); + ret = __ceph_choose_frag(ci, v, pfrag, found); + mutex_unlock(&ci->i_fragtree_mutex); + return ret; +} + +/* + * Process dirfrag (delegation) info from the mds. Include leaf + * fragment in tree ONLY if ndist > 0. Otherwise, only + * branches/splits are included in i_fragtree) + */ +static int ceph_fill_dirfrag(struct inode *inode, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + u32 id = le32_to_cpu(dirinfo->frag); + int mds = le32_to_cpu(dirinfo->auth); + int ndist = le32_to_cpu(dirinfo->ndist); + int diri_auth = -1; + int i; + int err = 0; + + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + diri_auth = ci->i_auth_cap->mds; + spin_unlock(&ci->i_ceph_lock); + + if (mds == -1) /* CDIR_AUTH_PARENT */ + mds = diri_auth; + + mutex_lock(&ci->i_fragtree_mutex); + if (ndist == 0 && mds == diri_auth) { + /* no delegation info needed. */ + frag = __ceph_find_frag(ci, id); + if (!frag) + goto out; + if (frag->split_by == 0) { + /* tree leaf, remove */ + dout("fill_dirfrag removed %llx.%llx frag %x" + " (no ref)\n", ceph_vinop(inode), id); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } else { + /* tree branch, keep and clear */ + dout("fill_dirfrag cleared %llx.%llx frag %x" + " referral\n", ceph_vinop(inode), id); + frag->mds = -1; + frag->ndist = 0; + } + goto out; + } + + + /* find/add this frag to store mds delegation info */ + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) { + /* this is not the end of the world; we can continue + with bad/inaccurate delegation info */ + pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n", + ceph_vinop(inode), le32_to_cpu(dirinfo->frag)); + err = -ENOMEM; + goto out; + } + + frag->mds = mds; + frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); + for (i = 0; i < frag->ndist; i++) + frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); + dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n", + ceph_vinop(inode), frag->frag, frag->ndist); + +out: + mutex_unlock(&ci->i_fragtree_mutex); + return err; +} + +static int frag_tree_split_cmp(const void *l, const void *r) +{ + struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; + struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; + return ceph_frag_compare(le32_to_cpu(ls->frag), + le32_to_cpu(rs->frag)); +} + +static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) +{ + if (!frag) + return f == ceph_frag_make(0, 0); + if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by) + return false; + return ceph_frag_contains_value(frag->frag, ceph_frag_value(f)); +} + +static int ceph_fill_fragtree(struct inode *inode, + struct ceph_frag_tree_head *fragtree, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag, *prev_frag = NULL; + struct rb_node *rb_node; + unsigned i, split_by, nsplits; + u32 id; + bool update = false; + + mutex_lock(&ci->i_fragtree_mutex); + nsplits = le32_to_cpu(fragtree->nsplits); + if (nsplits != ci->i_fragtree_nsplits) { + update = true; + } else if (nsplits) { + i = prandom_u32() % nsplits; + id = le32_to_cpu(fragtree->splits[i].frag); + if (!__ceph_find_frag(ci, id)) + update = true; + } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { + rb_node = rb_first(&ci->i_fragtree); + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) + update = true; + } + if (!update && dirinfo) { + id = le32_to_cpu(dirinfo->frag); + if (id != __ceph_choose_frag(ci, id, NULL, NULL)) + update = true; + } + if (!update) + goto out_unlock; + + if (nsplits > 1) { + sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]), + frag_tree_split_cmp, NULL); + } + + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); + rb_node = rb_first(&ci->i_fragtree); + for (i = 0; i < nsplits; i++) { + id = le32_to_cpu(fragtree->splits[i].frag); + split_by = le32_to_cpu(fragtree->splits[i].by); + if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { + pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " + "frag %x split by %d\n", ceph_vinop(inode), + i, nsplits, id, split_by); + continue; + } + frag = NULL; + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (ceph_frag_compare(frag->frag, id) >= 0) { + if (frag->frag != id) + frag = NULL; + else + rb_node = rb_next(rb_node); + break; + } + rb_node = rb_next(rb_node); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } + frag = NULL; + } + if (!frag) { + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) + continue; + } + if (frag->split_by == 0) + ci->i_fragtree_nsplits++; + frag->split_by = split_by; + dout(" frag %x split by %d\n", frag->frag, frag->split_by); + prev_frag = frag; + } + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + rb_node = rb_next(rb_node); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } + } +out_unlock: + mutex_unlock(&ci->i_fragtree_mutex); + return 0; +} + +/* + * initialize a newly allocated inode. + */ +struct inode *ceph_alloc_inode(struct super_block *sb) +{ + struct ceph_inode_info *ci; + int i; + + ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS); + if (!ci) + return NULL; + + dout("alloc_inode %p\n", &ci->vfs_inode); + + spin_lock_init(&ci->i_ceph_lock); + + ci->i_version = 0; + ci->i_inline_version = 0; + ci->i_time_warp_seq = 0; + ci->i_ceph_flags = 0; + atomic64_set(&ci->i_ordered_count, 1); + atomic64_set(&ci->i_release_count, 1); + atomic64_set(&ci->i_complete_seq[0], 0); + atomic64_set(&ci->i_complete_seq[1], 0); + ci->i_symlink = NULL; + + ci->i_max_bytes = 0; + ci->i_max_files = 0; + + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); + RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); + + ci->i_fragtree = RB_ROOT; + mutex_init(&ci->i_fragtree_mutex); + + ci->i_xattrs.blob = NULL; + ci->i_xattrs.prealloc_blob = NULL; + ci->i_xattrs.dirty = false; + ci->i_xattrs.index = RB_ROOT; + ci->i_xattrs.count = 0; + ci->i_xattrs.names_size = 0; + ci->i_xattrs.vals_size = 0; + ci->i_xattrs.version = 0; + ci->i_xattrs.index_version = 0; + + ci->i_caps = RB_ROOT; + ci->i_auth_cap = NULL; + ci->i_dirty_caps = 0; + ci->i_flushing_caps = 0; + INIT_LIST_HEAD(&ci->i_dirty_item); + INIT_LIST_HEAD(&ci->i_flushing_item); + ci->i_prealloc_cap_flush = NULL; + INIT_LIST_HEAD(&ci->i_cap_flush_list); + init_waitqueue_head(&ci->i_cap_wq); + ci->i_hold_caps_max = 0; + INIT_LIST_HEAD(&ci->i_cap_delay_list); + INIT_LIST_HEAD(&ci->i_cap_snaps); + ci->i_head_snapc = NULL; + ci->i_snap_caps = 0; + + ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) + ci->i_nr_by_mode[i] = 0; + + mutex_init(&ci->i_truncate_mutex); + ci->i_truncate_seq = 0; + ci->i_truncate_size = 0; + ci->i_truncate_pending = 0; + + ci->i_max_size = 0; + ci->i_reported_size = 0; + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + + ci->i_pin_ref = 0; + ci->i_rd_ref = 0; + ci->i_rdcache_ref = 0; + ci->i_wr_ref = 0; + ci->i_wb_ref = 0; + ci->i_fx_ref = 0; + ci->i_wrbuffer_ref = 0; + ci->i_wrbuffer_ref_head = 0; + atomic_set(&ci->i_filelock_ref, 0); + atomic_set(&ci->i_shared_gen, 1); + ci->i_rdcache_gen = 0; + ci->i_rdcache_revoking = 0; + + INIT_LIST_HEAD(&ci->i_unsafe_dirops); + INIT_LIST_HEAD(&ci->i_unsafe_iops); + spin_lock_init(&ci->i_unsafe_lock); + + ci->i_snap_realm = NULL; + INIT_LIST_HEAD(&ci->i_snap_realm_item); + INIT_LIST_HEAD(&ci->i_snap_flush_item); + + INIT_WORK(&ci->i_work, ceph_inode_work); + ci->i_work_mask = 0; + memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); + + ceph_fscache_inode_init(ci); + + return &ci->vfs_inode; +} + +void ceph_free_inode(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + kfree(ci->i_symlink); + kmem_cache_free(ceph_inode_cachep, ci); +} + +void ceph_evict_inode(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_inode_frag *frag; + struct rb_node *n; + + dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + + percpu_counter_dec(&mdsc->metric.total_inodes); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + ceph_fscache_unregister_inode_cookie(ci); + + __ceph_remove_caps(ci); + + if (__ceph_has_any_quota(ci)) + ceph_adjust_quota_realms_count(inode, false); + + /* + * we may still have a snap_realm reference if there are stray + * caps in i_snap_caps. + */ + if (ci->i_snap_realm) { + if (ceph_snap(inode) == CEPH_NOSNAP) { + struct ceph_snap_realm *realm = ci->i_snap_realm; + dout(" dropping residual ref to snap realm %p\n", + realm); + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, realm); + } else { + ceph_put_snapid_map(mdsc, ci->i_snapid_map); + ci->i_snap_realm = NULL; + } + } + + while ((n = rb_first(&ci->i_fragtree)) != NULL) { + frag = rb_entry(n, struct ceph_inode_frag, node); + rb_erase(n, &ci->i_fragtree); + kfree(frag); + } + ci->i_fragtree_nsplits = 0; + + __ceph_destroy_xattrs(ci); + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + + ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); +} + +static inline blkcnt_t calc_inode_blocks(u64 size) +{ + return (size + (1<<9) - 1) >> 9; +} + +/* + * Helpers to fill in size, ctime, mtime, and atime. We have to be + * careful because either the client or MDS may have more up to date + * info, depending on which capabilities are held, and whether + * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime + * and size are monotonically increasing, except when utimes() or + * truncate() increments the corresponding _seq values.) + */ +int ceph_fill_file_size(struct inode *inode, int issued, + u32 truncate_seq, u64 truncate_size, u64 size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int queue_trunc = 0; + + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || + (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { + dout("size %lld -> %llu\n", inode->i_size, size); + if (size > 0 && S_ISDIR(inode->i_mode)) { + pr_err("fill_file_size non-zero size for directory\n"); + size = 0; + } + i_size_write(inode, size); + inode->i_blocks = calc_inode_blocks(size); + ci->i_reported_size = size; + if (truncate_seq != ci->i_truncate_seq) { + dout("truncate_seq %u -> %u\n", + ci->i_truncate_seq, truncate_seq); + ci->i_truncate_seq = truncate_seq; + + /* the MDS should have revoked these caps */ + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_LAZYIO)); + /* + * If we hold relevant caps, or in the case where we're + * not the only client referencing this file and we + * don't hold those caps, then we need to check whether + * the file is either opened or mmaped + */ + if ((issued & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_BUFFER)) || + mapping_mapped(inode->i_mapping) || + __ceph_is_file_opened(ci)) { + ci->i_truncate_pending++; + queue_trunc = 1; + } + } + } + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && + ci->i_truncate_size != truncate_size) { + dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, + truncate_size); + ci->i_truncate_size = truncate_size; + } + + if (queue_trunc) + ceph_fscache_invalidate(inode); + + return queue_trunc; +} + +void ceph_fill_file_time(struct inode *inode, int issued, + u64 time_warp_seq, struct timespec64 *ctime, + struct timespec64 *mtime, struct timespec64 *atime) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int warn = 0; + + if (issued & (CEPH_CAP_FILE_EXCL| + CEPH_CAP_FILE_WR| + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { + if (ci->i_version == 0 || + timespec64_compare(ctime, &inode->i_ctime) > 0) { + dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n", + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + ctime->tv_sec, ctime->tv_nsec); + inode->i_ctime = *ctime; + } + if (ci->i_version == 0 || + ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { + /* the MDS did a utimes() */ + dout("mtime %lld.%09ld -> %lld.%09ld " + "tw %d -> %d\n", + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + mtime->tv_sec, mtime->tv_nsec, + ci->i_time_warp_seq, (int)time_warp_seq); + + inode->i_mtime = *mtime; + inode->i_atime = *atime; + ci->i_time_warp_seq = time_warp_seq; + } else if (time_warp_seq == ci->i_time_warp_seq) { + /* nobody did utimes(); take the max */ + if (timespec64_compare(mtime, &inode->i_mtime) > 0) { + dout("mtime %lld.%09ld -> %lld.%09ld inc\n", + inode->i_mtime.tv_sec, + inode->i_mtime.tv_nsec, + mtime->tv_sec, mtime->tv_nsec); + inode->i_mtime = *mtime; + } + if (timespec64_compare(atime, &inode->i_atime) > 0) { + dout("atime %lld.%09ld -> %lld.%09ld inc\n", + inode->i_atime.tv_sec, + inode->i_atime.tv_nsec, + atime->tv_sec, atime->tv_nsec); + inode->i_atime = *atime; + } + } else if (issued & CEPH_CAP_FILE_EXCL) { + /* we did a utimes(); ignore mds values */ + } else { + warn = 1; + } + } else { + /* we have no write|excl caps; whatever the MDS says is true */ + if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { + inode->i_ctime = *ctime; + inode->i_mtime = *mtime; + inode->i_atime = *atime; + ci->i_time_warp_seq = time_warp_seq; + } else { + warn = 1; + } + } + if (warn) /* time_warp_seq shouldn't go backwards */ + dout("%p mds time_warp_seq %llu < %u\n", + inode, time_warp_seq, ci->i_time_warp_seq); +} + +/* + * Populate an inode based on info from mds. May be called on new or + * existing inodes. + */ +int ceph_fill_inode(struct inode *inode, struct page *locked_page, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, int cap_fmode, + struct ceph_cap_reservation *caps_reservation) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_mds_reply_inode *info = iinfo->in; + struct ceph_inode_info *ci = ceph_inode(inode); + int issued, new_issued, info_caps; + struct timespec64 mtime, atime, ctime; + struct ceph_buffer *xattr_blob = NULL; + struct ceph_buffer *old_blob = NULL; + struct ceph_string *pool_ns = NULL; + struct ceph_cap *new_cap = NULL; + int err = 0; + bool wake = false; + bool queue_trunc = false; + bool new_version = false; + bool fill_inline = false; + + lockdep_assert_held(&mdsc->snap_rwsem); + + dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, + inode, ceph_vinop(inode), le64_to_cpu(info->version), + ci->i_version); + + info_caps = le32_to_cpu(info->cap.caps); + + /* prealloc new cap struct */ + if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) { + new_cap = ceph_get_cap(mdsc, caps_reservation); + if (!new_cap) + return -ENOMEM; + } + + /* + * prealloc xattr data, if it looks like we'll need it. only + * if len > 4 (meaning there are actually xattrs; the first 4 + * bytes are the xattr count). + */ + if (iinfo->xattr_len > 4) { + xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); + if (!xattr_blob) + pr_err("%s ENOMEM xattr blob %d bytes\n", __func__, + iinfo->xattr_len); + } + + if (iinfo->pool_ns_len > 0) + pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, + iinfo->pool_ns_len); + + if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map) + ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode)); + + spin_lock(&ci->i_ceph_lock); + + /* + * provided version will be odd if inode value is projected, + * even if stable. skip the update if we have newer stable + * info (ours>=theirs, e.g. due to racing mds replies), unless + * we are getting projected (unstable) info (in which case the + * version is odd, and we want ours>theirs). + * us them + * 2 2 skip + * 3 2 skip + * 3 3 update + */ + if (ci->i_version == 0 || + ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + le64_to_cpu(info->version) > (ci->i_version & ~1))) + new_version = true; + + /* Update change_attribute */ + inode_set_max_iversion_raw(inode, iinfo->change_attr); + + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + new_issued = ~issued & info_caps; + + /* update inode */ + inode->i_rdev = le32_to_cpu(info->rdev); + /* directories have fl_stripe_unit set to zero */ + if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; + + __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); + + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { + inode->i_mode = le32_to_cpu(info->mode); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); + dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); + ceph_decode_timespec64(&ci->i_btime, &iinfo->btime); + ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime); + } + + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && + (issued & CEPH_CAP_LINK_EXCL) == 0) + set_nlink(inode, le32_to_cpu(info->nlink)); + + if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { + /* be careful with mtime, atime, size */ + ceph_decode_timespec64(&atime, &info->atime); + ceph_decode_timespec64(&mtime, &info->mtime); + ceph_decode_timespec64(&ctime, &info->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(info->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) { + ci->i_files = le64_to_cpu(info->files); + ci->i_subdirs = le64_to_cpu(info->subdirs); + } + + if (new_version || + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + s64 old_pool = ci->i_layout.pool_id; + struct ceph_string *old_ns; + + ceph_file_layout_from_legacy(&ci->i_layout, &info->layout); + old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, + lockdep_is_held(&ci->i_ceph_lock)); + rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns); + + if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; + + pool_ns = old_ns; + + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(info->truncate_seq), + le64_to_cpu(info->truncate_size), + le64_to_cpu(info->size)); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + } + + /* layout and rstat are not tracked by capability, update them if + * the inode info is from auth mds */ + if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) { + if (S_ISDIR(inode->i_mode)) { + ci->i_dir_layout = iinfo->dir_layout; + ci->i_rbytes = le64_to_cpu(info->rbytes); + ci->i_rfiles = le64_to_cpu(info->rfiles); + ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); + ci->i_dir_pin = iinfo->dir_pin; + ceph_decode_timespec64(&ci->i_rctime, &info->rctime); + } + } + + /* xattrs */ + /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ + if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && + le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { + if (ci->i_xattrs.blob) + old_blob = ci->i_xattrs.blob; + ci->i_xattrs.blob = xattr_blob; + if (xattr_blob) + memcpy(ci->i_xattrs.blob->vec.iov_base, + iinfo->xattr_data, iinfo->xattr_len); + ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + ceph_forget_all_cached_acls(inode); + ceph_security_invalidate_secctx(inode); + xattr_blob = NULL; + } + + /* finally update i_version */ + if (le64_to_cpu(info->version) > ci->i_version) + ci->i_version = le64_to_cpu(info->version); + + inode->i_mapping->a_ops = &ceph_aops; + + switch (inode->i_mode & S_IFMT) { + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + case S_IFSOCK: + inode->i_blkbits = PAGE_SHIFT; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + inode->i_op = &ceph_file_iops; + break; + case S_IFREG: + inode->i_op = &ceph_file_iops; + inode->i_fop = &ceph_file_fops; + break; + case S_IFLNK: + inode->i_op = &ceph_symlink_iops; + if (!ci->i_symlink) { + u32 symlen = iinfo->symlink_len; + char *sym; + + spin_unlock(&ci->i_ceph_lock); + + if (symlen != i_size_read(inode)) { + pr_err("%s %llx.%llx BAD symlink " + "size %lld\n", __func__, + ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } + + err = -ENOMEM; + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); + if (!sym) + goto out; + + spin_lock(&ci->i_ceph_lock); + if (!ci->i_symlink) + ci->i_symlink = sym; + else + kfree(sym); /* lost a race */ + } + inode->i_link = ci->i_symlink; + break; + case S_IFDIR: + inode->i_op = &ceph_dir_iops; + inode->i_fop = &ceph_dir_fops; + break; + default: + pr_err("%s %llx.%llx BAD mode 0%o\n", __func__, + ceph_vinop(inode), inode->i_mode); + } + + /* were we issued a capability? */ + if (info_caps) { + if (ceph_snap(inode) == CEPH_NOSNAP) { + ceph_add_cap(inode, session, + le64_to_cpu(info->cap.cap_id), + info_caps, + le32_to_cpu(info->cap.wanted), + le32_to_cpu(info->cap.seq), + le32_to_cpu(info->cap.mseq), + le64_to_cpu(info->cap.realm), + info->cap.flags, &new_cap); + + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + (info_caps & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + dout(" marking %p complete (empty)\n", inode); + i_size_write(inode, 0); + __ceph_dir_set_complete(ci, + atomic64_read(&ci->i_release_count), + atomic64_read(&ci->i_ordered_count)); + } + + wake = true; + } else { + dout(" %p got snap_caps %s\n", inode, + ceph_cap_string(info_caps)); + ci->i_snap_caps |= info_caps; + } + } + + if (iinfo->inline_version > 0 && + iinfo->inline_version >= ci->i_inline_version) { + int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + ci->i_inline_version = iinfo->inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (locked_page || (info_caps & cache_caps))) + fill_inline = true; + } + + if (cap_fmode >= 0) { + if (!info_caps) + pr_warn("mds issued no caps on %llx.%llx\n", + ceph_vinop(inode)); + __ceph_touch_fmode(ci, mdsc, cap_fmode); + } + + spin_unlock(&ci->i_ceph_lock); + + if (fill_inline) + ceph_fill_inline_data(inode, locked_page, + iinfo->inline_data, iinfo->inline_len); + + if (wake) + wake_up_all(&ci->i_cap_wq); + + /* queue truncate if we saw i_size decrease */ + if (queue_trunc) + ceph_queue_vmtruncate(inode); + + /* populate frag tree */ + if (S_ISDIR(inode->i_mode)) + ceph_fill_fragtree(inode, &info->fragtree, dirinfo); + + /* update delegation info? */ + if (dirinfo) + ceph_fill_dirfrag(inode, dirinfo); + + err = 0; +out: + if (new_cap) + ceph_put_cap(mdsc, new_cap); + ceph_buffer_put(old_blob); + ceph_buffer_put(xattr_blob); + ceph_put_string(pool_ns); + return err; +} + +/* + * caller should hold session s_mutex and dentry->d_lock. + */ +static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time, + struct ceph_mds_session **old_lease_session) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + unsigned mask = le16_to_cpu(lease->mask); + long unsigned duration = le32_to_cpu(lease->duration_ms); + long unsigned ttl = from_time + (duration * HZ) / 1000; + long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; + + dout("update_dentry_lease %p duration %lu ms ttl %lu\n", + dentry, duration, ttl); + + /* only track leases on regular dentries */ + if (ceph_snap(dir) != CEPH_NOSNAP) + return; + + if (mask & CEPH_LEASE_PRIMARY_LINK) + di->flags |= CEPH_DENTRY_PRIMARY_LINK; + else + di->flags &= ~CEPH_DENTRY_PRIMARY_LINK; + + di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); + if (!(mask & CEPH_LEASE_VALID)) { + __ceph_dentry_dir_lease_touch(di); + return; + } + + if (di->lease_gen == session->s_cap_gen && + time_before(ttl, di->time)) + return; /* we already have a newer lease. */ + + if (di->lease_session && di->lease_session != session) { + *old_lease_session = di->lease_session; + di->lease_session = NULL; + } + + if (!di->lease_session) + di->lease_session = ceph_get_mds_session(session); + di->lease_gen = session->s_cap_gen; + di->lease_seq = le32_to_cpu(lease->seq); + di->lease_renew_after = half_ttl; + di->lease_renew_from = 0; + di->time = ttl; + + __ceph_dentry_lease_touch(di); +} + +static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time) +{ + struct ceph_mds_session *old_lease_session = NULL; + spin_lock(&dentry->d_lock); + __update_dentry_lease(dir, dentry, lease, session, from_time, + &old_lease_session); + spin_unlock(&dentry->d_lock); + ceph_put_mds_session(old_lease_session); +} + +/* + * update dentry lease without having parent inode locked + */ +static void update_dentry_lease_careful(struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time, + char *dname, u32 dname_len, + struct ceph_vino *pdvino, + struct ceph_vino *ptvino) + +{ + struct inode *dir; + struct ceph_mds_session *old_lease_session = NULL; + + spin_lock(&dentry->d_lock); + /* make sure dentry's name matches target */ + if (dentry->d_name.len != dname_len || + memcmp(dentry->d_name.name, dname, dname_len)) + goto out_unlock; + + dir = d_inode(dentry->d_parent); + /* make sure parent matches dvino */ + if (!ceph_ino_compare(dir, pdvino)) + goto out_unlock; + + /* make sure dentry's inode matches target. NULL ptvino means that + * we expect a negative dentry */ + if (ptvino) { + if (d_really_is_negative(dentry)) + goto out_unlock; + if (!ceph_ino_compare(d_inode(dentry), ptvino)) + goto out_unlock; + } else { + if (d_really_is_positive(dentry)) + goto out_unlock; + } + + __update_dentry_lease(dir, dentry, lease, session, + from_time, &old_lease_session); +out_unlock: + spin_unlock(&dentry->d_lock); + ceph_put_mds_session(old_lease_session); +} + +/* + * splice a dentry to an inode. + * caller must hold directory i_mutex for this to be safe. + */ +static int splice_dentry(struct dentry **pdn, struct inode *in) +{ + struct dentry *dn = *pdn; + struct dentry *realdn; + + BUG_ON(d_inode(dn)); + + if (S_ISDIR(in->i_mode)) { + /* If inode is directory, d_splice_alias() below will remove + * 'realdn' from its origin parent. We need to ensure that + * origin parent's readdir cache will not reference 'realdn' + */ + realdn = d_find_any_alias(in); + if (realdn) { + struct ceph_dentry_info *di = ceph_dentry(realdn); + spin_lock(&realdn->d_lock); + + realdn->d_op->d_prune(realdn); + + di->time = jiffies; + di->lease_shared_gen = 0; + di->offset = 0; + + spin_unlock(&realdn->d_lock); + dput(realdn); + } + } + + /* dn must be unhashed */ + if (!d_unhashed(dn)) + d_drop(dn); + realdn = d_splice_alias(in, dn); + if (IS_ERR(realdn)) { + pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", + PTR_ERR(realdn), dn, in, ceph_vinop(in)); + return PTR_ERR(realdn); + } + + if (realdn) { + dout("dn %p (%d) spliced with %p (%d) " + "inode %p ino %llx.%llx\n", + dn, d_count(dn), + realdn, d_count(realdn), + d_inode(realdn), ceph_vinop(d_inode(realdn))); + dput(dn); + *pdn = realdn; + } else { + BUG_ON(!ceph_dentry(dn)); + dout("dn %p attached to %p ino %llx.%llx\n", + dn, d_inode(dn), ceph_vinop(d_inode(dn))); + } + return 0; +} + +/* + * Incorporate results into the local cache. This is either just + * one inode, or a directory, dentry, and possibly linked-to inode (e.g., + * after a lookup). + * + * A reply may contain + * a directory inode along with a dentry. + * and/or a target inode + * + * Called with snap_rwsem (read). + */ +int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) +{ + struct ceph_mds_session *session = req->r_session; + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct inode *in = NULL; + struct ceph_vino tvino, dvino; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + int err = 0; + + dout("fill_trace %p is_dentry %d is_target %d\n", req, + rinfo->head->is_dentry, rinfo->head->is_target); + + if (!rinfo->head->is_target && !rinfo->head->is_dentry) { + dout("fill_trace reply is empty!\n"); + if (rinfo->head->result == 0 && req->r_parent) + ceph_invalidate_dir_request(req); + return 0; + } + + if (rinfo->head->is_dentry) { + struct inode *dir = req->r_parent; + + if (dir) { + err = ceph_fill_inode(dir, NULL, &rinfo->diri, + rinfo->dirfrag, session, -1, + &req->r_caps_reservation); + if (err < 0) + goto done; + } else { + WARN_ON_ONCE(1); + } + + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + struct qstr dname; + struct dentry *dn, *parent; + + BUG_ON(!rinfo->head->is_target); + BUG_ON(req->r_dentry); + + parent = d_find_any_alias(dir); + BUG_ON(!parent); + + dname.name = rinfo->dname; + dname.len = rinfo->dname_len; + dname.hash = full_name_hash(parent, dname.name, dname.len); + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (!dn) { + dput(parent); + err = -ENOMEM; + goto done; + } + err = 0; + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != tvino.ino || + ceph_snap(d_inode(dn)) != tvino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, d_inode(dn)); + ceph_dir_clear_ordered(dir); + d_delete(dn); + dput(dn); + goto retry_lookup; + } + + req->r_dentry = dn; + dput(parent); + } + } + + if (rinfo->head->is_target) { + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + in = ceph_get_inode(sb, tvino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } + + err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, + NULL, session, + (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && + rinfo->head->result == 0) ? req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err("ceph_fill_inode badness %p %llx.%llx\n", + in, ceph_vinop(in)); + if (in->i_state & I_NEW) + discard_new_inode(in); + else + iput(in); + goto done; + } + req->r_target_inode = in; + if (in->i_state & I_NEW) + unlock_new_inode(in); + } + + /* + * ignore null lease/binding on snapdir ENOENT, or else we + * will have trouble splicing in the virtual snapdir later + */ + if (rinfo->head->is_dentry && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && + (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, + fsc->mount_options->snapdir_name, + req->r_dentry->d_name.len))) { + /* + * lookup link rename : null -> possibly existing inode + * mknod symlink mkdir : null -> new inode + * unlink : linked -> null + */ + struct inode *dir = req->r_parent; + struct dentry *dn = req->r_dentry; + bool have_dir_cap, have_lease; + + BUG_ON(!dn); + BUG_ON(!dir); + BUG_ON(d_inode(dn->d_parent) != dir); + + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + BUG_ON(ceph_ino(dir) != dvino.ino); + BUG_ON(ceph_snap(dir) != dvino.snap); + + /* do we have a lease on the whole dir? */ + have_dir_cap = + (le32_to_cpu(rinfo->diri.in->cap.caps) & + CEPH_CAP_FILE_SHARED); + + /* do we have a dn lease? */ + have_lease = have_dir_cap || + le32_to_cpu(rinfo->dlease->duration_ms); + if (!have_lease) + dout("fill_trace no dentry lease or dir cap\n"); + + /* rename? */ + if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + + dout(" src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, + req->r_old_dentry, + dn, dn); + dout("fill_trace doing d_move %p -> %p\n", + req->r_old_dentry, dn); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(olddir); + + d_move(req->r_old_dentry, dn); + dout(" src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, + req->r_old_dentry, + dn, dn); + + /* ensure target dentry is invalidated, despite + rehashing bug in vfs_rename_dir */ + ceph_invalidate_dentry_lease(dn); + + dout("dn %p gets new offset %lld\n", req->r_old_dentry, + ceph_dentry(req->r_old_dentry)->offset); + + /* swap r_dentry and r_old_dentry in case that + * splice_dentry() gets called later. This is safe + * because no other place will use them */ + req->r_dentry = req->r_old_dentry; + req->r_old_dentry = dn; + dn = req->r_dentry; + } + + /* null dentry? */ + if (!rinfo->head->is_target) { + dout("fill_trace null dentry\n"); + if (d_really_is_positive(dn)) { + dout("d_delete %p\n", dn); + ceph_dir_clear_ordered(dir); + d_delete(dn); + } else if (have_lease) { + if (d_unhashed(dn)) + d_add(dn, NULL); + update_dentry_lease(dir, dn, + rinfo->dlease, session, + req->r_request_started); + } + goto done; + } + + /* attach proper inode */ + if (d_really_is_negative(dn)) { + ceph_dir_clear_ordered(dir); + ihold(in); + err = splice_dentry(&req->r_dentry, in); + if (err < 0) + goto done; + dn = req->r_dentry; /* may have spliced */ + } else if (d_really_is_positive(dn) && d_inode(dn) != in) { + dout(" %p links to %p %llx.%llx, not %llx.%llx\n", + dn, d_inode(dn), ceph_vinop(d_inode(dn)), + ceph_vinop(in)); + d_invalidate(dn); + have_lease = false; + } + + if (have_lease) { + update_dentry_lease(dir, dn, + rinfo->dlease, session, + req->r_request_started); + } + dout(" final dn %p\n", dn); + } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP) && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + struct inode *dir = req->r_parent; + + /* fill out a snapdir LOOKUPSNAP dentry */ + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); + BUG_ON(!req->r_dentry); + dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry); + ceph_dir_clear_ordered(dir); + ihold(in); + err = splice_dentry(&req->r_dentry, in); + if (err < 0) + goto done; + } else if (rinfo->head->is_dentry && req->r_dentry) { + /* parent inode is not locked, be carefull */ + struct ceph_vino *ptvino = NULL; + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + if (rinfo->head->is_target) { + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + ptvino = &tvino; + } + update_dentry_lease_careful(req->r_dentry, rinfo->dlease, + session, req->r_request_started, + rinfo->dname, rinfo->dname_len, + &dvino, ptvino); + } +done: + dout("fill_trace done err=%d\n", err); + return err; +} + +/* + * Prepopulate our cache with readdir results, leases, etc. + */ +static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + int i, err = 0; + + for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; + struct ceph_vino vino; + struct inode *in; + int rc; + + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); + + in = ceph_get_inode(req->r_dentry->d_sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + dout("new_inode badness got %d\n", err); + continue; + } + rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, + -1, &req->r_caps_reservation); + if (rc < 0) { + pr_err("ceph_fill_inode badness on %p got %d\n", + in, rc); + err = rc; + if (in->i_state & I_NEW) { + ihold(in); + discard_new_inode(in); + } + } else if (in->i_state & I_NEW) { + unlock_new_inode(in); + } + + /* avoid calling iput_final() in mds dispatch threads */ + ceph_async_iput(in); + } + + return err; +} + +void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) +{ + if (ctl->page) { + kunmap(ctl->page); + put_page(ctl->page); + ctl->page = NULL; + } +} + +static int fill_readdir_cache(struct inode *dir, struct dentry *dn, + struct ceph_readdir_cache_control *ctl, + struct ceph_mds_request *req) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + unsigned nsize = PAGE_SIZE / sizeof(struct dentry*); + unsigned idx = ctl->index % nsize; + pgoff_t pgoff = ctl->index / nsize; + + if (!ctl->page || pgoff != page_index(ctl->page)) { + ceph_readdir_cache_release(ctl); + if (idx == 0) + ctl->page = grab_cache_page(&dir->i_data, pgoff); + else + ctl->page = find_lock_page(&dir->i_data, pgoff); + if (!ctl->page) { + ctl->index = -1; + return idx == 0 ? -ENOMEM : 0; + } + /* reading/filling the cache are serialized by + * i_mutex, no need to use page lock */ + unlock_page(ctl->page); + ctl->dentries = kmap(ctl->page); + if (idx == 0) + memset(ctl->dentries, 0, PAGE_SIZE); + } + + if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && + req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { + dout("readdir cache dn %p idx %d\n", dn, ctl->index); + ctl->dentries[idx] = dn; + ctl->index++; + } else { + dout("disable readdir cache\n"); + ctl->index = -1; + } + return 0; +} + +int ceph_readdir_prepopulate(struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct dentry *parent = req->r_dentry; + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct qstr dname; + struct dentry *dn; + struct inode *in; + int err = 0, skipped = 0, ret, i; + struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; + u32 frag = le32_to_cpu(rhead->args.readdir.frag); + u32 last_hash = 0; + u32 fpos_offset; + struct ceph_readdir_cache_control cache_ctl = {}; + + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) + return readdir_prepopulate_inodes_only(req, session); + + if (rinfo->hash_order) { + if (req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, + strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } else if (rinfo->offset_hash) { + /* mds understands offset_hash */ + WARN_ON_ONCE(req->r_readdir_offset != 2); + last_hash = le32_to_cpu(rhead->args.readdir.offset_hash); + } + } + + if (rinfo->dir_dir && + le32_to_cpu(rinfo->dir_dir->frag) != frag) { + dout("readdir_prepopulate got new frag %x -> %x\n", + frag, le32_to_cpu(rinfo->dir_dir->frag)); + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (!rinfo->hash_order) + req->r_readdir_offset = 2; + } + + if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { + dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", + rinfo->dir_nr, parent); + } else { + dout("readdir_prepopulate %d items under dn %p\n", + rinfo->dir_nr, parent); + if (rinfo->dir_dir) + ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); + + if (ceph_frag_is_leftmost(frag) && + req->r_readdir_offset == 2 && + !(rinfo->hash_order && last_hash)) { + /* note dir version at start of readdir so we can + * tell if any dentries get dropped */ + req->r_dir_release_cnt = + atomic64_read(&ci->i_release_count); + req->r_dir_ordered_cnt = + atomic64_read(&ci->i_ordered_count); + req->r_readdir_cache_idx = 0; + } + } + + cache_ctl.index = req->r_readdir_cache_idx; + fpos_offset = req->r_readdir_offset; + + /* FIXME: release caps/leases if error occurs */ + for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; + struct ceph_vino tvino; + + dname.name = rde->name; + dname.len = rde->name_len; + dname.hash = full_name_hash(parent, dname.name, dname.len); + + tvino.ino = le64_to_cpu(rde->inode.in->ino); + tvino.snap = le64_to_cpu(rde->inode.in->snapid); + + if (rinfo->hash_order) { + u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + rde->name, rde->name_len); + hash = ceph_frag_value(hash); + if (hash != last_hash) + fpos_offset = 2; + last_hash = hash; + rde->offset = ceph_make_fpos(hash, fpos_offset++, true); + } else { + rde->offset = ceph_make_fpos(frag, fpos_offset++, false); + } + +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (!dn) { + dout("d_alloc badness\n"); + err = -ENOMEM; + goto out; + } + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != tvino.ino || + ceph_snap(d_inode(dn)) != tvino.snap)) { + struct ceph_dentry_info *di = ceph_dentry(dn); + dout(" dn %p points to wrong inode %p\n", + dn, d_inode(dn)); + + spin_lock(&dn->d_lock); + if (di->offset > 0 && + di->lease_shared_gen == + atomic_read(&ci->i_shared_gen)) { + __ceph_dir_clear_ordered(ci); + di->offset = 0; + } + spin_unlock(&dn->d_lock); + + d_delete(dn); + dput(dn); + goto retry_lookup; + } + + /* inode */ + if (d_really_is_positive(dn)) { + in = d_inode(dn); + } else { + in = ceph_get_inode(parent->d_sb, tvino); + if (IS_ERR(in)) { + dout("new_inode badness\n"); + d_drop(dn); + dput(dn); + err = PTR_ERR(in); + goto out; + } + } + + ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, + -1, &req->r_caps_reservation); + if (ret < 0) { + pr_err("ceph_fill_inode badness on %p\n", in); + if (d_really_is_negative(dn)) { + /* avoid calling iput_final() in mds + * dispatch threads */ + if (in->i_state & I_NEW) { + ihold(in); + discard_new_inode(in); + } + ceph_async_iput(in); + } + d_drop(dn); + err = ret; + goto next_item; + } + if (in->i_state & I_NEW) + unlock_new_inode(in); + + if (d_really_is_negative(dn)) { + if (ceph_security_xattr_deadlock(in)) { + dout(" skip splicing dn %p to inode %p" + " (security xattr deadlock)\n", dn, in); + ceph_async_iput(in); + skipped++; + goto next_item; + } + + err = splice_dentry(&dn, in); + if (err < 0) + goto next_item; + } + + ceph_dentry(dn)->offset = rde->offset; + + update_dentry_lease(d_inode(parent), dn, + rde->lease, req->r_session, + req->r_request_started); + + if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { + ret = fill_readdir_cache(d_inode(parent), dn, + &cache_ctl, req); + if (ret < 0) + err = ret; + } +next_item: + dput(dn); + } +out: + if (err == 0 && skipped == 0) { + set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags); + req->r_readdir_cache_idx = cache_ctl.index; + } + ceph_readdir_cache_release(&cache_ctl); + dout("readdir_prepopulate done\n"); + return err; +} + +bool ceph_inode_set_size(struct inode *inode, loff_t size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool ret; + + spin_lock(&ci->i_ceph_lock); + dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); + i_size_write(inode, size); + inode->i_blocks = calc_inode_blocks(size); + + ret = __ceph_should_report_size(ci); + + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +/* + * Put reference to inode, but avoid calling iput_final() in current thread. + * iput_final() may wait for reahahead pages. The wait can cause deadlock in + * some contexts. + */ +void ceph_async_iput(struct inode *inode) +{ + if (!inode) + return; + for (;;) { + if (atomic_add_unless(&inode->i_count, -1, 1)) + break; + if (queue_work(ceph_inode_to_client(inode)->inode_wq, + &ceph_inode(inode)->i_work)) + break; + /* queue work failed, i_count must be at least 2 */ + } +} + +/* + * Write back inode data in a worker thread. (This can't be done + * in the message handler context.) + */ +void ceph_queue_writeback(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + set_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask); + + ihold(inode); + if (queue_work(ceph_inode_to_client(inode)->inode_wq, + &ci->i_work)) { + dout("ceph_queue_writeback %p\n", inode); + } else { + dout("ceph_queue_writeback %p already queued, mask=%lx\n", + inode, ci->i_work_mask); + iput(inode); + } +} + +/* + * queue an async invalidation + */ +void ceph_queue_invalidate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + set_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask); + + ihold(inode); + if (queue_work(ceph_inode_to_client(inode)->inode_wq, + &ceph_inode(inode)->i_work)) { + dout("ceph_queue_invalidate %p\n", inode); + } else { + dout("ceph_queue_invalidate %p already queued, mask=%lx\n", + inode, ci->i_work_mask); + iput(inode); + } +} + +/* + * Queue an async vmtruncate. If we fail to queue work, we will handle + * the truncation the next time we call __ceph_do_pending_vmtruncate. + */ +void ceph_queue_vmtruncate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + set_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask); + + ihold(inode); + if (queue_work(ceph_inode_to_client(inode)->inode_wq, + &ci->i_work)) { + dout("ceph_queue_vmtruncate %p\n", inode); + } else { + dout("ceph_queue_vmtruncate %p already queued, mask=%lx\n", + inode, ci->i_work_mask); + iput(inode); + } +} + +static void ceph_do_invalidate_pages(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + u32 orig_gen; + int check = 0; + + mutex_lock(&ci->i_truncate_mutex); + + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", + inode, ceph_ino(inode)); + mapping_set_error(inode->i_mapping, -EIO); + truncate_pagecache(inode, 0); + mutex_unlock(&ci->i_truncate_mutex); + goto out; + } + + spin_lock(&ci->i_ceph_lock); + dout("invalidate_pages %p gen %d revoking %d\n", inode, + ci->i_rdcache_gen, ci->i_rdcache_revoking); + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); + goto out; + } + orig_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); + + ceph_fscache_invalidate(inode); + if (invalidate_inode_pages2(inode->i_mapping) < 0) { + pr_err("invalidate_pages %p fails\n", inode); + } + + spin_lock(&ci->i_ceph_lock); + if (orig_gen == ci->i_rdcache_gen && + orig_gen == ci->i_rdcache_revoking) { + dout("invalidate_pages %p gen %d successful\n", inode, + ci->i_rdcache_gen); + ci->i_rdcache_revoking--; + check = 1; + } else { + dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", + inode, orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; + } + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); +out: + if (check) + ceph_check_caps(ci, 0, NULL); +} + +/* + * Make sure any pending truncation is applied before doing anything + * that may depend on it. + */ +void __ceph_do_pending_vmtruncate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u64 to; + int wrbuffer_refs, finish = 0; + + mutex_lock(&ci->i_truncate_mutex); +retry: + spin_lock(&ci->i_ceph_lock); + if (ci->i_truncate_pending == 0) { + dout("__do_pending_vmtruncate %p none pending\n", inode); + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); + return; + } + + /* + * make sure any dirty snapped pages are flushed before we + * possibly truncate them.. so write AND block! + */ + if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { + spin_unlock(&ci->i_ceph_lock); + dout("__do_pending_vmtruncate %p flushing snaps first\n", + inode); + filemap_write_and_wait_range(&inode->i_data, 0, + inode->i_sb->s_maxbytes); + goto retry; + } + + /* there should be no reader or writer */ + WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); + + to = ci->i_truncate_size; + wrbuffer_refs = ci->i_wrbuffer_ref; + dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, + ci->i_truncate_pending, to); + spin_unlock(&ci->i_ceph_lock); + + truncate_pagecache(inode, to); + + spin_lock(&ci->i_ceph_lock); + if (to == ci->i_truncate_size) { + ci->i_truncate_pending = 0; + finish = 1; + } + spin_unlock(&ci->i_ceph_lock); + if (!finish) + goto retry; + + mutex_unlock(&ci->i_truncate_mutex); + + if (wrbuffer_refs == 0) + ceph_check_caps(ci, 0, NULL); + + wake_up_all(&ci->i_cap_wq); +} + +static void ceph_inode_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_work); + struct inode *inode = &ci->vfs_inode; + + if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) { + dout("writeback %p\n", inode); + filemap_fdatawrite(&inode->i_data); + } + if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask)) + ceph_do_invalidate_pages(inode); + + if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask)) + __ceph_do_pending_vmtruncate(inode); + + iput(inode); +} + +/* + * symlinks + */ +static const struct inode_operations ceph_symlink_iops = { + .get_link = simple_get_link, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .listxattr = ceph_listxattr, +}; + +int __ceph_setattr(struct inode *inode, struct iattr *attr) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned int ia_valid = attr->ia_valid; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf; + int issued; + int release = 0, dirtied = 0; + int mask = 0; + int err = 0; + int inode_dirty_flags = 0; + bool lock_snap_rwsem = false; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, + USE_AUTH_MDS); + if (IS_ERR(req)) { + ceph_free_cap_flush(prealloc_cf); + return PTR_ERR(req); + } + + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + + if (!ci->i_head_snapc && + (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + } + } + + dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); + + if (ia_valid & ATTR_UID) { + dout("setattr %p uid %d -> %d\n", inode, + from_kuid(&init_user_ns, inode->i_uid), + from_kuid(&init_user_ns, attr->ia_uid)); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_uid = attr->ia_uid; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + !uid_eq(attr->ia_uid, inode->i_uid)) { + req->r_args.setattr.uid = cpu_to_le32( + from_kuid(&init_user_ns, attr->ia_uid)); + mask |= CEPH_SETATTR_UID; + release |= CEPH_CAP_AUTH_SHARED; + } + } + if (ia_valid & ATTR_GID) { + dout("setattr %p gid %d -> %d\n", inode, + from_kgid(&init_user_ns, inode->i_gid), + from_kgid(&init_user_ns, attr->ia_gid)); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_gid = attr->ia_gid; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + !gid_eq(attr->ia_gid, inode->i_gid)) { + req->r_args.setattr.gid = cpu_to_le32( + from_kgid(&init_user_ns, attr->ia_gid)); + mask |= CEPH_SETATTR_GID; + release |= CEPH_CAP_AUTH_SHARED; + } + } + if (ia_valid & ATTR_MODE) { + dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode, + attr->ia_mode); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_mode = attr->ia_mode; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + attr->ia_mode != inode->i_mode) { + inode->i_mode = attr->ia_mode; + req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); + mask |= CEPH_SETATTR_MODE; + release |= CEPH_CAP_AUTH_SHARED; + } + } + + if (ia_valid & ATTR_ATIME) { + dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode, + inode->i_atime.tv_sec, inode->i_atime.tv_nsec, + attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); + if (issued & CEPH_CAP_FILE_EXCL) { + ci->i_time_warp_seq++; + inode->i_atime = attr->ia_atime; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_WR) && + timespec64_compare(&inode->i_atime, + &attr->ia_atime) < 0) { + inode->i_atime = attr->ia_atime; + dirtied |= CEPH_CAP_FILE_WR; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + !timespec64_equal(&inode->i_atime, &attr->ia_atime)) { + ceph_encode_timespec64(&req->r_args.setattr.atime, + &attr->ia_atime); + mask |= CEPH_SETATTR_ATIME; + release |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + } + } + if (ia_valid & ATTR_SIZE) { + dout("setattr %p size %lld -> %lld\n", inode, + inode->i_size, attr->ia_size); + if ((issued & CEPH_CAP_FILE_EXCL) && + attr->ia_size > inode->i_size) { + i_size_write(inode, attr->ia_size); + inode->i_blocks = calc_inode_blocks(attr->ia_size); + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + ia_valid |= ATTR_MTIME; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + attr->ia_size != inode->i_size) { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = + cpu_to_le64(inode->i_size); + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + } + } + if (ia_valid & ATTR_MTIME) { + dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); + if (issued & CEPH_CAP_FILE_EXCL) { + ci->i_time_warp_seq++; + inode->i_mtime = attr->ia_mtime; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_WR) && + timespec64_compare(&inode->i_mtime, + &attr->ia_mtime) < 0) { + inode->i_mtime = attr->ia_mtime; + dirtied |= CEPH_CAP_FILE_WR; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) { + ceph_encode_timespec64(&req->r_args.setattr.mtime, + &attr->ia_mtime); + mask |= CEPH_SETATTR_MTIME; + release |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + } + } + + /* these do nothing */ + if (ia_valid & ATTR_CTIME) { + bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| + ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; + dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode, + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, + only ? "ctime only" : "ignored"); + if (only) { + /* + * if kernel wants to dirty ctime but nothing else, + * we need to choose a cap to dirty under, or do + * a almost-no-op setattr + */ + if (issued & CEPH_CAP_AUTH_EXCL) + dirtied |= CEPH_CAP_AUTH_EXCL; + else if (issued & CEPH_CAP_FILE_EXCL) + dirtied |= CEPH_CAP_FILE_EXCL; + else if (issued & CEPH_CAP_XATTR_EXCL) + dirtied |= CEPH_CAP_XATTR_EXCL; + else + mask |= CEPH_SETATTR_CTIME; + } + } + if (ia_valid & ATTR_FILE) + dout("setattr %p ATTR_FILE ... hrm!\n", inode); + + if (dirtied) { + inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, + &prealloc_cf); + inode->i_ctime = attr->ia_ctime; + } + + release &= issued; + spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); + + if (inode_dirty_flags) + __mark_inode_dirty(inode, inode_dirty_flags); + + + if (mask) { + req->r_inode = inode; + ihold(inode); + req->r_inode_drop = release; + req->r_args.setattr.mask = cpu_to_le32(mask); + req->r_num_caps = 1; + req->r_stamp = attr->ia_ctime; + err = ceph_mdsc_do_request(mdsc, NULL, req); + } + dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, + ceph_cap_string(dirtied), mask); + + ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); + + if (err >= 0 && (mask & CEPH_SETATTR_SIZE)) + __ceph_do_pending_vmtruncate(inode); + + return err; +} + +/* + * setattr + */ +int ceph_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int err; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + err = setattr_prepare(dentry, attr); + if (err != 0) + return err; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size > max(inode->i_size, fsc->max_file_size)) + return -EFBIG; + + if ((attr->ia_valid & ATTR_SIZE) && + ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) + return -EDQUOT; + + err = __ceph_setattr(inode, attr); + + if (err >= 0 && (attr->ia_valid & ATTR_MODE)) + err = posix_acl_chmod(inode, attr->ia_mode); + + return err; +} + +/* + * Verify that we have a lease on the given mask. If not, + * do a getattr against an mds. + */ +int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int mode; + int err; + + if (ceph_snap(inode) == CEPH_SNAPDIR) { + dout("do_getattr inode %p SNAPDIR\n", inode); + return 0; + } + + dout("do_getattr inode %p mask %s mode 0%o\n", + inode, ceph_cap_string(mask), inode->i_mode); + if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) + return 0; + + mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_locked_page = locked_page; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (locked_page && err == 0) { + u64 inline_version = req->r_reply_info.targeti.inline_version; + if (inline_version == 0) { + /* the reply is supposed to contain inline data */ + err = -EINVAL; + } else if (inline_version == CEPH_INLINE_NONE) { + err = -ENODATA; + } else { + err = req->r_reply_info.targeti.inline_len; + } + } + ceph_mdsc_put_request(req); + dout("do_getattr result=%d\n", err); + return err; +} + + +/* + * Check inode permissions. We verify we have a valid value for + * the AUTH cap, then call the generic handler. + */ +int ceph_permission(struct inode *inode, int mask) +{ + int err; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); + + if (!err) + err = generic_permission(inode, mask); + return err; +} + +/* Craft a mask of needed caps given a set of requested statx attrs. */ +static int statx_to_caps(u32 want) +{ + int mask = 0; + + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) + mask |= CEPH_CAP_AUTH_SHARED; + + if (want & (STATX_NLINK|STATX_CTIME)) + mask |= CEPH_CAP_LINK_SHARED; + + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| + STATX_BLOCKS)) + mask |= CEPH_CAP_FILE_SHARED; + + if (want & (STATX_CTIME)) + mask |= CEPH_CAP_XATTR_SHARED; + + return mask; +} + +/* + * Get all the attributes. If we have sufficient caps for the requested attrs, + * then we can avoid talking to the MDS at all. + */ +int ceph_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct inode *inode = d_inode(path->dentry); + struct ceph_inode_info *ci = ceph_inode(inode); + u32 valid_mask = STATX_BASIC_STATS; + int err = 0; + + /* Skip the getattr altogether if we're asked not to sync */ + if (!(flags & AT_STATX_DONT_SYNC)) { + err = ceph_do_getattr(inode, statx_to_caps(request_mask), + flags & AT_STATX_FORCE_SYNC); + if (err) + return err; + } + + generic_fillattr(inode, stat); + stat->ino = ceph_present_inode(inode); + + /* + * btime on newly-allocated inodes is 0, so if this is still set to + * that, then assume that it's not valid. + */ + if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) { + stat->btime = ci->i_btime; + valid_mask |= STATX_BTIME; + } + + if (ceph_snap(inode) == CEPH_NOSNAP) + stat->dev = inode->i_sb->s_dev; + else + stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; + + if (S_ISDIR(inode->i_mode)) { + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), + RBYTES)) + stat->size = ci->i_rbytes; + else + stat->size = ci->i_files + ci->i_subdirs; + stat->blocks = 0; + stat->blksize = 65536; + /* + * Some applications rely on the number of st_nlink + * value on directories to be either 0 (if unlinked) + * or 2 + number of subdirectories. + */ + if (stat->nlink == 1) + /* '.' + '..' + subdirs */ + stat->nlink = 1 + 1 + ci->i_subdirs; + } + + stat->result_mask = request_mask & valid_mask; + return err; +} diff --git a/fs/ceph/io.c b/fs/ceph/io.c new file mode 100644 index 000000000..97602ea92 --- /dev/null +++ b/fs/ceph/io.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Trond Myklebust + * Copyright (c) 2019 Jeff Layton + * + * I/O and data path helper functionality. + * + * Heavily borrowed from equivalent code in fs/nfs/io.c + */ + +#include <linux/ceph/ceph_debug.h> + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/rwsem.h> +#include <linux/fs.h> + +#include "super.h" +#include "io.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + inode_dio_wait(inode); + } +} + +/** + * ceph_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +ceph_start_io_read(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * ceph_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered write operation is about to start, and ensure + * that we block all direct I/O. + */ +void +ceph_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ceph_inode(inode), inode); +} + +/** + * ceph_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +ceph_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + /* FIXME: unmap_mapping_range? */ + filemap_write_and_wait(inode->i_mapping); + } +} + +/** + * ceph_end_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +ceph_start_io_direct(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_buffered(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/fs/ceph/io.h b/fs/ceph/io.h new file mode 100644 index 000000000..fa594cd77 --- /dev/null +++ b/fs/ceph/io.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_IO_H +#define _FS_CEPH_IO_H + +void ceph_start_io_read(struct inode *inode); +void ceph_end_io_read(struct inode *inode); +void ceph_start_io_write(struct inode *inode); +void ceph_end_io_write(struct inode *inode); +void ceph_start_io_direct(struct inode *inode); +void ceph_end_io_direct(struct inode *inode); + +#endif /* FS_CEPH_IO_H */ diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c new file mode 100644 index 000000000..6e061bf62 --- /dev/null +++ b/fs/ceph/ioctl.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> +#include <linux/in.h> + +#include "super.h" +#include "mds_client.h" +#include "ioctl.h" +#include <linux/ceph/striper.h> + +/* + * ioctls + */ + +/* + * get and set the file layout + */ +static long ceph_ioctl_get_layout(struct file *file, void __user *arg) +{ + struct ceph_inode_info *ci = ceph_inode(file_inode(file)); + struct ceph_ioctl_layout l; + int err; + + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); + if (!err) { + l.stripe_unit = ci->i_layout.stripe_unit; + l.stripe_count = ci->i_layout.stripe_count; + l.object_size = ci->i_layout.object_size; + l.data_pool = ci->i_layout.pool_id; + l.preferred_osd = -1; + if (copy_to_user(arg, &l, sizeof(l))) + return -EFAULT; + } + + return err; +} + +static long __validate_layout(struct ceph_mds_client *mdsc, + struct ceph_ioctl_layout *l) +{ + int i, err; + + /* validate striping parameters */ + if ((l->object_size & ~PAGE_MASK) || + (l->stripe_unit & ~PAGE_MASK) || + ((unsigned)l->stripe_unit != 0 && + ((unsigned)l->object_size % (unsigned)l->stripe_unit))) + return -EINVAL; + + /* make sure it's a valid data pool */ + mutex_lock(&mdsc->mutex); + err = -EINVAL; + for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) + if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) { + err = 0; + break; + } + mutex_unlock(&mdsc->mutex); + if (err) + return err; + + return 0; +} + +static long ceph_ioctl_set_layout(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + struct ceph_inode_info *ci = ceph_inode(file_inode(file)); + struct ceph_ioctl_layout nl; + int err; + + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + /* validate changed params against current layout */ + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); + if (err) + return err; + + memset(&nl, 0, sizeof(nl)); + if (l.stripe_count) + nl.stripe_count = l.stripe_count; + else + nl.stripe_count = ci->i_layout.stripe_count; + if (l.stripe_unit) + nl.stripe_unit = l.stripe_unit; + else + nl.stripe_unit = ci->i_layout.stripe_unit; + if (l.object_size) + nl.object_size = l.object_size; + else + nl.object_size = ci->i_layout.object_size; + if (l.data_pool) + nl.data_pool = l.data_pool; + else + nl.data_pool = ci->i_layout.pool_id; + + /* this is obsolete, and always -1 */ + nl.preferred_osd = -1; + + err = __validate_layout(mdsc, &nl); + if (err) + return err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT, + USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); + + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); + return err; +} + +/* + * Set a layout policy on a directory inode. All items in the tree + * rooted at this inode will inherit this layout on creation, + * (It doesn't apply retroactively ) + * unless a subdirectory has its own layout policy. + */ +static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + int err; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + + /* copy and validate */ + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + err = __validate_layout(mdsc, &l); + if (err) + return err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT, + USE_AUTH_MDS); + + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = + cpu_to_le32(l.data_pool); + + err = ceph_mdsc_do_request(mdsc, inode, req); + ceph_mdsc_put_request(req); + return err; +} + +/* + * Return object name, size/offset information, and location (OSD + * number, network address) for a given file offset. + */ +static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) +{ + struct ceph_ioctl_dataloc dl; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_object_locator oloc; + CEPH_DEFINE_OID_ONSTACK(oid); + u32 xlen; + u64 tmp; + struct ceph_pg pgid; + int r; + + /* copy and validate */ + if (copy_from_user(&dl, arg, sizeof(dl))) + return -EFAULT; + + down_read(&osdc->lock); + ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, 1, + &dl.object_no, &dl.object_offset, &xlen); + dl.file_offset -= dl.object_offset; + dl.object_size = ci->i_layout.object_size; + dl.block_size = ci->i_layout.stripe_unit; + + /* block_offset = object_offset % block_size */ + tmp = dl.object_offset; + dl.block_offset = do_div(tmp, dl.block_size); + + snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", + ceph_ino(inode), dl.object_no); + + oloc.pool = ci->i_layout.pool_id; + oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); + ceph_oid_printf(&oid, "%s", dl.object_name); + + r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); + + ceph_oloc_destroy(&oloc); + if (r < 0) { + up_read(&osdc->lock); + return r; + } + + dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid); + if (dl.osd >= 0) { + struct ceph_entity_addr *a = + ceph_osd_addr(osdc->osdmap, dl.osd); + if (a) + memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr)); + } else { + memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); + } + up_read(&osdc->lock); + + /* send result back to user */ + if (copy_to_user(arg, &dl, sizeof(dl))) + return -EFAULT; + + return 0; +} + +static long ceph_ioctl_lazyio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + + if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { + spin_lock(&ci->i_ceph_lock); + fi->fmode |= CEPH_FILE_MODE_LAZY; + ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; + __ceph_touch_fmode(ci, mdsc, fi->fmode); + spin_unlock(&ci->i_ceph_lock); + dout("ioctl_layzio: file %p marked lazy\n", file); + + ceph_check_caps(ci, 0, NULL); + } else { + dout("ioctl_layzio: file %p already lazy\n", file); + } + return 0; +} + +static long ceph_ioctl_syncio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + + fi->flags |= CEPH_F_SYNC; + return 0; +} + +long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); + switch (cmd) { + case CEPH_IOC_GET_LAYOUT: + return ceph_ioctl_get_layout(file, (void __user *)arg); + + case CEPH_IOC_SET_LAYOUT: + return ceph_ioctl_set_layout(file, (void __user *)arg); + + case CEPH_IOC_SET_LAYOUT_POLICY: + return ceph_ioctl_set_layout_policy(file, (void __user *)arg); + + case CEPH_IOC_GET_DATALOC: + return ceph_ioctl_get_dataloc(file, (void __user *)arg); + + case CEPH_IOC_LAZYIO: + return ceph_ioctl_lazyio(file); + + case CEPH_IOC_SYNCIO: + return ceph_ioctl_syncio(file); + } + + return -ENOTTY; +} diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h new file mode 100644 index 000000000..51f7f1d39 --- /dev/null +++ b/fs/ceph/ioctl.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef FS_CEPH_IOCTL_H +#define FS_CEPH_IOCTL_H + +#include <linux/ioctl.h> +#include <linux/types.h> + +#define CEPH_IOCTL_MAGIC 0x97 + +/* + * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy + * CEPH_IOC_SET_LAYOUT - set file layout + * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy + * + * The file layout specifies how file data is striped over objects in + * the distributed object store, which object pool they belong to (if + * it differs from the default), and an optional 'preferred osd' to + * store them on. + * + * Files get a new layout based on the policy set on the containing + * directory or one of its ancestors. The GET_LAYOUT ioctl will let + * you examine the layout for a file or the policy on a directory. + * + * SET_LAYOUT will let you set a layout on a newly created file. This + * only works immediately after the file is created and before any + * data is written to it. + * + * SET_LAYOUT_POLICY will let you set a layout policy (default layout) + * on a directory that will apply to any new files created in that + * directory (or any child directory that doesn't specify a layout of + * its own). + */ + +/* use u64 to align sanely on all archs */ +struct ceph_ioctl_layout { + __u64 stripe_unit, stripe_count, object_size; + __u64 data_pool; + + /* obsolete. new values ignored, always return -1 */ + __s64 preferred_osd; +}; + +#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \ + struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ + struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \ + struct ceph_ioctl_layout) + +/* + * CEPH_IOC_GET_DATALOC - get location of file data in the cluster + * + * Extract identity, address of the OSD and object storing a given + * file offset. + */ +struct ceph_ioctl_dataloc { + __u64 file_offset; /* in+out: file offset */ + __u64 object_offset; /* out: offset in object */ + __u64 object_no; /* out: object # */ + __u64 object_size; /* out: object size */ + char object_name[64]; /* out: object name */ + __u64 block_offset; /* out: offset in block */ + __u64 block_size; /* out: block length */ + __s64 osd; /* out: osd # */ + struct sockaddr_storage osd_addr; /* out: osd address */ +}; + +#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ + struct ceph_ioctl_dataloc) + +/* + * CEPH_IOC_LAZYIO - relax consistency + * + * Normally Ceph switches to synchronous IO when multiple clients have + * the file open (and or more for write). Reads and writes bypass the + * page cache and go directly to the OSD. Setting this flag on a file + * descriptor will allow buffered IO for this file in cases where the + * application knows it won't interfere with other nodes (or doesn't + * care). + */ +#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) + +/* + * CEPH_IOC_SYNCIO - force synchronous IO + * + * This ioctl sets a file flag that forces the synchronous IO that + * bypasses the page cache, even if it is not necessary. This is + * essentially the opposite behavior of IOC_LAZYIO. This forces the + * same read/write path as a file opened by multiple clients when one + * or more of those clients is opened for write. + * + * Note that this type of sync IO takes a different path than a file + * opened with O_SYNC/D_SYNC (writes hit the page cache and are + * immediately flushed on page boundaries). It is very similar to + * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes + * are not copied (user page must remain stable) and O_DIRECT writes + * have alignment restrictions (on the buffer and file offset). + */ +#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) + +#endif diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c new file mode 100644 index 000000000..674d6ea89 --- /dev/null +++ b/fs/ceph/locks.c @@ -0,0 +1,493 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/random.h> + +#include "super.h" +#include "mds_client.h" +#include <linux/ceph/pagelist.h> + +static u64 lock_secret; +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); + +static inline u64 secure_addr(void *addr) +{ + u64 v = lock_secret ^ (u64)(unsigned long)addr; + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + v |= (1ULL << 63); + return v; +} + +void __init ceph_flock_init(void) +{ + get_random_bytes(&lock_secret, sizeof(lock_secret)); +} + +static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) +{ + struct inode *inode = file_inode(dst->fl_file); + atomic_inc(&ceph_inode(inode)->i_filelock_ref); +} + +static void ceph_fl_release_lock(struct file_lock *fl) +{ + struct inode *inode = file_inode(fl->fl_file); + struct ceph_inode_info *ci = ceph_inode(inode); + if (atomic_dec_and_test(&ci->i_filelock_ref)) { + /* clear error when all locks are released */ + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; + spin_unlock(&ci->i_ceph_lock); + } +} + +static const struct file_lock_operations ceph_fl_lock_ops = { + .fl_copy_lock = ceph_fl_copy_lock, + .fl_release_private = ceph_fl_release_lock, +}; + +/** + * Implement fcntl and flock locking functions. + */ +static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, + int cmd, u8 wait, struct file_lock *fl) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_mds_request *req; + int err; + u64 length = 0; + u64 owner; + + if (operation == CEPH_MDS_OP_SETFILELOCK) { + /* + * increasing i_filelock_ref closes race window between + * handling request reply and adding file_lock struct to + * inode. Otherwise, auth caps may get trimmed in the + * window. Caller function will decrease the counter. + */ + fl->fl_ops = &ceph_fl_lock_ops; + fl->fl_ops->fl_copy_lock(fl, NULL); + } + + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) + wait = 0; + + req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + + /* mds requires start and length rather than start and end */ + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + + owner = secure_addr(fl->fl_owner); + + dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " + "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type, + (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, + wait, fl->fl_type); + + req->r_args.filelock_change.rule = lock_type; + req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.owner = cpu_to_le64(owner); + req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); + req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); + req->r_args.filelock_change.length = cpu_to_le64(length); + req->r_args.filelock_change.wait = wait; + + if (wait) + req->r_wait_for_completion = ceph_lock_wait_for_completion; + + err = ceph_mdsc_do_request(mdsc, inode, req); + if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { + fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); + if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_RDLCK; + else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_WRLCK; + else + fl->fl_type = F_UNLCK; + + fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); + length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + + le64_to_cpu(req->r_reply_info.filelock_reply->length); + if (length >= 1) + fl->fl_end = length -1; + else + fl->fl_end = 0; + + } + ceph_mdsc_put_request(req); + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type, + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type, err); + return err; +} + +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_mds_request *intr_req; + struct inode *inode = req->r_inode; + int err, lock_type; + + BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); + if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) + lock_type = CEPH_LOCK_FCNTL_INTR; + else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) + lock_type = CEPH_LOCK_FLOCK_INTR; + else + BUG_ON(1); + BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); + + err = wait_for_completion_interruptible(&req->r_completion); + if (!err) + return 0; + + dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", + req->r_tid); + + mutex_lock(&mdsc->mutex); + if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { + err = 0; + } else { + /* + * ensure we aren't running concurrently with + * ceph_fill_trace or ceph_readdir_prepopulate, which + * rely on locks (dir mutex) held by our caller. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = err; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + + if (!req->r_session) { + // haven't sent the request + err = 0; + } + } + mutex_unlock(&mdsc->mutex); + if (!err) + return 0; + + intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, + USE_AUTH_MDS); + if (IS_ERR(intr_req)) + return PTR_ERR(intr_req); + + intr_req->r_inode = inode; + ihold(inode); + intr_req->r_num_caps = 1; + + intr_req->r_args.filelock_change = req->r_args.filelock_change; + intr_req->r_args.filelock_change.rule = lock_type; + intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; + + err = ceph_mdsc_do_request(mdsc, inode, intr_req); + ceph_mdsc_put_request(intr_req); + + if (err && err != -ERESTARTSYS) + return err; + + wait_for_completion_killable(&req->r_safe_completion); + return 0; +} + +static int try_unlock_file(struct file *file, struct file_lock *fl) +{ + int err; + unsigned int orig_flags = fl->fl_flags; + fl->fl_flags |= FL_EXISTS; + err = locks_lock_file_wait(file, fl); + fl->fl_flags = orig_flags; + if (err == -ENOENT) { + if (!(orig_flags & FL_EXISTS)) + err = 0; + return err; + } + return 1; +} + +/** + * Attempt to set an fcntl lock. + * For now, this just goes away to the server. Later it may be more awesome. + */ +int ceph_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + int err = 0; + u16 op = CEPH_MDS_OP_SETFILELOCK; + u8 wait = 0; + u8 lock_cmd; + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_lock, fl_owner: %p\n", fl->fl_owner); + + /* set wait bit as appropriate, then make command as Ceph expects it*/ + if (IS_GETLK(cmd)) + op = CEPH_MDS_OP_GETFILELOCK; + else if (IS_SETLKW(cmd)) + wait = 1; + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { + err = -EIO; + } + spin_unlock(&ci->i_ceph_lock); + if (err < 0) { + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) + posix_lock_file(file, fl, NULL); + return err; + } + + if (F_RDLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) { + err = try_unlock_file(file, fl); + if (err <= 0) + return err; + } + + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); + if (!err) { + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) { + dout("mds locked, locking locally\n"); + err = posix_lock_file(file, fl, NULL); + if (err) { + /* undo! This should only happen if + * the kernel detects local + * deadlock. */ + ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, + CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on posix_lock_file, undid lock\n", + err); + } + } + } + return err; +} + +int ceph_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + int err = 0; + u8 wait = 0; + u8 lock_cmd; + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + /* No mandatory locks */ + if (fl->fl_type & LOCK_MAND) + return -EOPNOTSUPP; + + dout("ceph_flock, fl_file: %p\n", fl->fl_file); + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { + err = -EIO; + } + spin_unlock(&ci->i_ceph_lock); + if (err < 0) { + if (F_UNLCK == fl->fl_type) + locks_lock_file_wait(file, fl); + return err; + } + + if (IS_SETLKW(cmd)) + wait = 1; + + if (F_RDLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + + if (F_UNLCK == fl->fl_type) { + err = try_unlock_file(file, fl); + if (err <= 0) + return err; + } + + err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, + inode, lock_cmd, wait, fl); + if (!err && F_UNLCK != fl->fl_type) { + err = locks_lock_file_wait(file, fl); + if (err) { + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + inode, CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on locks_lock_file_wait, undid lock\n", err); + } + } + return err; +} + +/* + * Fills in the passed counter variables, so you can prepare pagelist metadata + * before calling ceph_encode_locks. + */ +void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) +{ + struct file_lock *lock; + struct file_lock_context *ctx; + + *fcntl_count = 0; + *flock_count = 0; + + ctx = inode->i_flctx; + if (ctx) { + spin_lock(&ctx->flc_lock); + list_for_each_entry(lock, &ctx->flc_posix, fl_list) + ++(*fcntl_count); + list_for_each_entry(lock, &ctx->flc_flock, fl_list) + ++(*flock_count); + spin_unlock(&ctx->flc_lock); + } + dout("counted %d flock locks and %d fcntl locks\n", + *flock_count, *fcntl_count); +} + +/* + * Given a pointer to a lock, convert it to a ceph filelock + */ +static int lock_to_ceph_filelock(struct file_lock *lock, + struct ceph_filelock *cephlock) +{ + int err = 0; + cephlock->start = cpu_to_le64(lock->fl_start); + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); + cephlock->client = cpu_to_le64(0); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + + switch (lock->fl_type) { + case F_RDLCK: + cephlock->type = CEPH_LOCK_SHARED; + break; + case F_WRLCK: + cephlock->type = CEPH_LOCK_EXCL; + break; + case F_UNLCK: + cephlock->type = CEPH_LOCK_UNLOCK; + break; + default: + dout("Have unknown lock type %d\n", lock->fl_type); + err = -EINVAL; + } + + return err; +} + +/** + * Encode the flock and fcntl locks for the given inode into the ceph_filelock + * array. Must be called with inode->i_lock already held. + * If we encounter more of a specific lock type than expected, return -ENOSPC. + */ +int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, int num_flock_locks) +{ + struct file_lock *lock; + struct file_lock_context *ctx = inode->i_flctx; + int err = 0; + int seen_fcntl = 0; + int seen_flock = 0; + int l = 0; + + dout("encoding %d flock and %d fcntl locks\n", num_flock_locks, + num_fcntl_locks); + + if (!ctx) + return 0; + + spin_lock(&ctx->flc_lock); + list_for_each_entry(lock, &ctx->flc_posix, fl_list) { + ++seen_fcntl; + if (seen_fcntl > num_fcntl_locks) { + err = -ENOSPC; + goto fail; + } + err = lock_to_ceph_filelock(lock, &flocks[l]); + if (err) + goto fail; + ++l; + } + list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + ++seen_flock; + if (seen_flock > num_flock_locks) { + err = -ENOSPC; + goto fail; + } + err = lock_to_ceph_filelock(lock, &flocks[l]); + if (err) + goto fail; + ++l; + } +fail: + spin_unlock(&ctx->flc_lock); + return err; +} + +/** + * Copy the encoded flock and fcntl locks into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Returns zero on success. + */ +int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + int err = 0; + __le32 nlocks; + + nlocks = cpu_to_le32(num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + if (num_fcntl_locks > 0) { + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + } + + nlocks = cpu_to_le32(num_flock_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + if (num_flock_locks > 0) { + err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); + } +out_fail: + return err; +} diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c new file mode 100644 index 000000000..df1ecb8bf --- /dev/null +++ b/fs/ceph/mds_client.c @@ -0,0 +1,5275 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/fs.h> +#include <linux/wait.h> +#include <linux/slab.h> +#include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/ratelimit.h> +#include <linux/bits.h> +#include <linux/ktime.h> + +#include "super.h" +#include "mds_client.h" + +#include <linux/ceph/ceph_features.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/pagelist.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> + +#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) + +/* + * A cluster of MDS (metadata server) daemons is responsible for + * managing the file system namespace (the directory hierarchy and + * inodes) and for coordinating shared access to storage. Metadata is + * partitioning hierarchically across a number of servers, and that + * partition varies over time as the cluster adjusts the distribution + * in order to balance load. + * + * The MDS client is primarily responsible to managing synchronous + * metadata requests for operations like open, unlink, and so forth. + * If there is a MDS failure, we find out about it when we (possibly + * request and) receive a new MDS map, and can resubmit affected + * requests. + * + * For the most part, though, we take advantage of a lossless + * communications channel to the MDS, and do not need to worry about + * timing out or resubmitting requests. + * + * We maintain a stateful "session" with each MDS we interact with. + * Within each session, we sent periodic heartbeat messages to ensure + * any capabilities or leases we have been issues remain valid. If + * the session times out and goes stale, our leases and capabilities + * are no longer valid. + */ + +struct ceph_reconnect_state { + struct ceph_mds_session *session; + int nr_caps, nr_realms; + struct ceph_pagelist *pagelist; + unsigned msg_version; + bool allow_multi; +}; + +static void __wake_requests(struct ceph_mds_client *mdsc, + struct list_head *head); +static void ceph_cap_release_work(struct work_struct *work); +static void ceph_cap_reclaim_work(struct work_struct *work); + +static const struct ceph_connection_operations mds_con_ops; + + +/* + * mds reply parsing + */ + +static int parse_reply_info_quota(void **p, void *end, + struct ceph_mds_reply_info_in *info) +{ + u8 struct_v, struct_compat; + u32 struct_len; + + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only + * understand encoding with struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + ceph_decode_64_safe(p, end, info->max_bytes, bad); + ceph_decode_64_safe(p, end, info->max_files, bad); + *p = end; + return 0; +bad: + return -EIO; +} + +/* + * parse individual inode info + */ +static int parse_reply_info_in(void **p, void *end, + struct ceph_mds_reply_info_in *info, + u64 features) +{ + int err = 0; + u8 struct_v = 0; + + if (features == (u64)-1) { + u32 struct_len; + u8 struct_compat; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding with struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + } + + ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad); + info->in = *p; + *p += sizeof(struct ceph_mds_reply_inode) + + sizeof(*info->in->fragtree.splits) * + le32_to_cpu(info->in->fragtree.nsplits); + + ceph_decode_32_safe(p, end, info->symlink_len, bad); + ceph_decode_need(p, end, info->symlink_len, bad); + info->symlink = *p; + *p += info->symlink_len; + + ceph_decode_copy_safe(p, end, &info->dir_layout, + sizeof(info->dir_layout), bad); + ceph_decode_32_safe(p, end, info->xattr_len, bad); + ceph_decode_need(p, end, info->xattr_len, bad); + info->xattr_data = *p; + *p += info->xattr_len; + + if (features == (u64)-1) { + /* inline data */ + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + /* quota */ + err = parse_reply_info_quota(p, end, info); + if (err < 0) + goto out_bad; + /* pool namespace */ + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } + + /* btime */ + ceph_decode_need(p, end, sizeof(info->btime), bad); + ceph_decode_copy(p, &info->btime, sizeof(info->btime)); + + /* change attribute */ + ceph_decode_64_safe(p, end, info->change_attr, bad); + + /* dir pin */ + if (struct_v >= 2) { + ceph_decode_32_safe(p, end, info->dir_pin, bad); + } else { + info->dir_pin = -ENODATA; + } + + /* snapshot birth time, remains zero for v<=2 */ + if (struct_v >= 3) { + ceph_decode_need(p, end, sizeof(info->snap_btime), bad); + ceph_decode_copy(p, &info->snap_btime, + sizeof(info->snap_btime)); + } else { + memset(&info->snap_btime, 0, sizeof(info->snap_btime)); + } + + *p = end; + } else { + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + } else + info->inline_version = CEPH_INLINE_NONE; + + if (features & CEPH_FEATURE_MDS_QUOTA) { + err = parse_reply_info_quota(p, end, info); + if (err < 0) + goto out_bad; + } else { + info->max_bytes = 0; + info->max_files = 0; + } + + info->pool_ns_len = 0; + info->pool_ns_data = NULL; + if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } + } + + if (features & CEPH_FEATURE_FS_BTIME) { + ceph_decode_need(p, end, sizeof(info->btime), bad); + ceph_decode_copy(p, &info->btime, sizeof(info->btime)); + ceph_decode_64_safe(p, end, info->change_attr, bad); + } + + info->dir_pin = -ENODATA; + /* info->snap_btime remains zero */ + } + return 0; +bad: + err = -EIO; +out_bad: + return err; +} + +static int parse_reply_info_dir(void **p, void *end, + struct ceph_mds_reply_dirfrag **dirfrag, + u64 features) +{ + if (features == (u64)-1) { + u8 struct_v, struct_compat; + u32 struct_len; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding whose struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + } + + ceph_decode_need(p, end, sizeof(**dirfrag), bad); + *dirfrag = *p; + *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist); + if (unlikely(*p > end)) + goto bad; + if (features == (u64)-1) + *p = end; + return 0; +bad: + return -EIO; +} + +static int parse_reply_info_lease(void **p, void *end, + struct ceph_mds_reply_lease **lease, + u64 features) +{ + if (features == (u64)-1) { + u8 struct_v, struct_compat; + u32 struct_len; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding whose struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + } + + ceph_decode_need(p, end, sizeof(**lease), bad); + *lease = *p; + *p += sizeof(**lease); + if (features == (u64)-1) + *p = end; + return 0; +bad: + return -EIO; +} + +/* + * parse a normal reply, which may contain a (dir+)dentry and/or a + * target inode. + */ +static int parse_reply_info_trace(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + int err; + + if (info->head->is_dentry) { + err = parse_reply_info_in(p, end, &info->diri, features); + if (err < 0) + goto out_bad; + + err = parse_reply_info_dir(p, end, &info->dirfrag, features); + if (err < 0) + goto out_bad; + + ceph_decode_32_safe(p, end, info->dname_len, bad); + ceph_decode_need(p, end, info->dname_len, bad); + info->dname = *p; + *p += info->dname_len; + + err = parse_reply_info_lease(p, end, &info->dlease, features); + if (err < 0) + goto out_bad; + } + + if (info->head->is_target) { + err = parse_reply_info_in(p, end, &info->targeti, features); + if (err < 0) + goto out_bad; + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("problem parsing mds trace %d\n", err); + return err; +} + +/* + * parse readdir results + */ +static int parse_reply_info_readdir(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + u32 num, i = 0; + int err; + + err = parse_reply_info_dir(p, end, &info->dir_dir, features); + if (err < 0) + goto out_bad; + + ceph_decode_need(p, end, sizeof(num) + 2, bad); + num = ceph_decode_32(p); + { + u16 flags = ceph_decode_16(p); + info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); + info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); + info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); + info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH); + } + if (num == 0) + goto done; + + BUG_ON(!info->dir_entries); + if ((unsigned long)(info->dir_entries + num) > + (unsigned long)info->dir_entries + info->dir_buf_size) { + pr_err("dir contents are larger than expected\n"); + WARN_ON(1); + goto bad; + } + + info->dir_nr = num; + while (num) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; + /* dentry */ + ceph_decode_32_safe(p, end, rde->name_len, bad); + ceph_decode_need(p, end, rde->name_len, bad); + rde->name = *p; + *p += rde->name_len; + dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); + + /* dentry lease */ + err = parse_reply_info_lease(p, end, &rde->lease, features); + if (err) + goto out_bad; + /* inode */ + err = parse_reply_info_in(p, end, &rde->inode, features); + if (err < 0) + goto out_bad; + /* ceph_readdir_prepopulate() will update it */ + rde->offset = 0; + i++; + num--; + } + +done: + /* Skip over any unrecognized fields */ + *p = end; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("problem parsing dir contents %d\n", err); + return err; +} + +/* + * parse fcntl F_GETLK results + */ +static int parse_reply_info_filelock(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + if (*p + sizeof(*info->filelock_reply) > end) + goto bad; + + info->filelock_reply = *p; + + /* Skip over any unrecognized fields */ + *p = end; + return 0; +bad: + return -EIO; +} + + +#if BITS_PER_LONG == 64 + +#define DELEGATED_INO_AVAILABLE xa_mk_value(1) + +static int ceph_parse_deleg_inos(void **p, void *end, + struct ceph_mds_session *s) +{ + u32 sets; + + ceph_decode_32_safe(p, end, sets, bad); + dout("got %u sets of delegated inodes\n", sets); + while (sets--) { + u64 start, len, ino; + + ceph_decode_64_safe(p, end, start, bad); + ceph_decode_64_safe(p, end, len, bad); + + /* Don't accept a delegation of system inodes */ + if (start < CEPH_INO_SYSTEM_BASE) { + pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n", + start, len); + continue; + } + while (len--) { + int err = xa_insert(&s->s_delegated_inos, ino = start++, + DELEGATED_INO_AVAILABLE, + GFP_KERNEL); + if (!err) { + dout("added delegated inode 0x%llx\n", + start - 1); + } else if (err == -EBUSY) { + pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n", + start - 1); + } else { + return err; + } + } + } + return 0; +bad: + return -EIO; +} + +u64 ceph_get_deleg_ino(struct ceph_mds_session *s) +{ + unsigned long ino; + void *val; + + xa_for_each(&s->s_delegated_inos, ino, val) { + val = xa_erase(&s->s_delegated_inos, ino); + if (val == DELEGATED_INO_AVAILABLE) + return ino; + } + return 0; +} + +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) +{ + return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE, + GFP_KERNEL); +} +#else /* BITS_PER_LONG == 64 */ +/* + * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just + * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top + * and bottom words? + */ +static int ceph_parse_deleg_inos(void **p, void *end, + struct ceph_mds_session *s) +{ + u32 sets; + + ceph_decode_32_safe(p, end, sets, bad); + if (sets) + ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad); + return 0; +bad: + return -EIO; +} + +u64 ceph_get_deleg_ino(struct ceph_mds_session *s) +{ + return 0; +} + +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) +{ + return 0; +} +#endif /* BITS_PER_LONG == 64 */ + +/* + * parse create results + */ +static int parse_reply_info_create(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features, struct ceph_mds_session *s) +{ + int ret; + + if (features == (u64)-1 || + (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { + if (*p == end) { + /* Malformed reply? */ + info->has_create_ino = false; + } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) { + u8 struct_v, struct_compat; + u32 len; + + info->has_create_ino = true; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_64_safe(p, end, info->ino, bad); + ret = ceph_parse_deleg_inos(p, end, s); + if (ret) + return ret; + } else { + /* legacy */ + ceph_decode_64_safe(p, end, info->ino, bad); + info->has_create_ino = true; + } + } else { + if (*p != end) + goto bad; + } + + /* Skip over any unrecognized fields */ + *p = end; + return 0; +bad: + return -EIO; +} + +/* + * parse extra results + */ +static int parse_reply_info_extra(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features, struct ceph_mds_session *s) +{ + u32 op = le32_to_cpu(info->head->op); + + if (op == CEPH_MDS_OP_GETFILELOCK) + return parse_reply_info_filelock(p, end, info, features); + else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) + return parse_reply_info_readdir(p, end, info, features); + else if (op == CEPH_MDS_OP_CREATE) + return parse_reply_info_create(p, end, info, features, s); + else + return -EIO; +} + +/* + * parse entire mds reply + */ +static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + void *p, *end; + u32 len; + int err; + + info->head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head); + end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head); + + /* trace */ + ceph_decode_32_safe(&p, end, len, bad); + if (len > 0) { + ceph_decode_need(&p, end, len, bad); + err = parse_reply_info_trace(&p, p+len, info, features); + if (err < 0) + goto out_bad; + } + + /* extra */ + ceph_decode_32_safe(&p, end, len, bad); + if (len > 0) { + ceph_decode_need(&p, end, len, bad); + err = parse_reply_info_extra(&p, p+len, info, features, s); + if (err < 0) + goto out_bad; + } + + /* snap blob */ + ceph_decode_32_safe(&p, end, len, bad); + info->snapblob_len = len; + info->snapblob = p; + p += len; + + if (p != end) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("mds parse_reply err %d\n", err); + return err; +} + +static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) +{ + if (!info->dir_entries) + return; + free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); +} + + +/* + * sessions + */ +const char *ceph_session_state_name(int s) +{ + switch (s) { + case CEPH_MDS_SESSION_NEW: return "new"; + case CEPH_MDS_SESSION_OPENING: return "opening"; + case CEPH_MDS_SESSION_OPEN: return "open"; + case CEPH_MDS_SESSION_HUNG: return "hung"; + case CEPH_MDS_SESSION_CLOSING: return "closing"; + case CEPH_MDS_SESSION_CLOSED: return "closed"; + case CEPH_MDS_SESSION_RESTARTING: return "restarting"; + case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; + case CEPH_MDS_SESSION_REJECTED: return "rejected"; + default: return "???"; + } +} + +struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) +{ + if (refcount_inc_not_zero(&s->s_ref)) { + dout("mdsc get_session %p %d -> %d\n", s, + refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref)); + return s; + } else { + dout("mdsc get_session %p 0 -- FAIL\n", s); + return NULL; + } +} + +void ceph_put_mds_session(struct ceph_mds_session *s) +{ + if (IS_ERR_OR_NULL(s)) + return; + + dout("mdsc put_session %p %d -> %d\n", s, + refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1); + if (refcount_dec_and_test(&s->s_ref)) { + if (s->s_auth.authorizer) + ceph_auth_destroy_authorizer(s->s_auth.authorizer); + WARN_ON(mutex_is_locked(&s->s_mutex)); + xa_destroy(&s->s_delegated_inos); + kfree(s); + } +} + +/* + * called under mdsc->mutex + */ +struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, + int mds) +{ + if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) + return NULL; + return ceph_get_mds_session(mdsc->sessions[mds]); +} + +static bool __have_session(struct ceph_mds_client *mdsc, int mds) +{ + if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) + return false; + else + return true; +} + +static int __verify_registered_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + if (s->s_mds >= mdsc->max_sessions || + mdsc->sessions[s->s_mds] != s) + return -ENOENT; + return 0; +} + +/* + * create+register a new session for given mds. + * called under mdsc->mutex. + */ +static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, + int mds) +{ + struct ceph_mds_session *s; + + if (mds >= mdsc->mdsmap->possible_max_rank) + return ERR_PTR(-EINVAL); + + s = kzalloc(sizeof(*s), GFP_NOFS); + if (!s) + return ERR_PTR(-ENOMEM); + + if (mds >= mdsc->max_sessions) { + int newmax = 1 << get_count_order(mds + 1); + struct ceph_mds_session **sa; + + dout("%s: realloc to %d\n", __func__, newmax); + sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); + if (!sa) + goto fail_realloc; + if (mdsc->sessions) { + memcpy(sa, mdsc->sessions, + mdsc->max_sessions * sizeof(void *)); + kfree(mdsc->sessions); + } + mdsc->sessions = sa; + mdsc->max_sessions = newmax; + } + + dout("%s: mds%d\n", __func__, mds); + s->s_mdsc = mdsc; + s->s_mds = mds; + s->s_state = CEPH_MDS_SESSION_NEW; + s->s_ttl = 0; + s->s_seq = 0; + mutex_init(&s->s_mutex); + + ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); + + spin_lock_init(&s->s_gen_ttl_lock); + s->s_cap_gen = 1; + s->s_cap_ttl = jiffies - 1; + + spin_lock_init(&s->s_cap_lock); + s->s_renew_requested = 0; + s->s_renew_seq = 0; + INIT_LIST_HEAD(&s->s_caps); + s->s_nr_caps = 0; + refcount_set(&s->s_ref, 1); + INIT_LIST_HEAD(&s->s_waiting); + INIT_LIST_HEAD(&s->s_unsafe); + xa_init(&s->s_delegated_inos); + s->s_num_cap_releases = 0; + s->s_cap_reconnect = 0; + s->s_cap_iterator = NULL; + INIT_LIST_HEAD(&s->s_cap_releases); + INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); + + INIT_LIST_HEAD(&s->s_cap_dirty); + INIT_LIST_HEAD(&s->s_cap_flushing); + + mdsc->sessions[mds] = s; + atomic_inc(&mdsc->num_sessions); + refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */ + + ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + + return s; + +fail_realloc: + kfree(s); + return ERR_PTR(-ENOMEM); +} + +/* + * called under mdsc->mutex + */ +static void __unregister_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + dout("__unregister_session mds%d %p\n", s->s_mds, s); + BUG_ON(mdsc->sessions[s->s_mds] != s); + mdsc->sessions[s->s_mds] = NULL; + ceph_con_close(&s->s_con); + ceph_put_mds_session(s); + atomic_dec(&mdsc->num_sessions); +} + +/* + * drop session refs in request. + * + * should be last request ref, or hold mdsc->mutex + */ +static void put_request_session(struct ceph_mds_request *req) +{ + if (req->r_session) { + ceph_put_mds_session(req->r_session); + req->r_session = NULL; + } +} + +void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *), + bool check_state) +{ + int mds; + + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; ++mds) { + struct ceph_mds_session *s; + + s = __ceph_lookup_mds_session(mdsc, mds); + if (!s) + continue; + + if (check_state && !check_session_state(s)) { + ceph_put_mds_session(s); + continue; + } + + mutex_unlock(&mdsc->mutex); + cb(s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + +void ceph_mdsc_release_request(struct kref *kref) +{ + struct ceph_mds_request *req = container_of(kref, + struct ceph_mds_request, + r_kref); + ceph_mdsc_release_dir_caps_no_check(req); + destroy_reply_info(&req->r_reply_info); + if (req->r_request) + ceph_msg_put(req->r_request); + if (req->r_reply) + ceph_msg_put(req->r_reply); + if (req->r_inode) { + ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); + /* avoid calling iput_final() in mds dispatch threads */ + ceph_async_iput(req->r_inode); + } + if (req->r_parent) { + ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + ceph_async_iput(req->r_parent); + } + ceph_async_iput(req->r_target_inode); + if (req->r_dentry) + dput(req->r_dentry); + if (req->r_old_dentry) + dput(req->r_old_dentry); + if (req->r_old_dentry_dir) { + /* + * track (and drop pins for) r_old_dentry_dir + * separately, since r_old_dentry's d_parent may have + * changed between the dir mutex being dropped and + * this request being freed. + */ + ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); + ceph_async_iput(req->r_old_dentry_dir); + } + kfree(req->r_path1); + kfree(req->r_path2); + if (req->r_pagelist) + ceph_pagelist_release(req->r_pagelist); + put_request_session(req); + ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); + WARN_ON_ONCE(!list_empty(&req->r_wait)); + kmem_cache_free(ceph_mds_request_cachep, req); +} + +DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) + +/* + * lookup session, bump ref if found. + * + * called under mdsc->mutex. + */ +static struct ceph_mds_request * +lookup_get_request(struct ceph_mds_client *mdsc, u64 tid) +{ + struct ceph_mds_request *req; + + req = lookup_request(&mdsc->request_tree, tid); + if (req) + ceph_mdsc_get_request(req); + + return req; +} + +/* + * Register an in-flight request, and assign a tid. Link to directory + * are modifying (if any). + * + * Called under mdsc->mutex. + */ +static void __register_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + struct inode *dir) +{ + int ret = 0; + + req->r_tid = ++mdsc->last_tid; + if (req->r_num_caps) { + ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation, + req->r_num_caps); + if (ret < 0) { + pr_err("__register_request %p " + "failed to reserve caps: %d\n", req, ret); + /* set req->r_err to fail early from __do_request */ + req->r_err = ret; + return; + } + } + dout("__register_request %p tid %lld\n", req, req->r_tid); + ceph_mdsc_get_request(req); + insert_request(&mdsc->request_tree, req); + + req->r_uid = current_fsuid(); + req->r_gid = current_fsgid(); + + if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) + mdsc->oldest_tid = req->r_tid; + + if (dir) { + struct ceph_inode_info *ci = ceph_inode(dir); + + ihold(dir); + req->r_unsafe_dir = dir; + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); + spin_unlock(&ci->i_unsafe_lock); + } +} + +static void __unregister_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + dout("__unregister_request %p tid %lld\n", req, req->r_tid); + + /* Never leave an unregistered request on an unsafe list! */ + list_del_init(&req->r_unsafe_item); + + if (req->r_tid == mdsc->oldest_tid) { + struct rb_node *p = rb_next(&req->r_node); + mdsc->oldest_tid = 0; + while (p) { + struct ceph_mds_request *next_req = + rb_entry(p, struct ceph_mds_request, r_node); + if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) { + mdsc->oldest_tid = next_req->r_tid; + break; + } + p = rb_next(p); + } + } + + erase_request(&mdsc->request_tree, req); + + if (req->r_unsafe_dir) { + struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_dir_item); + spin_unlock(&ci->i_unsafe_lock); + } + if (req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_target_item); + spin_unlock(&ci->i_unsafe_lock); + } + + if (req->r_unsafe_dir) { + /* avoid calling iput_final() in mds dispatch threads */ + ceph_async_iput(req->r_unsafe_dir); + req->r_unsafe_dir = NULL; + } + + complete_all(&req->r_safe_completion); + + ceph_mdsc_put_request(req); +} + +/* + * Walk back up the dentry tree until we hit a dentry representing a + * non-snapshot inode. We do this using the rcu_read_lock (which must be held + * when calling this) to ensure that the objects won't disappear while we're + * working with them. Once we hit a candidate dentry, we attempt to take a + * reference to it, and return that as the result. + */ +static struct inode *get_nonsnap_parent(struct dentry *dentry) +{ + struct inode *inode = NULL; + + while (dentry && !IS_ROOT(dentry)) { + inode = d_inode_rcu(dentry); + if (!inode || ceph_snap(inode) == CEPH_NOSNAP) + break; + dentry = dentry->d_parent; + } + if (inode) + inode = igrab(inode); + return inode; +} + +/* + * Choose mds to send request to next. If there is a hint set in the + * request (e.g., due to a prior forward hint from the mds), use that. + * Otherwise, consult frag tree and/or caps to identify the + * appropriate mds. If all else fails, choose randomly. + * + * Called under mdsc->mutex. + */ +static int __choose_mds(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + bool *random) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_cap *cap; + int mode = req->r_direct_mode; + int mds = -1; + u32 hash = req->r_direct_hash; + bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + + if (random) + *random = false; + + /* + * is there a specific mds we should try? ignore hint if we have + * no session and the mds is not up (active or recovering). + */ + if (req->r_resend_mds >= 0 && + (__have_session(mdsc, req->r_resend_mds) || + ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { + dout("%s using resend_mds mds%d\n", __func__, + req->r_resend_mds); + return req->r_resend_mds; + } + + if (mode == USE_RANDOM_MDS) + goto random; + + inode = NULL; + if (req->r_inode) { + if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { + inode = req->r_inode; + ihold(inode); + } else { + /* req->r_dentry is non-null for LSSNAP request */ + rcu_read_lock(); + inode = get_nonsnap_parent(req->r_dentry); + rcu_read_unlock(); + dout("%s using snapdir's parent %p\n", __func__, inode); + } + } else if (req->r_dentry) { + /* ignore race with rename; old or new d_parent is okay */ + struct dentry *parent; + struct inode *dir; + + rcu_read_lock(); + parent = READ_ONCE(req->r_dentry->d_parent); + dir = req->r_parent ? : d_inode_rcu(parent); + + if (!dir || dir->i_sb != mdsc->fsc->sb) { + /* not this fs or parent went negative */ + inode = d_inode(req->r_dentry); + if (inode) + ihold(inode); + } else if (ceph_snap(dir) != CEPH_NOSNAP) { + /* direct snapped/virtual snapdir requests + * based on parent dir inode */ + inode = get_nonsnap_parent(parent); + dout("%s using nonsnap parent %p\n", __func__, inode); + } else { + /* dentry target */ + inode = d_inode(req->r_dentry); + if (!inode || mode == USE_AUTH_MDS) { + /* dir + name */ + inode = igrab(dir); + hash = ceph_dentry_hash(dir, req->r_dentry); + is_hash = true; + } else { + ihold(inode); + } + } + rcu_read_unlock(); + } + + dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash, + hash, mode); + if (!inode) + goto random; + ci = ceph_inode(inode); + + if (is_hash && S_ISDIR(inode->i_mode)) { + struct ceph_inode_frag frag; + int found; + + ceph_choose_frag(ci, hash, &frag, &found); + if (found) { + if (mode == USE_ANY_MDS && frag.ndist > 0) { + u8 r; + + /* choose a random replica */ + get_random_bytes(&r, 1); + r %= frag.ndist; + mds = frag.dist[r]; + dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n", + __func__, inode, ceph_vinop(inode), + frag.frag, mds, (int)r, frag.ndist); + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE && + !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) + goto out; + } + + /* since this file/dir wasn't known to be + * replicated, then we want to look for the + * authoritative mds. */ + if (frag.mds >= 0) { + /* choose auth mds */ + mds = frag.mds; + dout("%s %p %llx.%llx frag %u mds%d (auth)\n", + __func__, inode, ceph_vinop(inode), + frag.frag, mds); + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) { + if (!ceph_mdsmap_is_laggy(mdsc->mdsmap, + mds)) + goto out; + } + } + mode = USE_AUTH_MDS; + } + } + + spin_lock(&ci->i_ceph_lock); + cap = NULL; + if (mode == USE_AUTH_MDS) + cap = ci->i_auth_cap; + if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) + cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); + if (!cap) { + spin_unlock(&ci->i_ceph_lock); + ceph_async_iput(inode); + goto random; + } + mds = cap->session->s_mds; + dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__, + inode, ceph_vinop(inode), mds, + cap == ci->i_auth_cap ? "auth " : "", cap); + spin_unlock(&ci->i_ceph_lock); +out: + /* avoid calling iput_final() while holding mdsc->mutex or + * in mds dispatch threads */ + ceph_async_iput(inode); + return mds; + +random: + if (random) + *random = true; + + mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); + dout("%s chose random mds%d\n", __func__, mds); + return mds; +} + + +/* + * session messages + */ +struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_session_head *h; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, + false); + if (!msg) { + pr_err("ENOMEM creating session %s msg\n", + ceph_session_op_name(op)); + return NULL; + } + h = msg->front.iov_base; + h->op = cpu_to_le32(op); + h->seq = cpu_to_le64(seq); + + return msg; +} + +static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; +#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8) +static int encode_supported_features(void **p, void *end) +{ + static const size_t count = ARRAY_SIZE(feature_bits); + + if (count > 0) { + size_t i; + size_t size = FEATURE_BYTES(count); + unsigned long bit; + + if (WARN_ON_ONCE(*p + 4 + size > end)) + return -ERANGE; + + ceph_encode_32(p, size); + memset(*p, 0, size); + for (i = 0; i < count; i++) { + bit = feature_bits[i]; + ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8); + } + *p += size; + } else { + if (WARN_ON_ONCE(*p + 4 > end)) + return -ERANGE; + + ceph_encode_32(p, 0); + } + + return 0; +} + +static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED; +#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8) +static int encode_metric_spec(void **p, void *end) +{ + static const size_t count = ARRAY_SIZE(metric_bits); + + /* header */ + if (WARN_ON_ONCE(*p + 2 > end)) + return -ERANGE; + + ceph_encode_8(p, 1); /* version */ + ceph_encode_8(p, 1); /* compat */ + + if (count > 0) { + size_t i; + size_t size = METRIC_BYTES(count); + + if (WARN_ON_ONCE(*p + 4 + 4 + size > end)) + return -ERANGE; + + /* metric spec info length */ + ceph_encode_32(p, 4 + size); + + /* metric spec */ + ceph_encode_32(p, size); + memset(*p, 0, size); + for (i = 0; i < count; i++) + ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8); + *p += size; + } else { + if (WARN_ON_ONCE(*p + 4 + 4 > end)) + return -ERANGE; + + /* metric spec info length */ + ceph_encode_32(p, 4); + /* metric spec */ + ceph_encode_32(p, 0); + } + + return 0; +} + +/* + * session message, specialization for CEPH_SESSION_REQUEST_OPEN + * to include additional client metadata fields. + */ +static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_session_head *h; + int i = -1; + int extra_bytes = 0; + int metadata_key_count = 0; + struct ceph_options *opt = mdsc->fsc->client->options; + struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; + size_t size, count; + void *p, *end; + int ret; + + const char* metadata[][2] = { + {"hostname", mdsc->nodename}, + {"kernel_version", init_utsname()->release}, + {"entity_id", opt->name ? : ""}, + {"root", fsopt->server_path ? : "/"}, + {NULL, NULL} + }; + + /* Calculate serialized length of metadata */ + extra_bytes = 4; /* map length */ + for (i = 0; metadata[i][0]; ++i) { + extra_bytes += 8 + strlen(metadata[i][0]) + + strlen(metadata[i][1]); + metadata_key_count++; + } + + /* supported feature */ + size = 0; + count = ARRAY_SIZE(feature_bits); + if (count > 0) + size = FEATURE_BYTES(count); + extra_bytes += 4 + size; + + /* metric spec */ + size = 0; + count = ARRAY_SIZE(metric_bits); + if (count > 0) + size = METRIC_BYTES(count); + extra_bytes += 2 + 4 + 4 + size; + + /* Allocate the message */ + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, + GFP_NOFS, false); + if (!msg) { + pr_err("ENOMEM creating session open msg\n"); + return ERR_PTR(-ENOMEM); + } + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + h = p; + h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN); + h->seq = cpu_to_le64(seq); + + /* + * Serialize client metadata into waiting buffer space, using + * the format that userspace expects for map<string, string> + * + * ClientSession messages with metadata are v4 + */ + msg->hdr.version = cpu_to_le16(4); + msg->hdr.compat_version = cpu_to_le16(1); + + /* The write pointer, following the session_head structure */ + p += sizeof(*h); + + /* Number of entries in the map */ + ceph_encode_32(&p, metadata_key_count); + + /* Two length-prefixed strings for each entry in the map */ + for (i = 0; metadata[i][0]; ++i) { + size_t const key_len = strlen(metadata[i][0]); + size_t const val_len = strlen(metadata[i][1]); + + ceph_encode_32(&p, key_len); + memcpy(p, metadata[i][0], key_len); + p += key_len; + ceph_encode_32(&p, val_len); + memcpy(p, metadata[i][1], val_len); + p += val_len; + } + + ret = encode_supported_features(&p, end); + if (ret) { + pr_err("encode_supported_features failed!\n"); + ceph_msg_put(msg); + return ERR_PTR(ret); + } + + ret = encode_metric_spec(&p, end); + if (ret) { + pr_err("encode_metric_spec failed!\n"); + ceph_msg_put(msg); + return ERR_PTR(ret); + } + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + + return msg; +} + +/* + * send session open request. + * + * called under mdsc->mutex + */ +static int __open_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int mstate; + int mds = session->s_mds; + + /* wait for mds to go active? */ + mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); + dout("open_session to mds%d (%s)\n", mds, + ceph_mds_state_name(mstate)); + session->s_state = CEPH_MDS_SESSION_OPENING; + session->s_renew_requested = jiffies; + + /* send connect message */ + msg = create_session_open_msg(mdsc, session->s_seq); + if (IS_ERR(msg)) + return PTR_ERR(msg); + ceph_con_send(&session->s_con, msg); + return 0; +} + +/* + * open sessions for any export targets for the given mds + * + * called under mdsc->mutex + */ +static struct ceph_mds_session * +__open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + int ret; + + session = __ceph_lookup_mds_session(mdsc, target); + if (!session) { + session = register_session(mdsc, target); + if (IS_ERR(session)) + return session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) { + ret = __open_session(mdsc, session); + if (ret) + return ERR_PTR(ret); + } + + return session; +} + +struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + dout("open_export_target_session to mds%d\n", target); + + mutex_lock(&mdsc->mutex); + session = __open_export_target_session(mdsc, target); + mutex_unlock(&mdsc->mutex); + + return session; +} + +static void __open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_info *mi; + struct ceph_mds_session *ts; + int i, mds = session->s_mds; + + if (mds >= mdsc->mdsmap->possible_max_rank) + return; + + mi = &mdsc->mdsmap->m_info[mds]; + dout("open_export_target_sessions for mds%d (%d targets)\n", + session->s_mds, mi->num_export_targets); + + for (i = 0; i < mi->num_export_targets; i++) { + ts = __open_export_target_session(mdsc, mi->export_targets[i]); + ceph_put_mds_session(ts); + } +} + +void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&mdsc->mutex); + __open_export_target_sessions(mdsc, session); + mutex_unlock(&mdsc->mutex); +} + +/* + * session caps + */ + +static void detach_cap_releases(struct ceph_mds_session *session, + struct list_head *target) +{ + lockdep_assert_held(&session->s_cap_lock); + + list_splice_init(&session->s_cap_releases, target); + session->s_num_cap_releases = 0; + dout("dispose_cap_releases mds%d\n", session->s_mds); +} + +static void dispose_cap_releases(struct ceph_mds_client *mdsc, + struct list_head *dispose) +{ + while (!list_empty(dispose)) { + struct ceph_cap *cap; + /* zero out the in-progress message */ + cap = list_first_entry(dispose, struct ceph_cap, session_caps); + list_del(&cap->session_caps); + ceph_put_cap(mdsc, cap); + } +} + +static void cleanup_session_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_request *req; + struct rb_node *p; + + dout("cleanup_session_requests mds%d\n", session->s_mds); + mutex_lock(&mdsc->mutex); + while (!list_empty(&session->s_unsafe)) { + req = list_first_entry(&session->s_unsafe, + struct ceph_mds_request, r_unsafe_item); + pr_warn_ratelimited(" dropping unsafe request %llu\n", + req->r_tid); + if (req->r_target_inode) + mapping_set_error(req->r_target_inode->i_mapping, -EIO); + if (req->r_unsafe_dir) + mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO); + __unregister_request(mdsc, req); + } + /* zero r_attempts, so kick_requests() will re-send requests */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_session && + req->r_session->s_mds == session->s_mds) + req->r_attempts = 0; + } + mutex_unlock(&mdsc->mutex); +} + +/* + * Helper to safely iterate over all caps associated with a session, with + * special care taken to handle a racing __ceph_remove_cap(). + * + * Caller must hold session s_mutex. + */ +int ceph_iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, struct ceph_cap *, + void *), void *arg) +{ + struct list_head *p; + struct ceph_cap *cap; + struct inode *inode, *last_inode = NULL; + struct ceph_cap *old_cap = NULL; + int ret; + + dout("iterate_session_caps %p mds%d\n", session, session->s_mds); + spin_lock(&session->s_cap_lock); + p = session->s_caps.next; + while (p != &session->s_caps) { + cap = list_entry(p, struct ceph_cap, session_caps); + inode = igrab(&cap->ci->vfs_inode); + if (!inode) { + p = p->next; + continue; + } + session->s_cap_iterator = cap; + spin_unlock(&session->s_cap_lock); + + if (last_inode) { + /* avoid calling iput_final() while holding + * s_mutex or in mds dispatch threads */ + ceph_async_iput(last_inode); + last_inode = NULL; + } + if (old_cap) { + ceph_put_cap(session->s_mdsc, old_cap); + old_cap = NULL; + } + + ret = cb(inode, cap, arg); + last_inode = inode; + + spin_lock(&session->s_cap_lock); + p = p->next; + if (!cap->ci) { + dout("iterate_session_caps finishing cap %p removal\n", + cap); + BUG_ON(cap->session != session); + cap->session = NULL; + list_del_init(&cap->session_caps); + session->s_nr_caps--; + atomic64_dec(&session->s_mdsc->metric.total_caps); + if (cap->queue_release) + __ceph_queue_cap_release(session, cap); + else + old_cap = cap; /* put_cap it w/o locks held */ + } + if (ret < 0) + goto out; + } + ret = 0; +out: + session->s_cap_iterator = NULL; + spin_unlock(&session->s_cap_lock); + + ceph_async_iput(last_inode); + if (old_cap) + ceph_put_cap(session->s_mdsc, old_cap); + + return ret; +} + +static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_snap *capsnap; + int capsnap_release = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode); + + while (!list_empty(&ci->i_cap_snaps)) { + capsnap = list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + __ceph_remove_capsnap(inode, capsnap, NULL, NULL); + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + capsnap_release++; + } + wake_up_all(&ci->i_cap_wq); + wake_up_all(&mdsc->cap_flushing_wq); + return capsnap_release; +} + +static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + LIST_HEAD(to_remove); + bool dirty_dropped = false; + bool invalidate = false; + int capsnap_release = 0; + + dout("removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->vfs_inode); + spin_lock(&ci->i_ceph_lock); + __ceph_remove_cap(cap, false); + if (!ci->i_auth_cap) { + struct ceph_cap_flush *cf; + + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (inode->i_data.nrpages > 0) + invalidate = true; + if (ci->i_wrbuffer_ref > 0) + mapping_set_error(&inode->i_data, -EIO); + } + + while (!list_empty(&ci->i_cap_flush_list)) { + cf = list_first_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + list_move(&cf->i_list, &to_remove); + } + + spin_lock(&mdsc->cap_dirty_lock); + + list_for_each_entry(cf, &to_remove, i_list) + list_del_init(&cf->g_list); + + if (!list_empty(&ci->i_dirty_item)) { + pr_warn_ratelimited( + " dropping dirty %s state for %p %lld\n", + ceph_cap_string(ci->i_dirty_caps), + inode, ceph_ino(inode)); + ci->i_dirty_caps = 0; + list_del_init(&ci->i_dirty_item); + dirty_dropped = true; + } + if (!list_empty(&ci->i_flushing_item)) { + pr_warn_ratelimited( + " dropping dirty+flushing %s state for %p %lld\n", + ceph_cap_string(ci->i_flushing_caps), + inode, ceph_ino(inode)); + ci->i_flushing_caps = 0; + list_del_init(&ci->i_flushing_item); + mdsc->num_cap_flushing--; + dirty_dropped = true; + } + spin_unlock(&mdsc->cap_dirty_lock); + + if (dirty_dropped) { + mapping_set_error(inode->i_mapping, -EIO); + + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } + + if (atomic_read(&ci->i_filelock_ref) > 0) { + /* make further file lock syscall return -EIO */ + ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; + pr_warn_ratelimited(" dropping file locks for %p %lld\n", + inode, ceph_ino(inode)); + } + + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { + list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); + ci->i_prealloc_cap_flush = NULL; + } + + if (!list_empty(&ci->i_cap_snaps)) + capsnap_release = remove_capsnaps(mdsc, inode); + } + spin_unlock(&ci->i_ceph_lock); + while (!list_empty(&to_remove)) { + struct ceph_cap_flush *cf; + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, i_list); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); + } + + wake_up_all(&ci->i_cap_wq); + if (invalidate) + ceph_queue_invalidate(inode); + if (dirty_dropped) + iput(inode); + while (capsnap_release--) + iput(inode); + return 0; +} + +/* + * caller must hold session s_mutex + */ +static void remove_session_caps(struct ceph_mds_session *session) +{ + struct ceph_fs_client *fsc = session->s_mdsc->fsc; + struct super_block *sb = fsc->sb; + LIST_HEAD(dispose); + + dout("remove_session_caps on %p\n", session); + ceph_iterate_session_caps(session, remove_session_caps_cb, fsc); + + wake_up_all(&fsc->mdsc->cap_flushing_wq); + + spin_lock(&session->s_cap_lock); + if (session->s_nr_caps > 0) { + struct inode *inode; + struct ceph_cap *cap, *prev = NULL; + struct ceph_vino vino; + /* + * iterate_session_caps() skips inodes that are being + * deleted, we need to wait until deletions are complete. + * __wait_on_freeing_inode() is designed for the job, + * but it is not exported, so use lookup inode function + * to access it. + */ + while (!list_empty(&session->s_caps)) { + cap = list_entry(session->s_caps.next, + struct ceph_cap, session_caps); + if (cap == prev) + break; + prev = cap; + vino = cap->ci->i_vino; + spin_unlock(&session->s_cap_lock); + + inode = ceph_find_inode(sb, vino); + /* avoid calling iput_final() while holding s_mutex */ + ceph_async_iput(inode); + + spin_lock(&session->s_cap_lock); + } + } + + // drop cap expires and unlock s_cap_lock + detach_cap_releases(session, &dispose); + + BUG_ON(session->s_nr_caps > 0); + BUG_ON(!list_empty(&session->s_cap_flushing)); + spin_unlock(&session->s_cap_lock); + dispose_cap_releases(session->s_mdsc, &dispose); +} + +enum { + RECONNECT, + RENEWCAPS, + FORCE_RO, +}; + +/* + * wake up any threads waiting on this session's caps. if the cap is + * old (didn't get renewed on the client reconnect), remove it now. + * + * caller must hold s_mutex. + */ +static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned long ev = (unsigned long)arg; + + if (ev == RECONNECT) { + spin_lock(&ci->i_ceph_lock); + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + spin_unlock(&ci->i_ceph_lock); + } else if (ev == RENEWCAPS) { + if (cap->cap_gen < cap->session->s_cap_gen) { + /* mds did not re-issue stale cap */ + spin_lock(&ci->i_ceph_lock); + cap->issued = cap->implemented = CEPH_CAP_PIN; + spin_unlock(&ci->i_ceph_lock); + } + } else if (ev == FORCE_RO) { + } + wake_up_all(&ci->i_cap_wq); + return 0; +} + +static void wake_up_session_caps(struct ceph_mds_session *session, int ev) +{ + dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); + ceph_iterate_session_caps(session, wake_up_session_cb, + (void *)(unsigned long)ev); +} + +/* + * Send periodic message to MDS renewing all currently held caps. The + * ack will reset the expiration for all caps from this session. + * + * caller holds s_mutex + */ +static int send_renew_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int state; + + if (time_after_eq(jiffies, session->s_cap_ttl) && + time_after_eq(session->s_cap_ttl, session->s_renew_requested)) + pr_info("mds%d caps stale\n", session->s_mds); + session->s_renew_requested = jiffies; + + /* do not try to renew caps until a recovering mds has reconnected + * with its clients. */ + state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); + if (state < CEPH_MDS_STATE_RECONNECT) { + dout("send_renew_caps ignoring mds%d (%s)\n", + session->s_mds, ceph_mds_state_name(state)); + return 0; + } + + dout("send_renew_caps to mds%d (%s)\n", session->s_mds, + ceph_mds_state_name(state)); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + ++session->s_renew_seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + +static int send_flushmsg_ack(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, u64 seq) +{ + struct ceph_msg *msg; + + dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", + session->s_mds, ceph_session_state_name(session->s_state), seq); + msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + + +/* + * Note new cap ttl, and any transition from stale -> not stale (fresh?). + * + * Called under session->s_mutex + */ +static void renewed_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, int is_renew) +{ + int was_stale; + int wake = 0; + + spin_lock(&session->s_cap_lock); + was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl); + + session->s_cap_ttl = session->s_renew_requested + + mdsc->mdsmap->m_session_timeout*HZ; + + if (was_stale) { + if (time_before(jiffies, session->s_cap_ttl)) { + pr_info("mds%d caps renewed\n", session->s_mds); + wake = 1; + } else { + pr_info("mds%d caps still stale\n", session->s_mds); + } + } + dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n", + session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", + time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); + spin_unlock(&session->s_cap_lock); + + if (wake) + wake_up_session_caps(session, RENEWCAPS); +} + +/* + * send a session close request + */ +static int request_close_session(struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + dout("request_close_session mds%d state %s seq %lld\n", + session->s_mds, ceph_session_state_name(session->s_state), + session->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE, + session->s_seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 1; +} + +/* + * Called with s_mutex held. + */ +static int __close_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + if (session->s_state >= CEPH_MDS_SESSION_CLOSING) + return 0; + session->s_state = CEPH_MDS_SESSION_CLOSING; + return request_close_session(session); +} + +static bool drop_negative_children(struct dentry *dentry) +{ + struct dentry *child; + bool all_negative = true; + + if (!d_is_dir(dentry)) + goto out; + + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_child) { + if (d_really_is_positive(child)) { + all_negative = false; + break; + } + } + spin_unlock(&dentry->d_lock); + + if (all_negative) + shrink_dcache_parent(dentry); +out: + return all_negative; +} + +/* + * Trim old(er) caps. + * + * Because we can't cache an inode without one or more caps, we do + * this indirectly: if a cap is unused, we prune its aliases, at which + * point the inode will hopefully get dropped to. + * + * Yes, this is a bit sloppy. Our only real goal here is to respond to + * memory pressure from the MDS, though, so it needn't be perfect. + */ +static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) +{ + int *remaining = arg; + struct ceph_inode_info *ci = ceph_inode(inode); + int used, wanted, oissued, mine; + + if (*remaining <= 0) + return -1; + + spin_lock(&ci->i_ceph_lock); + mine = cap->issued | cap->implemented; + used = __ceph_caps_used(ci); + wanted = __ceph_caps_file_wanted(ci); + oissued = __ceph_caps_issued_other(ci, cap); + + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", + inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), + ceph_cap_string(used), ceph_cap_string(wanted)); + if (cap == ci->i_auth_cap) { + if (ci->i_dirty_caps || ci->i_flushing_caps || + !list_empty(&ci->i_cap_snaps)) + goto out; + if ((used | wanted) & CEPH_CAP_ANY_WR) + goto out; + /* Note: it's possible that i_filelock_ref becomes non-zero + * after dropping auth caps. It doesn't hurt because reply + * of lock mds request will re-add auth caps. */ + if (atomic_read(&ci->i_filelock_ref) > 0) + goto out; + } + /* The inode has cached pages, but it's no longer used. + * we can safely drop it */ + if (S_ISREG(inode->i_mode) && + wanted == 0 && used == CEPH_CAP_FILE_CACHE && + !(oissued & CEPH_CAP_FILE_CACHE)) { + used = 0; + oissued = 0; + } + if ((used | wanted) & ~oissued & mine) + goto out; /* we need these caps */ + + if (oissued) { + /* we aren't the only cap.. just remove us */ + __ceph_remove_cap(cap, true); + (*remaining)--; + } else { + struct dentry *dentry; + /* try dropping referring dentries */ + spin_unlock(&ci->i_ceph_lock); + dentry = d_find_any_alias(inode); + if (dentry && drop_negative_children(dentry)) { + int count; + dput(dentry); + d_prune_aliases(inode); + count = atomic_read(&inode->i_count); + if (count == 1) + (*remaining)--; + dout("trim_caps_cb %p cap %p pruned, count now %d\n", + inode, cap, count); + } else { + dput(dentry); + } + return 0; + } + +out: + spin_unlock(&ci->i_ceph_lock); + return 0; +} + +/* + * Trim session cap count down to some max number. + */ +int ceph_trim_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int max_caps) +{ + int trim_caps = session->s_nr_caps - max_caps; + + dout("trim_caps mds%d start: %d / %d, trim %d\n", + session->s_mds, session->s_nr_caps, max_caps, trim_caps); + if (trim_caps > 0) { + int remaining = trim_caps; + + ceph_iterate_session_caps(session, trim_caps_cb, &remaining); + dout("trim_caps mds%d done: %d / %d, trimmed %d\n", + session->s_mds, session->s_nr_caps, max_caps, + trim_caps - remaining); + } + + ceph_flush_cap_releases(mdsc, session); + return 0; +} + +static int check_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid) +{ + int ret = 1; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&mdsc->cap_flush_list)) { + struct ceph_cap_flush *cf = + list_first_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); + if (cf->tid <= want_flush_tid) { + dout("check_caps_flush still flushing tid " + "%llu <= %llu\n", cf->tid, want_flush_tid); + ret = 0; + } + } + spin_unlock(&mdsc->cap_dirty_lock); + return ret; +} + +/* + * flush all dirty inode data to disk. + * + * returns true if we've flushed through want_flush_tid + */ +static void wait_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid) +{ + dout("check_caps_flush want %llu\n", want_flush_tid); + + wait_event(mdsc->cap_flushing_wq, + check_caps_flush(mdsc, want_flush_tid)); + + dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid); +} + +/* + * called under s_mutex + */ +static void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg = NULL; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; + struct ceph_cap *cap; + LIST_HEAD(tmp_list); + int num_cap_releases; + __le32 barrier, *cap_barrier; + + down_read(&osdc->lock); + barrier = cpu_to_le32(osdc->epoch_barrier); + up_read(&osdc->lock); + + spin_lock(&session->s_cap_lock); +again: + list_splice_init(&session->s_cap_releases, &tmp_list); + num_cap_releases = session->s_num_cap_releases; + session->s_num_cap_releases = 0; + spin_unlock(&session->s_cap_lock); + + while (!list_empty(&tmp_list)) { + if (!msg) { + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, + PAGE_SIZE, GFP_NOFS, false); + if (!msg) + goto out_err; + head = msg->front.iov_base; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + + msg->hdr.version = cpu_to_le16(2); + msg->hdr.compat_version = cpu_to_le16(1); + } + + cap = list_first_entry(&tmp_list, struct ceph_cap, + session_caps); + list_del(&cap->session_caps); + num_cap_releases--; + + head = msg->front.iov_base; + put_unaligned_le32(get_unaligned_le32(&head->num) + 1, + &head->num); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(cap->cap_ino); + item->cap_id = cpu_to_le64(cap->cap_id); + item->migrate_seq = cpu_to_le32(cap->mseq); + item->seq = cpu_to_le32(cap->issue_seq); + msg->front.iov_len += sizeof(*item); + + ceph_put_cap(mdsc, cap); + + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + // Append cap_barrier field + cap_barrier = msg->front.iov_base + msg->front.iov_len; + *cap_barrier = barrier; + msg->front.iov_len += sizeof(*cap_barrier); + + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + msg = NULL; + } + } + + BUG_ON(num_cap_releases != 0); + + spin_lock(&session->s_cap_lock); + if (!list_empty(&session->s_cap_releases)) + goto again; + spin_unlock(&session->s_cap_lock); + + if (msg) { + // Append cap_barrier field + cap_barrier = msg->front.iov_base + msg->front.iov_len; + *cap_barrier = barrier; + msg->front.iov_len += sizeof(*cap_barrier); + + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + } + return; +out_err: + pr_err("send_cap_releases mds%d, failed to allocate message\n", + session->s_mds); + spin_lock(&session->s_cap_lock); + list_splice(&tmp_list, &session->s_cap_releases); + session->s_num_cap_releases += num_cap_releases; + spin_unlock(&session->s_cap_lock); +} + +static void ceph_cap_release_work(struct work_struct *work) +{ + struct ceph_mds_session *session = + container_of(work, struct ceph_mds_session, s_cap_release_work); + + mutex_lock(&session->s_mutex); + if (session->s_state == CEPH_MDS_SESSION_OPEN || + session->s_state == CEPH_MDS_SESSION_HUNG) + ceph_send_cap_releases(session->s_mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); +} + +void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + if (mdsc->stopping) + return; + + ceph_get_mds_session(session); + if (queue_work(mdsc->fsc->cap_wq, + &session->s_cap_release_work)) { + dout("cap release work queued\n"); + } else { + ceph_put_mds_session(session); + dout("failed to queue cap release work\n"); + } +} + +/* + * caller holds session->s_cap_lock + */ +void __ceph_queue_cap_release(struct ceph_mds_session *session, + struct ceph_cap *cap) +{ + list_add_tail(&cap->session_caps, &session->s_cap_releases); + session->s_num_cap_releases++; + + if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE)) + ceph_flush_cap_releases(session->s_mdsc, session); +} + +static void ceph_cap_reclaim_work(struct work_struct *work) +{ + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, cap_reclaim_work); + int ret = ceph_trim_dentries(mdsc); + if (ret == -EAGAIN) + ceph_queue_cap_reclaim_work(mdsc); +} + +void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc) +{ + if (mdsc->stopping) + return; + + if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) { + dout("caps reclaim work queued\n"); + } else { + dout("failed to queue caps release work\n"); + } +} + +void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) +{ + int val; + if (!nr) + return; + val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); + if ((val % CEPH_CAPS_PER_RELEASE) < nr) { + atomic_set(&mdsc->cap_reclaim_pending, 0); + ceph_queue_cap_reclaim_work(mdsc); + } +} + +/* + * requests + */ + +int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; + size_t size = sizeof(struct ceph_mds_reply_dir_entry); + unsigned int num_entries; + int order; + + spin_lock(&ci->i_ceph_lock); + num_entries = ci->i_files + ci->i_subdirs; + spin_unlock(&ci->i_ceph_lock); + num_entries = max(num_entries, 1U); + num_entries = min(num_entries, opt->max_readdir); + + order = get_order(size * num_entries); + while (order >= 0) { + rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN, + order); + if (rinfo->dir_entries) + break; + order--; + } + if (!rinfo->dir_entries) + return -ENOMEM; + + num_entries = (PAGE_SIZE << order) / size; + num_entries = min(num_entries, opt->max_readdir); + + rinfo->dir_buf_size = PAGE_SIZE << order; + req->r_num_caps = num_entries + 1; + req->r_args.readdir.max_entries = cpu_to_le32(num_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); + return 0; +} + +/* + * Create an mds request. + */ +struct ceph_mds_request * +ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) +{ + struct ceph_mds_request *req; + + req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS); + if (!req) + return ERR_PTR(-ENOMEM); + + mutex_init(&req->r_fill_mutex); + req->r_mdsc = mdsc; + req->r_started = jiffies; + req->r_start_latency = ktime_get(); + req->r_resend_mds = -1; + INIT_LIST_HEAD(&req->r_unsafe_dir_item); + INIT_LIST_HEAD(&req->r_unsafe_target_item); + req->r_fmode = -1; + kref_init(&req->r_kref); + RB_CLEAR_NODE(&req->r_node); + INIT_LIST_HEAD(&req->r_wait); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + INIT_LIST_HEAD(&req->r_unsafe_item); + + ktime_get_coarse_real_ts64(&req->r_stamp); + + req->r_op = op; + req->r_direct_mode = mode; + return req; +} + +/* + * return oldest (lowest) request, tid in request tree, 0 if none. + * + * called under mdsc->mutex. + */ +static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) +{ + if (RB_EMPTY_ROOT(&mdsc->request_tree)) + return NULL; + return rb_entry(rb_first(&mdsc->request_tree), + struct ceph_mds_request, r_node); +} + +static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) +{ + return mdsc->oldest_tid; +} + +/* + * Build a dentry's path. Allocate on heap; caller must kfree. Based + * on build_path_from_dentry in fs/cifs/dir.c. + * + * If @stop_on_nosnap, generate path relative to the first non-snapped + * inode. + * + * Encode hidden .snap dirs as a double /, i.e. + * foo/.snap/bar -> foo//bar + */ +char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, + int stop_on_nosnap) +{ + struct dentry *temp; + char *path; + int pos; + unsigned seq; + u64 base; + + if (!dentry) + return ERR_PTR(-EINVAL); + + path = __getname(); + if (!path) + return ERR_PTR(-ENOMEM); +retry: + pos = PATH_MAX - 1; + path[pos] = '\0'; + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + temp = dentry; + for (;;) { + struct inode *inode; + + spin_lock(&temp->d_lock); + inode = d_inode(temp); + if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { + dout("build_path path+%d: %p SNAPDIR\n", + pos, temp); + } else if (stop_on_nosnap && inode && dentry != temp && + ceph_snap(inode) == CEPH_NOSNAP) { + spin_unlock(&temp->d_lock); + pos++; /* get rid of any prepended '/' */ + break; + } else { + pos -= temp->d_name.len; + if (pos < 0) { + spin_unlock(&temp->d_lock); + break; + } + memcpy(path + pos, temp->d_name.name, temp->d_name.len); + } + spin_unlock(&temp->d_lock); + temp = READ_ONCE(temp->d_parent); + + /* Are we at the root? */ + if (IS_ROOT(temp)) + break; + + /* Are we out of buffer? */ + if (--pos < 0) + break; + + path[pos] = '/'; + } + base = ceph_ino(d_inode(temp)); + rcu_read_unlock(); + + if (read_seqretry(&rename_lock, seq)) + goto retry; + + if (pos < 0) { + /* + * A rename didn't occur, but somehow we didn't end up where + * we thought we would. Throw a warning and try again. + */ + pr_warn("build_path did not end path lookup where " + "expected, pos is %d\n", pos); + goto retry; + } + + *pbase = base; + *plen = PATH_MAX - 1 - pos; + dout("build_path on %p %d built %llx '%.*s'\n", + dentry, d_count(dentry), base, *plen, path + pos); + return path + pos; +} + +static int build_dentry_path(struct dentry *dentry, struct inode *dir, + const char **ppath, int *ppathlen, u64 *pino, + bool *pfreepath, bool parent_locked) +{ + char *path; + + rcu_read_lock(); + if (!dir) + dir = d_inode_rcu(dentry->d_parent); + if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) { + *pino = ceph_ino(dir); + rcu_read_unlock(); + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; + return 0; + } + rcu_read_unlock(); + path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + if (IS_ERR(path)) + return PTR_ERR(path); + *ppath = path; + *pfreepath = true; + return 0; +} + +static int build_inode_path(struct inode *inode, + const char **ppath, int *ppathlen, u64 *pino, + bool *pfreepath) +{ + struct dentry *dentry; + char *path; + + if (ceph_snap(inode) == CEPH_NOSNAP) { + *pino = ceph_ino(inode); + *ppathlen = 0; + return 0; + } + dentry = d_find_alias(inode); + path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + dput(dentry); + if (IS_ERR(path)) + return PTR_ERR(path); + *ppath = path; + *pfreepath = true; + return 0; +} + +/* + * request arguments may be specified via an inode *, a dentry *, or + * an explicit ino+path. + */ +static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, + struct inode *rdiri, const char *rpath, + u64 rino, const char **ppath, int *pathlen, + u64 *ino, bool *freepath, bool parent_locked) +{ + int r = 0; + + if (rinode) { + r = build_inode_path(rinode, ppath, pathlen, ino, freepath); + dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), + ceph_snap(rinode)); + } else if (rdentry) { + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, + freepath, parent_locked); + dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, + *ppath); + } else if (rpath || rino) { + *ino = rino; + *ppath = rpath; + *pathlen = rpath ? strlen(rpath) : 0; + dout(" path %.*s\n", *pathlen, rpath); + } + + return r; +} + +/* + * called under mdsc->mutex + */ +static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + int mds, bool drop_cap_releases) +{ + struct ceph_msg *msg; + struct ceph_mds_request_head *head; + const char *path1 = NULL; + const char *path2 = NULL; + u64 ino1 = 0, ino2 = 0; + int pathlen1 = 0, pathlen2 = 0; + bool freepath1 = false, freepath2 = false; + int len; + u16 releases; + void *p, *end; + int ret; + + ret = set_request_path_attr(req->r_inode, req->r_dentry, + req->r_parent, req->r_path1, req->r_ino1.ino, + &path1, &pathlen1, &ino1, &freepath1, + test_bit(CEPH_MDS_R_PARENT_LOCKED, + &req->r_req_flags)); + if (ret < 0) { + msg = ERR_PTR(ret); + goto out; + } + + /* If r_old_dentry is set, then assume that its parent is locked */ + ret = set_request_path_attr(NULL, req->r_old_dentry, + req->r_old_dentry_dir, + req->r_path2, req->r_ino2.ino, + &path2, &pathlen2, &ino2, &freepath2, true); + if (ret < 0) { + msg = ERR_PTR(ret); + goto out_free1; + } + + len = sizeof(*head) + + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + + sizeof(struct ceph_timespec); + + /* calculate (max) length for cap releases */ + len += sizeof(struct ceph_mds_request_release) * + (!!req->r_inode_drop + !!req->r_dentry_drop + + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); + if (req->r_dentry_drop) + len += pathlen1; + if (req->r_old_dentry_drop) + len += pathlen2; + + msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); + if (!msg) { + msg = ERR_PTR(-ENOMEM); + goto out_free2; + } + + msg->hdr.version = cpu_to_le16(2); + msg->hdr.tid = cpu_to_le64(req->r_tid); + + head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(*head); + end = msg->front.iov_base + msg->front.iov_len; + + head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); + head->op = cpu_to_le32(req->r_op); + head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); + head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); + head->ino = cpu_to_le64(req->r_deleg_ino); + head->args = req->r_args; + + ceph_encode_filepath(&p, end, ino1, path1); + ceph_encode_filepath(&p, end, ino2, path2); + + /* make note of release offset, in case we need to replay */ + req->r_request_release_offset = p - msg->front.iov_base; + + /* cap releases */ + releases = 0; + if (req->r_inode_drop) + releases += ceph_encode_inode_release(&p, + req->r_inode ? req->r_inode : d_inode(req->r_dentry), + mds, req->r_inode_drop, req->r_inode_unless, + req->r_op == CEPH_MDS_OP_READDIR); + if (req->r_dentry_drop) + releases += ceph_encode_dentry_release(&p, req->r_dentry, + req->r_parent, mds, req->r_dentry_drop, + req->r_dentry_unless); + if (req->r_old_dentry_drop) + releases += ceph_encode_dentry_release(&p, req->r_old_dentry, + req->r_old_dentry_dir, mds, + req->r_old_dentry_drop, + req->r_old_dentry_unless); + if (req->r_old_inode_drop) + releases += ceph_encode_inode_release(&p, + d_inode(req->r_old_dentry), + mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); + + if (drop_cap_releases) { + releases = 0; + p = msg->front.iov_base + req->r_request_release_offset; + } + + head->num_releases = cpu_to_le16(releases); + + /* time stamp */ + { + struct ceph_timespec ts; + ceph_encode_timespec64(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } + + if (WARN_ON_ONCE(p > end)) { + ceph_msg_put(msg); + msg = ERR_PTR(-ERANGE); + goto out_free2; + } + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + + if (req->r_pagelist) { + struct ceph_pagelist *pagelist = req->r_pagelist; + ceph_msg_data_add_pagelist(msg, pagelist); + msg->hdr.data_len = cpu_to_le32(pagelist->length); + } else { + msg->hdr.data_len = 0; + } + + msg->hdr.data_off = cpu_to_le16(0); + +out_free2: + if (freepath2) + ceph_mdsc_free_path((char *)path2, pathlen2); +out_free1: + if (freepath1) + ceph_mdsc_free_path((char *)path1, pathlen1); +out: + return msg; +} + +/* + * called under mdsc->mutex if error, under no mutex if + * success. + */ +static void complete_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + req->r_end_latency = ktime_get(); + + if (req->r_callback) + req->r_callback(mdsc, req); + complete_all(&req->r_completion); +} + +/* + * called under mdsc->mutex + */ +static int __prepare_send_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + int mds, bool drop_cap_releases) +{ + struct ceph_mds_request_head *rhead; + struct ceph_msg *msg; + int flags = 0; + + req->r_attempts++; + if (req->r_inode) { + struct ceph_cap *cap = + ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); + + if (cap) + req->r_sent_on_mseq = cap->mseq; + else + req->r_sent_on_mseq = -1; + } + dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, + req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + void *p; + /* + * Replay. Do not regenerate message (and rebuild + * paths, etc.); just use the original message. + * Rebuilding paths will break for renames because + * d_move mangles the src name. + */ + msg = req->r_request; + rhead = msg->front.iov_base; + + flags = le32_to_cpu(rhead->flags); + flags |= CEPH_MDS_FLAG_REPLAY; + rhead->flags = cpu_to_le32(flags); + + if (req->r_target_inode) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + + rhead->num_retry = req->r_attempts - 1; + + /* remove cap/dentry releases from message */ + rhead->num_releases = 0; + + /* time stamp */ + p = msg->front.iov_base + req->r_request_release_offset; + { + struct ceph_timespec ts; + ceph_encode_timespec64(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + return 0; + } + + if (req->r_request) { + ceph_msg_put(req->r_request); + req->r_request = NULL; + } + msg = create_request_message(mdsc, req, mds, drop_cap_releases); + if (IS_ERR(msg)) { + req->r_err = PTR_ERR(msg); + return PTR_ERR(msg); + } + req->r_request = msg; + + rhead = msg->front.iov_base; + rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) + flags |= CEPH_MDS_FLAG_REPLAY; + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) + flags |= CEPH_MDS_FLAG_ASYNC; + if (req->r_parent) + flags |= CEPH_MDS_FLAG_WANT_DENTRY; + rhead->flags = cpu_to_le32(flags); + rhead->num_fwd = req->r_num_fwd; + rhead->num_retry = req->r_attempts - 1; + + dout(" r_parent = %p\n", req->r_parent); + return 0; +} + +/* + * called under mdsc->mutex + */ +static int __send_request(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_mds_request *req, + bool drop_cap_releases) +{ + int err; + + err = __prepare_send_request(mdsc, req, session->s_mds, + drop_cap_releases); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + + return err; +} + +/* + * send request, or put it on the appropriate wait list. + */ +static void __do_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_mds_session *session = NULL; + int mds = -1; + int err = 0; + bool random; + + if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) + __unregister_request(mdsc, req); + return; + } + + if (req->r_timeout && + time_after_eq(jiffies, req->r_started + req->r_timeout)) { + dout("do_request timed out\n"); + err = -ETIMEDOUT; + goto finish; + } + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout("do_request forced umount\n"); + err = -EIO; + goto finish; + } + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { + if (mdsc->mdsmap_err) { + err = mdsc->mdsmap_err; + dout("do_request mdsmap err %d\n", err); + goto finish; + } + if (mdsc->mdsmap->m_epoch == 0) { + dout("do_request no mdsmap, waiting for map\n"); + list_add(&req->r_wait, &mdsc->waiting_for_map); + return; + } + if (!(mdsc->fsc->mount_options->flags & + CEPH_MOUNT_OPT_MOUNTWAIT) && + !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { + err = -EHOSTUNREACH; + goto finish; + } + } + + put_request_session(req); + + mds = __choose_mds(mdsc, req, &random); + if (mds < 0 || + ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { + err = -EJUKEBOX; + goto finish; + } + dout("do_request no mds or not active, waiting for map\n"); + list_add(&req->r_wait, &mdsc->waiting_for_map); + return; + } + + /* get, open session */ + session = __ceph_lookup_mds_session(mdsc, mds); + if (!session) { + session = register_session(mdsc, mds); + if (IS_ERR(session)) { + err = PTR_ERR(session); + goto finish; + } + } + req->r_session = ceph_get_mds_session(session); + + dout("do_request mds%d session %p state %s\n", mds, session, + ceph_session_state_name(session->s_state)); + if (session->s_state != CEPH_MDS_SESSION_OPEN && + session->s_state != CEPH_MDS_SESSION_HUNG) { + if (session->s_state == CEPH_MDS_SESSION_REJECTED) { + err = -EACCES; + goto out_session; + } + /* + * We cannot queue async requests since the caps and delegated + * inodes are bound to the session. Just return -EJUKEBOX and + * let the caller retry a sync request in that case. + */ + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { + err = -EJUKEBOX; + goto out_session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) { + err = __open_session(mdsc, session); + if (err) + goto out_session; + /* retry the same mds later */ + if (random) + req->r_resend_mds = mds; + } + list_add(&req->r_wait, &session->s_waiting); + goto out_session; + } + + /* send request */ + req->r_resend_mds = -1; /* forget any previous mds hint */ + + if (req->r_request_started == 0) /* note request start time */ + req->r_request_started = jiffies; + + err = __send_request(mdsc, session, req, false); + +out_session: + ceph_put_mds_session(session); +finish: + if (err) { + dout("__do_request early error %d\n", err); + req->r_err = err; + complete_request(mdsc, req); + __unregister_request(mdsc, req); + } + return; +} + +/* + * called under mdsc->mutex + */ +static void __wake_requests(struct ceph_mds_client *mdsc, + struct list_head *head) +{ + struct ceph_mds_request *req; + LIST_HEAD(tmp_list); + + list_splice_init(head, &tmp_list); + + while (!list_empty(&tmp_list)) { + req = list_entry(tmp_list.next, + struct ceph_mds_request, r_wait); + list_del_init(&req->r_wait); + dout(" wake request %p tid %llu\n", req, req->r_tid); + __do_request(mdsc, req); + } +} + +/* + * Wake up threads with requests pending for @mds, so that they can + * resubmit their requests to a possibly different mds. + */ +static void kick_requests(struct ceph_mds_client *mdsc, int mds) +{ + struct ceph_mds_request *req; + struct rb_node *p = rb_first(&mdsc->request_tree); + + dout("kick_requests mds%d\n", mds); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) + continue; + if (req->r_attempts > 0) + continue; /* only new requests */ + if (req->r_session && + req->r_session->s_mds == mds) { + dout(" kicking tid %llu\n", req->r_tid); + list_del_init(&req->r_wait); + __do_request(mdsc, req); + } + } +} + +int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, + struct ceph_mds_request *req) +{ + int err = 0; + + /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ + if (req->r_inode) + ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); + if (req->r_parent) { + struct ceph_inode_info *ci = ceph_inode(req->r_parent); + int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ? + CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD; + spin_lock(&ci->i_ceph_lock); + ceph_take_cap_refs(ci, CEPH_CAP_PIN, false); + __ceph_touch_fmode(ci, mdsc, fmode); + spin_unlock(&ci->i_ceph_lock); + ihold(req->r_parent); + } + if (req->r_old_dentry_dir) + ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); + + if (req->r_inode) { + err = ceph_wait_on_async_create(req->r_inode); + if (err) { + dout("%s: wait for async create returned: %d\n", + __func__, err); + return err; + } + } + + if (!err && req->r_old_inode) { + err = ceph_wait_on_async_create(req->r_old_inode); + if (err) { + dout("%s: wait for async create returned: %d\n", + __func__, err); + return err; + } + } + + dout("submit_request on %p for inode %p\n", req, dir); + mutex_lock(&mdsc->mutex); + __register_request(mdsc, req, dir); + __do_request(mdsc, req); + err = req->r_err; + mutex_unlock(&mdsc->mutex); + return err; +} + +static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + int err; + + /* wait */ + dout("do_request waiting\n"); + if (!req->r_timeout && req->r_wait_for_completion) { + err = req->r_wait_for_completion(mdsc, req); + } else { + long timeleft = wait_for_completion_killable_timeout( + &req->r_completion, + ceph_timeout_jiffies(req->r_timeout)); + if (timeleft > 0) + err = 0; + else if (!timeleft) + err = -ETIMEDOUT; /* timed out */ + else + err = timeleft; /* killed */ + } + dout("do_request waited, got %d\n", err); + mutex_lock(&mdsc->mutex); + + /* only abort if we didn't race with a real reply */ + if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { + err = le32_to_cpu(req->r_reply_info.head->result); + } else if (err < 0) { + dout("aborted request %lld with %d\n", req->r_tid, err); + + /* + * ensure we aren't running concurrently with + * ceph_fill_trace or ceph_readdir_prepopulate, which + * rely on locks (dir mutex) held by our caller. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = err; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + + if (req->r_parent && + (req->r_op & CEPH_MDS_OP_WRITE)) + ceph_invalidate_dir_request(req); + } else { + err = req->r_err; + } + + mutex_unlock(&mdsc->mutex); + return err; +} + +/* + * Synchrously perform an mds request. Take care of all of the + * session setup, forwarding, retry details. + */ +int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req) +{ + int err; + + dout("do_request on %p\n", req); + + /* issue */ + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) + err = ceph_mdsc_wait_request(mdsc, req); + dout("do_request %p done, result %d\n", req, err); + return err; +} + +/* + * Invalidate dir's completeness, dentry lease state on an aborted MDS + * namespace request. + */ +void ceph_invalidate_dir_request(struct ceph_mds_request *req) +{ + struct inode *dir = req->r_parent; + struct inode *old_dir = req->r_old_dentry_dir; + + dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir); + + ceph_dir_clear_complete(dir); + if (old_dir) + ceph_dir_clear_complete(old_dir); + if (req->r_dentry) + ceph_invalidate_dentry_lease(req->r_dentry); + if (req->r_old_dentry) + ceph_invalidate_dentry_lease(req->r_old_dentry); +} + +/* + * Handle mds reply. + * + * We take the session mutex and parse and process the reply immediately. + * This preserves the logical ordering of replies, capabilities, etc., sent + * by the MDS as they are applied to our local cache. + */ +static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_mds_request *req; + struct ceph_mds_reply_head *head = msg->front.iov_base; + struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ + struct ceph_snap_realm *realm; + u64 tid; + int err, result; + int mds = session->s_mds; + + if (msg->front.iov_len < sizeof(*head)) { + pr_err("mdsc_handle_reply got corrupt (short) reply\n"); + ceph_msg_dump(msg); + return; + } + + /* get request, session */ + tid = le64_to_cpu(msg->hdr.tid); + mutex_lock(&mdsc->mutex); + req = lookup_get_request(mdsc, tid); + if (!req) { + dout("handle_reply on unknown tid %llu\n", tid); + mutex_unlock(&mdsc->mutex); + return; + } + dout("handle_reply %p\n", req); + + /* correct session? */ + if (req->r_session != session) { + pr_err("mdsc_handle_reply got %llu on session mds%d" + " not mds%d\n", tid, session->s_mds, + req->r_session ? req->r_session->s_mds : -1); + mutex_unlock(&mdsc->mutex); + goto out; + } + + /* dup? */ + if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) || + (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) { + pr_warn("got a dup %s reply on %llu from mds%d\n", + head->safe ? "safe" : "unsafe", tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } + if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) { + pr_warn("got unsafe after safe on %llu from mds%d\n", + tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } + + result = le32_to_cpu(head->result); + + /* + * Handle an ESTALE + * if we're not talking to the authority, send to them + * if the authority has changed while we weren't looking, + * send to new authority + * Otherwise we just have to return an ESTALE + */ + if (result == -ESTALE) { + dout("got ESTALE on request %llu\n", req->r_tid); + req->r_resend_mds = -1; + if (req->r_direct_mode != USE_AUTH_MDS) { + dout("not using auth, setting for that now\n"); + req->r_direct_mode = USE_AUTH_MDS; + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); + goto out; + } else { + int mds = __choose_mds(mdsc, req, NULL); + if (mds >= 0 && mds != req->r_session->s_mds) { + dout("but auth changed, so resending\n"); + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); + goto out; + } + } + dout("have to return ESTALE on request %llu\n", req->r_tid); + } + + + if (head->safe) { + set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); + __unregister_request(mdsc, req); + + /* last request during umount? */ + if (mdsc->stopping && !__get_oldest_req(mdsc)) + complete_all(&mdsc->safe_umount_waiters); + + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + /* + * We already handled the unsafe response, now do the + * cleanup. No need to examine the response; the MDS + * doesn't include any result info in the safe + * response. And even if it did, there is nothing + * useful we could do with a revised return value. + */ + dout("got safe reply %llu, mds%d\n", tid, mds); + + mutex_unlock(&mdsc->mutex); + goto out; + } + } else { + set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags); + list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); + } + + dout("handle_reply tid %lld result %d\n", tid, result); + rinfo = &req->r_reply_info; + if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) + err = parse_reply_info(session, msg, rinfo, (u64)-1); + else + err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + if (err < 0) { + pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); + ceph_msg_dump(msg); + goto out_err; + } + + /* snap trace */ + realm = NULL; + if (rinfo->snapblob_len) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, rinfo->snapblob, + rinfo->snapblob + rinfo->snapblob_len, + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, + &realm); + downgrade_write(&mdsc->snap_rwsem); + } else { + down_read(&mdsc->snap_rwsem); + } + + /* insert trace into our cache */ + mutex_lock(&req->r_fill_mutex); + current->journal_info = req; + err = ceph_fill_trace(mdsc->fsc->sb, req); + if (err == 0) { + if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || + req->r_op == CEPH_MDS_OP_LSSNAP)) + ceph_readdir_prepopulate(req, req->r_session); + } + current->journal_info = NULL; + mutex_unlock(&req->r_fill_mutex); + + up_read(&mdsc->snap_rwsem); + if (realm) + ceph_put_snap_realm(mdsc, realm); + + if (err == 0) { + if (req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + struct ceph_inode_info *ci = + ceph_inode(req->r_target_inode); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_target_item, + &ci->i_unsafe_iops); + spin_unlock(&ci->i_unsafe_lock); + } + + ceph_unreserve_caps(mdsc, &req->r_caps_reservation); + } +out_err: + mutex_lock(&mdsc->mutex); + if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + if (err) { + req->r_err = err; + } else { + req->r_reply = ceph_msg_get(msg); + set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags); + } + } else { + dout("reply arrived after request %lld was aborted\n", tid); + } + mutex_unlock(&mdsc->mutex); + + mutex_unlock(&session->s_mutex); + + /* kick calling process */ + complete_request(mdsc, req); + + ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency, + req->r_end_latency, err); +out: + ceph_mdsc_put_request(req); + return; +} + + + +/* + * handle mds notification that our request has been forwarded. + */ +static void handle_forward(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_request *req; + u64 tid = le64_to_cpu(msg->hdr.tid); + u32 next_mds; + u32 fwd_seq; + int err = -EINVAL; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + + ceph_decode_need(&p, end, 2*sizeof(u32), bad); + next_mds = ceph_decode_32(&p); + fwd_seq = ceph_decode_32(&p); + + mutex_lock(&mdsc->mutex); + req = lookup_get_request(mdsc, tid); + if (!req) { + dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); + goto out; /* dup reply? */ + } + + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + dout("forward tid %llu aborted, unregistering\n", tid); + __unregister_request(mdsc, req); + } else if (fwd_seq <= req->r_num_fwd) { + dout("forward tid %llu to mds%d - old seq %d <= %d\n", + tid, next_mds, req->r_num_fwd, fwd_seq); + } else { + /* resend. forward race not possible; mds would drop */ + dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); + BUG_ON(req->r_err); + BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)); + req->r_attempts = 0; + req->r_num_fwd = fwd_seq; + req->r_resend_mds = next_mds; + put_request_session(req); + __do_request(mdsc, req); + } + ceph_mdsc_put_request(req); +out: + mutex_unlock(&mdsc->mutex); + return; + +bad: + pr_err("mdsc_handle_forward decode error err=%d\n", err); +} + +static int __decode_session_metadata(void **p, void *end, + bool *blocklisted) +{ + /* map<string,string> */ + u32 n; + bool err_str; + ceph_decode_32_safe(p, end, n, bad); + while (n-- > 0) { + u32 len; + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + err_str = !strncmp(*p, "error_string", len); + *p += len; + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + /* + * Match "blocklisted (blacklisted)" from newer MDSes, + * or "blacklisted" from older MDSes. + */ + if (err_str && strnstr(*p, "blacklisted", len)) + *blocklisted = true; + *p += len; + } + return 0; +bad: + return -1; +} + +/* + * handle a mds session control message + */ +static void handle_session(struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + int mds = session->s_mds; + int msg_version = le16_to_cpu(msg->hdr.version); + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + struct ceph_mds_session_head *h; + u32 op; + u64 seq, features = 0; + int wake = 0; + bool blocklisted = false; + + /* decode */ + ceph_decode_need(&p, end, sizeof(*h), bad); + h = p; + p += sizeof(*h); + + op = le32_to_cpu(h->op); + seq = le64_to_cpu(h->seq); + + if (msg_version >= 3) { + u32 len; + /* version >= 2, metadata */ + if (__decode_session_metadata(&p, end, &blocklisted) < 0) + goto bad; + /* version >= 3, feature bits */ + ceph_decode_32_safe(&p, end, len, bad); + if (len) { + ceph_decode_64_safe(&p, end, features, bad); + p += len - sizeof(features); + } + } + + mutex_lock(&mdsc->mutex); + if (op == CEPH_SESSION_CLOSE) { + ceph_get_mds_session(session); + __unregister_session(mdsc, session); + } + /* FIXME: this ttl calculation is generous */ + session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + + dout("handle_session mds%d %s %p state %s seq %llu\n", + mds, ceph_session_op_name(op), session, + ceph_session_state_name(session->s_state), seq); + + if (session->s_state == CEPH_MDS_SESSION_HUNG) { + session->s_state = CEPH_MDS_SESSION_OPEN; + pr_info("mds%d came back\n", session->s_mds); + } + + switch (op) { + case CEPH_SESSION_OPEN: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect success\n", session->s_mds); + session->s_state = CEPH_MDS_SESSION_OPEN; + session->s_features = features; + renewed_caps(mdsc, session, 0); + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) + metric_schedule_delayed(&mdsc->metric); + wake = 1; + if (mdsc->stopping) + __close_session(mdsc, session); + break; + + case CEPH_SESSION_RENEWCAPS: + if (session->s_renew_seq == seq) + renewed_caps(mdsc, session, 1); + break; + + case CEPH_SESSION_CLOSE: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect denied\n", session->s_mds); + session->s_state = CEPH_MDS_SESSION_CLOSED; + cleanup_session_requests(mdsc, session); + remove_session_caps(session); + wake = 2; /* for good measure */ + wake_up_all(&mdsc->session_close_wq); + break; + + case CEPH_SESSION_STALE: + pr_info("mds%d caps went stale, renewing\n", + session->s_mds); + spin_lock(&session->s_gen_ttl_lock); + session->s_cap_gen++; + session->s_cap_ttl = jiffies - 1; + spin_unlock(&session->s_gen_ttl_lock); + send_renew_caps(mdsc, session); + break; + + case CEPH_SESSION_RECALL_STATE: + ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); + break; + + case CEPH_SESSION_FLUSHMSG: + /* flush cap releases */ + spin_lock(&session->s_cap_lock); + if (session->s_num_cap_releases) + ceph_flush_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); + + send_flushmsg_ack(mdsc, session, seq); + break; + + case CEPH_SESSION_FORCE_RO: + dout("force_session_readonly %p\n", session); + spin_lock(&session->s_cap_lock); + session->s_readonly = true; + spin_unlock(&session->s_cap_lock); + wake_up_session_caps(session, FORCE_RO); + break; + + case CEPH_SESSION_REJECT: + WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING); + pr_info("mds%d rejected session\n", session->s_mds); + session->s_state = CEPH_MDS_SESSION_REJECTED; + cleanup_session_requests(mdsc, session); + remove_session_caps(session); + if (blocklisted) + mdsc->fsc->blocklisted = true; + wake = 2; /* for good measure */ + break; + + default: + pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); + WARN_ON(1); + } + + mutex_unlock(&session->s_mutex); + if (wake) { + mutex_lock(&mdsc->mutex); + __wake_requests(mdsc, &session->s_waiting); + if (wake == 2) + kick_requests(mdsc, mds); + mutex_unlock(&mdsc->mutex); + } + if (op == CEPH_SESSION_CLOSE) + ceph_put_mds_session(session); + return; + +bad: + pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds, + (int)msg->front.iov_len); + ceph_msg_dump(msg); + return; +} + +void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) +{ + int dcaps; + + dcaps = xchg(&req->r_dir_caps, 0); + if (dcaps) { + dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); + ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps); + } +} + +void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) +{ + int dcaps; + + dcaps = xchg(&req->r_dir_caps, 0); + if (dcaps) { + dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); + ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent), + dcaps); + } +} + +/* + * called under session->mutex. + */ +static void replay_unsafe_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_request *req, *nreq; + struct rb_node *p; + + dout("replay_unsafe_requests mds%d\n", session->s_mds); + + mutex_lock(&mdsc->mutex); + list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) + __send_request(mdsc, session, req, true); + + /* + * also re-send old requests when MDS enters reconnect stage. So that MDS + * can process completed request in clientreplay stage. + */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) + continue; + if (req->r_attempts == 0) + continue; /* only old requests */ + if (!req->r_session) + continue; + if (req->r_session->s_mds != session->s_mds) + continue; + + ceph_mdsc_release_dir_caps_no_check(req); + + __send_request(mdsc, session, req, true); + } + mutex_unlock(&mdsc->mutex); +} + +static int send_reconnect_partial(struct ceph_reconnect_state *recon_state) +{ + struct ceph_msg *reply; + struct ceph_pagelist *_pagelist; + struct page *page; + __le32 *addr; + int err = -ENOMEM; + + if (!recon_state->allow_multi) + return -ENOSPC; + + /* can't handle message that contains both caps and realm */ + BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms); + + /* pre-allocate new pagelist */ + _pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!_pagelist) + return -ENOMEM; + + reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); + if (!reply) + goto fail_msg; + + /* placeholder for nr_caps */ + err = ceph_pagelist_encode_32(_pagelist, 0); + if (err < 0) + goto fail; + + if (recon_state->nr_caps) { + /* currently encoding caps */ + err = ceph_pagelist_encode_32(recon_state->pagelist, 0); + if (err) + goto fail; + } else { + /* placeholder for nr_realms (currently encoding relams) */ + err = ceph_pagelist_encode_32(_pagelist, 0); + if (err < 0) + goto fail; + } + + err = ceph_pagelist_encode_8(recon_state->pagelist, 1); + if (err) + goto fail; + + page = list_first_entry(&recon_state->pagelist->head, struct page, lru); + addr = kmap_atomic(page); + if (recon_state->nr_caps) { + /* currently encoding caps */ + *addr = cpu_to_le32(recon_state->nr_caps); + } else { + /* currently encoding relams */ + *(addr + 1) = cpu_to_le32(recon_state->nr_realms); + } + kunmap_atomic(addr); + + reply->hdr.version = cpu_to_le16(5); + reply->hdr.compat_version = cpu_to_le16(4); + + reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length); + ceph_msg_data_add_pagelist(reply, recon_state->pagelist); + + ceph_con_send(&recon_state->session->s_con, reply); + ceph_pagelist_release(recon_state->pagelist); + + recon_state->pagelist = _pagelist; + recon_state->nr_caps = 0; + recon_state->nr_realms = 0; + recon_state->msg_version = 5; + return 0; +fail: + ceph_msg_put(reply); +fail_msg: + ceph_pagelist_release(_pagelist); + return err; +} + +static struct dentry* d_find_primary(struct inode *inode) +{ + struct dentry *alias, *dn = NULL; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + + spin_lock(&inode->i_lock); + if (hlist_empty(&inode->i_dentry)) + goto out_unlock; + + if (S_ISDIR(inode->i_mode)) { + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + if (!IS_ROOT(alias)) + dn = dget(alias); + goto out_unlock; + } + + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + spin_lock(&alias->d_lock); + if (!d_unhashed(alias) && + (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) { + dn = dget_dlock(alias); + } + spin_unlock(&alias->d_lock); + if (dn) + break; + } +out_unlock: + spin_unlock(&inode->i_lock); + return dn; +} + +/* + * Encode information about a cap for a reconnect with the MDS. + */ +static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + union { + struct ceph_mds_cap_reconnect v2; + struct ceph_mds_cap_reconnect_v1 v1; + } rec; + struct ceph_inode_info *ci = cap->ci; + struct ceph_reconnect_state *recon_state = arg; + struct ceph_pagelist *pagelist = recon_state->pagelist; + struct dentry *dentry; + char *path; + int pathlen = 0, err; + u64 pathbase; + u64 snap_follows; + + dout(" adding %p ino %llx.%llx cap %p %lld %s\n", + inode, ceph_vinop(inode), cap, cap->cap_id, + ceph_cap_string(cap->issued)); + + dentry = d_find_primary(inode); + if (dentry) { + /* set pathbase to parent dir when msg_version >= 2 */ + path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, + recon_state->msg_version >= 2); + dput(dentry); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out_err; + } + } else { + path = NULL; + pathbase = 0; + } + + spin_lock(&ci->i_ceph_lock); + cap->seq = 0; /* reset cap seq */ + cap->issue_seq = 0; /* and issue_seq */ + cap->mseq = 0; /* and migrate_seq */ + cap->cap_gen = cap->session->s_cap_gen; + + /* These are lost when the session goes away */ + if (S_ISDIR(inode->i_mode)) { + if (cap->issued & CEPH_CAP_DIR_CREATE) { + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); + } + cap->issued &= ~CEPH_CAP_ANY_DIR_OPS; + } + + if (recon_state->msg_version >= 2) { + rec.v2.cap_id = cpu_to_le64(cap->cap_id); + rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v2.issued = cpu_to_le32(cap->issued); + rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.flock_len = (__force __le32) + ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); + } else { + rec.v1.cap_id = cpu_to_le64(cap->cap_id); + rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v1.issued = cpu_to_le32(cap->issued); + rec.v1.size = cpu_to_le64(inode->i_size); + ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); + ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); + rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v1.pathbase = cpu_to_le64(pathbase); + } + + if (list_empty(&ci->i_cap_snaps)) { + snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0; + } else { + struct ceph_cap_snap *capsnap = + list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + snap_follows = capsnap->follows; + } + spin_unlock(&ci->i_ceph_lock); + + if (recon_state->msg_version >= 2) { + int num_fcntl_locks, num_flock_locks; + struct ceph_filelock *flocks = NULL; + size_t struct_len, total_len = sizeof(u64); + u8 struct_v = 0; + +encode_again: + if (rec.v2.flock_len) { + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + } else { + num_fcntl_locks = 0; + num_flock_locks = 0; + } + if (num_fcntl_locks + num_flock_locks > 0) { + flocks = kmalloc_array(num_fcntl_locks + num_flock_locks, + sizeof(struct ceph_filelock), + GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_err; + } + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + if (err) { + kfree(flocks); + flocks = NULL; + if (err == -ENOSPC) + goto encode_again; + goto out_err; + } + } else { + kfree(flocks); + flocks = NULL; + } + + if (recon_state->msg_version >= 3) { + /* version, compat_version and struct_len */ + total_len += 2 * sizeof(u8) + sizeof(u32); + struct_v = 2; + } + /* + * number of encoded locks is stable, so copy to pagelist + */ + struct_len = 2 * sizeof(u32) + + (num_fcntl_locks + num_flock_locks) * + sizeof(struct ceph_filelock); + rec.v2.flock_len = cpu_to_le32(struct_len); + + struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); + + if (struct_v >= 2) + struct_len += sizeof(u64); /* snap_follows */ + + total_len += struct_len; + + if (pagelist->length + total_len > RECONNECT_MAX_SIZE) { + err = send_reconnect_partial(recon_state); + if (err) + goto out_freeflocks; + pagelist = recon_state->pagelist; + } + + err = ceph_pagelist_reserve(pagelist, total_len); + if (err) + goto out_freeflocks; + + ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); + if (recon_state->msg_version >= 3) { + ceph_pagelist_encode_8(pagelist, struct_v); + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_32(pagelist, struct_len); + } + ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); + ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, num_flock_locks); + if (struct_v >= 2) + ceph_pagelist_encode_64(pagelist, snap_follows); +out_freeflocks: + kfree(flocks); + } else { + err = ceph_pagelist_reserve(pagelist, + sizeof(u64) + sizeof(u32) + + pathlen + sizeof(rec.v1)); + if (err) + goto out_err; + + ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); + ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); + } + +out_err: + ceph_mdsc_free_path(path, pathlen); + if (!err) + recon_state->nr_caps++; + return err; +} + +static int encode_snap_realms(struct ceph_mds_client *mdsc, + struct ceph_reconnect_state *recon_state) +{ + struct rb_node *p; + struct ceph_pagelist *pagelist = recon_state->pagelist; + int err = 0; + + if (recon_state->msg_version >= 4) { + err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms); + if (err < 0) + goto fail; + } + + /* + * snaprealms. we provide mds with the ino, seq (version), and + * parent for all of our realms. If the mds has any newer info, + * it will tell us. + */ + for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { + struct ceph_snap_realm *realm = + rb_entry(p, struct ceph_snap_realm, node); + struct ceph_mds_snaprealm_reconnect sr_rec; + + if (recon_state->msg_version >= 4) { + size_t need = sizeof(u8) * 2 + sizeof(u32) + + sizeof(sr_rec); + + if (pagelist->length + need > RECONNECT_MAX_SIZE) { + err = send_reconnect_partial(recon_state); + if (err) + goto fail; + pagelist = recon_state->pagelist; + } + + err = ceph_pagelist_reserve(pagelist, need); + if (err) + goto fail; + + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_32(pagelist, sizeof(sr_rec)); + } + + dout(" adding snap realm %llx seq %lld parent %llx\n", + realm->ino, realm->seq, realm->parent_ino); + sr_rec.ino = cpu_to_le64(realm->ino); + sr_rec.seq = cpu_to_le64(realm->seq); + sr_rec.parent = cpu_to_le64(realm->parent_ino); + + err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); + if (err) + goto fail; + + recon_state->nr_realms++; + } +fail: + return err; +} + + +/* + * If an MDS fails and recovers, clients need to reconnect in order to + * reestablish shared state. This includes all caps issued through + * this session _and_ the snap_realm hierarchy. Because it's not + * clear which snap realms the mds cares about, we send everything we + * know about.. that ensures we'll then get any new info the + * recovering MDS might have. + * + * This is a relatively heavyweight operation, but it's rare. + */ +static void send_mds_reconnect(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *reply; + int mds = session->s_mds; + int err = -ENOMEM; + struct ceph_reconnect_state recon_state = { + .session = session, + }; + LIST_HEAD(dispose); + + pr_info("mds%d reconnect start\n", mds); + + recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!recon_state.pagelist) + goto fail_nopagelist; + + reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); + if (!reply) + goto fail_nomsg; + + xa_destroy(&session->s_delegated_inos); + + mutex_lock(&session->s_mutex); + session->s_state = CEPH_MDS_SESSION_RECONNECTING; + session->s_seq = 0; + + dout("session %p state %s\n", session, + ceph_session_state_name(session->s_state)); + + spin_lock(&session->s_gen_ttl_lock); + session->s_cap_gen++; + spin_unlock(&session->s_gen_ttl_lock); + + spin_lock(&session->s_cap_lock); + /* don't know if session is readonly */ + session->s_readonly = 0; + /* + * notify __ceph_remove_cap() that we are composing cap reconnect. + * If a cap get released before being added to the cap reconnect, + * __ceph_remove_cap() should skip queuing cap release. + */ + session->s_cap_reconnect = 1; + /* drop old cap expires; we're about to reestablish that state */ + detach_cap_releases(session, &dispose); + spin_unlock(&session->s_cap_lock); + dispose_cap_releases(mdsc, &dispose); + + /* trim unused caps to reduce MDS's cache rejoin time */ + if (mdsc->fsc->sb->s_root) + shrink_dcache_parent(mdsc->fsc->sb->s_root); + + ceph_con_close(&session->s_con); + ceph_con_open(&session->s_con, + CEPH_ENTITY_TYPE_MDS, mds, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + + /* replay unsafe requests */ + replay_unsafe_requests(mdsc, session); + + ceph_early_kick_flushing_caps(mdsc, session); + + down_read(&mdsc->snap_rwsem); + + /* placeholder for nr_caps */ + err = ceph_pagelist_encode_32(recon_state.pagelist, 0); + if (err) + goto fail; + + if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) { + recon_state.msg_version = 3; + recon_state.allow_multi = true; + } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) { + recon_state.msg_version = 3; + } else { + recon_state.msg_version = 2; + } + /* trsaverse this session's caps */ + err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state); + + spin_lock(&session->s_cap_lock); + session->s_cap_reconnect = 0; + spin_unlock(&session->s_cap_lock); + + if (err < 0) + goto fail; + + /* check if all realms can be encoded into current message */ + if (mdsc->num_snap_realms) { + size_t total_len = + recon_state.pagelist->length + + mdsc->num_snap_realms * + sizeof(struct ceph_mds_snaprealm_reconnect); + if (recon_state.msg_version >= 4) { + /* number of realms */ + total_len += sizeof(u32); + /* version, compat_version and struct_len */ + total_len += mdsc->num_snap_realms * + (2 * sizeof(u8) + sizeof(u32)); + } + if (total_len > RECONNECT_MAX_SIZE) { + if (!recon_state.allow_multi) { + err = -ENOSPC; + goto fail; + } + if (recon_state.nr_caps) { + err = send_reconnect_partial(&recon_state); + if (err) + goto fail; + } + recon_state.msg_version = 5; + } + } + + err = encode_snap_realms(mdsc, &recon_state); + if (err < 0) + goto fail; + + if (recon_state.msg_version >= 5) { + err = ceph_pagelist_encode_8(recon_state.pagelist, 0); + if (err < 0) + goto fail; + } + + if (recon_state.nr_caps || recon_state.nr_realms) { + struct page *page = + list_first_entry(&recon_state.pagelist->head, + struct page, lru); + __le32 *addr = kmap_atomic(page); + if (recon_state.nr_caps) { + WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms); + *addr = cpu_to_le32(recon_state.nr_caps); + } else if (recon_state.msg_version >= 4) { + *(addr + 1) = cpu_to_le32(recon_state.nr_realms); + } + kunmap_atomic(addr); + } + + reply->hdr.version = cpu_to_le16(recon_state.msg_version); + if (recon_state.msg_version >= 4) + reply->hdr.compat_version = cpu_to_le16(4); + + reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length); + ceph_msg_data_add_pagelist(reply, recon_state.pagelist); + + ceph_con_send(&session->s_con, reply); + + mutex_unlock(&session->s_mutex); + + mutex_lock(&mdsc->mutex); + __wake_requests(mdsc, &session->s_waiting); + mutex_unlock(&mdsc->mutex); + + up_read(&mdsc->snap_rwsem); + ceph_pagelist_release(recon_state.pagelist); + return; + +fail: + ceph_msg_put(reply); + up_read(&mdsc->snap_rwsem); + mutex_unlock(&session->s_mutex); +fail_nomsg: + ceph_pagelist_release(recon_state.pagelist); +fail_nopagelist: + pr_err("error %d preparing reconnect for mds%d\n", err, mds); + return; +} + + +/* + * compare old and new mdsmaps, kicking requests + * and closing out old connections as necessary + * + * called under mdsc->mutex. + */ +static void check_new_map(struct ceph_mds_client *mdsc, + struct ceph_mdsmap *newmap, + struct ceph_mdsmap *oldmap) +{ + int i; + int oldstate, newstate; + struct ceph_mds_session *s; + + dout("check_new_map new %u old %u\n", + newmap->m_epoch, oldmap->m_epoch); + + for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { + if (!mdsc->sessions[i]) + continue; + s = mdsc->sessions[i]; + oldstate = ceph_mdsmap_get_state(oldmap, i); + newstate = ceph_mdsmap_get_state(newmap, i); + + dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", + i, ceph_mds_state_name(oldstate), + ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", + ceph_mds_state_name(newstate), + ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", + ceph_session_state_name(s->s_state)); + + if (i >= newmap->possible_max_rank) { + /* force close session for stopped mds */ + ceph_get_mds_session(s); + __unregister_session(mdsc, s); + __wake_requests(mdsc, &s->s_waiting); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + cleanup_session_requests(mdsc, s); + remove_session_caps(s); + mutex_unlock(&s->s_mutex); + + ceph_put_mds_session(s); + + mutex_lock(&mdsc->mutex); + kick_requests(mdsc, i); + continue; + } + + if (memcmp(ceph_mdsmap_get_addr(oldmap, i), + ceph_mdsmap_get_addr(newmap, i), + sizeof(struct ceph_entity_addr))) { + /* just close it */ + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_lock(&mdsc->mutex); + ceph_con_close(&s->s_con); + mutex_unlock(&s->s_mutex); + s->s_state = CEPH_MDS_SESSION_RESTARTING; + } else if (oldstate == newstate) { + continue; /* nothing new with this mds */ + } + + /* + * send reconnect? + */ + if (s->s_state == CEPH_MDS_SESSION_RESTARTING && + newstate >= CEPH_MDS_STATE_RECONNECT) { + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + mutex_lock(&mdsc->mutex); + } + + /* + * kick request on any mds that has gone active. + */ + if (oldstate < CEPH_MDS_STATE_ACTIVE && + newstate >= CEPH_MDS_STATE_ACTIVE) { + if (oldstate != CEPH_MDS_STATE_CREATING && + oldstate != CEPH_MDS_STATE_STARTING) + pr_info("mds%d recovery completed\n", s->s_mds); + kick_requests(mdsc, i); + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_lock(&mdsc->mutex); + ceph_kick_flushing_caps(mdsc, s); + mutex_unlock(&s->s_mutex); + wake_up_session_caps(s, RECONNECT); + } + } + + for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { + s = mdsc->sessions[i]; + if (!s) + continue; + if (!ceph_mdsmap_is_laggy(newmap, i)) + continue; + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG || + s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout(" connecting to export targets of laggy mds%d\n", + i); + __open_export_target_sessions(mdsc, s); + } + } +} + + + +/* + * leases + */ + +/* + * caller must hold session s_mutex, dentry->d_lock + */ +void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + + ceph_put_mds_session(di->lease_session); + di->lease_session = NULL; +} + +static void handle_lease(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->fsc->sb; + struct inode *inode; + struct dentry *parent, *dentry; + struct ceph_dentry_info *di; + int mds = session->s_mds; + struct ceph_mds_lease *h = msg->front.iov_base; + u32 seq; + struct ceph_vino vino; + struct qstr dname; + int release = 0; + + dout("handle_lease from mds%d\n", mds); + + /* decode */ + if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) + goto bad; + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + seq = le32_to_cpu(h->seq); + dname.len = get_unaligned_le32(h + 1); + if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len) + goto bad; + dname.name = (void *)(h + 1) + sizeof(u32); + + /* lookup inode */ + inode = ceph_find_inode(sb, vino); + dout("handle_lease %s, ino %llx %p %.*s\n", + ceph_lease_op_name(h->action), vino.ino, inode, + dname.len, dname.name); + + mutex_lock(&session->s_mutex); + inc_session_sequence(session); + + if (!inode) { + dout("handle_lease no inode %llx\n", vino.ino); + goto release; + } + + /* dentry */ + parent = d_find_alias(inode); + if (!parent) { + dout("no parent dentry on inode %p\n", inode); + WARN_ON(1); + goto release; /* hrm... */ + } + dname.hash = full_name_hash(parent, dname.name, dname.len); + dentry = d_lookup(parent, &dname); + dput(parent); + if (!dentry) + goto release; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + switch (h->action) { + case CEPH_MDS_LEASE_REVOKE: + if (di->lease_session == session) { + if (ceph_seq_cmp(di->lease_seq, seq) > 0) + h->seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); + } + release = 1; + break; + + case CEPH_MDS_LEASE_RENEW: + if (di->lease_session == session && + di->lease_gen == session->s_cap_gen && + di->lease_renew_from && + di->lease_renew_after == 0) { + unsigned long duration = + msecs_to_jiffies(le32_to_cpu(h->duration_ms)); + + di->lease_seq = seq; + di->time = di->lease_renew_from + duration; + di->lease_renew_after = di->lease_renew_from + + (duration >> 1); + di->lease_renew_from = 0; + } + break; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + + if (!release) + goto out; + +release: + /* let's just reuse the same message */ + h->action = CEPH_MDS_LEASE_REVOKE_ACK; + ceph_msg_get(msg); + ceph_con_send(&session->s_con, msg); + +out: + mutex_unlock(&session->s_mutex); + /* avoid calling iput_final() in mds dispatch threads */ + ceph_async_iput(inode); + return; + +bad: + pr_err("corrupt lease message\n"); + ceph_msg_dump(msg); +} + +void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, + struct dentry *dentry, char action, + u32 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_lease *lease; + struct inode *dir; + int len = sizeof(*lease) + sizeof(u32) + NAME_MAX; + + dout("lease_send_msg identry %p %s to mds%d\n", + dentry, ceph_lease_op_name(action), session->s_mds); + + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); + if (!msg) + return; + lease = msg->front.iov_base; + lease->action = action; + lease->seq = cpu_to_le32(seq); + + spin_lock(&dentry->d_lock); + dir = d_inode(dentry->d_parent); + lease->ino = cpu_to_le64(ceph_ino(dir)); + lease->first = lease->last = cpu_to_le64(ceph_snap(dir)); + + put_unaligned_le32(dentry->d_name.len, lease + 1); + memcpy((void *)(lease + 1) + 4, + dentry->d_name.name, dentry->d_name.len); + spin_unlock(&dentry->d_lock); + /* + * if this is a preemptive lease RELEASE, no need to + * flush request stream, since the actual request will + * soon follow. + */ + msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); + + ceph_con_send(&session->s_con, msg); +} + +/* + * lock unlock the session, to wait ongoing session activities + */ +static void lock_unlock_session(struct ceph_mds_session *s) +{ + mutex_lock(&s->s_mutex); + mutex_unlock(&s->s_mutex); +} + +static void maybe_recover_session(struct ceph_mds_client *mdsc) +{ + struct ceph_fs_client *fsc = mdsc->fsc; + + if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) + return; + + if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) + return; + + if (!READ_ONCE(fsc->blocklisted)) + return; + + if (fsc->last_auto_reconnect && + time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) + return; + + pr_info("auto reconnect after blocklisted\n"); + fsc->last_auto_reconnect = jiffies; + ceph_force_reconnect(fsc->sb); +} + +bool check_session_state(struct ceph_mds_session *s) +{ + switch (s->s_state) { + case CEPH_MDS_SESSION_OPEN: + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { + s->s_state = CEPH_MDS_SESSION_HUNG; + pr_info("mds%d hung\n", s->s_mds); + } + break; + case CEPH_MDS_SESSION_CLOSING: + /* Should never reach this when we're unmounting */ + WARN_ON_ONCE(s->s_ttl); + fallthrough; + case CEPH_MDS_SESSION_NEW: + case CEPH_MDS_SESSION_RESTARTING: + case CEPH_MDS_SESSION_CLOSED: + case CEPH_MDS_SESSION_REJECTED: + return false; + } + + return true; +} + +/* + * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply, + * then we need to retransmit that request. + */ +void inc_session_sequence(struct ceph_mds_session *s) +{ + lockdep_assert_held(&s->s_mutex); + + s->s_seq++; + + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { + int ret; + + dout("resending session close request for mds%d\n", s->s_mds); + ret = request_close_session(s); + if (ret < 0) + pr_err("unable to close session to mds%d: %d\n", + s->s_mds, ret); + } +} + +/* + * delayed work -- periodically trim expired leases, renew caps with mds. If + * the @delay parameter is set to 0 or if it's more than 5 secs, the default + * workqueue delay value of 5 secs will be used. + */ +static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay) +{ + unsigned long max_delay = HZ * 5; + + /* 5 secs default delay */ + if (!delay || (delay > max_delay)) + delay = max_delay; + schedule_delayed_work(&mdsc->delayed_work, + round_jiffies_relative(delay)); +} + +static void delayed_work(struct work_struct *work) +{ + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, delayed_work.work); + unsigned long delay; + int renew_interval; + int renew_caps; + int i; + + dout("mdsc delayed_work\n"); + + if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED) + return; + + mutex_lock(&mdsc->mutex); + renew_interval = mdsc->mdsmap->m_session_timeout >> 2; + renew_caps = time_after_eq(jiffies, HZ*renew_interval + + mdsc->last_renew_caps); + if (renew_caps) + mdsc->last_renew_caps = jiffies; + + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + + if (!check_session_state(s)) { + ceph_put_mds_session(s); + continue; + } + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + if (renew_caps) + send_renew_caps(mdsc, s); + else + ceph_con_keepalive(&s->s_con); + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG) + ceph_send_cap_releases(mdsc, s); + mutex_unlock(&s->s_mutex); + ceph_put_mds_session(s); + + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + delay = ceph_check_delayed_caps(mdsc); + + ceph_queue_cap_reclaim_work(mdsc); + + ceph_trim_snapid_map(mdsc); + + maybe_recover_session(mdsc); + + schedule_delayed(mdsc, delay); +} + +int ceph_mdsc_init(struct ceph_fs_client *fsc) + +{ + struct ceph_mds_client *mdsc; + int err; + + mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); + if (!mdsc) + return -ENOMEM; + mdsc->fsc = fsc; + mutex_init(&mdsc->mutex); + mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); + if (!mdsc->mdsmap) { + err = -ENOMEM; + goto err_mdsc; + } + + init_completion(&mdsc->safe_umount_waiters); + init_waitqueue_head(&mdsc->session_close_wq); + INIT_LIST_HEAD(&mdsc->waiting_for_map); + mdsc->sessions = NULL; + atomic_set(&mdsc->num_sessions, 0); + mdsc->max_sessions = 0; + mdsc->stopping = 0; + atomic64_set(&mdsc->quotarealms_count, 0); + mdsc->quotarealms_inodes = RB_ROOT; + mutex_init(&mdsc->quotarealms_inodes_mutex); + mdsc->last_snap_seq = 0; + init_rwsem(&mdsc->snap_rwsem); + mdsc->snap_realms = RB_ROOT; + INIT_LIST_HEAD(&mdsc->snap_empty); + mdsc->num_snap_realms = 0; + spin_lock_init(&mdsc->snap_empty_lock); + mdsc->last_tid = 0; + mdsc->oldest_tid = 0; + mdsc->request_tree = RB_ROOT; + INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); + mdsc->last_renew_caps = jiffies; + INIT_LIST_HEAD(&mdsc->cap_delay_list); + INIT_LIST_HEAD(&mdsc->cap_wait_list); + spin_lock_init(&mdsc->cap_delay_lock); + INIT_LIST_HEAD(&mdsc->snap_flush_list); + spin_lock_init(&mdsc->snap_flush_lock); + mdsc->last_cap_flush_tid = 1; + INIT_LIST_HEAD(&mdsc->cap_flush_list); + INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); + mdsc->num_cap_flushing = 0; + spin_lock_init(&mdsc->cap_dirty_lock); + init_waitqueue_head(&mdsc->cap_flushing_wq); + INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); + atomic_set(&mdsc->cap_reclaim_pending, 0); + err = ceph_metric_init(&mdsc->metric); + if (err) + goto err_mdsmap; + + spin_lock_init(&mdsc->dentry_list_lock); + INIT_LIST_HEAD(&mdsc->dentry_leases); + INIT_LIST_HEAD(&mdsc->dentry_dir_leases); + + ceph_caps_init(mdsc); + ceph_adjust_caps_max_min(mdsc, fsc->mount_options); + + spin_lock_init(&mdsc->snapid_map_lock); + mdsc->snapid_map_tree = RB_ROOT; + INIT_LIST_HEAD(&mdsc->snapid_map_lru); + + init_rwsem(&mdsc->pool_perm_rwsem); + mdsc->pool_perm_tree = RB_ROOT; + + strscpy(mdsc->nodename, utsname()->nodename, + sizeof(mdsc->nodename)); + + fsc->mdsc = mdsc; + return 0; + +err_mdsmap: + kfree(mdsc->mdsmap); +err_mdsc: + kfree(mdsc); + return err; +} + +/* + * Wait for safe replies on open mds requests. If we time out, drop + * all requests from the tree to avoid dangling dentry refs. + */ +static void wait_requests(struct ceph_mds_client *mdsc) +{ + struct ceph_options *opts = mdsc->fsc->client->options; + struct ceph_mds_request *req; + + mutex_lock(&mdsc->mutex); + if (__get_oldest_req(mdsc)) { + mutex_unlock(&mdsc->mutex); + + dout("wait_requests waiting for requests\n"); + wait_for_completion_timeout(&mdsc->safe_umount_waiters, + ceph_timeout_jiffies(opts->mount_timeout)); + + /* tear down remaining requests */ + mutex_lock(&mdsc->mutex); + while ((req = __get_oldest_req(mdsc))) { + dout("wait_requests timed out on tid %llu\n", + req->r_tid); + list_del_init(&req->r_wait); + __unregister_request(mdsc, req); + } + } + mutex_unlock(&mdsc->mutex); + dout("wait_requests done\n"); +} + +void send_flush_mdlog(struct ceph_mds_session *s) +{ + struct ceph_msg *msg; + + /* + * Pre-luminous MDS crashes when it sees an unknown session request + */ + if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS)) + return; + + mutex_lock(&s->s_mutex); + dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds, + ceph_session_state_name(s->s_state), s->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, + s->s_seq); + if (!msg) { + pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n", + s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); + } else { + ceph_con_send(&s->s_con, msg); + } + mutex_unlock(&s->s_mutex); +} + +/* + * called before mount is ro, and before dentries are torn down. + * (hmm, does this still race with new lookups?) + */ +void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) +{ + dout("pre_umount\n"); + mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN; + + ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); + ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); + ceph_flush_dirty_caps(mdsc); + wait_requests(mdsc); + + /* + * wait for reply handlers to drop their request refs and + * their inode/dcache refs + */ + ceph_msgr_flush(); + + ceph_cleanup_quotarealms_inodes(mdsc); +} + +/* + * wait for all write mds requests to flush. + */ +static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) +{ + struct ceph_mds_request *req = NULL, *nextreq; + struct rb_node *n; + + mutex_lock(&mdsc->mutex); + dout("wait_unsafe_requests want %lld\n", want_tid); +restart: + req = __get_oldest_req(mdsc); + while (req && req->r_tid <= want_tid) { + /* find next request */ + n = rb_next(&req->r_node); + if (n) + nextreq = rb_entry(n, struct ceph_mds_request, r_node); + else + nextreq = NULL; + if (req->r_op != CEPH_MDS_OP_SETFILELOCK && + (req->r_op & CEPH_MDS_OP_WRITE)) { + /* write op */ + ceph_mdsc_get_request(req); + if (nextreq) + ceph_mdsc_get_request(nextreq); + mutex_unlock(&mdsc->mutex); + dout("wait_unsafe_requests wait on %llu (want %llu)\n", + req->r_tid, want_tid); + wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); + ceph_mdsc_put_request(req); + if (!nextreq) + break; /* next dne before, so we're done! */ + if (RB_EMPTY_NODE(&nextreq->r_node)) { + /* next request was removed from tree */ + ceph_mdsc_put_request(nextreq); + goto restart; + } + ceph_mdsc_put_request(nextreq); /* won't go away */ + } + req = nextreq; + } + mutex_unlock(&mdsc->mutex); + dout("wait_unsafe_requests done\n"); +} + +void ceph_mdsc_sync(struct ceph_mds_client *mdsc) +{ + u64 want_tid, want_flush; + + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + return; + + dout("sync\n"); + mutex_lock(&mdsc->mutex); + want_tid = mdsc->last_tid; + mutex_unlock(&mdsc->mutex); + + ceph_flush_dirty_caps(mdsc); + spin_lock(&mdsc->cap_dirty_lock); + want_flush = mdsc->last_cap_flush_tid; + if (!list_empty(&mdsc->cap_flush_list)) { + struct ceph_cap_flush *cf = + list_last_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); + cf->wake = true; + } + spin_unlock(&mdsc->cap_dirty_lock); + + dout("sync want tid %lld flush_seq %lld\n", + want_tid, want_flush); + + wait_unsafe_requests(mdsc, want_tid); + wait_caps_flush(mdsc, want_flush); +} + +/* + * true if all sessions are closed, or we force unmount + */ +static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) +{ + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + return true; + return atomic_read(&mdsc->num_sessions) <= skipped; +} + +/* + * called after sb is ro. + */ +void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) +{ + struct ceph_options *opts = mdsc->fsc->client->options; + struct ceph_mds_session *session; + int i; + int skipped = 0; + + dout("close_sessions\n"); + + /* close sessions */ + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + if (__close_session(mdsc, session) <= 0) + skipped++; + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + dout("waiting for sessions to close\n"); + wait_event_timeout(mdsc->session_close_wq, + done_closing_sessions(mdsc, skipped), + ceph_timeout_jiffies(opts->mount_timeout)); + + /* tear down remaining sessions */ + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + if (mdsc->sessions[i]) { + session = ceph_get_mds_session(mdsc->sessions[i]); + __unregister_session(mdsc, session); + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + remove_session_caps(session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + } + WARN_ON(!list_empty(&mdsc->cap_delay_list)); + mutex_unlock(&mdsc->mutex); + + ceph_cleanup_snapid_map(mdsc); + ceph_cleanup_empty_realms(mdsc); + + cancel_work_sync(&mdsc->cap_reclaim_work); + cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ + + dout("stopped\n"); +} + +void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_session *session; + int mds; + + dout("force umount\n"); + + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; mds++) { + session = __ceph_lookup_mds_session(mdsc, mds); + if (!session) + continue; + + if (session->s_state == CEPH_MDS_SESSION_REJECTED) + __unregister_session(mdsc, session); + __wake_requests(mdsc, &session->s_waiting); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + __close_session(mdsc, session); + if (session->s_state == CEPH_MDS_SESSION_CLOSING) { + cleanup_session_requests(mdsc, session); + remove_session_caps(session); + } + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + + mutex_lock(&mdsc->mutex); + kick_requests(mdsc, mds); + } + __wake_requests(mdsc, &mdsc->waiting_for_map); + mutex_unlock(&mdsc->mutex); +} + +static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) +{ + dout("stop\n"); + /* + * Make sure the delayed work stopped before releasing + * the resources. + * + * Because the cancel_delayed_work_sync() will only + * guarantee that the work finishes executing. But the + * delayed work will re-arm itself again after that. + */ + flush_delayed_work(&mdsc->delayed_work); + + if (mdsc->mdsmap) + ceph_mdsmap_destroy(mdsc->mdsmap); + kfree(mdsc->sessions); + ceph_caps_finalize(mdsc); + ceph_pool_perm_destroy(mdsc); +} + +void ceph_mdsc_destroy(struct ceph_fs_client *fsc) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + dout("mdsc_destroy %p\n", mdsc); + + if (!mdsc) + return; + + /* flush out any connection work with references to us */ + ceph_msgr_flush(); + + ceph_mdsc_stop(mdsc); + + ceph_metric_destroy(&mdsc->metric); + + fsc->mdsc = NULL; + kfree(mdsc); + dout("mdsc_destroy %p done\n", mdsc); +} + +void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +{ + struct ceph_fs_client *fsc = mdsc->fsc; + const char *mds_namespace = fsc->mount_options->mds_namespace; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + u32 epoch; + u32 map_len; + u32 num_fs; + u32 mount_fscid = (u32)-1; + u8 struct_v, struct_cv; + int err = -EINVAL; + + ceph_decode_need(&p, end, sizeof(u32), bad); + epoch = ceph_decode_32(&p); + + dout("handle_fsmap epoch %u\n", epoch); + + ceph_decode_need(&p, end, 2 + sizeof(u32), bad); + struct_v = ceph_decode_8(&p); + struct_cv = ceph_decode_8(&p); + map_len = ceph_decode_32(&p); + + ceph_decode_need(&p, end, sizeof(u32) * 3, bad); + p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */ + + num_fs = ceph_decode_32(&p); + while (num_fs-- > 0) { + void *info_p, *info_end; + u32 info_len; + u8 info_v, info_cv; + u32 fscid, namelen; + + ceph_decode_need(&p, end, 2 + sizeof(u32), bad); + info_v = ceph_decode_8(&p); + info_cv = ceph_decode_8(&p); + info_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, info_len, bad); + info_p = p; + info_end = p + info_len; + p = info_end; + + ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad); + fscid = ceph_decode_32(&info_p); + namelen = ceph_decode_32(&info_p); + ceph_decode_need(&info_p, info_end, namelen, bad); + + if (mds_namespace && + strlen(mds_namespace) == namelen && + !strncmp(mds_namespace, (char *)info_p, namelen)) { + mount_fscid = fscid; + break; + } + } + + ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch); + if (mount_fscid != (u32)-1) { + fsc->client->monc.fs_cluster_id = mount_fscid; + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, + 0, true); + ceph_monc_renew_subs(&fsc->client->monc); + } else { + err = -ENOENT; + goto err_out; + } + return; + +bad: + pr_err("error decoding fsmap\n"); +err_out: + mutex_lock(&mdsc->mutex); + mdsc->mdsmap_err = err; + __wake_requests(mdsc, &mdsc->waiting_for_map); + mutex_unlock(&mdsc->mutex); +} + +/* + * handle mds map update. + */ +void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +{ + u32 epoch; + u32 maplen; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + struct ceph_mdsmap *newmap, *oldmap; + struct ceph_fsid fsid; + int err = -EINVAL; + + ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); + ceph_decode_copy(&p, &fsid, sizeof(fsid)); + if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) + return; + epoch = ceph_decode_32(&p); + maplen = ceph_decode_32(&p); + dout("handle_map epoch %u len %d\n", epoch, (int)maplen); + + /* do we need it? */ + mutex_lock(&mdsc->mutex); + if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { + dout("handle_map epoch %u <= our %u\n", + epoch, mdsc->mdsmap->m_epoch); + mutex_unlock(&mdsc->mutex); + return; + } + + newmap = ceph_mdsmap_decode(&p, end); + if (IS_ERR(newmap)) { + err = PTR_ERR(newmap); + goto bad_unlock; + } + + /* swap into place */ + if (mdsc->mdsmap) { + oldmap = mdsc->mdsmap; + mdsc->mdsmap = newmap; + check_new_map(mdsc, newmap, oldmap); + ceph_mdsmap_destroy(oldmap); + } else { + mdsc->mdsmap = newmap; /* first mds map */ + } + mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size, + MAX_LFS_FILESIZE); + + __wake_requests(mdsc, &mdsc->waiting_for_map); + ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP, + mdsc->mdsmap->m_epoch); + + mutex_unlock(&mdsc->mutex); + schedule_delayed(mdsc, 0); + return; + +bad_unlock: + mutex_unlock(&mdsc->mutex); +bad: + pr_err("error decoding mdsmap %d\n", err); + return; +} + +static struct ceph_connection *con_get(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + + if (ceph_get_mds_session(s)) + return con; + return NULL; +} + +static void con_put(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + + ceph_put_mds_session(s); +} + +/* + * if the client is unresponsive for long enough, the mds will kill + * the session entirely. + */ +static void peer_reset(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + + pr_warn("mds%d closed our session\n", s->s_mds); + send_mds_reconnect(mdsc, s); +} + +static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + int type = le16_to_cpu(msg->hdr.type); + + mutex_lock(&mdsc->mutex); + if (__verify_registered_session(mdsc, s) < 0) { + mutex_unlock(&mdsc->mutex); + goto out; + } + mutex_unlock(&mdsc->mutex); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_mdsmap(mdsc, msg); + break; + case CEPH_MSG_FS_MAP_USER: + ceph_mdsc_handle_fsmap(mdsc, msg); + break; + case CEPH_MSG_CLIENT_SESSION: + handle_session(s, msg); + break; + case CEPH_MSG_CLIENT_REPLY: + handle_reply(s, msg); + break; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: + handle_forward(mdsc, s, msg); + break; + case CEPH_MSG_CLIENT_CAPS: + ceph_handle_caps(s, msg); + break; + case CEPH_MSG_CLIENT_SNAP: + ceph_handle_snap(mdsc, s, msg); + break; + case CEPH_MSG_CLIENT_LEASE: + handle_lease(mdsc, s, msg); + break; + case CEPH_MSG_CLIENT_QUOTA: + ceph_handle_quota(mdsc, s, msg); + break; + + default: + pr_err("received unknown message type %d %s\n", type, + ceph_msg_type_name(type)); + } +out: + ceph_msg_put(msg); +} + +/* + * authentication + */ + +/* + * Note: returned pointer is the address of a structure that's + * managed separately. Caller must *not* attempt to free it. + */ +static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, + int *proto, int force_new) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + + if (force_new && auth->authorizer) { + ceph_auth_destroy_authorizer(auth->authorizer); + auth->authorizer = NULL; + } + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); + } + *proto = ac->protocol; + + return auth; +} + +static int add_authorizer_challenge(struct ceph_connection *con, + void *challenge_buf, int challenge_buf_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + + return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer, + challenge_buf, challenge_buf_len); +} + +static int verify_authorizer_reply(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + + return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer); +} + +static int invalidate_authorizer(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + + return ceph_monc_validate_auth(&mdsc->fsc->client->monc); +} + +static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) +{ + struct ceph_msg *msg; + int type = (int) le16_to_cpu(hdr->type); + int front_len = (int) le32_to_cpu(hdr->front_len); + + if (con->in_msg) + return con->in_msg; + + *skip = 0; + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return NULL; + } + + return msg; +} + +static int mds_sign_message(struct ceph_msg *msg) +{ + struct ceph_mds_session *s = msg->con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + + return ceph_auth_sign_message(auth, msg); +} + +static int mds_check_message_signature(struct ceph_msg *msg) +{ + struct ceph_mds_session *s = msg->con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + + return ceph_auth_check_message_signature(auth, msg); +} + +static const struct ceph_connection_operations mds_con_ops = { + .get = con_get, + .put = con_put, + .dispatch = dispatch, + .get_authorizer = get_authorizer, + .add_authorizer_challenge = add_authorizer_challenge, + .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, + .peer_reset = peer_reset, + .alloc_msg = mds_alloc_msg, + .sign_message = mds_sign_message, + .check_message_signature = mds_check_message_signature, +}; + +/* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h new file mode 100644 index 000000000..1c958510f --- /dev/null +++ b/fs/ceph/mds_client.h @@ -0,0 +1,581 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_MDS_CLIENT_H +#define _FS_CEPH_MDS_CLIENT_H + +#include <linux/completion.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/utsname.h> +#include <linux/ktime.h> + +#include <linux/ceph/types.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/mdsmap.h> +#include <linux/ceph/auth.h> + +#include "metric.h" +#include "super.h" + +/* The first 8 bits are reserved for old ceph releases */ +enum ceph_feature_type { + CEPHFS_FEATURE_MIMIC = 8, + CEPHFS_FEATURE_REPLY_ENCODING, + CEPHFS_FEATURE_RECLAIM_CLIENT, + CEPHFS_FEATURE_LAZY_CAP_WANTED, + CEPHFS_FEATURE_MULTI_RECONNECT, + CEPHFS_FEATURE_DELEG_INO, + CEPHFS_FEATURE_METRIC_COLLECT, + + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT, +}; + +#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ + 0, 1, 2, 3, 4, 5, 6, 7, \ + CEPHFS_FEATURE_MIMIC, \ + CEPHFS_FEATURE_REPLY_ENCODING, \ + CEPHFS_FEATURE_LAZY_CAP_WANTED, \ + CEPHFS_FEATURE_MULTI_RECONNECT, \ + CEPHFS_FEATURE_DELEG_INO, \ + CEPHFS_FEATURE_METRIC_COLLECT, \ +} +#define CEPHFS_FEATURES_CLIENT_REQUIRED {} + +/* + * Some lock dependencies: + * + * session->s_mutex + * mdsc->mutex + * + * mdsc->snap_rwsem + * + * ci->i_ceph_lock + * mdsc->snap_flush_lock + * mdsc->cap_delay_lock + * + */ + +struct ceph_fs_client; +struct ceph_cap; + +/* + * parsed info about a single inode. pointers are into the encoded + * on-wire structures within the mds reply message payload. + */ +struct ceph_mds_reply_info_in { + struct ceph_mds_reply_inode *in; + struct ceph_dir_layout dir_layout; + u32 symlink_len; + char *symlink; + u32 xattr_len; + char *xattr_data; + u64 inline_version; + u32 inline_len; + char *inline_data; + u32 pool_ns_len; + char *pool_ns_data; + u64 max_bytes; + u64 max_files; + s32 dir_pin; + struct ceph_timespec btime; + struct ceph_timespec snap_btime; + u64 change_attr; +}; + +struct ceph_mds_reply_dir_entry { + char *name; + u32 name_len; + struct ceph_mds_reply_lease *lease; + struct ceph_mds_reply_info_in inode; + loff_t offset; +}; + +/* + * parsed info about an mds reply, including information about + * either: 1) the target inode and/or its parent directory and dentry, + * and directory contents (for readdir results), or + * 2) the file range lock info (for fcntl F_GETLK results). + */ +struct ceph_mds_reply_info_parsed { + struct ceph_mds_reply_head *head; + + /* trace */ + struct ceph_mds_reply_info_in diri, targeti; + struct ceph_mds_reply_dirfrag *dirfrag; + char *dname; + u32 dname_len; + struct ceph_mds_reply_lease *dlease; + + /* extra */ + union { + /* for fcntl F_GETLK results */ + struct ceph_filelock *filelock_reply; + + /* for readdir results */ + struct { + struct ceph_mds_reply_dirfrag *dir_dir; + size_t dir_buf_size; + int dir_nr; + bool dir_end; + bool dir_complete; + bool hash_order; + bool offset_hash; + struct ceph_mds_reply_dir_entry *dir_entries; + }; + + /* for create results */ + struct { + bool has_create_ino; + u64 ino; + }; + }; + + /* encoded blob describing snapshot contexts for certain + operations (e.g., open) */ + void *snapblob; + int snapblob_len; +}; + + +/* + * cap releases are batched and sent to the MDS en masse. + * + * Account for per-message overhead of mds_cap_release header + * and __le32 for osd epoch barrier trailing field. + */ +#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \ + sizeof(struct ceph_mds_cap_release)) / \ + sizeof(struct ceph_mds_cap_item)) + + +/* + * state associated with each MDS<->client session + */ +enum { + CEPH_MDS_SESSION_NEW = 1, + CEPH_MDS_SESSION_OPENING = 2, + CEPH_MDS_SESSION_OPEN = 3, + CEPH_MDS_SESSION_HUNG = 4, + CEPH_MDS_SESSION_RESTARTING = 5, + CEPH_MDS_SESSION_RECONNECTING = 6, + CEPH_MDS_SESSION_CLOSING = 7, + CEPH_MDS_SESSION_CLOSED = 8, + CEPH_MDS_SESSION_REJECTED = 9, +}; + +struct ceph_mds_session { + struct ceph_mds_client *s_mdsc; + int s_mds; + int s_state; + unsigned long s_ttl; /* time until mds kills us */ + unsigned long s_features; + u64 s_seq; /* incoming msg seq # */ + struct mutex s_mutex; /* serialize session messages */ + + struct ceph_connection s_con; + + struct ceph_auth_handshake s_auth; + + /* protected by s_gen_ttl_lock */ + spinlock_t s_gen_ttl_lock; + u32 s_cap_gen; /* inc each time we get mds stale msg */ + unsigned long s_cap_ttl; /* when session caps expire */ + + /* protected by s_cap_lock */ + spinlock_t s_cap_lock; + refcount_t s_ref; + struct list_head s_caps; /* all caps issued by this session */ + struct ceph_cap *s_cap_iterator; + int s_nr_caps; + int s_num_cap_releases; + int s_cap_reconnect; + int s_readonly; + struct list_head s_cap_releases; /* waiting cap_release messages */ + struct work_struct s_cap_release_work; + + /* See ceph_inode_info->i_dirty_item. */ + struct list_head s_cap_dirty; /* inodes w/ dirty caps */ + + /* See ceph_inode_info->i_flushing_item. */ + struct list_head s_cap_flushing; /* inodes w/ flushing caps */ + + unsigned long s_renew_requested; /* last time we sent a renew req */ + u64 s_renew_seq; + + struct list_head s_waiting; /* waiting requests */ + struct list_head s_unsafe; /* unsafe requests */ + struct xarray s_delegated_inos; +}; + +/* + * modes of choosing which MDS to send a request to + */ +enum { + USE_ANY_MDS, + USE_RANDOM_MDS, + USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ +}; + +struct ceph_mds_request; +struct ceph_mds_client; + +/* + * request completion callback + */ +typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); +/* + * wait for request completion callback + */ +typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); + +/* + * an in-flight mds request + */ +struct ceph_mds_request { + u64 r_tid; /* transaction id */ + struct rb_node r_node; + struct ceph_mds_client *r_mdsc; + + struct kref r_kref; + int r_op; /* mds op code */ + + /* operation on what? */ + struct inode *r_inode; /* arg1 */ + struct dentry *r_dentry; /* arg1 */ + struct dentry *r_old_dentry; /* arg2: rename from or link from */ + struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ + char *r_path1, *r_path2; + struct ceph_vino r_ino1, r_ino2; + + struct inode *r_parent; /* parent dir inode */ + struct inode *r_target_inode; /* resulting inode */ + +#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ +#define CEPH_MDS_R_ABORTED (2) /* call was aborted */ +#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ +#define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */ +#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ +#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ +#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ +#define CEPH_MDS_R_ASYNC (8) /* async request */ + unsigned long r_req_flags; + + struct mutex r_fill_mutex; + + union ceph_mds_request_args r_args; + int r_fmode; /* file mode, if expecting cap */ + kuid_t r_uid; + kgid_t r_gid; + int r_request_release_offset; + struct timespec64 r_stamp; + + /* for choosing which mds to send this request to */ + int r_direct_mode; + u32 r_direct_hash; /* choose dir frag based on this dentry hash */ + + /* data payload is used for xattr ops */ + struct ceph_pagelist *r_pagelist; + + /* what caps shall we drop? */ + int r_inode_drop, r_inode_unless; + int r_dentry_drop, r_dentry_unless; + int r_old_dentry_drop, r_old_dentry_unless; + struct inode *r_old_inode; + int r_old_inode_drop, r_old_inode_unless; + + struct ceph_msg *r_request; /* original request */ + struct ceph_msg *r_reply; + struct ceph_mds_reply_info_parsed r_reply_info; + int r_err; + + + struct page *r_locked_page; + int r_dir_caps; + int r_num_caps; + u32 r_readdir_offset; + + unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ + unsigned long r_started; /* start time to measure timeout against */ + unsigned long r_start_latency; /* start time to measure latency */ + unsigned long r_end_latency; /* finish time to measure latency */ + unsigned long r_request_started; /* start time for mds request only, + used to measure lease durations */ + + /* link unsafe requests to parent directory, for fsync */ + struct inode *r_unsafe_dir; + struct list_head r_unsafe_dir_item; + + /* unsafe requests that modify the target inode */ + struct list_head r_unsafe_target_item; + + struct ceph_mds_session *r_session; + + int r_attempts; /* resend attempts */ + int r_num_fwd; /* number of forward attempts */ + int r_resend_mds; /* mds to resend to next, if any*/ + u32 r_sent_on_mseq; /* cap mseq request was sent at*/ + u64 r_deleg_ino; + + struct list_head r_wait; + struct completion r_completion; + struct completion r_safe_completion; + ceph_mds_request_callback_t r_callback; + ceph_mds_request_wait_callback_t r_wait_for_completion; + struct list_head r_unsafe_item; /* per-session unsafe list item */ + + long long r_dir_release_cnt; + long long r_dir_ordered_cnt; + int r_readdir_cache_idx; + + struct ceph_cap_reservation r_caps_reservation; +}; + +struct ceph_pool_perm { + struct rb_node node; + int perm; + s64 pool; + size_t pool_ns_len; + char pool_ns[]; +}; + +struct ceph_snapid_map { + struct rb_node node; + struct list_head lru; + atomic_t ref; + u64 snap; + dev_t dev; + unsigned long last_used; +}; + +/* + * node for list of quotarealm inodes that are not visible from the filesystem + * mountpoint, but required to handle, e.g. quotas. + */ +struct ceph_quotarealm_inode { + struct rb_node node; + u64 ino; + unsigned long timeout; /* last time a lookup failed for this inode */ + struct mutex mutex; + struct inode *inode; +}; + +struct cap_wait { + struct list_head list; + u64 ino; + pid_t tgid; + int need; + int want; +}; + +enum { + CEPH_MDSC_STOPPING_BEGIN = 1, + CEPH_MDSC_STOPPING_FLUSHED = 2, +}; + +/* + * mds client state + */ +struct ceph_mds_client { + struct ceph_fs_client *fsc; + struct mutex mutex; /* all nested structures */ + + struct ceph_mdsmap *mdsmap; + struct completion safe_umount_waiters; + wait_queue_head_t session_close_wq; + struct list_head waiting_for_map; + int mdsmap_err; + + struct ceph_mds_session **sessions; /* NULL for mds if no session */ + atomic_t num_sessions; + int max_sessions; /* len of sessions array */ + int stopping; /* true if shutting down */ + + atomic64_t quotarealms_count; /* # realms with quota */ + /* + * We keep a list of inodes we don't see in the mountpoint but that we + * need to track quota realms. + */ + struct rb_root quotarealms_inodes; + struct mutex quotarealms_inodes_mutex; + + /* + * snap_rwsem will cover cap linkage into snaprealms, and + * realm snap contexts. (later, we can do per-realm snap + * contexts locks..) the empty list contains realms with no + * references (implying they contain no inodes with caps) that + * should be destroyed. + */ + u64 last_snap_seq; + struct rw_semaphore snap_rwsem; + struct rb_root snap_realms; + struct list_head snap_empty; + int num_snap_realms; + spinlock_t snap_empty_lock; /* protect snap_empty */ + + u64 last_tid; /* most recent mds request */ + u64 oldest_tid; /* oldest incomplete mds request, + excluding setfilelock requests */ + struct rb_root request_tree; /* pending mds requests */ + struct delayed_work delayed_work; /* delayed work */ + unsigned long last_renew_caps; /* last time we renewed our caps */ + struct list_head cap_delay_list; /* caps with delayed release */ + spinlock_t cap_delay_lock; /* protects cap_delay_list */ + struct list_head snap_flush_list; /* cap_snaps ready to flush */ + spinlock_t snap_flush_lock; + + u64 last_cap_flush_tid; + struct list_head cap_flush_list; + struct list_head cap_dirty_migrating; /* ...that are migration... */ + int num_cap_flushing; /* # caps we are flushing */ + spinlock_t cap_dirty_lock; /* protects above items */ + wait_queue_head_t cap_flushing_wq; + + struct work_struct cap_reclaim_work; + atomic_t cap_reclaim_pending; + + /* + * Cap reservations + * + * Maintain a global pool of preallocated struct ceph_caps, referenced + * by struct ceph_caps_reservations. This ensures that we preallocate + * memory needed to successfully process an MDS response. (If an MDS + * sends us cap information and we fail to process it, we will have + * problems due to the client and MDS being out of sync.) + * + * Reservations are 'owned' by a ceph_cap_reservation context. + */ + spinlock_t caps_list_lock; + struct list_head caps_list; /* unused (reserved or + unreserved) */ + struct list_head cap_wait_list; + int caps_total_count; /* total caps allocated */ + int caps_use_count; /* in use */ + int caps_use_max; /* max used caps */ + int caps_reserve_count; /* unused, reserved */ + int caps_avail_count; /* unused, unreserved */ + int caps_min_count; /* keep at least this many + (unreserved) */ + spinlock_t dentry_list_lock; + struct list_head dentry_leases; /* fifo list */ + struct list_head dentry_dir_leases; /* lru list */ + + struct ceph_client_metric metric; + + spinlock_t snapid_map_lock; + struct rb_root snapid_map_tree; + struct list_head snapid_map_lru; + + struct rw_semaphore pool_perm_rwsem; + struct rb_root pool_perm_tree; + + char nodename[__NEW_UTS_LEN + 1]; +}; + +extern const char *ceph_mds_op_name(int op); + +extern bool check_session_state(struct ceph_mds_session *s); +void inc_session_sequence(struct ceph_mds_session *s); + +extern struct ceph_mds_session * +__ceph_lookup_mds_session(struct ceph_mds_client *, int mds); + +extern const char *ceph_session_state_name(int s); + +extern struct ceph_mds_session * +ceph_get_mds_session(struct ceph_mds_session *s); +extern void ceph_put_mds_session(struct ceph_mds_session *s); + +extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, + struct ceph_msg *msg, int mds); + +extern int ceph_mdsc_init(struct ceph_fs_client *fsc); +extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); + +extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); + +extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); +extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir); +extern struct ceph_mds_request * +ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); +extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req); +extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req); +static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) +{ + kref_get(&req->r_kref); +} +extern void ceph_mdsc_release_request(struct kref *kref); +static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) +{ + kref_put(&req->r_kref, ceph_mdsc_release_request); +} + +extern void send_flush_mdlog(struct ceph_mds_session *s); +extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *), + bool check_state); +extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); +extern void __ceph_queue_cap_release(struct ceph_mds_session *session, + struct ceph_cap *cap); +extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); +extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); +extern int ceph_iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, + struct ceph_cap *, void *), + void *arg); +extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); + +static inline void ceph_mdsc_free_path(char *path, int len) +{ + if (!IS_ERR_OR_NULL(path)) + __putname(path - (PATH_MAX - 1 - len)); +} + +extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, + int stop_on_nosnap); + +extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); +extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, + struct dentry *dentry, char action, + u32 seq); + +extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); +extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); + +extern struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); +extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); + +extern int ceph_trim_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int max_caps); + +static inline int ceph_wait_on_async_create(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, + TASK_INTERRUPTIBLE); +} + +extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); +extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); +#endif diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c new file mode 100644 index 000000000..47f2903ba --- /dev/null +++ b/fs/ceph/mdsmap.c @@ -0,0 +1,419 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/bug.h> +#include <linux/err.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/types.h> + +#include <linux/ceph/mdsmap.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> + +#include "super.h" + +#define CEPH_MDS_IS_READY(i, ignore_laggy) \ + (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) + +static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) +{ + int n = 0; + int i, j; + + /* count */ + for (i = 0; i < m->possible_max_rank; i++) + if (CEPH_MDS_IS_READY(i, ignore_laggy)) + n++; + if (n == 0) + return -1; + + /* pick */ + n = prandom_u32() % n; + for (j = 0, i = 0; i < m->possible_max_rank; i++) { + if (CEPH_MDS_IS_READY(i, ignore_laggy)) + j++; + if (j > n) + break; + } + + return i; +} + +/* + * choose a random mds that is "up" (i.e. has a state > 0), or -1. + */ +int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +{ + int mds; + + mds = __mdsmap_get_random_mds(m, false); + if (mds == m->possible_max_rank || mds == -1) + mds = __mdsmap_get_random_mds(m, true); + + return mds == m->possible_max_rank ? -1 : mds; +} + +#define __decode_and_drop_type(p, end, type, bad) \ + do { \ + if (*p + sizeof(type) > end) \ + goto bad; \ + *p += sizeof(type); \ + } while (0) + +#define __decode_and_drop_set(p, end, type, bad) \ + do { \ + u32 n; \ + size_t need; \ + ceph_decode_32_safe(p, end, n, bad); \ + need = sizeof(type) * n; \ + ceph_decode_need(p, end, need, bad); \ + *p += need; \ + } while (0) + +#define __decode_and_drop_map(p, end, ktype, vtype, bad) \ + do { \ + u32 n; \ + size_t need; \ + ceph_decode_32_safe(p, end, n, bad); \ + need = (sizeof(ktype) + sizeof(vtype)) * n; \ + ceph_decode_need(p, end, need, bad); \ + *p += need; \ + } while (0) + + +static int __decode_and_drop_compat_set(void **p, void* end) +{ + int i; + /* compat, ro_compat, incompat*/ + for (i = 0; i < 3; i++) { + u32 n; + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); + /* mask */ + *p += sizeof(u64); + /* names (map<u64, string>) */ + n = ceph_decode_32(p); + while (n-- > 0) { + u32 len; + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), + bad); + *p += sizeof(u64); + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, bad); + *p += len; + } + } + return 0; +bad: + return -1; +} + +/* + * Decode an MDS map + * + * Ignore any fields we don't care about (there are quite a few of + * them). + */ +struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) +{ + struct ceph_mdsmap *m; + const void *start = *p; + int i, j, n; + int err; + u8 mdsmap_v; + u16 mdsmap_ev; + + m = kzalloc(sizeof(*m), GFP_NOFS); + if (!m) + return ERR_PTR(-ENOMEM); + + ceph_decode_need(p, end, 1 + 1, bad); + mdsmap_v = ceph_decode_8(p); + *p += sizeof(u8); /* mdsmap_cv */ + if (mdsmap_v >= 4) { + u32 mdsmap_len; + ceph_decode_32_safe(p, end, mdsmap_len, bad); + if (end < *p + mdsmap_len) + goto bad; + end = *p + mdsmap_len; + } + + ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); + m->m_epoch = ceph_decode_32(p); + m->m_client_epoch = ceph_decode_32(p); + m->m_last_failure = ceph_decode_32(p); + m->m_root = ceph_decode_32(p); + m->m_session_timeout = ceph_decode_32(p); + m->m_session_autoclose = ceph_decode_32(p); + m->m_max_file_size = ceph_decode_64(p); + m->m_max_mds = ceph_decode_32(p); + + /* + * pick out the active nodes as the m_num_active_mds, the + * m_num_active_mds maybe larger than m_max_mds when decreasing + * the max_mds in cluster side, in other case it should less + * than or equal to m_max_mds. + */ + m->m_num_active_mds = n = ceph_decode_32(p); + + /* + * the possible max rank, it maybe larger than the m_num_active_mds, + * for example if the mds_max == 2 in the cluster, when the MDS(0) + * was laggy and being replaced by a new MDS, we will temporarily + * receive a new mds map with n_num_mds == 1 and the active MDS(1), + * and the mds rank >= m_num_active_mds. + */ + m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); + + m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS); + if (!m->m_info) + goto nomem; + + /* pick out active nodes from mds_info (state > 0) */ + for (i = 0; i < n; i++) { + u64 global_id; + u32 namelen; + s32 mds, inc, state; + u8 info_v; + void *info_end = NULL; + struct ceph_entity_addr addr; + u32 num_export_targets; + void *pexport_targets = NULL; + struct ceph_timespec laggy_since; + struct ceph_mds_info *info; + bool laggy; + + ceph_decode_need(p, end, sizeof(u64) + 1, bad); + global_id = ceph_decode_64(p); + info_v= ceph_decode_8(p); + if (info_v >= 4) { + u32 info_len; + ceph_decode_need(p, end, 1 + sizeof(u32), bad); + *p += sizeof(u8); /* info_cv */ + info_len = ceph_decode_32(p); + info_end = *p + info_len; + if (info_end > end) + goto bad; + } + + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); + *p += sizeof(u64); + namelen = ceph_decode_32(p); /* skip mds name */ + *p += namelen; + + ceph_decode_need(p, end, + 4*sizeof(u32) + sizeof(u64) + + sizeof(addr) + sizeof(struct ceph_timespec), + bad); + mds = ceph_decode_32(p); + inc = ceph_decode_32(p); + state = ceph_decode_32(p); + *p += sizeof(u64); /* state_seq */ + err = ceph_decode_entity_addr(p, end, &addr); + if (err) + goto corrupt; + ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; + *p += sizeof(u32); + ceph_decode_32_safe(p, end, namelen, bad); + *p += namelen; + if (info_v >= 2) { + ceph_decode_32_safe(p, end, num_export_targets, bad); + pexport_targets = *p; + *p += num_export_targets * sizeof(u32); + } else { + num_export_targets = 0; + } + + if (info_end && *p != info_end) { + if (*p > info_end) + goto bad; + *p = info_end; + } + + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", + i+1, n, global_id, mds, inc, + ceph_pr_addr(&addr), + ceph_mds_state_name(state), + laggy ? "(laggy)" : ""); + + if (mds < 0 || mds >= m->possible_max_rank) { + pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds); + continue; + } + + if (state <= 0) { + dout("mdsmap_decode got incorrect state(%s)\n", + ceph_mds_state_name(state)); + continue; + } + + info = &m->m_info[mds]; + info->global_id = global_id; + info->state = state; + info->addr = addr; + info->laggy = laggy; + info->num_export_targets = num_export_targets; + if (num_export_targets) { + info->export_targets = kcalloc(num_export_targets, + sizeof(u32), GFP_NOFS); + if (!info->export_targets) + goto nomem; + for (j = 0; j < num_export_targets; j++) + info->export_targets[j] = + ceph_decode_32(&pexport_targets); + } else { + info->export_targets = NULL; + } + } + + /* pg_pools */ + ceph_decode_32_safe(p, end, n, bad); + m->m_num_data_pg_pools = n; + m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); + if (!m->m_data_pg_pools) + goto nomem; + ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); + for (i = 0; i < n; i++) + m->m_data_pg_pools[i] = ceph_decode_64(p); + m->m_cas_pg_pool = ceph_decode_64(p); + m->m_enabled = m->m_epoch > 1; + + mdsmap_ev = 1; + if (mdsmap_v >= 2) { + ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext); + } + if (mdsmap_ev >= 3) { + if (__decode_and_drop_compat_set(p, end) < 0) + goto bad_ext; + } + /* metadata_pool */ + if (mdsmap_ev < 5) { + __decode_and_drop_type(p, end, u32, bad_ext); + } else { + __decode_and_drop_type(p, end, u64, bad_ext); + } + + /* created + modified + tableserver */ + __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); + __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); + __decode_and_drop_type(p, end, u32, bad_ext); + + /* in */ + { + int num_laggy = 0; + ceph_decode_32_safe(p, end, n, bad_ext); + ceph_decode_need(p, end, sizeof(u32) * n, bad_ext); + + for (i = 0; i < n; i++) { + s32 mds = ceph_decode_32(p); + if (mds >= 0 && mds < m->possible_max_rank) { + if (m->m_info[mds].laggy) + num_laggy++; + } + } + m->m_num_laggy = num_laggy; + + if (n > m->possible_max_rank) { + void *new_m_info = krealloc(m->m_info, + n * sizeof(*m->m_info), + GFP_NOFS | __GFP_ZERO); + if (!new_m_info) + goto nomem; + m->m_info = new_m_info; + } + m->possible_max_rank = n; + } + + /* inc */ + __decode_and_drop_map(p, end, u32, u32, bad_ext); + /* up */ + __decode_and_drop_map(p, end, u32, u64, bad_ext); + /* failed */ + __decode_and_drop_set(p, end, u32, bad_ext); + /* stopped */ + __decode_and_drop_set(p, end, u32, bad_ext); + + if (mdsmap_ev >= 4) { + /* last_failure_osd_epoch */ + __decode_and_drop_type(p, end, u32, bad_ext); + } + if (mdsmap_ev >= 6) { + /* ever_allowed_snaps */ + __decode_and_drop_type(p, end, u8, bad_ext); + /* explicitly_allowed_snaps */ + __decode_and_drop_type(p, end, u8, bad_ext); + } + if (mdsmap_ev >= 7) { + /* inline_data_enabled */ + __decode_and_drop_type(p, end, u8, bad_ext); + } + if (mdsmap_ev >= 8) { + u32 name_len; + /* enabled */ + ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); + ceph_decode_32_safe(p, end, name_len, bad_ext); + ceph_decode_need(p, end, name_len, bad_ext); + *p += name_len; + } + /* damaged */ + if (mdsmap_ev >= 9) { + size_t need; + ceph_decode_32_safe(p, end, n, bad_ext); + need = sizeof(u32) * n; + ceph_decode_need(p, end, need, bad_ext); + *p += need; + m->m_damaged = n > 0; + } else { + m->m_damaged = false; + } +bad_ext: + dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); + *p = end; + dout("mdsmap_decode success epoch %u\n", m->m_epoch); + return m; +nomem: + err = -ENOMEM; + goto out_err; +corrupt: + pr_err("corrupt mdsmap\n"); + print_hex_dump(KERN_DEBUG, "mdsmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); +out_err: + ceph_mdsmap_destroy(m); + return ERR_PTR(err); +bad: + err = -EINVAL; + goto corrupt; +} + +void ceph_mdsmap_destroy(struct ceph_mdsmap *m) +{ + int i; + + if (m->m_info) { + for (i = 0; i < m->possible_max_rank; i++) + kfree(m->m_info[i].export_targets); + kfree(m->m_info); + } + kfree(m->m_data_pg_pools); + kfree(m); +} + +bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) +{ + int i, nr_active = 0; + if (!m->m_enabled) + return false; + if (m->m_damaged) + return false; + if (m->m_num_laggy == m->m_num_active_mds) + return false; + for (i = 0; i < m->possible_max_rank; i++) { + if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) + nr_active++; + } + return nr_active > 0; +} diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c new file mode 100644 index 000000000..906e446ab --- /dev/null +++ b/fs/ceph/metric.c @@ -0,0 +1,310 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/ceph/ceph_debug.h> + +#include <linux/types.h> +#include <linux/percpu_counter.h> +#include <linux/math64.h> + +#include "metric.h" +#include "mds_client.h" + +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + struct ceph_metric_head *head; + struct ceph_metric_cap *cap; + struct ceph_metric_read_latency *read; + struct ceph_metric_write_latency *write; + struct ceph_metric_metadata_latency *meta; + struct ceph_client_metric *m = &mdsc->metric; + u64 nr_caps = atomic64_read(&m->total_caps); + struct ceph_msg *msg; + struct timespec64 ts; + s64 sum; + s32 items = 0; + s32 len; + + len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) + + sizeof(*meta); + + msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); + if (!msg) { + pr_err("send metrics to mds%d, failed to allocate message\n", + s->s_mds); + return false; + } + + head = msg->front.iov_base; + + /* encode the cap metric */ + cap = (struct ceph_metric_cap *)(head + 1); + cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); + cap->ver = 1; + cap->compat = 1; + cap->data_len = cpu_to_le32(sizeof(*cap) - 10); + cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit)); + cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis)); + cap->total = cpu_to_le64(nr_caps); + items++; + + /* encode the read latency metric */ + read = (struct ceph_metric_read_latency *)(cap + 1); + read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); + read->ver = 1; + read->compat = 1; + read->data_len = cpu_to_le32(sizeof(*read) - 10); + sum = m->read_latency_sum; + jiffies_to_timespec64(sum, &ts); + read->sec = cpu_to_le32(ts.tv_sec); + read->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + /* encode the write latency metric */ + write = (struct ceph_metric_write_latency *)(read + 1); + write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); + write->ver = 1; + write->compat = 1; + write->data_len = cpu_to_le32(sizeof(*write) - 10); + sum = m->write_latency_sum; + jiffies_to_timespec64(sum, &ts); + write->sec = cpu_to_le32(ts.tv_sec); + write->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + /* encode the metadata latency metric */ + meta = (struct ceph_metric_metadata_latency *)(write + 1); + meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); + meta->ver = 1; + meta->compat = 1; + meta->data_len = cpu_to_le32(sizeof(*meta) - 10); + sum = m->metadata_latency_sum; + jiffies_to_timespec64(sum, &ts); + meta->sec = cpu_to_le32(ts.tv_sec); + meta->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + put_unaligned_le32(items, &head->num); + msg->front.iov_len = len; + msg->hdr.version = cpu_to_le16(1); + msg->hdr.compat_version = cpu_to_le16(1); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("client%llu send metrics to mds%d\n", + ceph_client_gid(mdsc->fsc->client), s->s_mds); + ceph_con_send(&s->s_con, msg); + + return true; +} + + +static void metric_get_session(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_session *s; + int i; + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + + /* + * Skip it if MDS doesn't support the metric collection, + * or the MDS will close the session's socket connection + * directly when it get this message. + */ + if (check_session_state(s) && + test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) { + mdsc->metric.session = s; + break; + } + + ceph_put_mds_session(s); + } + mutex_unlock(&mdsc->mutex); +} + +static void metric_delayed_work(struct work_struct *work) +{ + struct ceph_client_metric *m = + container_of(work, struct ceph_client_metric, delayed_work.work); + struct ceph_mds_client *mdsc = + container_of(m, struct ceph_mds_client, metric); + + if (mdsc->stopping || disable_send_metrics) + return; + + if (!m->session || !check_session_state(m->session)) { + if (m->session) { + ceph_put_mds_session(m->session); + m->session = NULL; + } + metric_get_session(mdsc); + } + if (m->session) { + ceph_mdsc_send_metrics(mdsc, m->session); + metric_schedule_delayed(m); + } +} + +int ceph_metric_init(struct ceph_client_metric *m) +{ + int ret; + + if (!m) + return -EINVAL; + + atomic64_set(&m->total_dentries, 0); + ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL); + if (ret) + return ret; + + ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL); + if (ret) + goto err_d_lease_mis; + + atomic64_set(&m->total_caps, 0); + ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL); + if (ret) + goto err_i_caps_hit; + + ret = percpu_counter_init(&m->i_caps_mis, 0, GFP_KERNEL); + if (ret) + goto err_i_caps_mis; + + spin_lock_init(&m->read_latency_lock); + m->read_latency_sq_sum = 0; + m->read_latency_min = KTIME_MAX; + m->read_latency_max = 0; + m->total_reads = 0; + m->read_latency_sum = 0; + + spin_lock_init(&m->write_latency_lock); + m->write_latency_sq_sum = 0; + m->write_latency_min = KTIME_MAX; + m->write_latency_max = 0; + m->total_writes = 0; + m->write_latency_sum = 0; + + spin_lock_init(&m->metadata_latency_lock); + m->metadata_latency_sq_sum = 0; + m->metadata_latency_min = KTIME_MAX; + m->metadata_latency_max = 0; + m->total_metadatas = 0; + m->metadata_latency_sum = 0; + + atomic64_set(&m->opened_files, 0); + ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL); + if (ret) + goto err_opened_inodes; + ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL); + if (ret) + goto err_total_inodes; + + m->session = NULL; + INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work); + + return 0; + +err_total_inodes: + percpu_counter_destroy(&m->opened_inodes); +err_opened_inodes: + percpu_counter_destroy(&m->i_caps_mis); +err_i_caps_mis: + percpu_counter_destroy(&m->i_caps_hit); +err_i_caps_hit: + percpu_counter_destroy(&m->d_lease_mis); +err_d_lease_mis: + percpu_counter_destroy(&m->d_lease_hit); + + return ret; +} + +void ceph_metric_destroy(struct ceph_client_metric *m) +{ + if (!m) + return; + + cancel_delayed_work_sync(&m->delayed_work); + + percpu_counter_destroy(&m->total_inodes); + percpu_counter_destroy(&m->opened_inodes); + percpu_counter_destroy(&m->i_caps_mis); + percpu_counter_destroy(&m->i_caps_hit); + percpu_counter_destroy(&m->d_lease_mis); + percpu_counter_destroy(&m->d_lease_hit); + + ceph_put_mds_session(m->session); +} + +static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, + ktime_t *min, ktime_t *max, + ktime_t *sq_sump, ktime_t lat) +{ + ktime_t total, avg, sq, lsum; + + total = ++(*totalp); + lsum = (*lsump += lat); + + if (unlikely(lat < *min)) + *min = lat; + if (unlikely(lat > *max)) + *max = lat; + + if (unlikely(total == 1)) + return; + + /* the sq is (lat - old_avg) * (lat - new_avg) */ + avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1)); + sq = lat - avg; + avg = DIV64_U64_ROUND_CLOSEST(lsum, total); + sq = sq * (lat - avg); + *sq_sump += sq; +} + +void ceph_update_read_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) + return; + + spin_lock(&m->read_latency_lock); + __update_latency(&m->total_reads, &m->read_latency_sum, + &m->read_latency_min, &m->read_latency_max, + &m->read_latency_sq_sum, lat); + spin_unlock(&m->read_latency_lock); +} + +void ceph_update_write_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc && rc != -ETIMEDOUT)) + return; + + spin_lock(&m->write_latency_lock); + __update_latency(&m->total_writes, &m->write_latency_sum, + &m->write_latency_min, &m->write_latency_max, + &m->write_latency_sq_sum, lat); + spin_unlock(&m->write_latency_lock); +} + +void ceph_update_metadata_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc && rc != -ENOENT)) + return; + + spin_lock(&m->metadata_latency_lock); + __update_latency(&m->total_metadatas, &m->metadata_latency_sum, + &m->metadata_latency_min, &m->metadata_latency_max, + &m->metadata_latency_sq_sum, lat); + spin_unlock(&m->metadata_latency_lock); +} diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h new file mode 100644 index 000000000..710f3f1dc --- /dev/null +++ b/fs/ceph/metric.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_MDS_METRIC_H +#define _FS_CEPH_MDS_METRIC_H + +#include <linux/types.h> +#include <linux/percpu_counter.h> +#include <linux/ktime.h> + +extern bool disable_send_metrics; + +enum ceph_metric_type { + CLIENT_METRIC_TYPE_CAP_INFO, + CLIENT_METRIC_TYPE_READ_LATENCY, + CLIENT_METRIC_TYPE_WRITE_LATENCY, + CLIENT_METRIC_TYPE_METADATA_LATENCY, + CLIENT_METRIC_TYPE_DENTRY_LEASE, + + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, +}; + +/* + * This will always have the highest metric bit value + * as the last element of the array. + */ +#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ + CLIENT_METRIC_TYPE_CAP_INFO, \ + CLIENT_METRIC_TYPE_READ_LATENCY, \ + CLIENT_METRIC_TYPE_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_METADATA_LATENCY, \ + \ + CLIENT_METRIC_TYPE_MAX, \ +} + +/* metric caps header */ +struct ceph_metric_cap { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(hit + mis + total) */ + __le64 hit; + __le64 mis; + __le64 total; +} __packed; + +/* metric read latency header */ +struct ceph_metric_read_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __packed; + +/* metric write latency header */ +struct ceph_metric_write_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __packed; + +/* metric metadata latency header */ +struct ceph_metric_metadata_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __packed; + +struct ceph_metric_head { + __le32 num; /* the number of metrics that will be sent */ +} __packed; + +/* This is the global metrics */ +struct ceph_client_metric { + atomic64_t total_dentries; + struct percpu_counter d_lease_hit; + struct percpu_counter d_lease_mis; + + atomic64_t total_caps; + struct percpu_counter i_caps_hit; + struct percpu_counter i_caps_mis; + + spinlock_t read_latency_lock; + u64 total_reads; + ktime_t read_latency_sum; + ktime_t read_latency_sq_sum; + ktime_t read_latency_min; + ktime_t read_latency_max; + + spinlock_t write_latency_lock; + u64 total_writes; + ktime_t write_latency_sum; + ktime_t write_latency_sq_sum; + ktime_t write_latency_min; + ktime_t write_latency_max; + + spinlock_t metadata_latency_lock; + u64 total_metadatas; + ktime_t metadata_latency_sum; + ktime_t metadata_latency_sq_sum; + ktime_t metadata_latency_min; + ktime_t metadata_latency_max; + + /* The total number of directories and files that are opened */ + atomic64_t opened_files; + + /* The total number of inodes that have opened files or directories */ + struct percpu_counter opened_inodes; + struct percpu_counter total_inodes; + + struct ceph_mds_session *session; + struct delayed_work delayed_work; /* delayed work */ +}; + +static inline void metric_schedule_delayed(struct ceph_client_metric *m) +{ + if (disable_send_metrics) + return; + + /* per second */ + schedule_delayed_work(&m->delayed_work, round_jiffies_relative(HZ)); +} + +extern int ceph_metric_init(struct ceph_client_metric *m); +extern void ceph_metric_destroy(struct ceph_client_metric *m); + +static inline void ceph_update_cap_hit(struct ceph_client_metric *m) +{ + percpu_counter_inc(&m->i_caps_hit); +} + +static inline void ceph_update_cap_mis(struct ceph_client_metric *m) +{ + percpu_counter_inc(&m->i_caps_mis); +} + +extern void ceph_update_read_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); +extern void ceph_update_write_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); +extern void ceph_update_metadata_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); +#endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c new file mode 100644 index 000000000..9b785f11e --- /dev/null +++ b/fs/ceph/quota.c @@ -0,0 +1,574 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * quota.c - CephFS quota + * + * Copyright (C) 2017-2018 SUSE + */ + +#include <linux/statfs.h> + +#include "super.h" +#include "mds_client.h" + +void ceph_adjust_quota_realms_count(struct inode *inode, bool inc) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + if (inc) + atomic64_inc(&mdsc->quotarealms_count); + else + atomic64_dec(&mdsc->quotarealms_count); +} + +static inline bool ceph_has_realms_with_quotas(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); + struct inode *root = d_inode(sb->s_root); + + if (atomic64_read(&mdsc->quotarealms_count) > 0) + return true; + /* if root is the real CephFS root, we don't have quota realms */ + if (root && ceph_ino(root) == CEPH_INO_ROOT) + return false; + /* otherwise, we can't know for sure */ + return true; +} + +void ceph_handle_quota(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->fsc->sb; + struct ceph_mds_quota *h = msg->front.iov_base; + struct ceph_vino vino; + struct inode *inode; + struct ceph_inode_info *ci; + + if (msg->front.iov_len < sizeof(*h)) { + pr_err("%s corrupt message mds%d len %d\n", __func__, + session->s_mds, (int)msg->front.iov_len); + ceph_msg_dump(msg); + return; + } + + /* increment msg sequence number */ + mutex_lock(&session->s_mutex); + inc_session_sequence(session); + mutex_unlock(&session->s_mutex); + + /* lookup inode */ + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) { + pr_warn("Failed to find inode %llu\n", vino.ino); + return; + } + ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + ci->i_rbytes = le64_to_cpu(h->rbytes); + ci->i_rfiles = le64_to_cpu(h->rfiles); + ci->i_rsubdirs = le64_to_cpu(h->rsubdirs); + __ceph_update_quota(ci, le64_to_cpu(h->max_bytes), + le64_to_cpu(h->max_files)); + spin_unlock(&ci->i_ceph_lock); + + /* avoid calling iput_final() in dispatch thread */ + ceph_async_iput(inode); +} + +static struct ceph_quotarealm_inode * +find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino) +{ + struct ceph_quotarealm_inode *qri = NULL; + struct rb_node **node, *parent = NULL; + + mutex_lock(&mdsc->quotarealms_inodes_mutex); + node = &(mdsc->quotarealms_inodes.rb_node); + while (*node) { + parent = *node; + qri = container_of(*node, struct ceph_quotarealm_inode, node); + + if (ino < qri->ino) + node = &((*node)->rb_left); + else if (ino > qri->ino) + node = &((*node)->rb_right); + else + break; + } + if (!qri || (qri->ino != ino)) { + /* Not found, create a new one and insert it */ + qri = kmalloc(sizeof(*qri), GFP_KERNEL); + if (qri) { + qri->ino = ino; + qri->inode = NULL; + qri->timeout = 0; + mutex_init(&qri->mutex); + rb_link_node(&qri->node, parent, node); + rb_insert_color(&qri->node, &mdsc->quotarealms_inodes); + } else + pr_warn("Failed to alloc quotarealms_inode\n"); + } + mutex_unlock(&mdsc->quotarealms_inodes_mutex); + + return qri; +} + +/* + * This function will try to lookup a realm inode which isn't visible in the + * filesystem mountpoint. A list of these kind of inodes (not visible) is + * maintained in the mdsc and freed only when the filesystem is umounted. + * + * Note that these inodes are kept in this list even if the lookup fails, which + * allows to prevent useless lookup requests. + */ +static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, + struct super_block *sb, + struct ceph_snap_realm *realm) +{ + struct ceph_quotarealm_inode *qri; + struct inode *in; + + qri = find_quotarealm_inode(mdsc, realm->ino); + if (!qri) + return NULL; + + mutex_lock(&qri->mutex); + if (qri->inode && ceph_is_any_caps(qri->inode)) { + /* A request has already returned the inode */ + mutex_unlock(&qri->mutex); + return qri->inode; + } + /* Check if this inode lookup has failed recently */ + if (qri->timeout && + time_before_eq(jiffies, qri->timeout)) { + mutex_unlock(&qri->mutex); + return NULL; + } + if (qri->inode) { + /* get caps */ + int ret = __ceph_do_getattr(qri->inode, NULL, + CEPH_STAT_CAP_INODE, true); + if (ret >= 0) + in = qri->inode; + else + in = ERR_PTR(ret); + } else { + in = ceph_lookup_inode(sb, realm->ino); + } + + if (IS_ERR(in)) { + dout("Can't lookup inode %llx (err: %ld)\n", + realm->ino, PTR_ERR(in)); + qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */ + } else { + qri->timeout = 0; + qri->inode = in; + } + mutex_unlock(&qri->mutex); + + return in; +} + +void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) +{ + struct ceph_quotarealm_inode *qri; + struct rb_node *node; + + /* + * It should now be safe to clean quotarealms_inode tree without holding + * mdsc->quotarealms_inodes_mutex... + */ + mutex_lock(&mdsc->quotarealms_inodes_mutex); + while (!RB_EMPTY_ROOT(&mdsc->quotarealms_inodes)) { + node = rb_first(&mdsc->quotarealms_inodes); + qri = rb_entry(node, struct ceph_quotarealm_inode, node); + rb_erase(node, &mdsc->quotarealms_inodes); + iput(qri->inode); + kfree(qri); + } + mutex_unlock(&mdsc->quotarealms_inodes_mutex); +} + +/* + * This function walks through the snaprealm for an inode and returns the + * ceph_snap_realm for the first snaprealm that has quotas set (either max_files + * or max_bytes). If the root is reached, return the root ceph_snap_realm + * instead. + * + * Note that the caller is responsible for calling ceph_put_snap_realm() on the + * returned realm. + * + * Callers of this function need to hold mdsc->snap_rwsem. However, if there's + * a need to do an inode lookup, this rwsem will be temporarily dropped. Hence + * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false' + * this function will return -EAGAIN; otherwise, the snaprealms walk-through + * will be restarted. + */ +static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, + struct inode *inode, bool retry) +{ + struct ceph_inode_info *ci = NULL; + struct ceph_snap_realm *realm, *next; + struct inode *in; + bool has_quota; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return NULL; + +restart: + realm = ceph_inode(inode)->i_snap_realm; + if (realm) + ceph_get_snap_realm(mdsc, realm); + else + pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) " + "null i_snap_realm\n", ceph_vinop(inode)); + while (realm) { + bool has_inode; + + spin_lock(&realm->inodes_with_caps_lock); + has_inode = realm->inode; + in = has_inode ? igrab(realm->inode) : NULL; + spin_unlock(&realm->inodes_with_caps_lock); + if (has_inode && !in) + break; + if (!in) { + up_read(&mdsc->snap_rwsem); + in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm); + down_read(&mdsc->snap_rwsem); + if (IS_ERR_OR_NULL(in)) + break; + ceph_put_snap_realm(mdsc, realm); + if (!retry) + return ERR_PTR(-EAGAIN); + goto restart; + } + + ci = ceph_inode(in); + has_quota = __ceph_has_any_quota(ci); + /* avoid calling iput_final() while holding mdsc->snap_rwsem */ + ceph_async_iput(in); + + next = realm->parent; + if (has_quota || !next) + return realm; + + ceph_get_snap_realm(mdsc, next); + ceph_put_snap_realm(mdsc, realm); + realm = next; + } + if (realm) + ceph_put_snap_realm(mdsc, realm); + + return NULL; +} + +static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb); + struct ceph_snap_realm *old_realm, *new_realm; + bool is_same; + +restart: + /* + * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem. + * However, get_quota_realm may drop it temporarily. By setting the + * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was + * dropped and we can then restart the whole operation. + */ + down_read(&mdsc->snap_rwsem); + old_realm = get_quota_realm(mdsc, old, true); + new_realm = get_quota_realm(mdsc, new, false); + if (PTR_ERR(new_realm) == -EAGAIN) { + up_read(&mdsc->snap_rwsem); + if (old_realm) + ceph_put_snap_realm(mdsc, old_realm); + goto restart; + } + is_same = (old_realm == new_realm); + up_read(&mdsc->snap_rwsem); + + if (old_realm) + ceph_put_snap_realm(mdsc, old_realm); + if (new_realm) + ceph_put_snap_realm(mdsc, new_realm); + + return is_same; +} + +enum quota_check_op { + QUOTA_CHECK_MAX_FILES_OP, /* check quota max_files limit */ + QUOTA_CHECK_MAX_BYTES_OP, /* check quota max_files limit */ + QUOTA_CHECK_MAX_BYTES_APPROACHING_OP /* check if quota max_files + limit is approaching */ +}; + +/* + * check_quota_exceeded() will walk up the snaprealm hierarchy and, for each + * realm, it will execute quota check operation defined by the 'op' parameter. + * The snaprealm walk is interrupted if the quota check detects that the quota + * is exceeded or if the root inode is reached. + */ +static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, + loff_t delta) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_inode_info *ci; + struct ceph_snap_realm *realm, *next; + struct inode *in; + u64 max, rvalue; + bool exceeded = false; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return false; + + down_read(&mdsc->snap_rwsem); +restart: + realm = ceph_inode(inode)->i_snap_realm; + if (realm) + ceph_get_snap_realm(mdsc, realm); + else + pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) " + "null i_snap_realm\n", ceph_vinop(inode)); + while (realm) { + bool has_inode; + + spin_lock(&realm->inodes_with_caps_lock); + has_inode = realm->inode; + in = has_inode ? igrab(realm->inode) : NULL; + spin_unlock(&realm->inodes_with_caps_lock); + if (has_inode && !in) + break; + if (!in) { + up_read(&mdsc->snap_rwsem); + in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm); + down_read(&mdsc->snap_rwsem); + if (IS_ERR_OR_NULL(in)) + break; + ceph_put_snap_realm(mdsc, realm); + goto restart; + } + ci = ceph_inode(in); + spin_lock(&ci->i_ceph_lock); + if (op == QUOTA_CHECK_MAX_FILES_OP) { + max = ci->i_max_files; + rvalue = ci->i_rfiles + ci->i_rsubdirs; + } else { + max = ci->i_max_bytes; + rvalue = ci->i_rbytes; + } + spin_unlock(&ci->i_ceph_lock); + switch (op) { + case QUOTA_CHECK_MAX_FILES_OP: + case QUOTA_CHECK_MAX_BYTES_OP: + exceeded = (max && (rvalue + delta > max)); + break; + case QUOTA_CHECK_MAX_BYTES_APPROACHING_OP: + if (max) { + if (rvalue >= max) + exceeded = true; + else { + /* + * when we're writing more that 1/16th + * of the available space + */ + exceeded = + (((max - rvalue) >> 4) < delta); + } + } + break; + default: + /* Shouldn't happen */ + pr_warn("Invalid quota check op (%d)\n", op); + exceeded = true; /* Just break the loop */ + } + /* avoid calling iput_final() while holding mdsc->snap_rwsem */ + ceph_async_iput(in); + + next = realm->parent; + if (exceeded || !next) + break; + ceph_get_snap_realm(mdsc, next); + ceph_put_snap_realm(mdsc, realm); + realm = next; + } + if (realm) + ceph_put_snap_realm(mdsc, realm); + up_read(&mdsc->snap_rwsem); + + return exceeded; +} + +/* + * ceph_quota_is_max_files_exceeded - check if we can create a new file + * @inode: directory where a new file is being created + * + * This functions returns true is max_files quota allows a new file to be + * created. It is necessary to walk through the snaprealm hierarchy (until the + * FS root) to check all realms with quotas set. + */ +bool ceph_quota_is_max_files_exceeded(struct inode *inode) +{ + if (!ceph_has_realms_with_quotas(inode)) + return false; + + WARN_ON(!S_ISDIR(inode->i_mode)); + + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 1); +} + +/* + * ceph_quota_is_max_bytes_exceeded - check if we can write to a file + * @inode: inode being written + * @newsize: new size if write succeeds + * + * This functions returns true is max_bytes quota allows a file size to reach + * @newsize; it returns false otherwise. + */ +bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newsize) +{ + loff_t size = i_size_read(inode); + + if (!ceph_has_realms_with_quotas(inode)) + return false; + + /* return immediately if we're decreasing file size */ + if (newsize <= size) + return false; + + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_OP, (newsize - size)); +} + +/* + * ceph_quota_is_max_bytes_approaching - check if we're reaching max_bytes + * @inode: inode being written + * @newsize: new size if write succeeds + * + * This function returns true if the new file size @newsize will be consuming + * more than 1/16th of the available quota space; it returns false otherwise. + */ +bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newsize) +{ + loff_t size = ceph_inode(inode)->i_reported_size; + + if (!ceph_has_realms_with_quotas(inode)) + return false; + + /* return immediately if we're decreasing file size */ + if (newsize <= size) + return false; + + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_APPROACHING_OP, + (newsize - size)); +} + +/* + * ceph_quota_update_statfs - if root has quota update statfs with quota status + * @fsc: filesystem client instance + * @buf: statfs to update + * + * If the mounted filesystem root has max_bytes quota set, update the filesystem + * statistics with the quota status. + * + * This function returns true if the stats have been updated, false otherwise. + */ +bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_inode_info *ci; + struct ceph_snap_realm *realm; + struct inode *in; + u64 total = 0, used, free; + bool is_updated = false; + + down_read(&mdsc->snap_rwsem); + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true); + up_read(&mdsc->snap_rwsem); + if (!realm) + return false; + + spin_lock(&realm->inodes_with_caps_lock); + in = realm->inode ? igrab(realm->inode) : NULL; + spin_unlock(&realm->inodes_with_caps_lock); + if (in) { + ci = ceph_inode(in); + spin_lock(&ci->i_ceph_lock); + if (ci->i_max_bytes) { + total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT; + used = ci->i_rbytes >> CEPH_BLOCK_SHIFT; + /* It is possible for a quota to be exceeded. + * Report 'zero' in that case + */ + free = total > used ? total - used : 0; + } + spin_unlock(&ci->i_ceph_lock); + if (total) { + buf->f_blocks = total; + buf->f_bfree = free; + buf->f_bavail = free; + is_updated = true; + } + iput(in); + } + ceph_put_snap_realm(mdsc, realm); + + return is_updated; +} + +/* + * ceph_quota_check_rename - check if a rename can be executed + * @mdsc: MDS client instance + * @old: inode to be copied + * @new: destination inode (directory) + * + * This function verifies if a rename (e.g. moving a file or directory) can be + * executed. It forces an rstat update in the @new target directory (and in the + * source @old as well, if it's a directory). The actual check is done both for + * max_files and max_bytes. + * + * This function returns 0 if it's OK to do the rename, or, if quotas are + * exceeded, -EXDEV (if @old is a directory) or -EDQUOT. + */ +int ceph_quota_check_rename(struct ceph_mds_client *mdsc, + struct inode *old, struct inode *new) +{ + struct ceph_inode_info *ci_old = ceph_inode(old); + int ret = 0; + + if (ceph_quota_is_same_realm(old, new)) + return 0; + + /* + * Get the latest rstat for target directory (and for source, if a + * directory) + */ + ret = ceph_do_getattr(new, CEPH_STAT_RSTAT, false); + if (ret) + return ret; + + if (S_ISDIR(old->i_mode)) { + ret = ceph_do_getattr(old, CEPH_STAT_RSTAT, false); + if (ret) + return ret; + ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP, + ci_old->i_rbytes); + if (!ret) + ret = check_quota_exceeded(new, + QUOTA_CHECK_MAX_FILES_OP, + ci_old->i_rfiles + + ci_old->i_rsubdirs); + if (ret) + ret = -EXDEV; + } else { + ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP, + i_size_read(old)); + if (!ret) + ret = check_quota_exceeded(new, + QUOTA_CHECK_MAX_FILES_OP, 1); + if (ret) + ret = -EDQUOT; + } + + return ret; +} diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c new file mode 100644 index 000000000..db464682b --- /dev/null +++ b/fs/ceph/snap.c @@ -0,0 +1,1205 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/sort.h> +#include <linux/slab.h> +#include <linux/iversion.h> +#include "super.h" +#include "mds_client.h" +#include <linux/ceph/decode.h> + +/* unused map expires after 5 minutes */ +#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ) + +/* + * Snapshots in ceph are driven in large part by cooperation from the + * client. In contrast to local file systems or file servers that + * implement snapshots at a single point in the system, ceph's + * distributed access to storage requires clients to help decide + * whether a write logically occurs before or after a recently created + * snapshot. + * + * This provides a perfect instantanous client-wide snapshot. Between + * clients, however, snapshots may appear to be applied at slightly + * different points in time, depending on delays in delivering the + * snapshot notification. + * + * Snapshots are _not_ file system-wide. Instead, each snapshot + * applies to the subdirectory nested beneath some directory. This + * effectively divides the hierarchy into multiple "realms," where all + * of the files contained by each realm share the same set of + * snapshots. An individual realm's snap set contains snapshots + * explicitly created on that realm, as well as any snaps in its + * parent's snap set _after_ the point at which the parent became it's + * parent (due to, say, a rename). Similarly, snaps from prior parents + * during the time intervals during which they were the parent are included. + * + * The client is spared most of this detail, fortunately... it must only + * maintains a hierarchy of realms reflecting the current parent/child + * realm relationship, and for each realm has an explicit list of snaps + * inherited from prior parents. + * + * A snap_realm struct is maintained for realms containing every inode + * with an open cap in the system. (The needed snap realm information is + * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq' + * version number is used to ensure that as realm parameters change (new + * snapshot, new parent, etc.) the client's realm hierarchy is updated. + * + * The realm hierarchy drives the generation of a 'snap context' for each + * realm, which simply lists the resulting set of snaps for the realm. This + * is attached to any writes sent to OSDs. + */ +/* + * Unfortunately error handling is a bit mixed here. If we get a snap + * update, but don't have enough memory to update our realm hierarchy, + * it's not clear what we can do about it (besides complaining to the + * console). + */ + + +/* + * increase ref count for the realm + * + * caller must hold snap_rwsem. + */ +void ceph_get_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + lockdep_assert_held(&mdsc->snap_rwsem); + + /* + * The 0->1 and 1->0 transitions must take the snap_empty_lock + * atomically with the refcount change. Go ahead and bump the + * nref here, unless it's 0, in which case we take the spinlock + * and then do the increment and remove it from the list. + */ + if (atomic_inc_not_zero(&realm->nref)) + return; + + spin_lock(&mdsc->snap_empty_lock); + if (atomic_inc_return(&realm->nref) == 1) + list_del_init(&realm->empty_item); + spin_unlock(&mdsc->snap_empty_lock); +} + +static void __insert_snap_realm(struct rb_root *root, + struct ceph_snap_realm *new) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_snap_realm *r = NULL; + + while (*p) { + parent = *p; + r = rb_entry(parent, struct ceph_snap_realm, node); + if (new->ino < r->ino) + p = &(*p)->rb_left; + else if (new->ino > r->ino) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); +} + +/* + * create and get the realm rooted at @ino and bump its ref count. + * + * caller must hold snap_rwsem for write. + */ +static struct ceph_snap_realm *ceph_create_snap_realm( + struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *realm; + + lockdep_assert_held_write(&mdsc->snap_rwsem); + + realm = kzalloc(sizeof(*realm), GFP_NOFS); + if (!realm) + return ERR_PTR(-ENOMEM); + + atomic_set(&realm->nref, 1); /* for caller */ + realm->ino = ino; + INIT_LIST_HEAD(&realm->children); + INIT_LIST_HEAD(&realm->child_item); + INIT_LIST_HEAD(&realm->empty_item); + INIT_LIST_HEAD(&realm->dirty_item); + INIT_LIST_HEAD(&realm->inodes_with_caps); + spin_lock_init(&realm->inodes_with_caps_lock); + __insert_snap_realm(&mdsc->snap_realms, realm); + mdsc->num_snap_realms++; + + dout("create_snap_realm %llx %p\n", realm->ino, realm); + return realm; +} + +/* + * lookup the realm rooted at @ino. + * + * caller must hold snap_rwsem. + */ +static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct rb_node *n = mdsc->snap_realms.rb_node; + struct ceph_snap_realm *r; + + lockdep_assert_held(&mdsc->snap_rwsem); + + while (n) { + r = rb_entry(n, struct ceph_snap_realm, node); + if (ino < r->ino) + n = n->rb_left; + else if (ino > r->ino) + n = n->rb_right; + else { + dout("lookup_snap_realm %llx %p\n", r->ino, r); + return r; + } + } + return NULL; +} + +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *r; + r = __lookup_snap_realm(mdsc, ino); + if (r) + ceph_get_snap_realm(mdsc, r); + return r; +} + +static void __put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); + +/* + * called with snap_rwsem (write) + */ +static void __destroy_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + lockdep_assert_held_write(&mdsc->snap_rwsem); + + dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); + + rb_erase(&realm->node, &mdsc->snap_realms); + mdsc->num_snap_realms--; + + if (realm->parent) { + list_del_init(&realm->child_item); + __put_snap_realm(mdsc, realm->parent); + } + + kfree(realm->prior_parent_snaps); + kfree(realm->snaps); + ceph_put_snap_context(realm->cached_context); + kfree(realm); +} + +/* + * caller holds snap_rwsem (write) + */ +static void __put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + lockdep_assert_held_write(&mdsc->snap_rwsem); + + /* + * We do not require the snap_empty_lock here, as any caller that + * increments the value must hold the snap_rwsem. + */ + if (atomic_dec_and_test(&realm->nref)) + __destroy_snap_realm(mdsc, realm); +} + +/* + * See comments in ceph_get_snap_realm. Caller needn't hold any locks. + */ +void ceph_put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock)) + return; + + if (down_write_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&mdsc->snap_empty_lock); + __destroy_snap_realm(mdsc, realm); + up_write(&mdsc->snap_rwsem); + } else { + list_add(&realm->empty_item, &mdsc->snap_empty); + spin_unlock(&mdsc->snap_empty_lock); + } +} + +/* + * Clean up any realms whose ref counts have dropped to zero. Note + * that this does not include realms who were created but not yet + * used. + * + * Called under snap_rwsem (write) + */ +static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) +{ + struct ceph_snap_realm *realm; + + lockdep_assert_held_write(&mdsc->snap_rwsem); + + spin_lock(&mdsc->snap_empty_lock); + while (!list_empty(&mdsc->snap_empty)) { + realm = list_first_entry(&mdsc->snap_empty, + struct ceph_snap_realm, empty_item); + list_del(&realm->empty_item); + spin_unlock(&mdsc->snap_empty_lock); + __destroy_snap_realm(mdsc, realm); + spin_lock(&mdsc->snap_empty_lock); + } + spin_unlock(&mdsc->snap_empty_lock); +} + +void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) +{ + down_write(&mdsc->snap_rwsem); + __cleanup_empty_realms(mdsc); + up_write(&mdsc->snap_rwsem); +} + +/* + * adjust the parent realm of a given @realm. adjust child list, and parent + * pointers, and ref counts appropriately. + * + * return true if parent was changed, 0 if unchanged, <0 on error. + * + * caller must hold snap_rwsem for write. + */ +static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm, + u64 parentino) +{ + struct ceph_snap_realm *parent; + + lockdep_assert_held_write(&mdsc->snap_rwsem); + + if (realm->parent_ino == parentino) + return 0; + + parent = ceph_lookup_snap_realm(mdsc, parentino); + if (!parent) { + parent = ceph_create_snap_realm(mdsc, parentino); + if (IS_ERR(parent)) + return PTR_ERR(parent); + } + dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", + realm->ino, realm, realm->parent_ino, realm->parent, + parentino, parent); + if (realm->parent) { + list_del_init(&realm->child_item); + ceph_put_snap_realm(mdsc, realm->parent); + } + realm->parent_ino = parentino; + realm->parent = parent; + list_add(&realm->child_item, &parent->children); + return 1; +} + + +static int cmpu64_rev(const void *a, const void *b) +{ + if (*(u64 *)a < *(u64 *)b) + return 1; + if (*(u64 *)a > *(u64 *)b) + return -1; + return 0; +} + + +/* + * build the snap context for a given realm. + */ +static int build_snap_context(struct ceph_snap_realm *realm, + struct list_head* dirty_realms) +{ + struct ceph_snap_realm *parent = realm->parent; + struct ceph_snap_context *snapc; + int err = 0; + u32 num = realm->num_prior_parent_snaps + realm->num_snaps; + + /* + * build parent context, if it hasn't been built. + * conservatively estimate that all parent snaps might be + * included by us. + */ + if (parent) { + if (!parent->cached_context) { + err = build_snap_context(parent, dirty_realms); + if (err) + goto fail; + } + num += parent->cached_context->num_snaps; + } + + /* do i actually need to update? not if my context seq + matches realm seq, and my parents' does to. (this works + because we rebuild_snap_realms() works _downward_ in + hierarchy after each update.) */ + if (realm->cached_context && + realm->cached_context->seq == realm->seq && + (!parent || + realm->cached_context->seq >= parent->cached_context->seq)) { + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" + " (unchanged)\n", + realm->ino, realm, realm->cached_context, + realm->cached_context->seq, + (unsigned int)realm->cached_context->num_snaps); + return 0; + } + + /* alloc new snap context */ + err = -ENOMEM; + if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) + goto fail; + snapc = ceph_create_snap_context(num, GFP_NOFS); + if (!snapc) + goto fail; + + /* build (reverse sorted) snap vector */ + num = 0; + snapc->seq = realm->seq; + if (parent) { + u32 i; + + /* include any of parent's snaps occurring _after_ my + parent became my parent */ + for (i = 0; i < parent->cached_context->num_snaps; i++) + if (parent->cached_context->snaps[i] >= + realm->parent_since) + snapc->snaps[num++] = + parent->cached_context->snaps[i]; + if (parent->cached_context->seq > snapc->seq) + snapc->seq = parent->cached_context->seq; + } + memcpy(snapc->snaps + num, realm->snaps, + sizeof(u64)*realm->num_snaps); + num += realm->num_snaps; + memcpy(snapc->snaps + num, realm->prior_parent_snaps, + sizeof(u64)*realm->num_prior_parent_snaps); + num += realm->num_prior_parent_snaps; + + sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); + snapc->num_snaps = num; + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", + realm->ino, realm, snapc, snapc->seq, + (unsigned int) snapc->num_snaps); + + ceph_put_snap_context(realm->cached_context); + realm->cached_context = snapc; + /* queue realm for cap_snap creation */ + list_add_tail(&realm->dirty_item, dirty_realms); + return 0; + +fail: + /* + * if we fail, clear old (incorrect) cached_context... hopefully + * we'll have better luck building it later + */ + if (realm->cached_context) { + ceph_put_snap_context(realm->cached_context); + realm->cached_context = NULL; + } + pr_err("build_snap_context %llx %p fail %d\n", realm->ino, + realm, err); + return err; +} + +/* + * rebuild snap context for the given realm and all of its children. + */ +static void rebuild_snap_realms(struct ceph_snap_realm *realm, + struct list_head *dirty_realms) +{ + struct ceph_snap_realm *child; + + dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); + build_snap_context(realm, dirty_realms); + + list_for_each_entry(child, &realm->children, child_item) + rebuild_snap_realms(child, dirty_realms); +} + + +/* + * helper to allocate and decode an array of snapids. free prior + * instance, if any. + */ +static int dup_array(u64 **dst, __le64 *src, u32 num) +{ + u32 i; + + kfree(*dst); + if (num) { + *dst = kcalloc(num, sizeof(u64), GFP_NOFS); + if (!*dst) + return -ENOMEM; + for (i = 0; i < num; i++) + (*dst)[i] = get_unaligned_le64(src + i); + } else { + *dst = NULL; + } + return 0; +} + +static bool has_new_snaps(struct ceph_snap_context *o, + struct ceph_snap_context *n) +{ + if (n->num_snaps == 0) + return false; + /* snaps are in descending order */ + return n->snaps[0] > o->seq; +} + +/* + * When a snapshot is applied, the size/mtime inode metadata is queued + * in a ceph_cap_snap (one for each snapshot) until writeback + * completes and the metadata can be flushed back to the MDS. + * + * However, if a (sync) write is currently in-progress when we apply + * the snapshot, we have to wait until the write succeeds or fails + * (and a final size/mtime is known). In this case the + * cap_snap->writing = 1, and is said to be "pending." When the write + * finishes, we __ceph_finish_cap_snap(). + * + * Caller must hold snap_rwsem for read (i.e., the realm topology won't + * change). + */ +void ceph_queue_cap_snap(struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap_snap *capsnap; + struct ceph_snap_context *old_snapc, *new_snapc; + struct ceph_buffer *old_blob = NULL; + int used, dirty; + + capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); + if (!capsnap) { + pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); + return; + } + capsnap->cap_flush.is_capsnap = true; + INIT_LIST_HEAD(&capsnap->cap_flush.i_list); + INIT_LIST_HEAD(&capsnap->cap_flush.g_list); + + spin_lock(&ci->i_ceph_lock); + used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + + old_snapc = ci->i_head_snapc; + new_snapc = ci->i_snap_realm->cached_context; + + /* + * If there is a write in progress, treat that as a dirty Fw, + * even though it hasn't completed yet; by the time we finish + * up this capsnap it will be. + */ + if (used & CEPH_CAP_FILE_WR) + dirty |= CEPH_CAP_FILE_WR; + + if (__ceph_have_pending_cap_snap(ci)) { + /* there is no point in queuing multiple "pending" cap_snaps, + as no new writes are allowed to start when pending, so any + writes in progress now were started before the previous + cap_snap. lucky us. */ + dout("queue_cap_snap %p already pending\n", inode); + goto update_snapc; + } + if (ci->i_wrbuffer_ref_head == 0 && + !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { + dout("queue_cap_snap %p nothing dirty|writing\n", inode); + goto update_snapc; + } + + BUG_ON(!old_snapc); + + /* + * There is no need to send FLUSHSNAP message to MDS if there is + * no new snapshot. But when there is dirty pages or on-going + * writes, we still need to create cap_snap. cap_snap is needed + * by the write path and page writeback path. + * + * also see ceph_try_drop_cap_snap() + */ + if (has_new_snaps(old_snapc, new_snapc)) { + if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) + capsnap->need_flush = true; + } else { + if (!(used & CEPH_CAP_FILE_WR) && + ci->i_wrbuffer_ref_head == 0) { + dout("queue_cap_snap %p " + "no new_snap|dirty_page|writing\n", inode); + goto update_snapc; + } + } + + dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", + inode, capsnap, old_snapc, ceph_cap_string(dirty), + capsnap->need_flush ? "" : "no_flush"); + ihold(inode); + + refcount_set(&capsnap->nref, 1); + INIT_LIST_HEAD(&capsnap->ci_item); + + capsnap->follows = old_snapc->seq; + capsnap->issued = __ceph_caps_issued(ci, NULL); + capsnap->dirty = dirty; + + capsnap->mode = inode->i_mode; + capsnap->uid = inode->i_uid; + capsnap->gid = inode->i_gid; + + if (dirty & CEPH_CAP_XATTR_EXCL) { + old_blob = __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; + } else { + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; + } + + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + + /* dirty page count moved from _head to this cap_snap; + all subsequent writes page dirties occur _after_ this + snapshot. */ + capsnap->dirty_pages = ci->i_wrbuffer_ref_head; + ci->i_wrbuffer_ref_head = 0; + capsnap->context = old_snapc; + list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); + + if (used & CEPH_CAP_FILE_WR) { + dout("queue_cap_snap %p cap_snap %p snapc %p" + " seq %llu used WR, now pending\n", inode, + capsnap, old_snapc, old_snapc->seq); + capsnap->writing = 1; + } else { + /* note mtime, size NOW. */ + __ceph_finish_cap_snap(ci, capsnap); + } + capsnap = NULL; + old_snapc = NULL; + +update_snapc: + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { + ci->i_head_snapc = ceph_get_snap_context(new_snapc); + dout(" new snapc is %p\n", new_snapc); + } + spin_unlock(&ci->i_ceph_lock); + + ceph_buffer_put(old_blob); + kfree(capsnap); + ceph_put_snap_context(old_snapc); +} + +/* + * Finalize the size, mtime for a cap_snap.. that is, settle on final values + * to be used for the snapshot, to be flushed back to the mds. + * + * If capsnap can now be flushed, add to snap_flush list, and return 1. + * + * Caller must hold i_ceph_lock. + */ +int __ceph_finish_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + + BUG_ON(capsnap->writing); + capsnap->size = inode->i_size; + capsnap->mtime = inode->i_mtime; + capsnap->atime = inode->i_atime; + capsnap->ctime = inode->i_ctime; + capsnap->btime = ci->i_btime; + capsnap->change_attr = inode_peek_iversion_raw(inode); + capsnap->time_warp_seq = ci->i_time_warp_seq; + capsnap->truncate_size = ci->i_truncate_size; + capsnap->truncate_seq = ci->i_truncate_seq; + if (capsnap->dirty_pages) { + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " + "still has %d dirty pages\n", inode, capsnap, + capsnap->context, capsnap->context->seq, + ceph_cap_string(capsnap->dirty), capsnap->size, + capsnap->dirty_pages); + return 0; + } + + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", + inode, capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); + + spin_lock(&mdsc->snap_flush_lock); + if (list_empty(&ci->i_snap_flush_item)) { + ihold(inode); + list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + } + spin_unlock(&mdsc->snap_flush_lock); + return 1; /* caller may want to ceph_flush_snaps */ +} + +/* + * Queue cap_snaps for snap writeback for this realm and its children. + * Called under snap_rwsem, so realm topology won't change. + */ +static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci; + struct inode *lastinode = NULL; + + dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + + spin_lock(&realm->inodes_with_caps_lock); + list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { + struct inode *inode = igrab(&ci->vfs_inode); + if (!inode) + continue; + spin_unlock(&realm->inodes_with_caps_lock); + /* avoid calling iput_final() while holding + * mdsc->snap_rwsem or in mds dispatch threads */ + ceph_async_iput(lastinode); + lastinode = inode; + ceph_queue_cap_snap(ci); + spin_lock(&realm->inodes_with_caps_lock); + } + spin_unlock(&realm->inodes_with_caps_lock); + ceph_async_iput(lastinode); + + dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); +} + +/* + * Parse and apply a snapblob "snap trace" from the MDS. This specifies + * the snap realm parameters from a given realm and all of its ancestors, + * up to the root. + * + * Caller must hold snap_rwsem for write. + */ +int ceph_update_snap_trace(struct ceph_mds_client *mdsc, + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret) +{ + struct ceph_mds_snap_realm *ri; /* encoded */ + __le64 *snaps; /* encoded */ + __le64 *prior_parent_snaps; /* encoded */ + struct ceph_snap_realm *realm; + struct ceph_snap_realm *first_realm = NULL; + struct ceph_snap_realm *realm_to_rebuild = NULL; + int rebuild_snapcs; + int err = -ENOMEM; + LIST_HEAD(dirty_realms); + + lockdep_assert_held_write(&mdsc->snap_rwsem); + + dout("update_snap_trace deletion=%d\n", deletion); +more: + realm = NULL; + rebuild_snapcs = 0; + ceph_decode_need(&p, e, sizeof(*ri), bad); + ri = p; + p += sizeof(*ri); + ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) + + le32_to_cpu(ri->num_prior_parent_snaps)), bad); + snaps = p; + p += sizeof(u64) * le32_to_cpu(ri->num_snaps); + prior_parent_snaps = p; + p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps); + + realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino)); + if (!realm) { + realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino)); + if (IS_ERR(realm)) { + err = PTR_ERR(realm); + goto fail; + } + } + + /* ensure the parent is correct */ + err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); + if (err < 0) + goto fail; + rebuild_snapcs += err; + + if (le64_to_cpu(ri->seq) > realm->seq) { + dout("update_snap_trace updating %llx %p %lld -> %lld\n", + realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); + /* update realm parameters, snap lists */ + realm->seq = le64_to_cpu(ri->seq); + realm->created = le64_to_cpu(ri->created); + realm->parent_since = le64_to_cpu(ri->parent_since); + + realm->num_snaps = le32_to_cpu(ri->num_snaps); + err = dup_array(&realm->snaps, snaps, realm->num_snaps); + if (err < 0) + goto fail; + + realm->num_prior_parent_snaps = + le32_to_cpu(ri->num_prior_parent_snaps); + err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps, + realm->num_prior_parent_snaps); + if (err < 0) + goto fail; + + if (realm->seq > mdsc->last_snap_seq) + mdsc->last_snap_seq = realm->seq; + + rebuild_snapcs = 1; + } else if (!realm->cached_context) { + dout("update_snap_trace %llx %p seq %lld new\n", + realm->ino, realm, realm->seq); + rebuild_snapcs = 1; + } else { + dout("update_snap_trace %llx %p seq %lld unchanged\n", + realm->ino, realm, realm->seq); + } + + dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, + realm, rebuild_snapcs, p, e); + + /* + * this will always track the uppest parent realm from which + * we need to rebuild the snapshot contexts _downward_ in + * hierarchy. + */ + if (rebuild_snapcs) + realm_to_rebuild = realm; + + /* rebuild_snapcs when we reach the _end_ (root) of the trace */ + if (realm_to_rebuild && p >= e) + rebuild_snap_realms(realm_to_rebuild, &dirty_realms); + + if (!first_realm) + first_realm = realm; + else + ceph_put_snap_realm(mdsc, realm); + + if (p < e) + goto more; + + /* + * queue cap snaps _after_ we've built the new snap contexts, + * so that i_head_snapc can be set appropriately. + */ + while (!list_empty(&dirty_realms)) { + realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, + dirty_item); + list_del_init(&realm->dirty_item); + queue_realm_cap_snaps(realm); + } + + if (realm_ret) + *realm_ret = first_realm; + else + ceph_put_snap_realm(mdsc, first_realm); + + __cleanup_empty_realms(mdsc); + return 0; + +bad: + err = -EINVAL; +fail: + if (realm && !IS_ERR(realm)) + ceph_put_snap_realm(mdsc, realm); + if (first_realm) + ceph_put_snap_realm(mdsc, first_realm); + pr_err("update_snap_trace error %d\n", err); + return err; +} + + +/* + * Send any cap_snaps that are queued for flush. Try to carry + * s_mutex across multiple snap flushes to avoid locking overhead. + * + * Caller holds no locks. + */ +static void flush_snaps(struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci; + struct inode *inode; + struct ceph_mds_session *session = NULL; + + dout("flush_snaps\n"); + spin_lock(&mdsc->snap_flush_lock); + while (!list_empty(&mdsc->snap_flush_list)) { + ci = list_first_entry(&mdsc->snap_flush_list, + struct ceph_inode_info, i_snap_flush_item); + inode = &ci->vfs_inode; + ihold(inode); + spin_unlock(&mdsc->snap_flush_lock); + ceph_flush_snaps(ci, &session); + /* avoid calling iput_final() while holding + * session->s_mutex or in mds dispatch threads */ + ceph_async_iput(inode); + spin_lock(&mdsc->snap_flush_lock); + } + spin_unlock(&mdsc->snap_flush_lock); + + if (session) { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } + dout("flush_snaps done\n"); +} + + +/* + * Handle a snap notification from the MDS. + * + * This can take two basic forms: the simplest is just a snap creation + * or deletion notification on an existing realm. This should update the + * realm and its children. + * + * The more difficult case is realm creation, due to snap creation at a + * new point in the file hierarchy, or due to a rename that moves a file or + * directory into another realm. + */ +void ceph_handle_snap(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->fsc->sb; + int mds = session->s_mds; + u64 split; + int op; + int trace_len; + struct ceph_snap_realm *realm = NULL; + void *p = msg->front.iov_base; + void *e = p + msg->front.iov_len; + struct ceph_mds_snap_head *h; + int num_split_inos, num_split_realms; + __le64 *split_inos = NULL, *split_realms = NULL; + int i; + int locked_rwsem = 0; + + /* decode */ + if (msg->front.iov_len < sizeof(*h)) + goto bad; + h = p; + op = le32_to_cpu(h->op); + split = le64_to_cpu(h->split); /* non-zero if we are splitting an + * existing realm */ + num_split_inos = le32_to_cpu(h->num_split_inos); + num_split_realms = le32_to_cpu(h->num_split_realms); + trace_len = le32_to_cpu(h->trace_len); + p += sizeof(*h); + + dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, + ceph_snap_op_name(op), split, trace_len); + + mutex_lock(&session->s_mutex); + inc_session_sequence(session); + mutex_unlock(&session->s_mutex); + + down_write(&mdsc->snap_rwsem); + locked_rwsem = 1; + + if (op == CEPH_SNAP_OP_SPLIT) { + struct ceph_mds_snap_realm *ri; + + /* + * A "split" breaks part of an existing realm off into + * a new realm. The MDS provides a list of inodes + * (with caps) and child realms that belong to the new + * child. + */ + split_inos = p; + p += sizeof(u64) * num_split_inos; + split_realms = p; + p += sizeof(u64) * num_split_realms; + ceph_decode_need(&p, e, sizeof(*ri), bad); + /* we will peek at realm info here, but will _not_ + * advance p, as the realm update will occur below in + * ceph_update_snap_trace. */ + ri = p; + + realm = ceph_lookup_snap_realm(mdsc, split); + if (!realm) { + realm = ceph_create_snap_realm(mdsc, split); + if (IS_ERR(realm)) + goto out; + } + + dout("splitting snap_realm %llx %p\n", realm->ino, realm); + for (i = 0; i < num_split_inos; i++) { + struct ceph_vino vino = { + .ino = le64_to_cpu(split_inos[i]), + .snap = CEPH_NOSNAP, + }; + struct inode *inode = ceph_find_inode(sb, vino); + struct ceph_inode_info *ci; + struct ceph_snap_realm *oldrealm; + + if (!inode) + continue; + ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (!ci->i_snap_realm) + goto skip_inode; + /* + * If this inode belongs to a realm that was + * created after our new realm, we experienced + * a race (due to another split notifications + * arriving from a different MDS). So skip + * this inode. + */ + if (ci->i_snap_realm->created > + le64_to_cpu(ri->created)) { + dout(" leaving %p in newer realm %llx %p\n", + inode, ci->i_snap_realm->ino, + ci->i_snap_realm); + goto skip_inode; + } + dout(" will move %p to split realm %llx %p\n", + inode, realm->ino, realm); + /* + * Move the inode to the new realm + */ + oldrealm = ci->i_snap_realm; + spin_lock(&oldrealm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + spin_unlock(&oldrealm->inodes_with_caps_lock); + + spin_lock(&realm->inodes_with_caps_lock); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + ci->i_snap_realm = realm; + if (realm->ino == ci->i_vino.ino) + realm->inode = inode; + spin_unlock(&realm->inodes_with_caps_lock); + + spin_unlock(&ci->i_ceph_lock); + + ceph_get_snap_realm(mdsc, realm); + ceph_put_snap_realm(mdsc, oldrealm); + + /* avoid calling iput_final() while holding + * mdsc->snap_rwsem or mds in dispatch threads */ + ceph_async_iput(inode); + continue; + +skip_inode: + spin_unlock(&ci->i_ceph_lock); + ceph_async_iput(inode); + } + + /* we may have taken some of the old realm's children. */ + for (i = 0; i < num_split_realms; i++) { + struct ceph_snap_realm *child = + __lookup_snap_realm(mdsc, + le64_to_cpu(split_realms[i])); + if (!child) + continue; + adjust_snap_realm_parent(mdsc, child, realm->ino); + } + } else { + /* + * In the non-split case both 'num_split_inos' and + * 'num_split_realms' should be 0, making this a no-op. + * However the MDS happens to populate 'split_realms' list + * in one of the UPDATE op cases by mistake. + * + * Skip both lists just in case to ensure that 'p' is + * positioned at the start of realm info, as expected by + * ceph_update_snap_trace(). + */ + p += sizeof(u64) * num_split_inos; + p += sizeof(u64) * num_split_realms; + } + + /* + * update using the provided snap trace. if we are deleting a + * snap, we can avoid queueing cap_snaps. + */ + ceph_update_snap_trace(mdsc, p, e, + op == CEPH_SNAP_OP_DESTROY, NULL); + + if (op == CEPH_SNAP_OP_SPLIT) + /* we took a reference when we created the realm, above */ + ceph_put_snap_realm(mdsc, realm); + + __cleanup_empty_realms(mdsc); + + up_write(&mdsc->snap_rwsem); + + flush_snaps(mdsc); + return; + +bad: + pr_err("corrupt snap message from mds%d\n", mds); + ceph_msg_dump(msg); +out: + if (locked_rwsem) + up_write(&mdsc->snap_rwsem); + return; +} + +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, + u64 snap) +{ + struct ceph_snapid_map *sm, *exist; + struct rb_node **p, *parent; + int ret; + + exist = NULL; + spin_lock(&mdsc->snapid_map_lock); + p = &mdsc->snapid_map_tree.rb_node; + while (*p) { + exist = rb_entry(*p, struct ceph_snapid_map, node); + if (snap > exist->snap) { + p = &(*p)->rb_left; + } else if (snap < exist->snap) { + p = &(*p)->rb_right; + } else { + if (atomic_inc_return(&exist->ref) == 1) + list_del_init(&exist->lru); + break; + } + exist = NULL; + } + spin_unlock(&mdsc->snapid_map_lock); + if (exist) { + dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + return exist; + } + + sm = kmalloc(sizeof(*sm), GFP_NOFS); + if (!sm) + return NULL; + + ret = get_anon_bdev(&sm->dev); + if (ret < 0) { + kfree(sm); + return NULL; + } + + INIT_LIST_HEAD(&sm->lru); + atomic_set(&sm->ref, 1); + sm->snap = snap; + + exist = NULL; + parent = NULL; + p = &mdsc->snapid_map_tree.rb_node; + spin_lock(&mdsc->snapid_map_lock); + while (*p) { + parent = *p; + exist = rb_entry(*p, struct ceph_snapid_map, node); + if (snap > exist->snap) + p = &(*p)->rb_left; + else if (snap < exist->snap) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + if (exist) { + if (atomic_inc_return(&exist->ref) == 1) + list_del_init(&exist->lru); + } else { + rb_link_node(&sm->node, parent, p); + rb_insert_color(&sm->node, &mdsc->snapid_map_tree); + } + spin_unlock(&mdsc->snapid_map_lock); + if (exist) { + free_anon_bdev(sm->dev); + kfree(sm); + dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + return exist; + } + + dout("create snapid map %llx -> %x\n", sm->snap, sm->dev); + return sm; +} + +void ceph_put_snapid_map(struct ceph_mds_client* mdsc, + struct ceph_snapid_map *sm) +{ + if (!sm) + return; + if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) { + if (!RB_EMPTY_NODE(&sm->node)) { + sm->last_used = jiffies; + list_add_tail(&sm->lru, &mdsc->snapid_map_lru); + spin_unlock(&mdsc->snapid_map_lock); + } else { + /* already cleaned up by + * ceph_cleanup_snapid_map() */ + spin_unlock(&mdsc->snapid_map_lock); + kfree(sm); + } + } +} + +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) +{ + struct ceph_snapid_map *sm; + unsigned long now; + LIST_HEAD(to_free); + + spin_lock(&mdsc->snapid_map_lock); + now = jiffies; + + while (!list_empty(&mdsc->snapid_map_lru)) { + sm = list_first_entry(&mdsc->snapid_map_lru, + struct ceph_snapid_map, lru); + if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now)) + break; + + rb_erase(&sm->node, &mdsc->snapid_map_tree); + list_move(&sm->lru, &to_free); + } + spin_unlock(&mdsc->snapid_map_lock); + + while (!list_empty(&to_free)) { + sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); + list_del(&sm->lru); + dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev); + free_anon_bdev(sm->dev); + kfree(sm); + } +} + +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) +{ + struct ceph_snapid_map *sm; + struct rb_node *p; + LIST_HEAD(to_free); + + spin_lock(&mdsc->snapid_map_lock); + while ((p = rb_first(&mdsc->snapid_map_tree))) { + sm = rb_entry(p, struct ceph_snapid_map, node); + rb_erase(p, &mdsc->snapid_map_tree); + RB_CLEAR_NODE(p); + list_move(&sm->lru, &to_free); + } + spin_unlock(&mdsc->snapid_map_lock); + + while (!list_empty(&to_free)) { + sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); + list_del(&sm->lru); + free_anon_bdev(sm->dev); + if (WARN_ON_ONCE(atomic_read(&sm->ref))) { + pr_err("snapid map %llx -> %x still in use\n", + sm->snap, sm->dev); + } + kfree(sm); + } +} diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c new file mode 100644 index 000000000..573bb9556 --- /dev/null +++ b/fs/ceph/strings.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ceph fs string constants + */ +#include <linux/module.h> +#include <linux/ceph/types.h> + + +const char *ceph_mds_state_name(int s) +{ + switch (s) { + /* down and out */ + case CEPH_MDS_STATE_DNE: return "down:dne"; + case CEPH_MDS_STATE_STOPPED: return "down:stopped"; + /* up and out */ + case CEPH_MDS_STATE_BOOT: return "up:boot"; + case CEPH_MDS_STATE_STANDBY: return "up:standby"; + case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; + case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay"; + case CEPH_MDS_STATE_CREATING: return "up:creating"; + case CEPH_MDS_STATE_STARTING: return "up:starting"; + /* up and in */ + case CEPH_MDS_STATE_REPLAY: return "up:replay"; + case CEPH_MDS_STATE_RESOLVE: return "up:resolve"; + case CEPH_MDS_STATE_RECONNECT: return "up:reconnect"; + case CEPH_MDS_STATE_REJOIN: return "up:rejoin"; + case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay"; + case CEPH_MDS_STATE_ACTIVE: return "up:active"; + case CEPH_MDS_STATE_STOPPING: return "up:stopping"; + } + return "???"; +} + +const char *ceph_session_op_name(int op) +{ + switch (op) { + case CEPH_SESSION_REQUEST_OPEN: return "request_open"; + case CEPH_SESSION_OPEN: return "open"; + case CEPH_SESSION_REQUEST_CLOSE: return "request_close"; + case CEPH_SESSION_CLOSE: return "close"; + case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps"; + case CEPH_SESSION_RENEWCAPS: return "renewcaps"; + case CEPH_SESSION_STALE: return "stale"; + case CEPH_SESSION_RECALL_STATE: return "recall_state"; + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; + case CEPH_SESSION_FORCE_RO: return "force_ro"; + case CEPH_SESSION_REJECT: return "reject"; + case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog"; + } + return "???"; +} + +const char *ceph_mds_op_name(int op) +{ + switch (op) { + case CEPH_MDS_OP_LOOKUP: return "lookup"; + case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; + case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; + case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; + case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; + case CEPH_MDS_OP_GETATTR: return "getattr"; + case CEPH_MDS_OP_SETXATTR: return "setxattr"; + case CEPH_MDS_OP_SETATTR: return "setattr"; + case CEPH_MDS_OP_RMXATTR: return "rmxattr"; + case CEPH_MDS_OP_SETLAYOUT: return "setlayou"; + case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout"; + case CEPH_MDS_OP_READDIR: return "readdir"; + case CEPH_MDS_OP_MKNOD: return "mknod"; + case CEPH_MDS_OP_LINK: return "link"; + case CEPH_MDS_OP_UNLINK: return "unlink"; + case CEPH_MDS_OP_RENAME: return "rename"; + case CEPH_MDS_OP_MKDIR: return "mkdir"; + case CEPH_MDS_OP_RMDIR: return "rmdir"; + case CEPH_MDS_OP_SYMLINK: return "symlink"; + case CEPH_MDS_OP_CREATE: return "create"; + case CEPH_MDS_OP_OPEN: return "open"; + case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap"; + case CEPH_MDS_OP_LSSNAP: return "lssnap"; + case CEPH_MDS_OP_MKSNAP: return "mksnap"; + case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_RENAMESNAP: return "renamesnap"; + case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; + case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; + } + return "???"; +} + +const char *ceph_cap_op_name(int op) +{ + switch (op) { + case CEPH_CAP_OP_GRANT: return "grant"; + case CEPH_CAP_OP_REVOKE: return "revoke"; + case CEPH_CAP_OP_TRUNC: return "trunc"; + case CEPH_CAP_OP_EXPORT: return "export"; + case CEPH_CAP_OP_IMPORT: return "import"; + case CEPH_CAP_OP_UPDATE: return "update"; + case CEPH_CAP_OP_DROP: return "drop"; + case CEPH_CAP_OP_FLUSH: return "flush"; + case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack"; + case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap"; + case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack"; + case CEPH_CAP_OP_RELEASE: return "release"; + case CEPH_CAP_OP_RENEW: return "renew"; + } + return "???"; +} + +const char *ceph_lease_op_name(int o) +{ + switch (o) { + case CEPH_MDS_LEASE_REVOKE: return "revoke"; + case CEPH_MDS_LEASE_RELEASE: return "release"; + case CEPH_MDS_LEASE_RENEW: return "renew"; + case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack"; + } + return "???"; +} + +const char *ceph_snap_op_name(int o) +{ + switch (o) { + case CEPH_SNAP_OP_UPDATE: return "update"; + case CEPH_SNAP_OP_CREATE: return "create"; + case CEPH_SNAP_OP_DESTROY: return "destroy"; + case CEPH_SNAP_OP_SPLIT: return "split"; + } + return "???"; +} diff --git a/fs/ceph/super.c b/fs/ceph/super.c new file mode 100644 index 000000000..f2aff9734 --- /dev/null +++ b/fs/ceph/super.c @@ -0,0 +1,1347 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ceph/ceph_debug.h> + +#include <linux/backing-dev.h> +#include <linux/ctype.h> +#include <linux/fs.h> +#include <linux/inet.h> +#include <linux/in6.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/statfs.h> +#include <linux/string.h> + +#include "super.h" +#include "mds_client.h" +#include "cache.h" + +#include <linux/ceph/ceph_features.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> + +static DEFINE_SPINLOCK(ceph_fsc_lock); +static LIST_HEAD(ceph_fsc_list); + +/* + * Ceph superblock operations + * + * Handle the basics of mounting, unmounting. + */ + +/* + * super ops + */ +static void ceph_put_super(struct super_block *s) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(s); + + dout("put_super\n"); + ceph_mdsc_close_sessions(fsc->mdsc); +} + +static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry)); + struct ceph_mon_client *monc = &fsc->client->monc; + struct ceph_statfs st; + int i, err; + u64 data_pool; + + if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) { + data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0]; + } else { + data_pool = CEPH_NOPOOL; + } + + dout("statfs\n"); + err = ceph_monc_do_statfs(monc, data_pool, &st); + if (err < 0) + return err; + + /* fill in kstatfs */ + buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ + + /* + * express utilization in terms of large blocks to avoid + * overflow on 32-bit machines. + * + * NOTE: for the time being, we make bsize == frsize to humor + * not-yet-ancient versions of glibc that are broken. + * Someday, we will probably want to report a real block + * size... whatever that may mean for a network file system! + */ + buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; + + /* + * By default use root quota for stats; fallback to overall filesystem + * usage if using 'noquotadf' mount option or if the root dir doesn't + * have max_bytes quota set. + */ + if (ceph_test_mount_opt(fsc, NOQUOTADF) || + !ceph_quota_update_statfs(fsc, buf)) { + buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + } + + buf->f_files = le64_to_cpu(st.num_objects); + buf->f_ffree = -1; + buf->f_namelen = NAME_MAX; + + /* Must convert the fsid, for consistent values across arches */ + buf->f_fsid.val[0] = 0; + mutex_lock(&monc->mutex); + for (i = 0 ; i < sizeof(monc->monmap->fsid) / sizeof(__le32) ; ++i) + buf->f_fsid.val[0] ^= le32_to_cpu(((__le32 *)&monc->monmap->fsid)[i]); + mutex_unlock(&monc->mutex); + + /* fold the fs_cluster_id into the upper bits */ + buf->f_fsid.val[1] = monc->fs_cluster_id; + + return 0; +} + +static int ceph_sync_fs(struct super_block *sb, int wait) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + if (!wait) { + dout("sync_fs (non-blocking)\n"); + ceph_flush_dirty_caps(fsc->mdsc); + dout("sync_fs (non-blocking) done\n"); + return 0; + } + + dout("sync_fs (blocking)\n"); + ceph_osdc_sync(&fsc->client->osdc); + ceph_mdsc_sync(fsc->mdsc); + dout("sync_fs (blocking) done\n"); + return 0; +} + +/* + * mount options + */ +enum { + Opt_wsize, + Opt_rsize, + Opt_rasize, + Opt_caps_wanted_delay_min, + Opt_caps_wanted_delay_max, + Opt_caps_max, + Opt_readdir_max_entries, + Opt_readdir_max_bytes, + Opt_congestion_kb, + /* int args above */ + Opt_snapdirname, + Opt_mds_namespace, + Opt_recover_session, + Opt_source, + /* string args above */ + Opt_dirstat, + Opt_rbytes, + Opt_asyncreaddir, + Opt_dcache, + Opt_ino32, + Opt_fscache, + Opt_poolperm, + Opt_require_active_mds, + Opt_acl, + Opt_quotadf, + Opt_copyfrom, + Opt_wsync, +}; + +enum ceph_recover_session_mode { + ceph_recover_session_no, + ceph_recover_session_clean +}; + +static const struct constant_table ceph_param_recover[] = { + { "no", ceph_recover_session_no }, + { "clean", ceph_recover_session_clean }, + {} +}; + +static const struct fs_parameter_spec ceph_mount_parameters[] = { + fsparam_flag_no ("acl", Opt_acl), + fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir), + fsparam_s32 ("caps_max", Opt_caps_max), + fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max), + fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min), + fsparam_u32 ("write_congestion_kb", Opt_congestion_kb), + fsparam_flag_no ("copyfrom", Opt_copyfrom), + fsparam_flag_no ("dcache", Opt_dcache), + fsparam_flag_no ("dirstat", Opt_dirstat), + fsparam_flag_no ("fsc", Opt_fscache), // fsc|nofsc + fsparam_string ("fsc", Opt_fscache), // fsc=... + fsparam_flag_no ("ino32", Opt_ino32), + fsparam_string ("mds_namespace", Opt_mds_namespace), + fsparam_flag_no ("poolperm", Opt_poolperm), + fsparam_flag_no ("quotadf", Opt_quotadf), + fsparam_u32 ("rasize", Opt_rasize), + fsparam_flag_no ("rbytes", Opt_rbytes), + fsparam_u32 ("readdir_max_bytes", Opt_readdir_max_bytes), + fsparam_u32 ("readdir_max_entries", Opt_readdir_max_entries), + fsparam_enum ("recover_session", Opt_recover_session, ceph_param_recover), + fsparam_flag_no ("require_active_mds", Opt_require_active_mds), + fsparam_u32 ("rsize", Opt_rsize), + fsparam_string ("snapdirname", Opt_snapdirname), + fsparam_string ("source", Opt_source), + fsparam_u32 ("wsize", Opt_wsize), + fsparam_flag_no ("wsync", Opt_wsync), + {} +}; + +struct ceph_parse_opts_ctx { + struct ceph_options *copts; + struct ceph_mount_options *opts; +}; + +/* + * Remove adjacent slashes and then the trailing slash, unless it is + * the only remaining character. + * + * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/". + */ +static void canonicalize_path(char *path) +{ + int i, j = 0; + + for (i = 0; path[i] != '\0'; i++) { + if (path[i] != '/' || j < 1 || path[j - 1] != '/') + path[j++] = path[i]; + } + + if (j > 1 && path[j - 1] == '/') + j--; + path[j] = '\0'; +} + +/* + * Parse the source parameter. Distinguish the server list from the path. + * + * The source will look like: + * <server_spec>[,<server_spec>...]:[<path>] + * where + * <server_spec> is <ip>[:<port>] + * <path> is optional, but if present must begin with '/' + */ +static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *dev_name = param->string, *dev_name_end; + int ret; + + dout("%s '%s'\n", __func__, dev_name); + if (!dev_name || !*dev_name) + return invalfc(fc, "Empty source"); + + dev_name_end = strchr(dev_name, '/'); + if (dev_name_end) { + /* + * The server_path will include the whole chars from userland + * including the leading '/'. + */ + kfree(fsopt->server_path); + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) + return -ENOMEM; + + canonicalize_path(fsopt->server_path); + } else { + dev_name_end = dev_name + strlen(dev_name); + } + + dev_name_end--; /* back up to ':' separator */ + if (dev_name_end < dev_name || *dev_name_end != ':') + return invalfc(fc, "No path or : separator in source"); + + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); + + ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, + pctx->copts, fc->log.log); + if (ret) + return ret; + + fc->source = param->string; + param->string = NULL; + return 0; +} + +static int ceph_parse_mount_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + struct fs_parse_result result; + unsigned int mode; + int token, ret; + + ret = ceph_parse_param(param, pctx->copts, fc->log.log); + if (ret != -ENOPARAM) + return ret; + + token = fs_parse(fc, ceph_mount_parameters, param, &result); + dout("%s fs_parse '%s' token %d\n", __func__, param->key, token); + if (token < 0) + return token; + + switch (token) { + case Opt_snapdirname: + kfree(fsopt->snapdir_name); + fsopt->snapdir_name = param->string; + param->string = NULL; + break; + case Opt_mds_namespace: + kfree(fsopt->mds_namespace); + fsopt->mds_namespace = param->string; + param->string = NULL; + break; + case Opt_recover_session: + mode = result.uint_32; + if (mode == ceph_recover_session_no) + fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; + else if (mode == ceph_recover_session_clean) + fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; + else + BUG(); + break; + case Opt_source: + if (fc->source) + return invalfc(fc, "Multiple sources specified"); + return ceph_parse_source(param, fc); + case Opt_wsize: + if (result.uint_32 < PAGE_SIZE || + result.uint_32 > CEPH_MAX_WRITE_SIZE) + goto out_of_range; + fsopt->wsize = ALIGN(result.uint_32, PAGE_SIZE); + break; + case Opt_rsize: + if (result.uint_32 < PAGE_SIZE || + result.uint_32 > CEPH_MAX_READ_SIZE) + goto out_of_range; + fsopt->rsize = ALIGN(result.uint_32, PAGE_SIZE); + break; + case Opt_rasize: + fsopt->rasize = ALIGN(result.uint_32, PAGE_SIZE); + break; + case Opt_caps_wanted_delay_min: + if (result.uint_32 < 1) + goto out_of_range; + fsopt->caps_wanted_delay_min = result.uint_32; + break; + case Opt_caps_wanted_delay_max: + if (result.uint_32 < 1) + goto out_of_range; + fsopt->caps_wanted_delay_max = result.uint_32; + break; + case Opt_caps_max: + if (result.int_32 < 0) + goto out_of_range; + fsopt->caps_max = result.int_32; + break; + case Opt_readdir_max_entries: + if (result.uint_32 < 1) + goto out_of_range; + fsopt->max_readdir = result.uint_32; + break; + case Opt_readdir_max_bytes: + if (result.uint_32 < PAGE_SIZE && result.uint_32 != 0) + goto out_of_range; + fsopt->max_readdir_bytes = result.uint_32; + break; + case Opt_congestion_kb: + if (result.uint_32 < 1024) /* at least 1M */ + goto out_of_range; + fsopt->congestion_kb = result.uint_32; + break; + case Opt_dirstat: + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_rbytes: + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_asyncreaddir: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; + case Opt_dcache: + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; + break; + case Opt_ino32: + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_INO32; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + break; + + case Opt_fscache: +#ifdef CONFIG_CEPH_FSCACHE + kfree(fsopt->fscache_uniq); + fsopt->fscache_uniq = NULL; + if (result.negated) { + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + } else { + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + fsopt->fscache_uniq = param->string; + param->string = NULL; + } + break; +#else + return invalfc(fc, "fscache support is disabled"); +#endif + case Opt_poolperm: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; + break; + case Opt_require_active_mds: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT; + else + fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; + break; + case Opt_quotadf: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; + break; + case Opt_copyfrom: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; + break; + case Opt_acl: + if (!result.negated) { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#else + return invalfc(fc, "POSIX ACL support is disabled"); +#endif + } else { + fc->sb_flags &= ~SB_POSIXACL; + } + break; + case Opt_wsync: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS; + else + fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; + break; + default: + BUG(); + } + return 0; + +out_of_range: + return invalfc(fc, "%s out of range", param->key); +} + +static void destroy_mount_options(struct ceph_mount_options *args) +{ + dout("destroy_mount_options %p\n", args); + if (!args) + return; + + kfree(args->snapdir_name); + kfree(args->mds_namespace); + kfree(args->server_path); + kfree(args->fscache_uniq); + kfree(args); +} + +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} + +static int compare_mount_options(struct ceph_mount_options *new_fsopt, + struct ceph_options *new_opt, + struct ceph_fs_client *fsc) +{ + struct ceph_mount_options *fsopt1 = new_fsopt; + struct ceph_mount_options *fsopt2 = fsc->mount_options; + int ofs = offsetof(struct ceph_mount_options, snapdir_name); + int ret; + + ret = memcmp(fsopt1, fsopt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); + if (ret) + return ret; + + ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); + if (ret) + return ret; + + ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); + if (ret) + return ret; + + ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq); + if (ret) + return ret; + + return ceph_compare_options(new_opt, fsc->client); +} + +/** + * ceph_show_options - Show mount options in /proc/mounts + * @m: seq_file to write to + * @root: root of that (sub)tree + */ +static int ceph_show_options(struct seq_file *m, struct dentry *root) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); + struct ceph_mount_options *fsopt = fsc->mount_options; + size_t pos; + int ret; + + /* a comma between MNT/MS and client options */ + seq_putc(m, ','); + pos = m->count; + + ret = ceph_print_client_options(m, fsc->client, false); + if (ret) + return ret; + + /* retract our comma if no client options */ + if (m->count == pos) + m->count--; + + if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) + seq_puts(m, ",dirstat"); + if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES)) + seq_puts(m, ",rbytes"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) + seq_puts(m, ",noasyncreaddir"); + if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) + seq_puts(m, ",nodcache"); + if (fsopt->flags & CEPH_MOUNT_OPT_INO32) + seq_puts(m, ",ino32"); + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) { + seq_show_option(m, "fsc", fsopt->fscache_uniq); + } + if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) + seq_puts(m, ",nopoolperm"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF) + seq_puts(m, ",noquotadf"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + if (root->d_sb->s_flags & SB_POSIXACL) + seq_puts(m, ",acl"); + else + seq_puts(m, ",noacl"); +#endif + + if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) + seq_puts(m, ",copyfrom"); + + if (fsopt->mds_namespace) + seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) + seq_show_option(m, "recover_session", "clean"); + + if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + seq_puts(m, ",nowsync"); + + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) + seq_printf(m, ",wsize=%u", fsopt->wsize); + if (fsopt->rsize != CEPH_MAX_READ_SIZE) + seq_printf(m, ",rsize=%u", fsopt->rsize); + if (fsopt->rasize != CEPH_RASIZE_DEFAULT) + seq_printf(m, ",rasize=%u", fsopt->rasize); + if (fsopt->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb); + if (fsopt->caps_max) + seq_printf(m, ",caps_max=%d", fsopt->caps_max); + if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + seq_printf(m, ",caps_wanted_delay_min=%u", + fsopt->caps_wanted_delay_min); + if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + seq_printf(m, ",caps_wanted_delay_max=%u", + fsopt->caps_wanted_delay_max); + if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir); + if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes); + if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) + seq_show_option(m, "snapdirname", fsopt->snapdir_name); + + return 0; +} + +/* + * handle any mon messages the standard library doesn't understand. + * return error if we don't either. + */ +static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) +{ + struct ceph_fs_client *fsc = client->private; + int type = le16_to_cpu(msg->hdr.type); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_mdsmap(fsc->mdsc, msg); + return 0; + case CEPH_MSG_FS_MAP_USER: + ceph_mdsc_handle_fsmap(fsc->mdsc, msg); + return 0; + default: + return -1; + } +} + +/* + * create a new fs client + * + * Success or not, this function consumes @fsopt and @opt. + */ +static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, + struct ceph_options *opt) +{ + struct ceph_fs_client *fsc; + int err; + + fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); + if (!fsc) { + err = -ENOMEM; + goto fail; + } + + fsc->client = ceph_create_client(opt, fsc); + if (IS_ERR(fsc->client)) { + err = PTR_ERR(fsc->client); + goto fail; + } + opt = NULL; /* fsc->client now owns this */ + + fsc->client->extra_mon_dispatch = extra_mon_dispatch; + ceph_set_opt(fsc->client, ABORT_ON_FULL); + + if (!fsopt->mds_namespace) { + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, + 0, true); + } else { + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP, + 0, false); + } + + fsc->mount_options = fsopt; + + fsc->sb = NULL; + fsc->mount_state = CEPH_MOUNT_MOUNTING; + fsc->filp_gen = 1; + fsc->have_copy_from2 = true; + + atomic_long_set(&fsc->writeback_count, 0); + + err = -ENOMEM; + /* + * The number of concurrent works can be high but they don't need + * to be processed in parallel, limit concurrency. + */ + fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0); + if (!fsc->inode_wq) + goto fail_client; + fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); + if (!fsc->cap_wq) + goto fail_inode_wq; + + spin_lock(&ceph_fsc_lock); + list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); + spin_unlock(&ceph_fsc_lock); + + return fsc; + +fail_inode_wq: + destroy_workqueue(fsc->inode_wq); +fail_client: + ceph_destroy_client(fsc->client); +fail: + kfree(fsc); + if (opt) + ceph_destroy_options(opt); + destroy_mount_options(fsopt); + return ERR_PTR(err); +} + +static void flush_fs_workqueues(struct ceph_fs_client *fsc) +{ + flush_workqueue(fsc->inode_wq); + flush_workqueue(fsc->cap_wq); +} + +static void destroy_fs_client(struct ceph_fs_client *fsc) +{ + dout("destroy_fs_client %p\n", fsc); + + spin_lock(&ceph_fsc_lock); + list_del(&fsc->metric_wakeup); + spin_unlock(&ceph_fsc_lock); + + ceph_mdsc_destroy(fsc); + destroy_workqueue(fsc->inode_wq); + destroy_workqueue(fsc->cap_wq); + + destroy_mount_options(fsc->mount_options); + + ceph_destroy_client(fsc->client); + + kfree(fsc); + dout("destroy_fs_client %p done\n", fsc); +} + +/* + * caches + */ +struct kmem_cache *ceph_inode_cachep; +struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_cap_flush_cachep; +struct kmem_cache *ceph_dentry_cachep; +struct kmem_cache *ceph_file_cachep; +struct kmem_cache *ceph_dir_file_cachep; +struct kmem_cache *ceph_mds_request_cachep; +mempool_t *ceph_wb_pagevec_pool; + +static void ceph_inode_init_once(void *foo) +{ + struct ceph_inode_info *ci = foo; + inode_init_once(&ci->vfs_inode); +} + +static int __init init_caches(void) +{ + int error = -ENOMEM; + + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", + sizeof(struct ceph_inode_info), + __alignof__(struct ceph_inode_info), + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ceph_inode_init_once); + if (!ceph_inode_cachep) + return -ENOMEM; + + ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD); + if (!ceph_cap_cachep) + goto bad_cap; + ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (!ceph_cap_flush_cachep) + goto bad_cap_flush; + + ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (!ceph_dentry_cachep) + goto bad_dentry; + + ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); + if (!ceph_file_cachep) + goto bad_file; + + ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD); + if (!ceph_dir_file_cachep) + goto bad_dir_file; + + ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD); + if (!ceph_mds_request_cachep) + goto bad_mds_req; + + ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT); + if (!ceph_wb_pagevec_pool) + goto bad_pagevec_pool; + + error = ceph_fscache_register(); + if (error) + goto bad_fscache; + + return 0; + +bad_fscache: + kmem_cache_destroy(ceph_mds_request_cachep); +bad_pagevec_pool: + mempool_destroy(ceph_wb_pagevec_pool); +bad_mds_req: + kmem_cache_destroy(ceph_dir_file_cachep); +bad_dir_file: + kmem_cache_destroy(ceph_file_cachep); +bad_file: + kmem_cache_destroy(ceph_dentry_cachep); +bad_dentry: + kmem_cache_destroy(ceph_cap_flush_cachep); +bad_cap_flush: + kmem_cache_destroy(ceph_cap_cachep); +bad_cap: + kmem_cache_destroy(ceph_inode_cachep); + return error; +} + +static void destroy_caches(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + + kmem_cache_destroy(ceph_inode_cachep); + kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_cap_flush_cachep); + kmem_cache_destroy(ceph_dentry_cachep); + kmem_cache_destroy(ceph_file_cachep); + kmem_cache_destroy(ceph_dir_file_cachep); + kmem_cache_destroy(ceph_mds_request_cachep); + mempool_destroy(ceph_wb_pagevec_pool); + + ceph_fscache_unregister(); +} + +/* + * ceph_umount_begin - initiate forced umount. Tear down the + * mount, skipping steps that may hang while waiting for server(s). + */ +static void ceph_umount_begin(struct super_block *sb) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + dout("ceph_umount_begin - starting forced umount\n"); + if (!fsc) + return; + fsc->mount_state = CEPH_MOUNT_SHUTDOWN; + ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); + ceph_mdsc_force_umount(fsc->mdsc); + fsc->filp_gen++; // invalidate open files +} + +static const struct super_operations ceph_super_ops = { + .alloc_inode = ceph_alloc_inode, + .free_inode = ceph_free_inode, + .write_inode = ceph_write_inode, + .drop_inode = generic_delete_inode, + .evict_inode = ceph_evict_inode, + .sync_fs = ceph_sync_fs, + .put_super = ceph_put_super, + .show_options = ceph_show_options, + .statfs = ceph_statfs, + .umount_begin = ceph_umount_begin, +}; + +/* + * Bootstrap mount by opening the root directory. Note the mount + * @started time from caller, and time out if this takes too long. + */ +static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, + const char *path, + unsigned long started) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req = NULL; + int err; + struct dentry *root; + + /* open dir */ + dout("open_root_inode opening '%s'\n", path); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + req->r_path1 = kstrdup(path, GFP_NOFS); + if (!req->r_path1) { + root = ERR_PTR(-ENOMEM); + goto out; + } + + req->r_ino1.ino = CEPH_INO_ROOT; + req->r_ino1.snap = CEPH_NOSNAP; + req->r_started = started; + req->r_timeout = fsc->client->options->mount_timeout; + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err == 0) { + struct inode *inode = req->r_target_inode; + req->r_target_inode = NULL; + dout("open_root_inode success\n"); + root = d_make_root(inode); + if (!root) { + root = ERR_PTR(-ENOMEM); + goto out; + } + dout("open_root_inode success, root dentry is %p\n", root); + } else { + root = ERR_PTR(err); + } +out: + ceph_mdsc_put_request(req); + return root; +} + +/* + * mount: join the ceph cluster, and open root directory. + */ +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, + struct fs_context *fc) +{ + int err; + unsigned long started = jiffies; /* note the start time */ + struct dentry *root; + + dout("mount start %p\n", fsc); + mutex_lock(&fsc->client->mount_mutex); + + if (!fsc->sb->s_root) { + const char *path = fsc->mount_options->server_path ? + fsc->mount_options->server_path + 1 : ""; + + err = __ceph_open_session(fsc->client, started); + if (err < 0) + goto out; + + /* setup fscache */ + if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) { + err = ceph_fscache_register_fs(fsc, fc); + if (err < 0) + goto out; + } + + dout("mount opening path '%s'\n", path); + + ceph_fs_debugfs_init(fsc); + + root = open_root_dentry(fsc, path, started); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + fsc->sb->s_root = dget(root); + } else { + root = dget(fsc->sb->s_root); + } + + fsc->mount_state = CEPH_MOUNT_MOUNTED; + dout("mount success\n"); + mutex_unlock(&fsc->client->mount_mutex); + return root; + +out: + mutex_unlock(&fsc->client->mount_mutex); + return ERR_PTR(err); +} + +static int ceph_set_super(struct super_block *s, struct fs_context *fc) +{ + struct ceph_fs_client *fsc = s->s_fs_info; + int ret; + + dout("set_super %p\n", s); + + s->s_maxbytes = MAX_LFS_FILESIZE; + + s->s_xattr = ceph_xattr_handlers; + fsc->sb = s; + fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */ + + s->s_op = &ceph_super_ops; + s->s_d_op = &ceph_dentry_ops; + s->s_export_op = &ceph_export_ops; + + s->s_time_gran = 1; + s->s_time_min = 0; + s->s_time_max = U32_MAX; + + ret = set_anon_super_fc(s, fc); + if (ret != 0) + fsc->sb = NULL; + return ret; +} + +/* + * share superblock if same fs AND options + */ +static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) +{ + struct ceph_fs_client *new = fc->s_fs_info; + struct ceph_mount_options *fsopt = new->mount_options; + struct ceph_options *opt = new->client->options; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + dout("ceph_compare_super %p\n", sb); + + if (compare_mount_options(fsopt, opt, fsc)) { + dout("monitor(s)/mount options don't match\n"); + return 0; + } + if ((opt->flags & CEPH_OPT_FSID) && + ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) { + dout("fsid doesn't match\n"); + return 0; + } + if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) { + dout("flags differ\n"); + return 0; + } + + if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) { + dout("client is blocklisted (and CLEANRECOVER is not set)\n"); + return 0; + } + + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { + dout("client has been forcibly unmounted\n"); + return 0; + } + + return 1; +} + +/* + * construct our own bdi so we can control readahead, etc. + */ +static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); + +static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) +{ + int err; + + err = super_setup_bdi_name(sb, "ceph-%ld", + atomic_long_inc_return(&bdi_seq)); + if (err) + return err; + + /* set ra_pages based on rasize mount option? */ + sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT; + + /* set io_pages based on max osd read size */ + sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT; + + return 0; +} + +static int ceph_get_tree(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct super_block *sb; + struct ceph_fs_client *fsc; + struct dentry *res; + int (*compare_super)(struct super_block *, struct fs_context *) = + ceph_compare_super; + int err; + + dout("ceph_get_tree\n"); + + if (!fc->source) + return invalfc(fc, "No source"); + + /* create client (which we may/may not use) */ + fsc = create_fs_client(pctx->opts, pctx->copts); + pctx->opts = NULL; + pctx->copts = NULL; + if (IS_ERR(fsc)) { + err = PTR_ERR(fsc); + goto out_final; + } + + err = ceph_mdsc_init(fsc); + if (err < 0) + goto out; + + if (ceph_test_opt(fsc->client, NOSHARE)) + compare_super = NULL; + + fc->s_fs_info = fsc; + sb = sget_fc(fc, compare_super, ceph_set_super); + fc->s_fs_info = NULL; + if (IS_ERR(sb)) { + err = PTR_ERR(sb); + goto out; + } + + if (ceph_sb_to_client(sb) != fsc) { + destroy_fs_client(fsc); + fsc = ceph_sb_to_client(sb); + dout("get_sb got existing client %p\n", fsc); + } else { + dout("get_sb using new client %p\n", fsc); + err = ceph_setup_bdi(sb, fsc); + if (err < 0) + goto out_splat; + } + + res = ceph_real_mount(fsc, fc); + if (IS_ERR(res)) { + err = PTR_ERR(res); + goto out_splat; + } + dout("root %p inode %p ino %llx.%llx\n", res, + d_inode(res), ceph_vinop(d_inode(res))); + fc->root = fsc->sb->s_root; + return 0; + +out_splat: + if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) { + pr_info("No mds server is up or the cluster is laggy\n"); + err = -EHOSTUNREACH; + } + + ceph_mdsc_close_sessions(fsc->mdsc); + deactivate_locked_super(sb); + goto out_final; + +out: + destroy_fs_client(fsc); +out_final: + dout("ceph_get_tree fail %d\n", err); + return err; +} + +static void ceph_free_fc(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + + if (pctx) { + destroy_mount_options(pctx->opts); + ceph_destroy_options(pctx->copts); + kfree(pctx); + } +} + +static int ceph_reconfigure_fc(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb); + + if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + ceph_set_mount_opt(fsc, ASYNC_DIROPS); + else + ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + + sync_filesystem(fc->root->d_sb); + return 0; +} + +static const struct fs_context_operations ceph_context_ops = { + .free = ceph_free_fc, + .parse_param = ceph_parse_mount_param, + .get_tree = ceph_get_tree, + .reconfigure = ceph_reconfigure_fc, +}; + +/* + * Set up the filesystem mount context. + */ +static int ceph_init_fs_context(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx; + struct ceph_mount_options *fsopt; + + pctx = kzalloc(sizeof(*pctx), GFP_KERNEL); + if (!pctx) + return -ENOMEM; + + pctx->copts = ceph_alloc_options(); + if (!pctx->copts) + goto nomem; + + pctx->opts = kzalloc(sizeof(*pctx->opts), GFP_KERNEL); + if (!pctx->opts) + goto nomem; + + fsopt = pctx->opts; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->wsize = CEPH_MAX_WRITE_SIZE; + fsopt->rsize = CEPH_MAX_READ_SIZE; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + if (!fsopt->snapdir_name) + goto nomem; + + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#endif + + fc->fs_private = pctx; + fc->ops = &ceph_context_ops; + return 0; + +nomem: + destroy_mount_options(pctx->opts); + ceph_destroy_options(pctx->copts); + kfree(pctx); + return -ENOMEM; +} + +static void ceph_kill_sb(struct super_block *s) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(s); + + dout("kill_sb %p\n", s); + + ceph_mdsc_pre_umount(fsc->mdsc); + flush_fs_workqueues(fsc); + + /* + * Though the kill_anon_super() will finally trigger the + * sync_filesystem() anyway, we still need to do it here + * and then bump the stage of shutdown to stop the work + * queue as earlier as possible. + */ + sync_filesystem(s); + + fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; + + kill_anon_super(s); + + fsc->client->extra_mon_dispatch = NULL; + ceph_fs_debugfs_cleanup(fsc); + + ceph_fscache_unregister_fs(fsc); + + destroy_fs_client(fsc); +} + +static struct file_system_type ceph_fs_type = { + .owner = THIS_MODULE, + .name = "ceph", + .init_fs_context = ceph_init_fs_context, + .kill_sb = ceph_kill_sb, + .fs_flags = FS_RENAME_DOES_D_MOVE, +}; +MODULE_ALIAS_FS("ceph"); + +int ceph_force_reconnect(struct super_block *sb) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + int err = 0; + + ceph_umount_begin(sb); + + /* Make sure all page caches get invalidated. + * see remove_session_caps_cb() */ + flush_workqueue(fsc->inode_wq); + + /* In case that we were blocklisted. This also reset + * all mon/osd connections */ + ceph_reset_client_addr(fsc->client); + + ceph_osdc_clear_abort_err(&fsc->client->osdc); + + fsc->blocklisted = false; + fsc->mount_state = CEPH_MOUNT_MOUNTED; + + if (sb->s_root) { + err = __ceph_do_getattr(d_inode(sb->s_root), NULL, + CEPH_STAT_CAP_INODE, true); + } + return err; +} + +static int __init init_ceph(void) +{ + int ret = init_caches(); + if (ret) + goto out; + + ceph_flock_init(); + ret = register_filesystem(&ceph_fs_type); + if (ret) + goto out_caches; + + pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); + + return 0; + +out_caches: + destroy_caches(); +out: + return ret; +} + +static void __exit exit_ceph(void) +{ + dout("exit_ceph\n"); + unregister_filesystem(&ceph_fs_type); + destroy_caches(); +} + +static int param_set_metrics(const char *val, const struct kernel_param *kp) +{ + struct ceph_fs_client *fsc; + int ret; + + ret = param_set_bool(val, kp); + if (ret) { + pr_err("Failed to parse sending metrics switch value '%s'\n", + val); + return ret; + } else if (!disable_send_metrics) { + // wake up all the mds clients + spin_lock(&ceph_fsc_lock); + list_for_each_entry(fsc, &ceph_fsc_list, metric_wakeup) { + metric_schedule_delayed(&fsc->mdsc->metric); + } + spin_unlock(&ceph_fsc_lock); + } + + return 0; +} + +static const struct kernel_param_ops param_ops_metrics = { + .set = param_set_metrics, + .get = param_get_bool, +}; + +bool disable_send_metrics = false; +module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); +MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); + +module_init(init_ceph); +module_exit(exit_ceph); + +MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); +MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); +MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); +MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_LICENSE("GPL"); diff --git a/fs/ceph/super.h b/fs/ceph/super.h new file mode 100644 index 000000000..8716cb618 --- /dev/null +++ b/fs/ceph/super.h @@ -0,0 +1,1262 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_SUPER_H +#define _FS_CEPH_SUPER_H + +#include <linux/ceph/ceph_debug.h> + +#include <asm/unaligned.h> +#include <linux/backing-dev.h> +#include <linux/completion.h> +#include <linux/exportfs.h> +#include <linux/fs.h> +#include <linux/mempool.h> +#include <linux/pagemap.h> +#include <linux/wait.h> +#include <linux/writeback.h> +#include <linux/slab.h> +#include <linux/posix_acl.h> +#include <linux/refcount.h> +#include <linux/security.h> + +#include <linux/ceph/libceph.h> + +#ifdef CONFIG_CEPH_FSCACHE +#include <linux/fscache.h> +#endif + +/* f_type in struct statfs */ +#define CEPH_SUPER_MAGIC 0x00c36400 + +/* large granularity for statfs utilization stats to facilitate + * large volume sizes on 32-bit machines. */ +#define CEPH_BLOCK_SHIFT 22 /* 4 MB */ +#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) + +#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ +#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ +#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ +#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ +#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ +#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ +#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ +#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ +#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ +#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ +#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ + +#define CEPH_MOUNT_OPT_DEFAULT \ + (CEPH_MOUNT_OPT_DCACHE | \ + CEPH_MOUNT_OPT_NOCOPYFROM) + +#define ceph_set_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt +#define ceph_clear_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt +#define ceph_test_mount_opt(fsc, opt) \ + (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) + +/* max size of osd read request, limited by libceph */ +#define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN +/* osd has a configurable limitaion of max write size. + * CEPH_MSG_MAX_DATA_LEN should be small enough. */ +#define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN +#define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) +#define CEPH_SNAPDIRNAME_DEFAULT ".snap" + +/* + * Delay telling the MDS we no longer want caps, in case we reopen + * the file. Delay a minimum amount of time, even if we send a cap + * message for some other reason. Otherwise, take the oppotunity to + * update the mds to avoid sending another message later. + */ +#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ +#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ + +struct ceph_mount_options { + unsigned int flags; + + unsigned int wsize; /* max write size */ + unsigned int rsize; /* max read size */ + unsigned int rasize; /* max readahead */ + unsigned int congestion_kb; /* max writeback in flight */ + unsigned int caps_wanted_delay_min, caps_wanted_delay_max; + int caps_max; + unsigned int max_readdir; /* max readdir result (entries) */ + unsigned int max_readdir_bytes; /* max readdir result (bytes) */ + + /* + * everything above this point can be memcmp'd; everything below + * is handled in compare_mount_options() + */ + + char *snapdir_name; /* default ".snap" */ + char *mds_namespace; /* default NULL */ + char *server_path; /* default NULL (means "/") */ + char *fscache_uniq; /* default NULL */ +}; + +struct ceph_fs_client { + struct super_block *sb; + + struct list_head metric_wakeup; + + struct ceph_mount_options *mount_options; + struct ceph_client *client; + + unsigned long mount_state; + + unsigned long last_auto_reconnect; + bool blocklisted; + + bool have_copy_from2; + + u32 filp_gen; + loff_t max_file_size; + + struct ceph_mds_client *mdsc; + + atomic_long_t writeback_count; + + struct workqueue_struct *inode_wq; + struct workqueue_struct *cap_wq; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_congestion_kb; + struct dentry *debugfs_bdi; + struct dentry *debugfs_mdsc, *debugfs_mdsmap; + struct dentry *debugfs_metric; + struct dentry *debugfs_mds_sessions; +#endif + +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; +#endif +}; + + +/* + * File i/o capability. This tracks shared state with the metadata + * server that allows us to cache or writeback attributes or to read + * and write data. For any given inode, we should have one or more + * capabilities, one issued by each metadata server, and our + * cumulative access is the OR of all issued capabilities. + * + * Each cap is referenced by the inode's i_caps rbtree and by per-mds + * session capability lists. + */ +struct ceph_cap { + struct ceph_inode_info *ci; + struct rb_node ci_node; /* per-ci cap tree */ + struct ceph_mds_session *session; + struct list_head session_caps; /* per-session caplist */ + u64 cap_id; /* unique cap id (mds provided) */ + union { + /* in-use caps */ + struct { + int issued; /* latest, from the mds */ + int implemented; /* implemented superset of + issued (for revocation) */ + int mds; /* mds index for this cap */ + int mds_wanted; /* caps wanted from this mds */ + }; + /* caps to release */ + struct { + u64 cap_ino; + int queue_release; + }; + }; + u32 seq, issue_seq, mseq; + u32 cap_gen; /* active/stale cycle */ + unsigned long last_used; + struct list_head caps_item; +}; + +#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ +#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ +#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ + +struct ceph_cap_flush { + u64 tid; + int caps; + bool wake; /* wake up flush waiters when finish ? */ + bool is_capsnap; /* true means capsnap */ + struct list_head g_list; // global + struct list_head i_list; // per inode +}; + +/* + * Snapped cap state that is pending flush to mds. When a snapshot occurs, + * we first complete any in-process sync writes and writeback any dirty + * data before flushing the snapped state (tracked here) back to the MDS. + */ +struct ceph_cap_snap { + refcount_t nref; + struct list_head ci_item; + + struct ceph_cap_flush cap_flush; + + u64 follows; + int issued, dirty; + struct ceph_snap_context *context; + + umode_t mode; + kuid_t uid; + kgid_t gid; + + struct ceph_buffer *xattr_blob; + u64 xattr_version; + + u64 size; + u64 change_attr; + struct timespec64 mtime, atime, ctime, btime; + u64 time_warp_seq; + u64 truncate_size; + u32 truncate_seq; + int writing; /* a sync write is still in progress */ + int dirty_pages; /* dirty pages awaiting writeback */ + bool inline_data; + bool need_flush; +}; + +static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) +{ + if (refcount_dec_and_test(&capsnap->nref)) { + if (capsnap->xattr_blob) + ceph_buffer_put(capsnap->xattr_blob); + kfree(capsnap); + } +} + +/* + * The frag tree describes how a directory is fragmented, potentially across + * multiple metadata servers. It is also used to indicate points where + * metadata authority is delegated, and whether/where metadata is replicated. + * + * A _leaf_ frag will be present in the i_fragtree IFF there is + * delegation info. That is, if mds >= 0 || ndist > 0. + */ +#define CEPH_MAX_DIRFRAG_REP 4 + +struct ceph_inode_frag { + struct rb_node node; + + /* fragtree state */ + u32 frag; + int split_by; /* i.e. 2^(split_by) children */ + + /* delegation and replication info */ + int mds; /* -1 if same authority as parent */ + int ndist; /* >0 if replicated */ + int dist[CEPH_MAX_DIRFRAG_REP]; +}; + +/* + * We cache inode xattrs as an encoded blob until they are first used, + * at which point we parse them into an rbtree. + */ +struct ceph_inode_xattr { + struct rb_node node; + + const char *name; + int name_len; + const char *val; + int val_len; + int dirty; + + int should_free_name; + int should_free_val; +}; + +/* + * Ceph dentry state + */ +struct ceph_dentry_info { + struct dentry *dentry; + struct ceph_mds_session *lease_session; + struct list_head lease_list; + unsigned flags; + int lease_shared_gen; + u32 lease_gen; + u32 lease_seq; + unsigned long lease_renew_after, lease_renew_from; + unsigned long time; + u64 offset; +}; + +#define CEPH_DENTRY_REFERENCED 1 +#define CEPH_DENTRY_LEASE_LIST 2 +#define CEPH_DENTRY_SHRINK_LIST 4 +#define CEPH_DENTRY_PRIMARY_LINK 8 + +struct ceph_inode_xattrs_info { + /* + * (still encoded) xattr blob. we avoid the overhead of parsing + * this until someone actually calls getxattr, etc. + * + * blob->vec.iov_len == 4 implies there are no xattrs; blob == + * NULL means we don't know. + */ + struct ceph_buffer *blob, *prealloc_blob; + + struct rb_root index; + bool dirty; + int count; + int names_size; + int vals_size; + u64 version, index_version; +}; + +/* + * Ceph inode. + */ +struct ceph_inode_info { + struct ceph_vino i_vino; /* ceph ino + snap */ + + spinlock_t i_ceph_lock; + + u64 i_version; + u64 i_inline_version; + u32 i_time_warp_seq; + + unsigned long i_ceph_flags; + atomic64_t i_release_count; + atomic64_t i_ordered_count; + atomic64_t i_complete_seq[2]; + + struct ceph_dir_layout i_dir_layout; + struct ceph_file_layout i_layout; + struct ceph_file_layout i_cached_layout; // for async creates + char *i_symlink; + + /* for dirs */ + struct timespec64 i_rctime; + u64 i_rbytes, i_rfiles, i_rsubdirs; + u64 i_files, i_subdirs; + + /* quotas */ + u64 i_max_bytes, i_max_files; + + s32 i_dir_pin; + + struct rb_root i_fragtree; + int i_fragtree_nsplits; + struct mutex i_fragtree_mutex; + + struct ceph_inode_xattrs_info i_xattrs; + + /* capabilities. protected _both_ by i_ceph_lock and cap->session's + * s_mutex. */ + struct rb_root i_caps; /* cap list */ + struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ + unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ + + /* + * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty + * is protected by the mdsc->cap_dirty_lock, but each individual item + * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty + * requires the mdsc->cap_dirty_lock. List presence for an item can + * be tested under the i_ceph_lock. Changing anything requires both. + */ + struct list_head i_dirty_item; + + /* + * Link to session's s_cap_flushing list. Protected in a similar + * fashion to i_dirty_item, but also by the s_mutex for changes. The + * s_cap_flushing list can be walked while holding either the s_mutex + * or msdc->cap_dirty_lock. List presence can also be checked while + * holding the i_ceph_lock for this inode. + */ + struct list_head i_flushing_item; + + /* we need to track cap writeback on a per-cap-bit basis, to allow + * overlapping, pipelined cap flushes to the mds. we can probably + * reduce the tid to 8 bits if we're concerned about inode size. */ + struct ceph_cap_flush *i_prealloc_cap_flush; + struct list_head i_cap_flush_list; + wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ + unsigned long i_hold_caps_max; /* jiffies */ + struct list_head i_cap_delay_list; /* for delayed cap release to mds */ + struct ceph_cap_reservation i_cap_migration_resv; + struct list_head i_cap_snaps; /* snapped state pending flush to mds */ + struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or + dirty|flushing caps */ + unsigned i_snap_caps; /* cap bits for snapped files */ + + unsigned long i_last_rd; + unsigned long i_last_wr; + int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ + + struct mutex i_truncate_mutex; + u32 i_truncate_seq; /* last truncate to smaller size */ + u64 i_truncate_size; /* and the size we last truncated down to */ + int i_truncate_pending; /* still need to call vmtruncate */ + + u64 i_max_size; /* max file size authorized by mds */ + u64 i_reported_size; /* (max_)size reported to or requested of mds */ + u64 i_wanted_max_size; /* offset we'd like to write too */ + u64 i_requested_max_size; /* max_size we've requested */ + + /* held references to caps */ + int i_pin_ref; + int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref; + int i_wrbuffer_ref, i_wrbuffer_ref_head; + atomic_t i_filelock_ref; + atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */ + u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ + u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ + + struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ + struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ + spinlock_t i_unsafe_lock; + + union { + struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ + }; + int i_snap_realm_counter; /* snap realm (if caps) */ + struct list_head i_snap_realm_item; + struct list_head i_snap_flush_item; + struct timespec64 i_btime; + struct timespec64 i_snap_btime; + + struct work_struct i_work; + unsigned long i_work_mask; + +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + u32 i_fscache_gen; +#endif + struct inode vfs_inode; /* at end */ +}; + +static inline struct ceph_inode_info * +ceph_inode(const struct inode *inode) +{ + return container_of(inode, struct ceph_inode_info, vfs_inode); +} + +static inline struct ceph_fs_client * +ceph_inode_to_client(const struct inode *inode) +{ + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; +} + +static inline struct ceph_fs_client * +ceph_sb_to_client(const struct super_block *sb) +{ + return (struct ceph_fs_client *)sb->s_fs_info; +} + +static inline struct ceph_mds_client * +ceph_sb_to_mdsc(const struct super_block *sb) +{ + return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc; +} + +static inline struct ceph_vino +ceph_vino(const struct inode *inode) +{ + return ceph_inode(inode)->i_vino; +} + +static inline u32 ceph_ino_to_ino32(u64 vino) +{ + u32 ino = vino & 0xffffffff; + ino ^= vino >> 32; + if (!ino) + ino = 2; + return ino; +} + +/* + * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on + * some arches. We generally do not use this value inside the ceph driver, but + * we do want to set it to something, so that generic vfs code has an + * appropriate value for tracepoints and the like. + */ +static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino) +{ + if (sizeof(ino_t) == sizeof(u32)) + return ceph_ino_to_ino32(vino.ino); + return (ino_t)vino.ino; +} + +/* for printf-style formatting */ +#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap + +static inline u64 ceph_ino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.ino; +} + +static inline u64 ceph_snap(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.snap; +} + +/** + * ceph_present_ino - format an inode number for presentation to userland + * @sb: superblock where the inode lives + * @ino: inode number to (possibly) convert + * + * If the user mounted with the ino32 option, then the 64-bit value needs + * to be converted to something that can fit inside 32 bits. Note that + * internal kernel code never uses this value, so this is entirely for + * userland consumption. + */ +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino) +{ + if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))) + return ceph_ino_to_ino32(ino); + return ino; +} + +static inline u64 ceph_present_inode(struct inode *inode) +{ + return ceph_present_ino(inode->i_sb, ceph_ino(inode)); +} + +static inline int ceph_ino_compare(struct inode *inode, void *data) +{ + struct ceph_vino *pvino = (struct ceph_vino *)data; + struct ceph_inode_info *ci = ceph_inode(inode); + return ci->i_vino.ino == pvino->ino && + ci->i_vino.snap == pvino->snap; +} + +/* + * The MDS reserves a set of inodes for its own usage. These should never + * be accessible by clients, and so the MDS has no reason to ever hand these + * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE. + * + * These come from src/mds/mdstypes.h in the ceph sources. + */ +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 +#define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) + +static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) +{ + if (vino.ino < CEPH_INO_SYSTEM_BASE && + vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { + WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); + return true; + } + return false; +} + +static inline struct inode *ceph_find_inode(struct super_block *sb, + struct ceph_vino vino) +{ + if (ceph_vino_is_reserved(vino)) + return NULL; + + /* + * NB: The hashval will be run through the fs/inode.c hash function + * anyway, so there is no need to squash the inode number down to + * 32-bits first. Just use low-order bits on arches with 32-bit long. + */ + return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino); +} + + +/* + * Ceph inode. + */ +#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ +#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ +#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ +#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ +#define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */ +#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */ +#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ +#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) + +/* + * Masks of ceph inode work. + */ +#define CEPH_I_WORK_WRITEBACK 0 /* writeback */ +#define CEPH_I_WORK_INVALIDATE_PAGES 1 /* invalidate pages */ +#define CEPH_I_WORK_VMTRUNCATE 2 /* vmtruncate */ + +/* + * We set the ERROR_WRITE bit when we start seeing write errors on an inode + * and then clear it when they start succeeding. Note that we do a lockless + * check first, and only take the lock if it looks like it needs to be changed. + * The write submission code just takes this as a hint, so we're not too + * worried if a few slip through in either direction. + */ +static inline void ceph_set_error_write(struct ceph_inode_info *ci) +{ + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ERROR_WRITE; + spin_unlock(&ci->i_ceph_lock); + } +} + +static inline void ceph_clear_error_write(struct ceph_inode_info *ci) +{ + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE; + spin_unlock(&ci->i_ceph_lock); + } +} + +static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, + long long release_count, + long long ordered_count) +{ + /* + * Makes sure operations that setup readdir cache (update page + * cache and i_size) are strongly ordered w.r.t. the following + * atomic64_set() operations. + */ + smp_mb(); + atomic64_set(&ci->i_complete_seq[0], release_count); + atomic64_set(&ci->i_complete_seq[1], ordered_count); +} + +static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) +{ + atomic64_inc(&ci->i_release_count); +} + +static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci) +{ + atomic64_inc(&ci->i_ordered_count); +} + +static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) +{ + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count); +} + +static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) +{ + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count) && + atomic64_read(&ci->i_complete_seq[1]) == + atomic64_read(&ci->i_ordered_count); +} + +static inline void ceph_dir_clear_complete(struct inode *inode) +{ + __ceph_dir_clear_complete(ceph_inode(inode)); +} + +static inline void ceph_dir_clear_ordered(struct inode *inode) +{ + __ceph_dir_clear_ordered(ceph_inode(inode)); +} + +static inline bool ceph_dir_is_complete_ordered(struct inode *inode) +{ + bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode)); + smp_rmb(); + return ret; +} + +/* find a specific frag @f */ +extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, + u32 f); + +/* + * choose fragment for value @v. copy frag content to pfrag, if leaf + * exists + */ +extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, + int *found); + +static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry) +{ + return (struct ceph_dentry_info *)dentry->d_fsdata; +} + +/* + * caps helpers + */ +static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) +{ + return !RB_EMPTY_ROOT(&ci->i_caps); +} + +extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented); +extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t); +extern int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, + int t); +extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, + struct ceph_cap *cap); + +static inline int ceph_caps_issued(struct ceph_inode_info *ci) +{ + int issued; + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + return issued; +} + +static inline int ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, + int mask, int touch) +{ + int r; + spin_lock(&ci->i_ceph_lock); + r = __ceph_caps_issued_mask_metric(ci, mask, touch); + spin_unlock(&ci->i_ceph_lock); + return r; +} + +static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) +{ + return ci->i_dirty_caps | ci->i_flushing_caps; +} +extern struct ceph_cap_flush *ceph_alloc_cap_flush(void); +extern void ceph_free_cap_flush(struct ceph_cap_flush *cf); +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf); + +extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask); +extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_used(struct ceph_inode_info *ci); + +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci) +{ + return ci->i_nr_by_mode[0]; +} +extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); +extern int __ceph_caps_wanted(struct ceph_inode_info *ci); + +/* what the mds thinks we want */ +extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); + +extern void ceph_caps_init(struct ceph_mds_client *mdsc); +extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); +extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, + struct ceph_mount_options *fsopt); +extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need); +extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_reservation_status(struct ceph_fs_client *client, + int *total, int *avail, int *used, + int *reserved, int *min); + + + +/* + * we keep buffered readdir results attached to file->private_data + */ +#define CEPH_F_SYNC 1 +#define CEPH_F_ATEND 2 + +struct ceph_file_info { + short fmode; /* initialized on open */ + short flags; /* CEPH_F_* */ + + spinlock_t rw_contexts_lock; + struct list_head rw_contexts; + + u32 filp_gen; +}; + +struct ceph_dir_file_info { + struct ceph_file_info file_info; + + /* readdir: position within the dir */ + u32 frag; + struct ceph_mds_request *last_readdir; + + /* readdir: position within a frag */ + unsigned next_offset; /* offset of next chunk (last_name's + 1) */ + char *last_name; /* last entry in previous chunk */ + long long dir_release_count; + long long dir_ordered_count; + int readdir_cache_idx; + + /* used for -o dirstat read() on directory thing */ + char *dir_info; + int dir_info_len; +}; + +struct ceph_rw_context { + struct list_head list; + struct task_struct *thread; + int caps; +}; + +#define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \ + struct ceph_rw_context _name = { \ + .thread = current, \ + .caps = _caps, \ + } + +static inline void ceph_add_rw_context(struct ceph_file_info *cf, + struct ceph_rw_context *ctx) +{ + spin_lock(&cf->rw_contexts_lock); + list_add(&ctx->list, &cf->rw_contexts); + spin_unlock(&cf->rw_contexts_lock); +} + +static inline void ceph_del_rw_context(struct ceph_file_info *cf, + struct ceph_rw_context *ctx) +{ + spin_lock(&cf->rw_contexts_lock); + list_del(&ctx->list); + spin_unlock(&cf->rw_contexts_lock); +} + +static inline struct ceph_rw_context* +ceph_find_rw_context(struct ceph_file_info *cf) +{ + struct ceph_rw_context *ctx, *found = NULL; + spin_lock(&cf->rw_contexts_lock); + list_for_each_entry(ctx, &cf->rw_contexts, list) { + if (ctx->thread == current) { + found = ctx; + break; + } + } + spin_unlock(&cf->rw_contexts_lock); + return found; +} + +struct ceph_readdir_cache_control { + struct page *page; + struct dentry **dentries; + int index; +}; + +/* + * A "snap realm" describes a subset of the file hierarchy sharing + * the same set of snapshots that apply to it. The realms themselves + * are organized into a hierarchy, such that children inherit (some of) + * the snapshots of their parents. + * + * All inodes within the realm that have capabilities are linked into a + * per-realm list. + */ +struct ceph_snap_realm { + u64 ino; + struct inode *inode; + atomic_t nref; + struct rb_node node; + + u64 created, seq; + u64 parent_ino; + u64 parent_since; /* snapid when our current parent became so */ + + u64 *prior_parent_snaps; /* snaps inherited from any parents we */ + u32 num_prior_parent_snaps; /* had prior to parent_since */ + u64 *snaps; /* snaps specific to this realm */ + u32 num_snaps; + + struct ceph_snap_realm *parent; + struct list_head children; /* list of child realms */ + struct list_head child_item; + + struct list_head empty_item; /* if i have ref==0 */ + + struct list_head dirty_item; /* if realm needs new context */ + + /* the current set of snaps for this realm */ + struct ceph_snap_context *cached_context; + + struct list_head inodes_with_caps; + spinlock_t inodes_with_caps_lock; +}; + +static inline int default_congestion_kb(void) +{ + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; +} + + +/* super.c */ +extern int ceph_force_reconnect(struct super_block *sb); +/* snap.c */ +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino); +extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); +extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); +extern int ceph_update_snap_trace(struct ceph_mds_client *m, + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret); +extern void ceph_handle_snap(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg); +extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); +extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap); +extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); + +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, + u64 snap); +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc, + struct ceph_snapid_map *sm); +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc); +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc); + + +/* + * a cap_snap is "pending" if it is still awaiting an in-progress + * sync write (that may/may not still update size, mtime, etc.). + */ +static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) +{ + return !list_empty(&ci->i_cap_snaps) && + list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, + ci_item)->writing; +} + +/* inode.c */ +struct ceph_mds_reply_info_in; +struct ceph_mds_reply_dirfrag; + +extern const struct inode_operations ceph_file_iops; + +extern struct inode *ceph_alloc_inode(struct super_block *sb); +extern void ceph_evict_inode(struct inode *inode); +extern void ceph_free_inode(struct inode *inode); + +extern struct inode *ceph_get_inode(struct super_block *sb, + struct ceph_vino vino); +extern struct inode *ceph_get_snapdir(struct inode *parent); +extern int ceph_fill_file_size(struct inode *inode, int issued, + u32 truncate_seq, u64 truncate_size, u64 size); +extern void ceph_fill_file_time(struct inode *inode, int issued, + u64 time_warp_seq, struct timespec64 *ctime, + struct timespec64 *mtime, + struct timespec64 *atime); +extern int ceph_fill_inode(struct inode *inode, struct page *locked_page, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, int cap_fmode, + struct ceph_cap_reservation *caps_reservation); +extern int ceph_fill_trace(struct super_block *sb, + struct ceph_mds_request *req); +extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, + struct ceph_mds_session *session); + +extern int ceph_inode_holds_cap(struct inode *inode, int mask); + +extern bool ceph_inode_set_size(struct inode *inode, loff_t size); +extern void __ceph_do_pending_vmtruncate(struct inode *inode); +extern void ceph_queue_vmtruncate(struct inode *inode); +extern void ceph_queue_invalidate(struct inode *inode); +extern void ceph_queue_writeback(struct inode *inode); +extern void ceph_async_iput(struct inode *inode); + +extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force); +static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) +{ + return __ceph_do_getattr(inode, NULL, mask, force); +} +extern int ceph_permission(struct inode *inode, int mask); +extern int __ceph_setattr(struct inode *inode, struct iattr *attr); +extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); +extern int ceph_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); + +/* xattr.c */ +int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); +ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); +extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); +extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); +extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); +extern const struct xattr_handler *ceph_xattr_handlers[]; + +struct ceph_acl_sec_ctx { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + void *default_acl; + void *acl; +#endif +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL + void *sec_ctx; + u32 sec_ctxlen; +#endif + struct ceph_pagelist *pagelist; +}; + +#ifdef CONFIG_SECURITY +extern bool ceph_security_xattr_deadlock(struct inode *in); +extern bool ceph_security_xattr_wanted(struct inode *in); +#else +static inline bool ceph_security_xattr_deadlock(struct inode *in) +{ + return false; +} +static inline bool ceph_security_xattr_wanted(struct inode *in) +{ + return false; +} +#endif + +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL +extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, + struct ceph_acl_sec_ctx *ctx); +static inline void ceph_security_invalidate_secctx(struct inode *inode) +{ + security_inode_invalidate_secctx(inode); +} +#else +static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, + struct ceph_acl_sec_ctx *ctx) +{ + return 0; +} +static inline void ceph_security_invalidate_secctx(struct inode *inode) +{ +} +#endif + +void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); + +/* acl.c */ +#ifdef CONFIG_CEPH_FS_POSIX_ACL + +struct posix_acl *ceph_get_acl(struct inode *, int); +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acl_sec_ctx *as_ctx); +void ceph_init_inode_acls(struct inode *inode, + struct ceph_acl_sec_ctx *as_ctx); + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ + forget_all_cached_acls(inode); +} + +#else + +#define ceph_get_acl NULL +#define ceph_set_acl NULL + +static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acl_sec_ctx *as_ctx) +{ + return 0; +} +static inline void ceph_init_inode_acls(struct inode *inode, + struct ceph_acl_sec_ctx *as_ctx) +{ +} +static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) +{ + return 0; +} + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ +} + +#endif + +/* caps.c */ +extern const char *ceph_cap_string(int c); +extern void ceph_handle_caps(struct ceph_mds_session *session, + struct ceph_msg *msg); +extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + unsigned issued, unsigned wanted, + unsigned cap, unsigned seq, u64 realmino, int flags, + struct ceph_cap **new_cap); +extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void __ceph_remove_caps(struct ceph_inode_info *ci); +extern void ceph_put_cap(struct ceph_mds_client *mdsc, + struct ceph_cap *cap); +extern int ceph_is_any_caps(struct inode *inode); + +extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); +extern int ceph_fsync(struct file *file, loff_t start, loff_t end, + int datasync); +extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, + struct ceph_inode_info *ci); +extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, + int mds); +extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, + bool snap_rwsem_locked); +extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); +extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); +extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, + int had); +extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, + struct ceph_snap_context *snapc); +extern void __ceph_remove_capsnap(struct inode *inode, + struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc); +extern void ceph_remove_capsnap(struct inode *inode, + struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc); +extern void ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession); +extern bool __ceph_should_report_size(struct ceph_inode_info *ci); +extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, + struct ceph_mds_session *session); +extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc); +extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); +extern int ceph_drop_caps_for_unlink(struct inode *inode); +extern int ceph_encode_inode_release(void **p, struct inode *inode, + int mds, int drop, int unless, int force); +extern int ceph_encode_dentry_release(void **p, struct dentry *dn, + struct inode *dir, + int mds, int drop, int unless); + +extern int ceph_get_caps(struct file *filp, int need, int want, + loff_t endoff, int *got, struct page **pinned_page); +extern int ceph_try_get_caps(struct inode *inode, + int need, int want, bool nonblock, int *got); + +/* for counting open files by mode */ +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count); +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count); +extern void __ceph_touch_fmode(struct ceph_inode_info *ci, + struct ceph_mds_client *mdsc, int fmode); + +/* addr.c */ +extern const struct address_space_operations ceph_aops; +extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); +extern int ceph_uninline_data(struct file *filp, struct page *locked_page); +extern int ceph_pool_perm_check(struct inode *inode, int need); +extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); + +/* file.c */ +extern const struct file_operations ceph_file_fops; + +extern int ceph_renew_caps(struct inode *inode, int fmode); +extern int ceph_open(struct inode *inode, struct file *file); +extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode); +extern int ceph_release(struct inode *inode, struct file *filp); +extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len); + +/* dir.c */ +extern const struct file_operations ceph_dir_fops; +extern const struct file_operations ceph_snapdir_fops; +extern const struct inode_operations ceph_dir_iops; +extern const struct inode_operations ceph_snapdir_iops; +extern const struct dentry_operations ceph_dentry_ops; + +extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); +extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err); +extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err); + +extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di); +extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di); +extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern int ceph_trim_dentries(struct ceph_mds_client *mdsc); +extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); +extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); + +/* ioctl.c */ +extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* export.c */ +extern const struct export_operations ceph_export_ops; +struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino); + +/* locks.c */ +extern __init void ceph_flock_init(void); +extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); +extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); +extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); +extern int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, + int num_flock_locks); +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks); + +/* debugfs.c */ +extern void ceph_fs_debugfs_init(struct ceph_fs_client *client); +extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); + +/* quota.c */ +static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci) +{ + return ci->i_max_files || ci->i_max_bytes; +} + +extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); + +static inline void __ceph_update_quota(struct ceph_inode_info *ci, + u64 max_bytes, u64 max_files) +{ + bool had_quota, has_quota; + had_quota = __ceph_has_any_quota(ci); + ci->i_max_bytes = max_bytes; + ci->i_max_files = max_files; + has_quota = __ceph_has_any_quota(ci); + + if (had_quota != has_quota) + ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); +} + +extern void ceph_handle_quota(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg); +extern bool ceph_quota_is_max_files_exceeded(struct inode *inode); +extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, + loff_t newlen); +extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, + loff_t newlen); +extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, + struct kstatfs *buf); +extern int ceph_quota_check_rename(struct ceph_mds_client *mdsc, + struct inode *old, struct inode *new); +extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); + +#endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/util.c b/fs/ceph/util.c new file mode 100644 index 000000000..2c3487567 --- /dev/null +++ b/fs/ceph/util.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some non-inline ceph helpers + */ +#include <linux/module.h> +#include <linux/ceph/types.h> + +/* + * return true if @layout appears to be valid + */ +int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) +{ + __u32 su = layout->stripe_unit; + __u32 sc = layout->stripe_count; + __u32 os = layout->object_size; + + /* stripe unit, object size must be non-zero, 64k increment */ + if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + /* object size must be a multiple of stripe unit */ + if (os < su || os % su) + return 0; + /* stripe count must be non-zero */ + if (!sc) + return 0; + return 1; +} + +void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit); + fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count); + fl->object_size = le32_to_cpu(legacy->fl_object_size); + fl->pool_id = le32_to_cpu(legacy->fl_pg_pool); + if (fl->pool_id == 0 && fl->stripe_unit == 0 && + fl->stripe_count == 0 && fl->object_size == 0) + fl->pool_id = -1; +} + +void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit); + legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count); + legacy->fl_object_size = cpu_to_le32(fl->object_size); + if (fl->pool_id >= 0) + legacy->fl_pg_pool = cpu_to_le32(fl->pool_id); + else + legacy->fl_pg_pool = 0; +} + +int ceph_flags_to_mode(int flags) +{ + int mode; + +#ifdef O_DIRECTORY /* fixme */ + if ((flags & O_DIRECTORY) == O_DIRECTORY) + return CEPH_FILE_MODE_PIN; +#endif + + switch (flags & O_ACCMODE) { + case O_WRONLY: + mode = CEPH_FILE_MODE_WR; + break; + case O_RDONLY: + mode = CEPH_FILE_MODE_RD; + break; + case O_RDWR: + case O_ACCMODE: /* this is what the VFS does */ + mode = CEPH_FILE_MODE_RDWR; + break; + } +#ifdef O_LAZY + if (flags & O_LAZY) + mode |= CEPH_FILE_MODE_LAZY; +#endif + + return mode; +} + +int ceph_caps_for_mode(int mode) +{ + int caps = CEPH_CAP_PIN; + + if (mode & CEPH_FILE_MODE_RD) + caps |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; + if (mode & CEPH_FILE_MODE_WR) + caps |= CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | + CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; + if (mode & CEPH_FILE_MODE_LAZY) + caps |= CEPH_CAP_FILE_LAZYIO; + + return caps; +} diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c new file mode 100644 index 000000000..76322c0f6 --- /dev/null +++ b/fs/ceph/xattr.c @@ -0,0 +1,1306 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> +#include <linux/ceph/pagelist.h> + +#include "super.h" +#include "mds_client.h" + +#include <linux/ceph/decode.h> + +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/posix_acl_xattr.h> +#include <linux/slab.h> + +#define XATTR_CEPH_PREFIX "ceph." +#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) + +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr); + +static bool ceph_is_valid_xattr(const char *name) +{ + return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || + !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +/* + * These define virtual xattrs exposing the recursive directory + * statistics and layout metadata. + */ +struct ceph_vxattr { + char *name; + size_t name_size; /* strlen(name) + 1 (for '\0') */ + ssize_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, + size_t size); + bool (*exists_cb)(struct ceph_inode_info *ci); + unsigned int flags; +}; + +#define VXATTR_FLAG_READONLY (1<<0) +#define VXATTR_FLAG_HIDDEN (1<<1) +#define VXATTR_FLAG_RSTAT (1<<2) + +/* layouts */ + +static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) +{ + struct ceph_file_layout *fl = &ci->i_layout; + return (fl->stripe_unit > 0 || fl->stripe_count > 0 || + fl->object_size > 0 || fl->pool_id >= 0 || + rcu_dereference_raw(fl->pool_ns) != NULL); +} + +static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, + size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_string *pool_ns; + s64 pool = ci->i_layout.pool_id; + const char *pool_name; + const char *ns_field = " pool_namespace="; + char buf[128]; + size_t len, total_len = 0; + ssize_t ret; + + pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); + + dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); + down_read(&osdc->lock); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) { + len = snprintf(buf, sizeof(buf), + "stripe_unit=%u stripe_count=%u object_size=%u pool=", + ci->i_layout.stripe_unit, ci->i_layout.stripe_count, + ci->i_layout.object_size); + total_len = len + strlen(pool_name); + } else { + len = snprintf(buf, sizeof(buf), + "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld", + ci->i_layout.stripe_unit, ci->i_layout.stripe_count, + ci->i_layout.object_size, pool); + total_len = len; + } + + if (pool_ns) + total_len += strlen(ns_field) + pool_ns->len; + + ret = total_len; + if (size >= total_len) { + memcpy(val, buf, len); + ret = len; + if (pool_name) { + len = strlen(pool_name); + memcpy(val + ret, pool_name, len); + ret += len; + } + if (pool_ns) { + len = strlen(ns_field); + memcpy(val + ret, ns_field, len); + ret += len; + memcpy(val + ret, pool_ns->str, pool_ns->len); + ret += pool_ns->len; + } + } + up_read(&osdc->lock); + ceph_put_string(pool_ns); + return ret; +} + +/* + * The convention with strings in xattrs is that they should not be NULL + * terminated, since we're returning the length with them. snprintf always + * NULL terminates however, so call it on a temporary buffer and then memcpy + * the result into place. + */ +static __printf(3, 4) +int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...) +{ + int ret; + va_list args; + char buf[96]; /* NB: reevaluate size if new vxattrs are added */ + + va_start(args, fmt); + ret = vsnprintf(buf, size ? sizeof(buf) : 0, fmt, args); + va_end(args); + + /* Sanity check */ + if (size && ret + 1 > sizeof(buf)) { + WARN_ONCE(true, "Returned length too big (%d)", ret); + return -E2BIG; + } + + if (ret <= size) + memcpy(val, buf, ret); + return ret; +} + +static ssize_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_unit); +} + +static ssize_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_count); +} + +static ssize_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return ceph_fmt_xattr(val, size, "%u", ci->i_layout.object_size); +} + +static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, + char *val, size_t size) +{ + ssize_t ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ci->i_layout.pool_id; + const char *pool_name; + + down_read(&osdc->lock); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) { + ret = strlen(pool_name); + if (ret <= size) + memcpy(val, pool_name, ret); + } else { + ret = ceph_fmt_xattr(val, size, "%lld", pool); + } + up_read(&osdc->lock); + return ret; +} + +static ssize_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci, + char *val, size_t size) +{ + ssize_t ret = 0; + struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns); + + if (ns) { + ret = ns->len; + if (ret <= size) + memcpy(val, ns->str, ret); + ceph_put_string(ns); + } + return ret; +} + +/* directories */ + +static ssize_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_files + ci->i_subdirs); +} + +static ssize_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_files); +} + +static ssize_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_subdirs); +} + +static ssize_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", + ci->i_rfiles + ci->i_rsubdirs); +} + +static ssize_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_rfiles); +} + +static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs); +} + +static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_rbytes); +} + +static ssize_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_rctime.tv_sec, + ci->i_rctime.tv_nsec); +} + +/* dir pin */ +static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci) +{ + return ci->i_dir_pin != -ENODATA; +} + +static ssize_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%d", (int)ci->i_dir_pin); +} + +/* quotas */ +static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) +{ + bool ret = false; + spin_lock(&ci->i_ceph_lock); + if ((ci->i_max_files || ci->i_max_bytes) && + ci->i_vino.snap == CEPH_NOSNAP && + ci->i_snap_realm && + ci->i_snap_realm->ino == ci->i_vino.ino) + ret = true; + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +static ssize_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "max_bytes=%llu max_files=%llu", + ci->i_max_bytes, ci->i_max_files); +} + +static ssize_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return ceph_fmt_xattr(val, size, "%llu", ci->i_max_bytes); +} + +static ssize_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return ceph_fmt_xattr(val, size, "%llu", ci->i_max_files); +} + +/* snapshots */ +static bool ceph_vxattrcb_snap_btime_exists(struct ceph_inode_info *ci) +{ + return (ci->i_snap_btime.tv_sec != 0 || ci->i_snap_btime.tv_nsec != 0); +} + +static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_snap_btime.tv_sec, + ci->i_snap_btime.tv_nsec); +} + +#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name +#define CEPH_XATTR_NAME2(_type, _name, _name2) \ + XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 + +#define XATTR_NAME_CEPH(_type, _name, _flags) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .exists_cb = NULL, \ + .flags = (VXATTR_FLAG_READONLY | _flags), \ + } +#define XATTR_RSTAT_FIELD(_type, _name) \ + XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) +#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .exists_cb = NULL, \ + .flags = VXATTR_FLAG_RSTAT, \ + } +#define XATTR_LAYOUT_FIELD(_type, _name, _field) \ + { \ + .name = CEPH_XATTR_NAME2(_type, _name, _field), \ + .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ + .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ + .exists_cb = ceph_vxattrcb_layout_exists, \ + .flags = VXATTR_FLAG_HIDDEN, \ + } +#define XATTR_QUOTA_FIELD(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .exists_cb = ceph_vxattrcb_quota_exists, \ + .flags = VXATTR_FLAG_HIDDEN, \ + } + +static struct ceph_vxattr ceph_dir_vxattrs[] = { + { + .name = "ceph.dir.layout", + .name_size = sizeof("ceph.dir.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .exists_cb = ceph_vxattrcb_layout_exists, + .flags = VXATTR_FLAG_HIDDEN, + }, + XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), + XATTR_LAYOUT_FIELD(dir, layout, stripe_count), + XATTR_LAYOUT_FIELD(dir, layout, object_size), + XATTR_LAYOUT_FIELD(dir, layout, pool), + XATTR_LAYOUT_FIELD(dir, layout, pool_namespace), + XATTR_NAME_CEPH(dir, entries, 0), + XATTR_NAME_CEPH(dir, files, 0), + XATTR_NAME_CEPH(dir, subdirs, 0), + XATTR_RSTAT_FIELD(dir, rentries), + XATTR_RSTAT_FIELD(dir, rfiles), + XATTR_RSTAT_FIELD(dir, rsubdirs), + XATTR_RSTAT_FIELD(dir, rbytes), + XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime), + { + .name = "ceph.dir.pin", + .name_size = sizeof("ceph.dir.pin"), + .getxattr_cb = ceph_vxattrcb_dir_pin, + .exists_cb = ceph_vxattrcb_dir_pin_exists, + .flags = VXATTR_FLAG_HIDDEN, + }, + { + .name = "ceph.quota", + .name_size = sizeof("ceph.quota"), + .getxattr_cb = ceph_vxattrcb_quota, + .exists_cb = ceph_vxattrcb_quota_exists, + .flags = VXATTR_FLAG_HIDDEN, + }, + XATTR_QUOTA_FIELD(quota, max_bytes), + XATTR_QUOTA_FIELD(quota, max_files), + { + .name = "ceph.snap.btime", + .name_size = sizeof("ceph.snap.btime"), + .getxattr_cb = ceph_vxattrcb_snap_btime, + .exists_cb = ceph_vxattrcb_snap_btime_exists, + .flags = VXATTR_FLAG_READONLY, + }, + { .name = NULL, 0 } /* Required table terminator */ +}; + +/* files */ + +static struct ceph_vxattr ceph_file_vxattrs[] = { + { + .name = "ceph.file.layout", + .name_size = sizeof("ceph.file.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .exists_cb = ceph_vxattrcb_layout_exists, + .flags = VXATTR_FLAG_HIDDEN, + }, + XATTR_LAYOUT_FIELD(file, layout, stripe_unit), + XATTR_LAYOUT_FIELD(file, layout, stripe_count), + XATTR_LAYOUT_FIELD(file, layout, object_size), + XATTR_LAYOUT_FIELD(file, layout, pool), + XATTR_LAYOUT_FIELD(file, layout, pool_namespace), + { + .name = "ceph.snap.btime", + .name_size = sizeof("ceph.snap.btime"), + .getxattr_cb = ceph_vxattrcb_snap_btime, + .exists_cb = ceph_vxattrcb_snap_btime_exists, + .flags = VXATTR_FLAG_READONLY, + }, + { .name = NULL, 0 } /* Required table terminator */ +}; + +static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) + return ceph_dir_vxattrs; + else if (S_ISREG(inode->i_mode)) + return ceph_file_vxattrs; + return NULL; +} + +static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, + const char *name) +{ + struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode); + + if (vxattr) { + while (vxattr->name) { + if (!strcmp(vxattr->name, name)) + return vxattr; + vxattr++; + } + } + + return NULL; +} + +static int __set_xattr(struct ceph_inode_info *ci, + const char *name, int name_len, + const char *val, int val_len, + int flags, int update_xattr, + struct ceph_inode_xattr **newxattr) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_xattr *xattr = NULL; + int c; + int new = 0; + + p = &ci->i_xattrs.index.rb_node; + while (*p) { + parent = *p; + xattr = rb_entry(parent, struct ceph_inode_xattr, node); + c = strncmp(name, xattr->name, min(name_len, xattr->name_len)); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else { + if (name_len == xattr->name_len) + break; + else if (name_len < xattr->name_len) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + xattr = NULL; + } + + if (update_xattr) { + int err = 0; + + if (xattr && (flags & XATTR_CREATE)) + err = -EEXIST; + else if (!xattr && (flags & XATTR_REPLACE)) + err = -ENODATA; + if (err) { + kfree(name); + kfree(val); + kfree(*newxattr); + return err; + } + if (update_xattr < 0) { + if (xattr) + __remove_xattr(ci, xattr); + kfree(name); + kfree(*newxattr); + return 0; + } + } + + if (!xattr) { + new = 1; + xattr = *newxattr; + xattr->name = name; + xattr->name_len = name_len; + xattr->should_free_name = update_xattr; + + ci->i_xattrs.count++; + dout("__set_xattr count=%d\n", ci->i_xattrs.count); + } else { + kfree(*newxattr); + *newxattr = NULL; + if (xattr->should_free_val) + kfree(xattr->val); + + if (update_xattr) { + kfree(name); + name = xattr->name; + } + ci->i_xattrs.names_size -= xattr->name_len; + ci->i_xattrs.vals_size -= xattr->val_len; + } + ci->i_xattrs.names_size += name_len; + ci->i_xattrs.vals_size += val_len; + if (val) + xattr->val = val; + else + xattr->val = ""; + + xattr->val_len = val_len; + xattr->dirty = update_xattr; + xattr->should_free_val = (val && update_xattr); + + if (new) { + rb_link_node(&xattr->node, parent, p); + rb_insert_color(&xattr->node, &ci->i_xattrs.index); + dout("__set_xattr_val p=%p\n", p); + } + + dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n", + ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val); + + return 0; +} + +static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, + const char *name) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_xattr *xattr = NULL; + int name_len = strlen(name); + int c; + + p = &ci->i_xattrs.index.rb_node; + while (*p) { + parent = *p; + xattr = rb_entry(parent, struct ceph_inode_xattr, node); + c = strncmp(name, xattr->name, xattr->name_len); + if (c == 0 && name_len > xattr->name_len) + c = 1; + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else { + dout("__get_xattr %s: found %.*s\n", name, + xattr->val_len, xattr->val); + return xattr; + } + } + + dout("__get_xattr %s: not found\n", name); + + return NULL; +} + +static void __free_xattr(struct ceph_inode_xattr *xattr) +{ + BUG_ON(!xattr); + + if (xattr->should_free_name) + kfree(xattr->name); + if (xattr->should_free_val) + kfree(xattr->val); + + kfree(xattr); +} + +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr) +{ + if (!xattr) + return -ENODATA; + + rb_erase(&xattr->node, &ci->i_xattrs.index); + + if (xattr->should_free_name) + kfree(xattr->name); + if (xattr->should_free_val) + kfree(xattr->val); + + ci->i_xattrs.names_size -= xattr->name_len; + ci->i_xattrs.vals_size -= xattr->val_len; + ci->i_xattrs.count--; + kfree(xattr); + + return 0; +} + +static char *__copy_xattr_names(struct ceph_inode_info *ci, + char *dest) +{ + struct rb_node *p; + struct ceph_inode_xattr *xattr = NULL; + + p = rb_first(&ci->i_xattrs.index); + dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count); + + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + memcpy(dest, xattr->name, xattr->name_len); + dest[xattr->name_len] = '\0'; + + dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, + xattr->name_len, ci->i_xattrs.names_size); + + dest += xattr->name_len + 1; + p = rb_next(p); + } + + return dest; +} + +void __ceph_destroy_xattrs(struct ceph_inode_info *ci) +{ + struct rb_node *p, *tmp; + struct ceph_inode_xattr *xattr = NULL; + + p = rb_first(&ci->i_xattrs.index); + + dout("__ceph_destroy_xattrs p=%p\n", p); + + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + tmp = p; + p = rb_next(tmp); + dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p, + xattr->name_len, xattr->name); + rb_erase(tmp, &ci->i_xattrs.index); + + __free_xattr(xattr); + } + + ci->i_xattrs.names_size = 0; + ci->i_xattrs.vals_size = 0; + ci->i_xattrs.index_version = 0; + ci->i_xattrs.count = 0; + ci->i_xattrs.index = RB_ROOT; +} + +static int __build_xattrs(struct inode *inode) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) +{ + u32 namelen; + u32 numattr = 0; + void *p, *end; + u32 len; + const char *name, *val; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 xattr_version; + struct ceph_inode_xattr **xattrs = NULL; + int err = 0; + int i; + + dout("__build_xattrs() len=%d\n", + ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); + + if (ci->i_xattrs.index_version >= ci->i_xattrs.version) + return 0; /* already built */ + + __ceph_destroy_xattrs(ci); + +start: + /* updated internal xattr rb tree */ + if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) { + p = ci->i_xattrs.blob->vec.iov_base; + end = p + ci->i_xattrs.blob->vec.iov_len; + ceph_decode_32_safe(&p, end, numattr, bad); + xattr_version = ci->i_xattrs.version; + spin_unlock(&ci->i_ceph_lock); + + xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *), + GFP_NOFS); + err = -ENOMEM; + if (!xattrs) + goto bad_lock; + + for (i = 0; i < numattr; i++) { + xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), + GFP_NOFS); + if (!xattrs[i]) + goto bad_lock; + } + + spin_lock(&ci->i_ceph_lock); + if (ci->i_xattrs.version != xattr_version) { + /* lost a race, retry */ + for (i = 0; i < numattr; i++) + kfree(xattrs[i]); + kfree(xattrs); + xattrs = NULL; + goto start; + } + err = -EIO; + while (numattr--) { + ceph_decode_32_safe(&p, end, len, bad); + namelen = len; + name = p; + p += len; + ceph_decode_32_safe(&p, end, len, bad); + val = p; + p += len; + + err = __set_xattr(ci, name, namelen, val, len, + 0, 0, &xattrs[numattr]); + + if (err < 0) + goto bad; + } + kfree(xattrs); + } + ci->i_xattrs.index_version = ci->i_xattrs.version; + ci->i_xattrs.dirty = false; + + return err; +bad_lock: + spin_lock(&ci->i_ceph_lock); +bad: + if (xattrs) { + for (i = 0; i < numattr; i++) + kfree(xattrs[i]); + kfree(xattrs); + } + ci->i_xattrs.names_size = 0; + return err; +} + +static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, + int val_size) +{ + /* + * 4 bytes for the length, and additional 4 bytes per each xattr name, + * 4 bytes per each value + */ + int size = 4 + ci->i_xattrs.count*(4 + 4) + + ci->i_xattrs.names_size + + ci->i_xattrs.vals_size; + dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n", + ci->i_xattrs.count, ci->i_xattrs.names_size, + ci->i_xattrs.vals_size); + + if (name_size) + size += 4 + 4 + name_size + val_size; + + return size; +} + +/* + * If there are dirty xattrs, reencode xattrs into the prealloc_blob + * and swap into place. It returns the old i_xattrs.blob (or NULL) so + * that it can be freed by the caller as the i_ceph_lock is likely to be + * held. + */ +struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci) +{ + struct rb_node *p; + struct ceph_inode_xattr *xattr = NULL; + struct ceph_buffer *old_blob = NULL; + void *dest; + + dout("__build_xattrs_blob %p\n", &ci->vfs_inode); + if (ci->i_xattrs.dirty) { + int need = __get_required_blob_size(ci, 0, 0); + + BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len); + + p = rb_first(&ci->i_xattrs.index); + dest = ci->i_xattrs.prealloc_blob->vec.iov_base; + + ceph_encode_32(&dest, ci->i_xattrs.count); + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + + ceph_encode_32(&dest, xattr->name_len); + memcpy(dest, xattr->name, xattr->name_len); + dest += xattr->name_len; + ceph_encode_32(&dest, xattr->val_len); + memcpy(dest, xattr->val, xattr->val_len); + dest += xattr->val_len; + + p = rb_next(p); + } + + /* adjust buffer len; it may be larger than we need */ + ci->i_xattrs.prealloc_blob->vec.iov_len = + dest - ci->i_xattrs.prealloc_blob->vec.iov_base; + + if (ci->i_xattrs.blob) + old_blob = ci->i_xattrs.blob; + ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; + ci->i_xattrs.prealloc_blob = NULL; + ci->i_xattrs.dirty = false; + ci->i_xattrs.version++; + } + + return old_blob; +} + +static inline int __get_request_mask(struct inode *in) { + struct ceph_mds_request *req = current->journal_info; + int mask = 0; + if (req && req->r_target_inode == in) { + if (req->r_op == CEPH_MDS_OP_LOOKUP || + req->r_op == CEPH_MDS_OP_LOOKUPINO || + req->r_op == CEPH_MDS_OP_LOOKUPPARENT || + req->r_op == CEPH_MDS_OP_GETATTR) { + mask = le32_to_cpu(req->r_args.getattr.mask); + } else if (req->r_op == CEPH_MDS_OP_OPEN || + req->r_op == CEPH_MDS_OP_CREATE) { + mask = le32_to_cpu(req->r_args.open.mask); + } + } + return mask; +} + +ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_xattr *xattr; + struct ceph_vxattr *vxattr = NULL; + int req_mask; + ssize_t err; + + /* let's see if a virtual xattr was requested */ + vxattr = ceph_match_vxattr(inode, name); + if (vxattr) { + int mask = 0; + if (vxattr->flags & VXATTR_FLAG_RSTAT) + mask |= CEPH_STAT_RSTAT; + err = ceph_do_getattr(inode, mask, true); + if (err) + return err; + err = -ENODATA; + if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) { + err = vxattr->getxattr_cb(ci, value, size); + if (size && size < err) + err = -ERANGE; + } + return err; + } + + req_mask = __get_request_mask(inode); + + spin_lock(&ci->i_ceph_lock); + dout("getxattr %p name '%s' ver=%lld index_ver=%lld\n", inode, name, + ci->i_xattrs.version, ci->i_xattrs.index_version); + + if (ci->i_xattrs.version == 0 || + !((req_mask & CEPH_CAP_XATTR_SHARED) || + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1))) { + spin_unlock(&ci->i_ceph_lock); + + /* security module gets xattr while filling trace */ + if (current->journal_info) { + pr_warn_ratelimited("sync getxattr %p " + "during filling trace\n", inode); + return -EBUSY; + } + + /* get xattrs from mds (if we don't already have them) */ + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); + if (err) + return err; + spin_lock(&ci->i_ceph_lock); + } + + err = __build_xattrs(inode); + if (err < 0) + goto out; + + err = -ENODATA; /* == ENOATTR */ + xattr = __get_xattr(ci, name); + if (!xattr) + goto out; + + err = -ERANGE; + if (size && size < xattr->val_len) + goto out; + + err = xattr->val_len; + if (size == 0) + goto out; + + memcpy(value, xattr->val, xattr->val_len); + + if (current->journal_info && + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN)) + ci->i_ceph_flags |= CEPH_I_SEC_INITED; +out: + spin_unlock(&ci->i_ceph_lock); + return err; +} + +ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct ceph_inode_info *ci = ceph_inode(inode); + bool len_only = (size == 0); + u32 namelen; + int err; + + spin_lock(&ci->i_ceph_lock); + dout("listxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + + if (ci->i_xattrs.version == 0 || + !__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1)) { + spin_unlock(&ci->i_ceph_lock); + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); + if (err) + return err; + spin_lock(&ci->i_ceph_lock); + } + + err = __build_xattrs(inode); + if (err < 0) + goto out; + + /* add 1 byte for each xattr due to the null termination */ + namelen = ci->i_xattrs.names_size + ci->i_xattrs.count; + if (!len_only) { + if (namelen > size) { + err = -ERANGE; + goto out; + } + names = __copy_xattr_names(ci, names); + size -= namelen; + } + err = namelen; +out: + spin_unlock(&ci->i_ceph_lock); + return err; +} + +static int ceph_sync_setxattr(struct inode *inode, const char *name, + const char *value, size_t size, int flags) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_pagelist *pagelist = NULL; + int op = CEPH_MDS_OP_SETXATTR; + int err; + + if (size > 0) { + /* copy value into pagelist */ + pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!pagelist) + return -ENOMEM; + + err = ceph_pagelist_append(pagelist, value, size); + if (err) + goto out; + } else if (!value) { + if (flags & CEPH_XATTR_REPLACE) + op = CEPH_MDS_OP_RMXATTR; + else + flags |= CEPH_XATTR_REMOVE; + } + + dout("setxattr value=%.*s\n", (int)size, value); + + /* do request */ + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + err = -ENOMEM; + goto out; + } + + if (op == CEPH_MDS_OP_SETXATTR) { + req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_pagelist = pagelist; + pagelist = NULL; + } + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + + dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); + dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); + +out: + if (pagelist) + ceph_pagelist_release(pagelist); + return err; +} + +int __ceph_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct ceph_vxattr *vxattr; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf = NULL; + struct ceph_buffer *old_blob = NULL; + int issued; + int err; + int dirty = 0; + int name_len = strlen(name); + int val_len = size; + char *newname = NULL; + char *newval = NULL; + struct ceph_inode_xattr *xattr = NULL; + int required_blob_size; + bool check_realm = false; + bool lock_snap_rwsem = false; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + vxattr = ceph_match_vxattr(inode, name); + if (vxattr) { + if (vxattr->flags & VXATTR_FLAG_READONLY) + return -EOPNOTSUPP; + if (value && !strncmp(vxattr->name, "ceph.quota", 10)) + check_realm = true; + } + + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; + + /* preallocate memory for xattr name, value, index node */ + err = -ENOMEM; + newname = kmemdup(name, name_len + 1, GFP_NOFS); + if (!newname) + goto out; + + if (val_len) { + newval = kmemdup(value, val_len, GFP_NOFS); + if (!newval) + goto out; + } + + xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); + if (!xattr) + goto out; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + goto out; + + spin_lock(&ci->i_ceph_lock); +retry: + issued = __ceph_caps_issued(ci, NULL); + if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) + goto do_sync; + + if (!lock_snap_rwsem && !ci->i_head_snapc) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + } + + dout("setxattr %p name '%s' issued %s\n", inode, name, + ceph_cap_string(issued)); + __build_xattrs(inode); + + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + + if (!ci->i_xattrs.prealloc_blob || + required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { + struct ceph_buffer *blob; + + spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); /* Shouldn't be required */ + dout(" pre-allocating new blob size=%d\n", required_blob_size); + blob = ceph_buffer_new(required_blob_size, GFP_NOFS); + if (!blob) + goto do_sync_unlocked; + spin_lock(&ci->i_ceph_lock); + /* prealloc_blob can't be released while holding i_ceph_lock */ + if (ci->i_xattrs.prealloc_blob) + old_blob = ci->i_xattrs.prealloc_blob; + ci->i_xattrs.prealloc_blob = blob; + goto retry; + } + + err = __set_xattr(ci, newname, name_len, newval, val_len, + flags, value ? 1 : -1, &xattr); + + if (!err) { + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, + &prealloc_cf); + ci->i_xattrs.dirty = true; + inode->i_ctime = current_time(inode); + } + + spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); + if (dirty) + __mark_inode_dirty(inode, dirty); + ceph_free_cap_flush(prealloc_cf); + return err; + +do_sync: + spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); + + /* security module set xattr while filling trace */ + if (current->journal_info) { + pr_warn_ratelimited("sync setxattr %p " + "during filling trace\n", inode); + err = -EBUSY; + } else { + err = ceph_sync_setxattr(inode, name, value, size, flags); + if (err >= 0 && check_realm) { + /* check if snaprealm was created for quota inode */ + spin_lock(&ci->i_ceph_lock); + if ((ci->i_max_files || ci->i_max_bytes) && + !(ci->i_snap_realm && + ci->i_snap_realm->ino == ci->i_vino.ino)) + err = -EOPNOTSUPP; + spin_unlock(&ci->i_ceph_lock); + } + } +out: + ceph_free_cap_flush(prealloc_cf); + kfree(newname); + kfree(newval); + kfree(xattr); + return err; +} + +static int ceph_get_xattr_handler(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) +{ + if (!ceph_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __ceph_getxattr(inode, name, value, size); +} + +static int ceph_set_xattr_handler(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + if (!ceph_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __ceph_setxattr(inode, name, value, size, flags); +} + +static const struct xattr_handler ceph_other_xattr_handler = { + .prefix = "", /* match any name => handlers called with full name */ + .get = ceph_get_xattr_handler, + .set = ceph_set_xattr_handler, +}; + +#ifdef CONFIG_SECURITY +bool ceph_security_xattr_wanted(struct inode *in) +{ + return in->i_security != NULL; +} + +bool ceph_security_xattr_deadlock(struct inode *in) +{ + struct ceph_inode_info *ci; + bool ret; + if (!in->i_security) + return false; + ci = ceph_inode(in); + spin_lock(&ci->i_ceph_lock); + ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) && + !(ci->i_xattrs.version > 0 && + __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)); + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL +int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, + struct ceph_acl_sec_ctx *as_ctx) +{ + struct ceph_pagelist *pagelist = as_ctx->pagelist; + const char *name; + size_t name_len; + int err; + + err = security_dentry_init_security(dentry, mode, &dentry->d_name, + &as_ctx->sec_ctx, + &as_ctx->sec_ctxlen); + if (err < 0) { + WARN_ON_ONCE(err != -EOPNOTSUPP); + err = 0; /* do nothing */ + goto out; + } + + err = -ENOMEM; + if (!pagelist) { + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) + goto out; + err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); + if (err) + goto out; + ceph_pagelist_encode_32(pagelist, 1); + } + + /* + * FIXME: Make security_dentry_init_security() generic. Currently + * It only supports single security module and only selinux has + * dentry_init_security hook. + */ + name = XATTR_NAME_SELINUX; + name_len = strlen(name); + err = ceph_pagelist_reserve(pagelist, + 4 * 2 + name_len + as_ctx->sec_ctxlen); + if (err) + goto out; + + if (as_ctx->pagelist) { + /* update count of KV pairs */ + BUG_ON(pagelist->length <= sizeof(__le32)); + if (list_is_singular(&pagelist->head)) { + le32_add_cpu((__le32*)pagelist->mapped_tail, 1); + } else { + struct page *page = list_first_entry(&pagelist->head, + struct page, lru); + void *addr = kmap_atomic(page); + le32_add_cpu((__le32*)addr, 1); + kunmap_atomic(addr); + } + } else { + as_ctx->pagelist = pagelist; + } + + ceph_pagelist_encode_32(pagelist, name_len); + ceph_pagelist_append(pagelist, name, name_len); + + ceph_pagelist_encode_32(pagelist, as_ctx->sec_ctxlen); + ceph_pagelist_append(pagelist, as_ctx->sec_ctx, as_ctx->sec_ctxlen); + + err = 0; +out: + if (pagelist && !as_ctx->pagelist) + ceph_pagelist_release(pagelist); + return err; +} +#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ +#endif /* CONFIG_SECURITY */ + +void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) +{ +#ifdef CONFIG_CEPH_FS_POSIX_ACL + posix_acl_release(as_ctx->acl); + posix_acl_release(as_ctx->default_acl); +#endif +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL + security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen); +#endif + if (as_ctx->pagelist) + ceph_pagelist_release(as_ctx->pagelist); +} + +/* + * List of handlers for synthetic system.* attributes. Other + * attributes are handled directly. + */ +const struct xattr_handler *ceph_xattr_handlers[] = { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &ceph_other_xattr_handler, + NULL, +}; |