summaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/Kconfig37
-rw-r--r--fs/ceph/Makefile14
-rw-r--r--fs/ceph/acl.c279
-rw-r--r--fs/ceph/addr.c2059
-rw-r--r--fs/ceph/cache.c364
-rw-r--r--fs/ceph/cache.h190
-rw-r--r--fs/ceph/caps.c4240
-rw-r--r--fs/ceph/ceph_frag.c23
-rw-r--r--fs/ceph/debugfs.c321
-rw-r--r--fs/ceph/dir.c1541
-rw-r--r--fs/ceph/export.c248
-rw-r--r--fs/ceph/file.c1830
-rw-r--r--fs/ceph/inode.c2312
-rw-r--r--fs/ceph/ioctl.c293
-rw-r--r--fs/ceph/ioctl.h101
-rw-r--r--fs/ceph/locks.c479
-rw-r--r--fs/ceph/mds_client.c4331
-rw-r--r--fs/ceph/mds_client.h466
-rw-r--r--fs/ceph/mdsmap.c396
-rw-r--r--fs/ceph/quota.c362
-rw-r--r--fs/ceph/snap.c995
-rw-r--r--fs/ceph/strings.c128
-rw-r--r--fs/ceph/super.c1214
-rw-r--r--fs/ceph/super.h1118
-rw-r--r--fs/ceph/xattr.c1209
25 files changed, 24550 insertions, 0 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000..52095f473
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,37 @@
+config CEPH_FS
+ tristate "Ceph distributed file system"
+ depends on INET
+ select CEPH_LIB
+ select LIBCRC32C
+ select CRYPTO_AES
+ select CRYPTO
+ default n
+ help
+ Choose Y or M here to include support for mounting the
+ experimental Ceph distributed file system. Ceph is an extremely
+ scalable file system designed to provide high performance,
+ reliable access to petabytes of storage.
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
+if CEPH_FS
+config CEPH_FSCACHE
+ bool "Enable Ceph client caching support"
+ depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
+ help
+ Choose Y here to enable persistent, read-only local
+ caching support for Ceph clients using FS-Cache
+
+endif
+
+config CEPH_FS_POSIX_ACL
+ bool "Ceph POSIX Access Control Lists"
+ depends on CEPH_FS
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ If you don't know what Access Control Lists are, say N
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000..a699e3203
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for CEPH filesystem.
+#
+
+obj-$(CONFIG_CEPH_FS) += ceph.o
+
+ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
+ export.o caps.o snap.o xattr.o quota.o \
+ mds_client.o mdsmap.o strings.o ceph_frag.o \
+ debugfs.o
+
+ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
+ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
new file mode 100644
index 000000000..027408d55
--- /dev/null
+++ b/fs/ceph/acl.c
@@ -0,0 +1,279 @@
+/*
+ * linux/fs/ceph/acl.c
+ *
+ * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "super.h"
+
+static inline void ceph_set_cached_acl(struct inode *inode,
+ int type, struct posix_acl *acl)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+ set_cached_acl(inode, type, acl);
+ else
+ forget_cached_acl(inode, type);
+ spin_unlock(&ci->i_ceph_lock);
+}
+
+struct posix_acl *ceph_get_acl(struct inode *inode, int type)
+{
+ int size;
+ unsigned int retry_cnt = 0;
+ const char *name;
+ char *value = NULL;
+ struct posix_acl *acl;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ BUG();
+ }
+
+retry:
+ size = __ceph_getxattr(inode, name, "", 0);
+ if (size > 0) {
+ value = kzalloc(size, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ size = __ceph_getxattr(inode, name, value, size);
+ }
+
+ if (size == -ERANGE && retry_cnt < 10) {
+ retry_cnt++;
+ kfree(value);
+ value = NULL;
+ goto retry;
+ }
+
+ if (size > 0) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ } else if (size == -ENODATA || size == 0) {
+ acl = NULL;
+ } else {
+ pr_err_ratelimited("get acl %llx.%llx failed, err=%d\n",
+ ceph_vinop(inode), size);
+ acl = ERR_PTR(-EIO);
+ }
+
+ kfree(value);
+
+ if (!IS_ERR(acl))
+ ceph_set_cached_acl(inode, type, acl);
+
+ return acl;
+}
+
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ int ret = 0, size = 0;
+ const char *name = NULL;
+ char *value = NULL;
+ struct iattr newattrs;
+ struct timespec64 old_ctime = inode->i_ctime;
+ umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ if (acl) {
+ ret = posix_acl_update_mode(inode, &new_mode, &acl);
+ if (ret)
+ goto out;
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode)) {
+ ret = acl ? -EINVAL : 0;
+ goto out;
+ }
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_NOFS);
+ if (!value) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (ret < 0)
+ goto out_free;
+ }
+
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ ret = -EROFS;
+ goto out_free;
+ }
+
+ if (new_mode != old_mode) {
+ newattrs.ia_ctime = current_time(inode);
+ newattrs.ia_mode = new_mode;
+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+ ret = __ceph_setattr(inode, &newattrs);
+ if (ret)
+ goto out_free;
+ }
+
+ ret = __ceph_setxattr(inode, name, value, size, 0);
+ if (ret) {
+ if (new_mode != old_mode) {
+ newattrs.ia_ctime = old_ctime;
+ newattrs.ia_mode = old_mode;
+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+ __ceph_setattr(inode, &newattrs);
+ }
+ goto out_free;
+ }
+
+ ceph_set_cached_acl(inode, type, acl);
+
+out_free:
+ kfree(value);
+out:
+ return ret;
+}
+
+int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+ struct ceph_acls_info *info)
+{
+ struct posix_acl *acl, *default_acl;
+ size_t val_size1 = 0, val_size2 = 0;
+ struct ceph_pagelist *pagelist = NULL;
+ void *tmp_buf = NULL;
+ int err;
+
+ err = posix_acl_create(dir, mode, &default_acl, &acl);
+ if (err)
+ return err;
+
+ if (acl) {
+ err = posix_acl_equiv_mode(acl, mode);
+ if (err < 0)
+ goto out_err;
+ if (err == 0) {
+ posix_acl_release(acl);
+ acl = NULL;
+ }
+ }
+
+ if (!default_acl && !acl)
+ return 0;
+
+ if (acl)
+ val_size1 = posix_acl_xattr_size(acl->a_count);
+ if (default_acl)
+ val_size2 = posix_acl_xattr_size(default_acl->a_count);
+
+ err = -ENOMEM;
+ tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
+ if (!tmp_buf)
+ goto out_err;
+ pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
+ if (!pagelist)
+ goto out_err;
+ ceph_pagelist_init(pagelist);
+
+ err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
+ if (err)
+ goto out_err;
+
+ ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
+
+ if (acl) {
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);
+ err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
+ if (err)
+ goto out_err;
+ ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS,
+ len);
+ err = posix_acl_to_xattr(&init_user_ns, acl,
+ tmp_buf, val_size1);
+ if (err < 0)
+ goto out_err;
+ ceph_pagelist_encode_32(pagelist, val_size1);
+ ceph_pagelist_append(pagelist, tmp_buf, val_size1);
+ }
+ if (default_acl) {
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);
+ err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
+ if (err)
+ goto out_err;
+ err = ceph_pagelist_encode_string(pagelist,
+ XATTR_NAME_POSIX_ACL_DEFAULT, len);
+ err = posix_acl_to_xattr(&init_user_ns, default_acl,
+ tmp_buf, val_size2);
+ if (err < 0)
+ goto out_err;
+ ceph_pagelist_encode_32(pagelist, val_size2);
+ ceph_pagelist_append(pagelist, tmp_buf, val_size2);
+ }
+
+ kfree(tmp_buf);
+
+ info->acl = acl;
+ info->default_acl = default_acl;
+ info->pagelist = pagelist;
+ return 0;
+
+out_err:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+ kfree(tmp_buf);
+ if (pagelist)
+ ceph_pagelist_release(pagelist);
+ return err;
+}
+
+void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info)
+{
+ if (!inode)
+ return;
+ ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl);
+ ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl);
+}
+
+void ceph_release_acls_info(struct ceph_acls_info *info)
+{
+ posix_acl_release(info->acl);
+ posix_acl_release(info->default_acl);
+ if (info->pagelist)
+ ceph_pagelist_release(info->pagelist);
+}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000..de10899da
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,2059 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h> /* generic_writepages */
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/signal.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "cache.h"
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/striper.h>
+
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page. This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode. In the absence of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress. In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_. Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb) \
+ (CONGESTION_ON_THRESH(congestion_kb) - \
+ (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+static inline struct ceph_snap_context *page_snap_context(struct page *page)
+{
+ if (PagePrivate(page))
+ return (void *)page->private;
+ return NULL;
+}
+
+/*
+ * Dirty a page. Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate. If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_context *snapc;
+
+ if (PageDirty(page)) {
+ dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+ mapping->host, page, page->index);
+ BUG_ON(!PagePrivate(page));
+ return 0;
+ }
+
+ inode = mapping->host;
+ ci = ceph_inode(inode);
+
+ /* dirty the head */
+ spin_lock(&ci->i_ceph_lock);
+ BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ capsnap->dirty_pages++;
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ ++ci->i_wrbuffer_ref_head;
+ }
+ if (ci->i_wrbuffer_ref == 0)
+ ihold(inode);
+ ++ci->i_wrbuffer_ref;
+ dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+ "snapc %p seq %lld (%d snaps)\n",
+ mapping->host, page, page->index,
+ ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+ ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+ snapc, snapc->seq, snapc->num_snaps);
+ spin_unlock(&ci->i_ceph_lock);
+
+ /*
+ * Reference snap context in page->private. Also set
+ * PagePrivate so that we get invalidatepage callback.
+ */
+ BUG_ON(PagePrivate(page));
+ page->private = (unsigned long)snapc;
+ SetPagePrivate(page);
+
+ return __set_page_dirty_nobuffers(page);
+}
+
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately. Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_context *snapc = page_snap_context(page);
+
+ inode = page->mapping->host;
+ ci = ceph_inode(inode);
+
+ if (offset != 0 || length != PAGE_SIZE) {
+ dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
+ inode, page, page->index, offset, length);
+ return;
+ }
+
+ ceph_invalidate_fscache_page(inode, page);
+
+ WARN_ON(!PageLocked(page));
+ if (!PagePrivate(page))
+ return;
+
+ ClearPageChecked(page);
+
+ dout("%p invalidatepage %p idx %lu full dirty page\n",
+ inode, page, page->index);
+
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+ page->private = 0;
+ ClearPagePrivate(page);
+}
+
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+ dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host,
+ page, page->index, PageDirty(page) ? "" : "not ");
+
+ /* Can we release the page from the cache? */
+ if (!ceph_release_fscache_page(page, g))
+ return 0;
+
+ return !PagePrivate(page);
+}
+
+/*
+ * read a single page, without unlocking it.
+ */
+static int ceph_do_readpage(struct file *filp, struct page *page)
+{
+ struct inode *inode = file_inode(filp);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc =
+ &ceph_inode_to_client(inode)->client->osdc;
+ int err = 0;
+ u64 off = page_offset(page);
+ u64 len = PAGE_SIZE;
+
+ if (off >= i_size_read(inode)) {
+ zero_user_segment(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ return 0;
+ }
+
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ /*
+ * Uptodate inline data should have been added
+ * into page cache while getting Fcr caps.
+ */
+ if (off == 0)
+ return -EINVAL;
+ zero_user_segment(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ return 0;
+ }
+
+ err = ceph_readpage_from_fscache(inode, page);
+ if (err == 0)
+ return -EINPROGRESS;
+
+ dout("readpage inode %p file %p page %p index %lu\n",
+ inode, filp, page, page->index);
+ err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+ off, &len,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ &page, 1, 0);
+ if (err == -ENOENT)
+ err = 0;
+ if (err < 0) {
+ SetPageError(page);
+ ceph_fscache_readpage_cancel(inode, page);
+ goto out;
+ }
+ if (err < PAGE_SIZE)
+ /* zero fill remainder of page */
+ zero_user_segment(page, err, PAGE_SIZE);
+ else
+ flush_dcache_page(page);
+
+ SetPageUptodate(page);
+ ceph_readpage_to_fscache(inode, page);
+
+out:
+ return err < 0 ? err : 0;
+}
+
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+ int r = ceph_do_readpage(filp, page);
+ if (r != -EINPROGRESS)
+ unlock_page(page);
+ else
+ r = 0;
+ return r;
+}
+
+/*
+ * Finish an async read(ahead) op.
+ */
+static void finish_read(struct ceph_osd_request *req)
+{
+ struct inode *inode = req->r_inode;
+ struct ceph_osd_data *osd_data;
+ int rc = req->r_result <= 0 ? req->r_result : 0;
+ int bytes = req->r_result >= 0 ? req->r_result : 0;
+ int num_pages;
+ int i;
+
+ dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
+
+ /* unlock all pages, zeroing any data we didn't read */
+ osd_data = osd_req_op_extent_osd_data(req, 0);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
+ for (i = 0; i < num_pages; i++) {
+ struct page *page = osd_data->pages[i];
+
+ if (rc < 0 && rc != -ENOENT) {
+ ceph_fscache_readpage_cancel(inode, page);
+ goto unlock;
+ }
+ if (bytes < (int)PAGE_SIZE) {
+ /* zero (remainder of) page */
+ int s = bytes < 0 ? 0 : bytes;
+ zero_user_segment(page, s, PAGE_SIZE);
+ }
+ dout("finish_read %p uptodate %p idx %lu\n", inode, page,
+ page->index);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ ceph_readpage_to_fscache(inode, page);
+unlock:
+ unlock_page(page);
+ put_page(page);
+ bytes -= PAGE_SIZE;
+ }
+ kfree(osd_data->pages);
+}
+
+/*
+ * start an async read(ahead) operation. return nr_pages we submitted
+ * a read for on success, or negative error code.
+ */
+static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
+ struct list_head *page_list, int max)
+{
+ struct ceph_osd_client *osdc =
+ &ceph_inode_to_client(inode)->client->osdc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct page *page = list_entry(page_list->prev, struct page, lru);
+ struct ceph_vino vino;
+ struct ceph_osd_request *req;
+ u64 off;
+ u64 len;
+ int i;
+ struct page **pages;
+ pgoff_t next_index;
+ int nr_pages = 0;
+ int got = 0;
+ int ret = 0;
+
+ if (!rw_ctx) {
+ /* caller of readpages does not hold buffer and read caps
+ * (fadvise, madvise and readahead cases) */
+ int want = CEPH_CAP_FILE_CACHE;
+ ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got);
+ if (ret < 0) {
+ dout("start_read %p, error getting cap\n", inode);
+ } else if (!(got & want)) {
+ dout("start_read %p, no cache cap\n", inode);
+ ret = 0;
+ }
+ if (ret <= 0) {
+ if (got)
+ ceph_put_cap_refs(ci, got);
+ while (!list_empty(page_list)) {
+ page = list_entry(page_list->prev,
+ struct page, lru);
+ list_del(&page->lru);
+ put_page(page);
+ }
+ return ret;
+ }
+ }
+
+ off = (u64) page_offset(page);
+
+ /* count pages */
+ next_index = page->index;
+ list_for_each_entry_reverse(page, page_list, lru) {
+ if (page->index != next_index)
+ break;
+ nr_pages++;
+ next_index++;
+ if (max && nr_pages == max)
+ break;
+ }
+ len = nr_pages << PAGE_SHIFT;
+ dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
+ off, len);
+ vino = ceph_vino(inode);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
+ 0, 1, CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ, NULL,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ goto out;
+ }
+
+ /* build page vector */
+ nr_pages = calc_pages_for(0, len);
+ pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+ for (i = 0; i < nr_pages; ++i) {
+ page = list_entry(page_list->prev, struct page, lru);
+ BUG_ON(PageLocked(page));
+ list_del(&page->lru);
+
+ dout("start_read %p adding %p idx %lu\n", inode, page,
+ page->index);
+ if (add_to_page_cache_lru(page, &inode->i_data, page->index,
+ GFP_KERNEL)) {
+ ceph_fscache_uncache_page(inode, page);
+ put_page(page);
+ dout("start_read %p add_to_page_cache failed %p\n",
+ inode, page);
+ nr_pages = i;
+ if (nr_pages > 0) {
+ len = nr_pages << PAGE_SHIFT;
+ osd_req_op_extent_update(req, 0, len);
+ break;
+ }
+ goto out_pages;
+ }
+ pages[i] = page;
+ }
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
+ req->r_callback = finish_read;
+ req->r_inode = inode;
+
+ dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
+ ret = ceph_osdc_start_request(osdc, req, false);
+ if (ret < 0)
+ goto out_pages;
+ ceph_osdc_put_request(req);
+
+ /* After adding locked pages to page cache, the inode holds cache cap.
+ * So we can drop our cap refs. */
+ if (got)
+ ceph_put_cap_refs(ci, got);
+
+ return nr_pages;
+
+out_pages:
+ for (i = 0; i < nr_pages; ++i) {
+ ceph_fscache_readpage_cancel(inode, pages[i]);
+ unlock_page(pages[i]);
+ }
+ ceph_put_page_vector(pages, nr_pages, false);
+out_put:
+ ceph_osdc_put_request(req);
+out:
+ if (got)
+ ceph_put_cap_refs(ci, got);
+ return ret;
+}
+
+
+/*
+ * Read multiple pages. Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *page_list, unsigned nr_pages)
+{
+ struct inode *inode = file_inode(file);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_file_info *fi = file->private_data;
+ struct ceph_rw_context *rw_ctx;
+ int rc = 0;
+ int max = 0;
+
+ if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
+ return -EINVAL;
+
+ rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
+ &nr_pages);
+
+ if (rc == 0)
+ goto out;
+
+ rw_ctx = ceph_find_rw_context(fi);
+ max = fsc->mount_options->rsize >> PAGE_SHIFT;
+ dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
+ inode, file, rw_ctx, nr_pages, max);
+ while (!list_empty(page_list)) {
+ rc = start_read(inode, rw_ctx, page_list, max);
+ if (rc < 0)
+ goto out;
+ }
+out:
+ ceph_fscache_readpages_cancel(inode, page_list);
+
+ dout("readpages %p file %p ret %d\n", inode, file, rc);
+ return rc;
+}
+
+struct ceph_writeback_ctl
+{
+ loff_t i_size;
+ u64 truncate_size;
+ u32 truncate_seq;
+ bool size_stable;
+ bool head_snapc;
+};
+
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ */
+static struct ceph_snap_context *
+get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
+ struct ceph_snap_context *page_snapc)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc = NULL;
+ struct ceph_cap_snap *capsnap = NULL;
+
+ spin_lock(&ci->i_ceph_lock);
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+ capsnap->context, capsnap->dirty_pages);
+ if (!capsnap->dirty_pages)
+ continue;
+
+ /* get i_size, truncate_{seq,size} for page_snapc? */
+ if (snapc && capsnap->context != page_snapc)
+ continue;
+
+ if (ctl) {
+ if (capsnap->writing) {
+ ctl->i_size = i_size_read(inode);
+ ctl->size_stable = false;
+ } else {
+ ctl->i_size = capsnap->size;
+ ctl->size_stable = true;
+ }
+ ctl->truncate_size = capsnap->truncate_size;
+ ctl->truncate_seq = capsnap->truncate_seq;
+ ctl->head_snapc = false;
+ }
+
+ if (snapc)
+ break;
+
+ snapc = ceph_get_snap_context(capsnap->context);
+ if (!page_snapc ||
+ page_snapc == snapc ||
+ page_snapc->seq > snapc->seq)
+ break;
+ }
+ if (!snapc && ci->i_wrbuffer_ref_head) {
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ dout(" head snapc %p has %d dirty pages\n",
+ snapc, ci->i_wrbuffer_ref_head);
+ if (ctl) {
+ ctl->i_size = i_size_read(inode);
+ ctl->truncate_size = ci->i_truncate_size;
+ ctl->truncate_seq = ci->i_truncate_seq;
+ ctl->size_stable = false;
+ ctl->head_snapc = true;
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ return snapc;
+}
+
+static u64 get_writepages_data_length(struct inode *inode,
+ struct page *page, u64 start)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc = page_snap_context(page);
+ struct ceph_cap_snap *capsnap = NULL;
+ u64 end = i_size_read(inode);
+
+ if (snapc != ci->i_head_snapc) {
+ bool found = false;
+ spin_lock(&ci->i_ceph_lock);
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ if (capsnap->context == snapc) {
+ if (!capsnap->writing)
+ end = capsnap->size;
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ WARN_ON(!found);
+ }
+ if (end > page_offset(page) + PAGE_SIZE)
+ end = page_offset(page) + PAGE_SIZE;
+ return end > start ? end - start : 0;
+}
+
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_fs_client *fsc;
+ struct ceph_snap_context *snapc, *oldest;
+ loff_t page_off = page_offset(page);
+ int err, len = PAGE_SIZE;
+ struct ceph_writeback_ctl ceph_wbc;
+
+ dout("writepage %p idx %lu\n", page, page->index);
+
+ inode = page->mapping->host;
+ ci = ceph_inode(inode);
+ fsc = ceph_inode_to_client(inode);
+
+ /* verify this is a writeable snap context */
+ snapc = page_snap_context(page);
+ if (!snapc) {
+ dout("writepage %p page %p not dirty?\n", inode, page);
+ return 0;
+ }
+ oldest = get_oldest_context(inode, &ceph_wbc, snapc);
+ if (snapc->seq > oldest->seq) {
+ dout("writepage %p page %p snapc %p not writeable - noop\n",
+ inode, page, snapc);
+ /* we should only noop if called by kswapd */
+ WARN_ON(!(current->flags & PF_MEMALLOC));
+ ceph_put_snap_context(oldest);
+ redirty_page_for_writepage(wbc, page);
+ return 0;
+ }
+ ceph_put_snap_context(oldest);
+
+ /* is this a partial page at end of file? */
+ if (page_off >= ceph_wbc.i_size) {
+ dout("%p page eof %llu\n", page, ceph_wbc.i_size);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
+ return 0;
+ }
+
+ if (ceph_wbc.i_size < page_off + len)
+ len = ceph_wbc.i_size - page_off;
+
+ dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
+ inode, page, page->index, page_off, len, snapc, snapc->seq);
+
+ if (atomic_long_inc_return(&fsc->writeback_count) >
+ CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
+ set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+
+ set_page_writeback(page);
+ err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
+ &ci->i_layout, snapc, page_off, len,
+ ceph_wbc.truncate_seq,
+ ceph_wbc.truncate_size,
+ &inode->i_mtime, &page, 1);
+ if (err < 0) {
+ struct writeback_control tmp_wbc;
+ if (!wbc)
+ wbc = &tmp_wbc;
+ if (err == -ERESTARTSYS) {
+ /* killed by SIGKILL */
+ dout("writepage interrupted page %p\n", page);
+ redirty_page_for_writepage(wbc, page);
+ end_page_writeback(page);
+ return err;
+ }
+ dout("writepage setting page/mapping error %d %p\n",
+ err, page);
+ SetPageError(page);
+ mapping_set_error(&inode->i_data, err);
+ wbc->pages_skipped++;
+ } else {
+ dout("writepage cleaned page %p\n", page);
+ err = 0; /* vfs expects us to return 0 */
+ }
+ page->private = 0;
+ ClearPagePrivate(page);
+ end_page_writeback(page);
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc); /* page's reference */
+
+ if (atomic_long_dec_return(&fsc->writeback_count) <
+ CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+ clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+
+ return err;
+}
+
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+ int err;
+ struct inode *inode = page->mapping->host;
+ BUG_ON(!inode);
+ ihold(inode);
+ err = writepage_nounlock(page, wbc);
+ if (err == -ERESTARTSYS) {
+ /* direct memory reclaimer was killed by SIGKILL. return 0
+ * to prevent caller from setting mapping/page error */
+ err = 0;
+ }
+ unlock_page(page);
+ iput(inode);
+ return err;
+}
+
+/*
+ * lame release_pages helper. release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+ struct pagevec pvec;
+ int i;
+
+ pagevec_init(&pvec);
+ for (i = 0; i < num; i++) {
+ if (pagevec_add(&pvec, pages[i]) == 0)
+ pagevec_release(&pvec);
+ }
+ pagevec_release(&pvec);
+}
+
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req)
+{
+ struct inode *inode = req->r_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_data *osd_data;
+ struct page *page;
+ int num_pages, total_pages = 0;
+ int i, j;
+ int rc = req->r_result;
+ struct ceph_snap_context *snapc = req->r_snapc;
+ struct address_space *mapping = inode->i_mapping;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ bool remove_page;
+
+ dout("writepages_finish %p rc %d\n", inode, rc);
+ if (rc < 0) {
+ mapping_set_error(mapping, rc);
+ ceph_set_error_write(ci);
+ } else {
+ ceph_clear_error_write(ci);
+ }
+
+ /*
+ * We lost the cache cap, need to truncate the page before
+ * it is unlocked, otherwise we'd truncate it later in the
+ * page truncation thread, possibly losing some data that
+ * raced its way in
+ */
+ remove_page = !(ceph_caps_issued(ci) &
+ (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
+
+ /* clean all pages */
+ for (i = 0; i < req->r_num_ops; i++) {
+ if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+ break;
+
+ osd_data = osd_req_op_extent_osd_data(req, i);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
+ total_pages += num_pages;
+ for (j = 0; j < num_pages; j++) {
+ page = osd_data->pages[j];
+ BUG_ON(!page);
+ WARN_ON(!PageUptodate(page));
+
+ if (atomic_long_dec_return(&fsc->writeback_count) <
+ CONGESTION_OFF_THRESH(
+ fsc->mount_options->congestion_kb))
+ clear_bdi_congested(inode_to_bdi(inode),
+ BLK_RW_ASYNC);
+
+ ceph_put_snap_context(page_snap_context(page));
+ page->private = 0;
+ ClearPagePrivate(page);
+ dout("unlocking %p\n", page);
+ end_page_writeback(page);
+
+ if (remove_page)
+ generic_error_remove_page(inode->i_mapping,
+ page);
+
+ unlock_page(page);
+ }
+ dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
+ inode, osd_data->length, rc >= 0 ? num_pages : 0);
+
+ ceph_release_pages(osd_data->pages, num_pages);
+ }
+
+ ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
+
+ osd_data = osd_req_op_extent_osd_data(req, 0);
+ if (osd_data->pages_from_pool)
+ mempool_free(osd_data->pages,
+ ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
+ else
+ kfree(osd_data->pages);
+ ceph_osdc_put_request(req);
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_vino vino = ceph_vino(inode);
+ pgoff_t index, start_index, end = -1;
+ struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
+ struct pagevec pvec;
+ int rc = 0;
+ unsigned int wsize = i_blocksize(inode);
+ struct ceph_osd_request *req = NULL;
+ struct ceph_writeback_ctl ceph_wbc;
+ bool should_loop, range_whole = false;
+ bool done = false;
+
+ dout("writepages_start %p (mode=%s)\n", inode,
+ wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+ (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+ if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ if (ci->i_wrbuffer_ref > 0) {
+ pr_warn_ratelimited(
+ "writepage_start %p %lld forced umount\n",
+ inode, ceph_ino(inode));
+ }
+ mapping_set_error(mapping, -EIO);
+ return -EIO; /* we're in a forced umount, don't write! */
+ }
+ if (fsc->mount_options->wsize < wsize)
+ wsize = fsc->mount_options->wsize;
+
+ pagevec_init(&pvec);
+
+ start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
+ index = start_index;
+
+retry:
+ /* find oldest snap context with dirty data */
+ snapc = get_oldest_context(inode, &ceph_wbc, NULL);
+ if (!snapc) {
+ /* hmm, why does writepages get called when there
+ is no dirty data? */
+ dout(" no snap context with dirty data?\n");
+ goto out;
+ }
+ dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+ snapc, snapc->seq, snapc->num_snaps);
+
+ should_loop = false;
+ if (ceph_wbc.head_snapc && snapc != last_snapc) {
+ /* where to start/end? */
+ if (wbc->range_cyclic) {
+ index = start_index;
+ end = -1;
+ if (index > 0)
+ should_loop = true;
+ dout(" cyclic, start at %lu\n", index);
+ } else {
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = true;
+ dout(" not cyclic, %lu to %lu\n", index, end);
+ }
+ } else if (!ceph_wbc.head_snapc) {
+ /* Do not respect wbc->range_{start,end}. Dirty pages
+ * in that range can be associated with newer snapc.
+ * They are not writeable until we write all dirty pages
+ * associated with 'snapc' get written */
+ if (index > 0)
+ should_loop = true;
+ dout(" non-head snapc, range whole\n");
+ }
+
+ ceph_put_snap_context(last_snapc);
+ last_snapc = snapc;
+
+ while (!done && index <= end) {
+ int num_ops = 0, op_idx;
+ unsigned i, pvec_pages, max_pages, locked_pages = 0;
+ struct page **pages = NULL, **data_pages;
+ mempool_t *pool = NULL; /* Becomes non-null if mempool used */
+ struct page *page;
+ pgoff_t strip_unit_end = 0;
+ u64 offset = 0, len = 0;
+
+ max_pages = wsize >> PAGE_SHIFT;
+
+get_more_pages:
+ pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_DIRTY,
+ max_pages - locked_pages);
+ dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
+ if (!pvec_pages && !locked_pages)
+ break;
+ for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+ page = pvec.pages[i];
+ dout("? %p idx %lu\n", page, page->index);
+ if (locked_pages == 0)
+ lock_page(page); /* first page */
+ else if (!trylock_page(page))
+ break;
+
+ /* only dirty pages, or our accounting breaks */
+ if (unlikely(!PageDirty(page)) ||
+ unlikely(page->mapping != mapping)) {
+ dout("!dirty or !mapping %p\n", page);
+ unlock_page(page);
+ continue;
+ }
+ /* only if matching snap context */
+ pgsnapc = page_snap_context(page);
+ if (pgsnapc != snapc) {
+ dout("page snapc %p %lld != oldest %p %lld\n",
+ pgsnapc, pgsnapc->seq, snapc, snapc->seq);
+ if (!should_loop &&
+ !ceph_wbc.head_snapc &&
+ wbc->sync_mode != WB_SYNC_NONE)
+ should_loop = true;
+ unlock_page(page);
+ continue;
+ }
+ if (page_offset(page) >= ceph_wbc.i_size) {
+ dout("%p page eof %llu\n",
+ page, ceph_wbc.i_size);
+ if ((ceph_wbc.size_stable ||
+ page_offset(page) >= i_size_read(inode)) &&
+ clear_page_dirty_for_io(page))
+ mapping->a_ops->invalidatepage(page,
+ 0, PAGE_SIZE);
+ unlock_page(page);
+ continue;
+ }
+ if (strip_unit_end && (page->index > strip_unit_end)) {
+ dout("end of strip unit %p\n", page);
+ unlock_page(page);
+ break;
+ }
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ dout("%p under writeback\n", page);
+ unlock_page(page);
+ continue;
+ }
+ dout("waiting on writeback %p\n", page);
+ wait_on_page_writeback(page);
+ }
+
+ if (!clear_page_dirty_for_io(page)) {
+ dout("%p !clear_page_dirty_for_io\n", page);
+ unlock_page(page);
+ continue;
+ }
+
+ /*
+ * We have something to write. If this is
+ * the first locked page this time through,
+ * calculate max possinle write size and
+ * allocate a page array
+ */
+ if (locked_pages == 0) {
+ u64 objnum;
+ u64 objoff;
+ u32 xlen;
+
+ /* prepare async write request */
+ offset = (u64)page_offset(page);
+ ceph_calc_file_object_mapping(&ci->i_layout,
+ offset, wsize,
+ &objnum, &objoff,
+ &xlen);
+ len = xlen;
+
+ num_ops = 1;
+ strip_unit_end = page->index +
+ ((len - 1) >> PAGE_SHIFT);
+
+ BUG_ON(pages);
+ max_pages = calc_pages_for(0, (u64)len);
+ pages = kmalloc_array(max_pages,
+ sizeof(*pages),
+ GFP_NOFS);
+ if (!pages) {
+ pool = fsc->wb_pagevec_pool;
+ pages = mempool_alloc(pool, GFP_NOFS);
+ BUG_ON(!pages);
+ }
+
+ len = 0;
+ } else if (page->index !=
+ (offset + len) >> PAGE_SHIFT) {
+ if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS :
+ CEPH_OSD_MAX_OPS)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ break;
+ }
+
+ num_ops++;
+ offset = (u64)page_offset(page);
+ len = 0;
+ }
+
+ /* note position of first page in pvec */
+ dout("%p will write page %p idx %lu\n",
+ inode, page, page->index);
+
+ if (atomic_long_inc_return(&fsc->writeback_count) >
+ CONGESTION_ON_THRESH(
+ fsc->mount_options->congestion_kb)) {
+ set_bdi_congested(inode_to_bdi(inode),
+ BLK_RW_ASYNC);
+ }
+
+
+ pages[locked_pages++] = page;
+ pvec.pages[i] = NULL;
+
+ len += PAGE_SIZE;
+ }
+
+ /* did we get anything? */
+ if (!locked_pages)
+ goto release_pvec_pages;
+ if (i) {
+ unsigned j, n = 0;
+ /* shift unused page to beginning of pvec */
+ for (j = 0; j < pvec_pages; j++) {
+ if (!pvec.pages[j])
+ continue;
+ if (n < j)
+ pvec.pages[n] = pvec.pages[j];
+ n++;
+ }
+ pvec.nr = n;
+
+ if (pvec_pages && i == pvec_pages &&
+ locked_pages < max_pages) {
+ dout("reached end pvec, trying for more\n");
+ pagevec_release(&pvec);
+ goto get_more_pages;
+ }
+ }
+
+new_request:
+ offset = page_offset(pages[0]);
+ len = wsize;
+
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0, num_ops,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ snapc, ceph_wbc.truncate_seq,
+ ceph_wbc.truncate_size, false);
+ if (IS_ERR(req)) {
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0,
+ min(num_ops,
+ CEPH_OSD_SLAB_OPS),
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE,
+ snapc, ceph_wbc.truncate_seq,
+ ceph_wbc.truncate_size, true);
+ BUG_ON(IS_ERR(req));
+ }
+ BUG_ON(len < page_offset(pages[locked_pages - 1]) +
+ PAGE_SIZE - offset);
+
+ req->r_callback = writepages_finish;
+ req->r_inode = inode;
+
+ /* Format the osd request message and submit the write */
+ len = 0;
+ data_pages = pages;
+ op_idx = 0;
+ for (i = 0; i < locked_pages; i++) {
+ u64 cur_offset = page_offset(pages[i]);
+ if (offset + len != cur_offset) {
+ if (op_idx + 1 == req->r_num_ops)
+ break;
+ osd_req_op_extent_dup_last(req, op_idx,
+ cur_offset - offset);
+ dout("writepages got pages at %llu~%llu\n",
+ offset, len);
+ osd_req_op_extent_osd_data_pages(req, op_idx,
+ data_pages, len, 0,
+ !!pool, false);
+ osd_req_op_extent_update(req, op_idx, len);
+
+ len = 0;
+ offset = cur_offset;
+ data_pages = pages + i;
+ op_idx++;
+ }
+
+ set_page_writeback(pages[i]);
+ len += PAGE_SIZE;
+ }
+
+ if (ceph_wbc.size_stable) {
+ len = min(len, ceph_wbc.i_size - offset);
+ } else if (i == locked_pages) {
+ /* writepages_finish() clears writeback pages
+ * according to the data length, so make sure
+ * data length covers all locked pages */
+ u64 min_len = len + 1 - PAGE_SIZE;
+ len = get_writepages_data_length(inode, pages[i - 1],
+ offset);
+ len = max(len, min_len);
+ }
+ dout("writepages got pages at %llu~%llu\n", offset, len);
+
+ osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
+ 0, !!pool, false);
+ osd_req_op_extent_update(req, op_idx, len);
+
+ BUG_ON(op_idx + 1 != req->r_num_ops);
+
+ pool = NULL;
+ if (i < locked_pages) {
+ BUG_ON(num_ops <= req->r_num_ops);
+ num_ops -= req->r_num_ops;
+ locked_pages -= i;
+
+ /* allocate new pages array for next request */
+ data_pages = pages;
+ pages = kmalloc_array(locked_pages, sizeof(*pages),
+ GFP_NOFS);
+ if (!pages) {
+ pool = fsc->wb_pagevec_pool;
+ pages = mempool_alloc(pool, GFP_NOFS);
+ BUG_ON(!pages);
+ }
+ memcpy(pages, data_pages + i,
+ locked_pages * sizeof(*pages));
+ memset(data_pages + i, 0,
+ locked_pages * sizeof(*pages));
+ } else {
+ BUG_ON(num_ops != req->r_num_ops);
+ index = pages[i - 1]->index + 1;
+ /* request message now owns the pages array */
+ pages = NULL;
+ }
+
+ req->r_mtime = inode->i_mtime;
+ rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
+ BUG_ON(rc);
+ req = NULL;
+
+ wbc->nr_to_write -= i;
+ if (pages)
+ goto new_request;
+
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
+ done = true;
+
+release_pvec_pages:
+ dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+ pvec.nr ? pvec.pages[0] : NULL);
+ pagevec_release(&pvec);
+ }
+
+ if (should_loop && !done) {
+ /* more to do; loop back to beginning of file */
+ dout("writepages looping back to beginning of file\n");
+ end = start_index - 1; /* OK even when start_index == 0 */
+
+ /* to write dirty pages associated with next snapc,
+ * we need to wait until current writes complete */
+ if (wbc->sync_mode != WB_SYNC_NONE &&
+ start_index == 0 && /* all dirty pages were checked */
+ !ceph_wbc.head_snapc) {
+ struct page *page;
+ unsigned i, nr;
+ index = 0;
+ while ((index <= end) &&
+ (nr = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_WRITEBACK))) {
+ for (i = 0; i < nr; i++) {
+ page = pvec.pages[i];
+ if (page_snap_context(page) != snapc)
+ continue;
+ wait_on_page_writeback(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ }
+
+ start_index = 0;
+ index = 0;
+ goto retry;
+ }
+
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = index;
+
+out:
+ ceph_osdc_put_request(req);
+ ceph_put_snap_context(last_snapc);
+ dout("writepages dend - startone, rc = %d\n", rc);
+ return rc;
+}
+
+
+
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+ struct ceph_snap_context *snapc)
+{
+ struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL);
+ int ret = !oldest || snapc->seq <= oldest->seq;
+
+ ceph_put_snap_context(oldest);
+ return ret;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
+ */
+static int ceph_update_writeable_page(struct file *file,
+ loff_t pos, unsigned len,
+ struct page *page)
+{
+ struct inode *inode = file_inode(file);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ loff_t page_off = pos & PAGE_MASK;
+ int pos_in_page = pos & ~PAGE_MASK;
+ int end_in_page = pos_in_page + len;
+ loff_t i_size;
+ int r;
+ struct ceph_snap_context *snapc, *oldest;
+
+ if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout(" page %p forced umount\n", page);
+ unlock_page(page);
+ return -EIO;
+ }
+
+retry_locked:
+ /* writepages currently holds page lock, but if we change that later, */
+ wait_on_page_writeback(page);
+
+ snapc = page_snap_context(page);
+ if (snapc && snapc != ci->i_head_snapc) {
+ /*
+ * this page is already dirty in another (older) snap
+ * context! is it writeable now?
+ */
+ oldest = get_oldest_context(inode, NULL, NULL);
+ if (snapc->seq > oldest->seq) {
+ ceph_put_snap_context(oldest);
+ dout(" page %p snapc %p not current or oldest\n",
+ page, snapc);
+ /*
+ * queue for writeback, and wait for snapc to
+ * be writeable or written
+ */
+ snapc = ceph_get_snap_context(snapc);
+ unlock_page(page);
+ ceph_queue_writeback(inode);
+ r = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ if (r == -ERESTARTSYS)
+ return r;
+ return -EAGAIN;
+ }
+ ceph_put_snap_context(oldest);
+
+ /* yay, writeable, do it now (without dropping page lock) */
+ dout(" page %p snapc %p not current, but oldest\n",
+ page, snapc);
+ if (!clear_page_dirty_for_io(page))
+ goto retry_locked;
+ r = writepage_nounlock(page, NULL);
+ if (r < 0)
+ goto fail_unlock;
+ goto retry_locked;
+ }
+
+ if (PageUptodate(page)) {
+ dout(" page %p already uptodate\n", page);
+ return 0;
+ }
+
+ /* full page? */
+ if (pos_in_page == 0 && len == PAGE_SIZE)
+ return 0;
+
+ /* past end of file? */
+ i_size = i_size_read(inode);
+
+ if (page_off >= i_size ||
+ (pos_in_page == 0 && (pos+len) >= i_size &&
+ end_in_page - pos_in_page != PAGE_SIZE)) {
+ dout(" zeroing %p 0 - %d and %d - %d\n",
+ page, pos_in_page, end_in_page, (int)PAGE_SIZE);
+ zero_user_segments(page,
+ 0, pos_in_page,
+ end_in_page, PAGE_SIZE);
+ return 0;
+ }
+
+ /* we need to read it. */
+ r = ceph_do_readpage(file, page);
+ if (r < 0) {
+ if (r == -EINPROGRESS)
+ return -EAGAIN;
+ goto fail_unlock;
+ }
+ goto retry_locked;
+fail_unlock:
+ unlock_page(page);
+ return r;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = file_inode(file);
+ struct page *page;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ int r;
+
+ do {
+ /* get a page */
+ page = grab_cache_page_write_begin(mapping, index, 0);
+ if (!page)
+ return -ENOMEM;
+
+ dout("write_begin file %p inode %p page %p %d~%d\n", file,
+ inode, page, (int)pos, (int)len);
+
+ r = ceph_update_writeable_page(file, pos, len, page);
+ if (r < 0)
+ put_page(page);
+ else
+ *pagep = page;
+ } while (r == -EAGAIN);
+
+ return r;
+}
+
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = file_inode(file);
+ bool check_cap = false;
+
+ dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+ inode, page, (int)pos, (int)copied, (int)len);
+
+ /* zero the stale part of the page if we did a short copy */
+ if (!PageUptodate(page)) {
+ if (copied < len) {
+ copied = 0;
+ goto out;
+ }
+ SetPageUptodate(page);
+ }
+
+ /* did file size increase? */
+ if (pos+copied > i_size_read(inode))
+ check_cap = ceph_inode_set_size(inode, pos+copied);
+
+ set_page_dirty(page);
+
+out:
+ unlock_page(page);
+ put_page(page);
+
+ if (check_cap)
+ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+
+ return copied;
+}
+
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
+{
+ WARN_ON(1);
+ return -EINVAL;
+}
+
+const struct address_space_operations ceph_aops = {
+ .readpage = ceph_readpage,
+ .readpages = ceph_readpages,
+ .writepage = ceph_writepage,
+ .writepages = ceph_writepages_start,
+ .write_begin = ceph_write_begin,
+ .write_end = ceph_write_end,
+ .set_page_dirty = ceph_set_page_dirty,
+ .invalidatepage = ceph_invalidatepage,
+ .releasepage = ceph_releasepage,
+ .direct_IO = ceph_direct_io,
+};
+
+static void ceph_block_sigs(sigset_t *oldset)
+{
+ sigset_t mask;
+ siginitsetinv(&mask, sigmask(SIGKILL));
+ sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static void ceph_restore_sigs(sigset_t *oldset)
+{
+ sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
+/*
+ * vm ops
+ */
+static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_info *fi = vma->vm_file->private_data;
+ struct page *pinned_page = NULL;
+ loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
+ int want, got, err;
+ sigset_t oldset;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
+
+ ceph_block_sigs(&oldset);
+
+ dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
+ inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_CACHE;
+
+ got = 0;
+ err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+ if (err < 0)
+ goto out_restore;
+
+ dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
+ inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
+
+ if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
+ ci->i_inline_version == CEPH_INLINE_NONE) {
+ CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
+ ceph_add_rw_context(fi, &rw_ctx);
+ ret = filemap_fault(vmf);
+ ceph_del_rw_context(fi, &rw_ctx);
+ dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n",
+ inode, off, (size_t)PAGE_SIZE,
+ ceph_cap_string(got), ret);
+ } else
+ err = -EAGAIN;
+
+ if (pinned_page)
+ put_page(pinned_page);
+ ceph_put_cap_refs(ci, got);
+
+ if (err != -EAGAIN)
+ goto out_restore;
+
+ /* read inline data */
+ if (off >= PAGE_SIZE) {
+ /* does not support inline data > PAGE_SIZE */
+ ret = VM_FAULT_SIGBUS;
+ } else {
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page = find_or_create_page(mapping, 0,
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out_inline;
+ }
+ err = __ceph_do_getattr(inode, page,
+ CEPH_STAT_CAP_INLINE_DATA, true);
+ if (err < 0 || off >= i_size_read(inode)) {
+ unlock_page(page);
+ put_page(page);
+ if (err == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
+ goto out_inline;
+ }
+ if (err < PAGE_SIZE)
+ zero_user_segment(page, err, PAGE_SIZE);
+ else
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ vmf->page = page;
+ ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+out_inline:
+ dout("filemap_fault %p %llu~%zd read inline data ret %x\n",
+ inode, off, (size_t)PAGE_SIZE, ret);
+ }
+out_restore:
+ ceph_restore_sigs(&oldset);
+ if (err < 0)
+ ret = vmf_error(err);
+
+ return ret;
+}
+
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_info *fi = vma->vm_file->private_data;
+ struct ceph_cap_flush *prealloc_cf;
+ struct page *page = vmf->page;
+ loff_t off = page_offset(page);
+ loff_t size = i_size_read(inode);
+ size_t len;
+ int want, got, err;
+ sigset_t oldset;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return VM_FAULT_OOM;
+
+ ceph_block_sigs(&oldset);
+
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ struct page *locked_page = NULL;
+ if (off == 0) {
+ lock_page(page);
+ locked_page = page;
+ }
+ err = ceph_uninline_data(vma->vm_file, locked_page);
+ if (locked_page)
+ unlock_page(locked_page);
+ if (err < 0)
+ goto out_free;
+ }
+
+ if (off + PAGE_SIZE <= size)
+ len = PAGE_SIZE;
+ else
+ len = size & ~PAGE_MASK;
+
+ dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
+ inode, ceph_vinop(inode), off, len, size);
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+
+ got = 0;
+ err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+ &got, NULL);
+ if (err < 0)
+ goto out_free;
+
+ dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
+ inode, off, len, ceph_cap_string(got));
+
+ /* Update time before taking page lock */
+ file_update_time(vma->vm_file);
+
+ do {
+ lock_page(page);
+
+ if ((off > size) || (page->mapping != inode->i_mapping)) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ break;
+ }
+
+ err = ceph_update_writeable_page(vma->vm_file, off, len, page);
+ if (err >= 0) {
+ /* success. we'll keep the page locked. */
+ set_page_dirty(page);
+ ret = VM_FAULT_LOCKED;
+ }
+ } while (err == -EAGAIN);
+
+ if (ret == VM_FAULT_LOCKED ||
+ ci->i_inline_version != CEPH_INLINE_NONE) {
+ int dirty;
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+
+ dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n",
+ inode, off, len, ceph_cap_string(got), ret);
+ ceph_put_cap_refs(ci, got);
+out_free:
+ ceph_restore_sigs(&oldset);
+ ceph_free_cap_flush(prealloc_cf);
+ if (err < 0)
+ ret = vmf_error(err);
+ return ret;
+}
+
+void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
+ char *data, size_t len)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+
+ if (locked_page) {
+ page = locked_page;
+ } else {
+ if (i_size_read(inode) == 0)
+ return;
+ page = find_or_create_page(mapping, 0,
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
+ if (!page)
+ return;
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return;
+ }
+ }
+
+ dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
+ inode, ceph_vinop(inode), len, locked_page);
+
+ if (len > 0) {
+ void *kaddr = kmap_atomic(page);
+ memcpy(kaddr, data, len);
+ kunmap_atomic(kaddr);
+ }
+
+ if (page != locked_page) {
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
+ else
+ flush_dcache_page(page);
+
+ SetPageUptodate(page);
+ unlock_page(page);
+ put_page(page);
+ }
+}
+
+int ceph_uninline_data(struct file *filp, struct page *locked_page)
+{
+ struct inode *inode = file_inode(filp);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_request *req;
+ struct page *page = NULL;
+ u64 len, inline_version;
+ int err = 0;
+ bool from_pagecache = false;
+
+ spin_lock(&ci->i_ceph_lock);
+ inline_version = ci->i_inline_version;
+ spin_unlock(&ci->i_ceph_lock);
+
+ dout("uninline_data %p %llx.%llx inline_version %llu\n",
+ inode, ceph_vinop(inode), inline_version);
+
+ if (inline_version == 1 || /* initial version, no data */
+ inline_version == CEPH_INLINE_NONE)
+ goto out;
+
+ if (locked_page) {
+ page = locked_page;
+ WARN_ON(!PageUptodate(page));
+ } else if (ceph_caps_issued(ci) &
+ (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
+ page = find_get_page(inode->i_mapping, 0);
+ if (page) {
+ if (PageUptodate(page)) {
+ from_pagecache = true;
+ lock_page(page);
+ } else {
+ put_page(page);
+ page = NULL;
+ }
+ }
+ }
+
+ if (page) {
+ len = i_size_read(inode);
+ if (len > PAGE_SIZE)
+ len = PAGE_SIZE;
+ } else {
+ page = __page_cache_alloc(GFP_NOFS);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = __ceph_do_getattr(inode, page,
+ CEPH_STAT_CAP_INLINE_DATA, true);
+ if (err < 0) {
+ /* no inline data */
+ if (err == -ENODATA)
+ err = 0;
+ goto out;
+ }
+ len = err;
+ }
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ ceph_vino(inode), 0, &len, 0, 1,
+ CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
+ NULL, 0, 0, false);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_mtime = inode->i_mtime;
+ err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!err)
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_put_request(req);
+ if (err < 0)
+ goto out;
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ ceph_vino(inode), 0, &len, 1, 3,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
+
+ {
+ __le64 xattr_buf = cpu_to_le64(inline_version);
+ err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
+ "inline_version", &xattr_buf,
+ sizeof(xattr_buf),
+ CEPH_OSD_CMPXATTR_OP_GT,
+ CEPH_OSD_CMPXATTR_MODE_U64);
+ if (err)
+ goto out_put;
+ }
+
+ {
+ char xattr_buf[32];
+ int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
+ "%llu", inline_version);
+ err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
+ "inline_version",
+ xattr_buf, xattr_len, 0, 0);
+ if (err)
+ goto out_put;
+ }
+
+ req->r_mtime = inode->i_mtime;
+ err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!err)
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+out_put:
+ ceph_osdc_put_request(req);
+ if (err == -ECANCELED)
+ err = 0;
+out:
+ if (page && page != locked_page) {
+ if (from_pagecache) {
+ unlock_page(page);
+ put_page(page);
+ } else
+ __free_pages(page, 0);
+ }
+
+ dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
+ inode, ceph_vinop(inode), inline_version, err);
+ return err;
+}
+
+static const struct vm_operations_struct ceph_vmops = {
+ .fault = ceph_filemap_fault,
+ .page_mkwrite = ceph_page_mkwrite,
+};
+
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ceph_vmops;
+ return 0;
+}
+
+enum {
+ POOL_READ = 1,
+ POOL_WRITE = 2,
+};
+
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
+ s64 pool, struct ceph_string *pool_ns)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
+ struct rb_node **p, *parent;
+ struct ceph_pool_perm *perm;
+ struct page **pages;
+ size_t pool_ns_len;
+ int err = 0, err2 = 0, have = 0;
+
+ down_read(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
+ while (*p) {
+ perm = rb_entry(*p, struct ceph_pool_perm, node);
+ if (pool < perm->pool)
+ p = &(*p)->rb_left;
+ else if (pool > perm->pool)
+ p = &(*p)->rb_right;
+ else {
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
+ }
+ }
+ up_read(&mdsc->pool_perm_rwsem);
+ if (*p)
+ goto out;
+
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
+ pool, (int)pool_ns->len, pool_ns->str);
+ else
+ dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
+
+ down_write(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ perm = rb_entry(parent, struct ceph_pool_perm, node);
+ if (pool < perm->pool)
+ p = &(*p)->rb_left;
+ else if (pool > perm->pool)
+ p = &(*p)->rb_right;
+ else {
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
+ }
+ }
+ if (*p) {
+ up_write(&mdsc->pool_perm_rwsem);
+ goto out;
+ }
+
+ rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
+ 1, false, GFP_NOFS);
+ if (!rd_req) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ rd_req->r_flags = CEPH_OSD_FLAG_READ;
+ osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
+ rd_req->r_base_oloc.pool = pool;
+ if (pool_ns)
+ rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
+ ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
+
+ err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+ if (err)
+ goto out_unlock;
+
+ wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
+ 1, false, GFP_NOFS);
+ if (!wr_req) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
+ osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
+ ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
+ ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
+
+ err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+ if (err)
+ goto out_unlock;
+
+ /* one page should be large enough for STAT data */
+ pages = ceph_alloc_page_vector(1, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ err = PTR_ERR(pages);
+ goto out_unlock;
+ }
+
+ osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
+ 0, false, true);
+ err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+
+ wr_req->r_mtime = ci->vfs_inode.i_mtime;
+ err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+
+ if (!err)
+ err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+ if (!err2)
+ err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+
+ if (err >= 0 || err == -ENOENT)
+ have |= POOL_READ;
+ else if (err != -EPERM)
+ goto out_unlock;
+
+ if (err2 == 0 || err2 == -EEXIST)
+ have |= POOL_WRITE;
+ else if (err2 != -EPERM) {
+ err = err2;
+ goto out_unlock;
+ }
+
+ pool_ns_len = pool_ns ? pool_ns->len : 0;
+ perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
+ if (!perm) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ perm->pool = pool;
+ perm->perm = have;
+ perm->pool_ns_len = pool_ns_len;
+ if (pool_ns_len > 0)
+ memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
+ perm->pool_ns[pool_ns_len] = 0;
+
+ rb_link_node(&perm->node, parent, p);
+ rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
+ err = 0;
+out_unlock:
+ up_write(&mdsc->pool_perm_rwsem);
+
+ ceph_osdc_put_request(rd_req);
+ ceph_osdc_put_request(wr_req);
+out:
+ if (!err)
+ err = have;
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
+ pool, (int)pool_ns->len, pool_ns->str, err);
+ else
+ dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
+ return err;
+}
+
+int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
+{
+ s64 pool;
+ struct ceph_string *pool_ns;
+ int ret, flags;
+
+ if (ci->i_vino.snap != CEPH_NOSNAP) {
+ /*
+ * Pool permission check needs to write to the first object.
+ * But for snapshot, head of the first object may have alread
+ * been deleted. Skip check to avoid creating orphan object.
+ */
+ return 0;
+ }
+
+ if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
+ NOPOOLPERM))
+ return 0;
+
+ spin_lock(&ci->i_ceph_lock);
+ flags = ci->i_ceph_flags;
+ pool = ci->i_layout.pool_id;
+ spin_unlock(&ci->i_ceph_lock);
+check:
+ if (flags & CEPH_I_POOL_PERM) {
+ if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
+ dout("ceph_pool_perm_check pool %lld no read perm\n",
+ pool);
+ return -EPERM;
+ }
+ if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
+ dout("ceph_pool_perm_check pool %lld no write perm\n",
+ pool);
+ return -EPERM;
+ }
+ return 0;
+ }
+
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ ret = __ceph_pool_perm_get(ci, pool, pool_ns);
+ ceph_put_string(pool_ns);
+ if (ret < 0)
+ return ret;
+
+ flags = CEPH_I_POOL_PERM;
+ if (ret & POOL_READ)
+ flags |= CEPH_I_POOL_RD;
+ if (ret & POOL_WRITE)
+ flags |= CEPH_I_POOL_WR;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (pool == ci->i_layout.pool_id &&
+ pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
+ ci->i_ceph_flags |= flags;
+ } else {
+ pool = ci->i_layout.pool_id;
+ flags = ci->i_ceph_flags;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ goto check;
+}
+
+void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
+{
+ struct ceph_pool_perm *perm;
+ struct rb_node *n;
+
+ while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
+ n = rb_first(&mdsc->pool_perm_tree);
+ perm = rb_entry(n, struct ceph_pool_perm, node);
+ rb_erase(n, &mdsc->pool_perm_tree);
+ kfree(perm);
+ }
+}
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
new file mode 100644
index 000000000..1bf3502bd
--- /dev/null
+++ b/fs/ceph/cache.c
@@ -0,0 +1,364 @@
+/*
+ * Ceph cache definitions.
+ *
+ * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ * Written by Milosz Tanski (milosz@adfin.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#include "super.h"
+#include "cache.h"
+
+struct ceph_aux_inode {
+ u64 version;
+ u64 mtime_sec;
+ u64 mtime_nsec;
+};
+
+struct fscache_netfs ceph_cache_netfs = {
+ .name = "ceph",
+ .version = 0,
+};
+
+static DEFINE_MUTEX(ceph_fscache_lock);
+static LIST_HEAD(ceph_fscache_list);
+
+struct ceph_fscache_entry {
+ struct list_head list;
+ struct fscache_cookie *fscache;
+ size_t uniq_len;
+ /* The following members must be last */
+ struct ceph_fsid fsid;
+ char uniquifier[0];
+};
+
+static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
+ .name = "CEPH.fsid",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+};
+
+int __init ceph_fscache_register(void)
+{
+ return fscache_register_netfs(&ceph_cache_netfs);
+}
+
+void ceph_fscache_unregister(void)
+{
+ fscache_unregister_netfs(&ceph_cache_netfs);
+}
+
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
+{
+ const struct ceph_fsid *fsid = &fsc->client->fsid;
+ const char *fscache_uniq = fsc->mount_options->fscache_uniq;
+ size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
+ struct ceph_fscache_entry *ent;
+ int err = 0;
+
+ mutex_lock(&ceph_fscache_lock);
+ list_for_each_entry(ent, &ceph_fscache_list, list) {
+ if (memcmp(&ent->fsid, fsid, sizeof(*fsid)))
+ continue;
+ if (ent->uniq_len != uniq_len)
+ continue;
+ if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len))
+ continue;
+
+ pr_err("fscache cookie already registered for fsid %pU\n", fsid);
+ pr_err(" use fsc=%%s mount option to specify a uniquifier\n");
+ err = -EBUSY;
+ goto out_unlock;
+ }
+
+ ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL);
+ if (!ent) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(&ent->fsid, fsid, sizeof(*fsid));
+ if (uniq_len > 0) {
+ memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
+ ent->uniq_len = uniq_len;
+ }
+
+ fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
+ &ceph_fscache_fsid_object_def,
+ &ent->fsid, sizeof(ent->fsid) + uniq_len,
+ NULL, 0,
+ fsc, 0, true);
+
+ if (fsc->fscache) {
+ ent->fscache = fsc->fscache;
+ list_add_tail(&ent->list, &ceph_fscache_list);
+ } else {
+ kfree(ent);
+ pr_err("unable to register fscache cookie for fsid %pU\n",
+ fsid);
+ /* all other fs ignore this error */
+ }
+out_unlock:
+ mutex_unlock(&ceph_fscache_lock);
+ return err;
+}
+
+static enum fscache_checkaux ceph_fscache_inode_check_aux(
+ void *cookie_netfs_data, const void *data, uint16_t dlen,
+ loff_t object_size)
+{
+ struct ceph_aux_inode aux;
+ struct ceph_inode_info* ci = cookie_netfs_data;
+ struct inode* inode = &ci->vfs_inode;
+
+ if (dlen != sizeof(aux) ||
+ i_size_read(inode) != object_size)
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ memset(&aux, 0, sizeof(aux));
+ aux.version = ci->i_version;
+ aux.mtime_sec = inode->i_mtime.tv_sec;
+ aux.mtime_nsec = inode->i_mtime.tv_nsec;
+
+ if (memcmp(data, &aux, sizeof(aux)) != 0)
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ dout("ceph inode 0x%p cached okay\n", ci);
+ return FSCACHE_CHECKAUX_OKAY;
+}
+
+static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
+ .name = "CEPH.inode",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .check_aux = ceph_fscache_inode_check_aux,
+};
+
+void ceph_fscache_register_inode_cookie(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_aux_inode aux;
+
+ /* No caching for filesystem */
+ if (!fsc->fscache)
+ return;
+
+ /* Only cache for regular files that are read only */
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ inode_lock_nested(inode, I_MUTEX_CHILD);
+ if (!ci->fscache) {
+ memset(&aux, 0, sizeof(aux));
+ aux.version = ci->i_version;
+ aux.mtime_sec = inode->i_mtime.tv_sec;
+ aux.mtime_nsec = inode->i_mtime.tv_nsec;
+ ci->fscache = fscache_acquire_cookie(fsc->fscache,
+ &ceph_fscache_inode_object_def,
+ &ci->i_vino, sizeof(ci->i_vino),
+ &aux, sizeof(aux),
+ ci, i_size_read(inode), false);
+ }
+ inode_unlock(inode);
+}
+
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+ struct fscache_cookie* cookie;
+
+ if ((cookie = ci->fscache) == NULL)
+ return;
+
+ ci->fscache = NULL;
+
+ fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
+ fscache_relinquish_cookie(cookie, &ci->i_vino, false);
+}
+
+static bool ceph_fscache_can_enable(void *data)
+{
+ struct inode *inode = data;
+ return !inode_is_open_for_write(inode);
+}
+
+void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (!fscache_cookie_valid(ci->fscache))
+ return;
+
+ if (inode_is_open_for_write(inode)) {
+ dout("fscache_file_set_cookie %p %p disabling cache\n",
+ inode, filp);
+ fscache_disable_cookie(ci->fscache, &ci->i_vino, false);
+ fscache_uncache_all_inode_pages(ci->fscache, inode);
+ } else {
+ fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode),
+ ceph_fscache_can_enable, inode);
+ if (fscache_cookie_enabled(ci->fscache)) {
+ dout("fscache_file_set_cookie %p %p enabling cache\n",
+ inode, filp);
+ }
+ }
+}
+
+static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error)
+{
+ if (!error)
+ SetPageUptodate(page);
+
+ unlock_page(page);
+}
+
+static inline bool cache_valid(struct ceph_inode_info *ci)
+{
+ return ci->i_fscache_gen == ci->i_rdcache_gen;
+}
+
+
+/* Atempt to read from the fscache,
+ *
+ * This function is called from the readpage_nounlock context. DO NOT attempt to
+ * unlock the page here (or in the callback).
+ */
+int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ if (!cache_valid(ci))
+ return -ENOBUFS;
+
+ ret = fscache_read_or_alloc_page(ci->fscache, page,
+ ceph_readpage_from_fscache_complete, NULL,
+ GFP_KERNEL);
+
+ switch (ret) {
+ case 0: /* Page found */
+ dout("page read submitted\n");
+ return 0;
+ case -ENOBUFS: /* Pages were not found, and can't be */
+ case -ENODATA: /* Pages were not found */
+ dout("page/inode not in cache\n");
+ return ret;
+ default:
+ dout("%s: unknown error ret = %i\n", __func__, ret);
+ return ret;
+ }
+}
+
+int ceph_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ if (!cache_valid(ci))
+ return -ENOBUFS;
+
+ ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
+ ceph_readpage_from_fscache_complete,
+ NULL, mapping_gfp_mask(mapping));
+
+ switch (ret) {
+ case 0: /* All pages found */
+ dout("all-page read submitted\n");
+ return 0;
+ case -ENOBUFS: /* Some pages were not found, and can't be */
+ case -ENODATA: /* some pages were not found */
+ dout("page/inode not in cache\n");
+ return ret;
+ default:
+ dout("%s: unknown error ret = %i\n", __func__, ret);
+ return ret;
+ }
+}
+
+void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ if (!PageFsCache(page))
+ return;
+
+ if (!cache_valid(ci))
+ return;
+
+ ret = fscache_write_page(ci->fscache, page, i_size_read(inode),
+ GFP_KERNEL);
+ if (ret)
+ fscache_uncache_page(ci->fscache, page);
+}
+
+void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (!PageFsCache(page))
+ return;
+
+ fscache_wait_on_page_write(ci->fscache, page);
+ fscache_uncache_page(ci->fscache, page);
+}
+
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+{
+ if (fscache_cookie_valid(fsc->fscache)) {
+ struct ceph_fscache_entry *ent;
+ bool found = false;
+
+ mutex_lock(&ceph_fscache_lock);
+ list_for_each_entry(ent, &ceph_fscache_list, list) {
+ if (ent->fscache == fsc->fscache) {
+ list_del(&ent->list);
+ kfree(ent);
+ found = true;
+ break;
+ }
+ }
+ WARN_ON_ONCE(!found);
+ mutex_unlock(&ceph_fscache_lock);
+
+ __fscache_relinquish_cookie(fsc->fscache, NULL, false);
+ }
+ fsc->fscache = NULL;
+}
+
+/*
+ * caller should hold CEPH_CAP_FILE_{RD,CACHE}
+ */
+void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
+{
+ if (cache_valid(ci))
+ return;
+
+ /* resue i_truncate_mutex. There should be no pending
+ * truncate while the caller holds CEPH_CAP_FILE_RD */
+ mutex_lock(&ci->i_truncate_mutex);
+ if (!cache_valid(ci)) {
+ if (fscache_check_consistency(ci->fscache, &ci->i_vino))
+ fscache_invalidate(ci->fscache);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_fscache_gen = ci->i_rdcache_gen;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ mutex_unlock(&ci->i_truncate_mutex);
+}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
new file mode 100644
index 000000000..7e72c7594
--- /dev/null
+++ b/fs/ceph/cache.h
@@ -0,0 +1,190 @@
+/*
+ * Ceph cache definitions.
+ *
+ * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ * Written by Milosz Tanski (milosz@adfin.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#ifndef _CEPH_CACHE_H
+#define _CEPH_CACHE_H
+
+#ifdef CONFIG_CEPH_FSCACHE
+
+extern struct fscache_netfs ceph_cache_netfs;
+
+int ceph_fscache_register(void);
+void ceph_fscache_unregister(void);
+
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
+
+void ceph_fscache_register_inode_cookie(struct inode *inode);
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
+void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp);
+void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci);
+
+int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
+int ceph_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages);
+void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
+void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
+
+static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+{
+ ci->fscache = NULL;
+ ci->i_fscache_gen = 0;
+}
+
+static inline void ceph_fscache_invalidate(struct inode *inode)
+{
+ fscache_invalidate(ceph_inode(inode)->fscache);
+}
+
+static inline void ceph_fscache_uncache_page(struct inode *inode,
+ struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ return fscache_uncache_page(ci->fscache, page);
+}
+
+static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+{
+ struct inode* inode = page->mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ return fscache_maybe_release_page(ci->fscache, page, gfp);
+}
+
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+ struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
+ __fscache_uncache_page(ci->fscache, page);
+}
+
+static inline void ceph_fscache_readpages_cancel(struct inode *inode,
+ struct list_head *pages)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ return fscache_readpages_cancel(ci->fscache, pages);
+}
+
+static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
+{
+ ci->i_fscache_gen = ci->i_rdcache_gen - 1;
+}
+
+#else
+
+static inline int ceph_fscache_register(void)
+{
+ return 0;
+}
+
+static inline void ceph_fscache_unregister(void)
+{
+}
+
+static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
+{
+ return 0;
+}
+
+static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+{
+}
+
+static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+{
+}
+
+static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
+{
+}
+
+static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+}
+
+static inline void ceph_fscache_file_set_cookie(struct inode *inode,
+ struct file *filp)
+{
+}
+
+static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
+{
+}
+
+static inline void ceph_fscache_uncache_page(struct inode *inode,
+ struct page *pages)
+{
+}
+
+static inline int ceph_readpage_from_fscache(struct inode* inode,
+ struct page *page)
+{
+ return -ENOBUFS;
+}
+
+static inline int ceph_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ return -ENOBUFS;
+}
+
+static inline void ceph_readpage_to_fscache(struct inode *inode,
+ struct page *page)
+{
+}
+
+static inline void ceph_fscache_invalidate(struct inode *inode)
+{
+}
+
+static inline void ceph_invalidate_fscache_page(struct inode *inode,
+ struct page *page)
+{
+}
+
+static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+{
+ return 1;
+}
+
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+ struct page *page)
+{
+}
+
+static inline void ceph_fscache_readpages_cancel(struct inode *inode,
+ struct list_head *pages)
+{
+}
+
+static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
+{
+}
+
+#endif
+
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000..6443ba1e6
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,4240 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "cache.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
+
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes). Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server. When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid);
+
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+
+static char *gcap_string(char *s, int c)
+{
+ if (c & CEPH_CAP_GSHARED)
+ *s++ = 's';
+ if (c & CEPH_CAP_GEXCL)
+ *s++ = 'x';
+ if (c & CEPH_CAP_GCACHE)
+ *s++ = 'c';
+ if (c & CEPH_CAP_GRD)
+ *s++ = 'r';
+ if (c & CEPH_CAP_GWR)
+ *s++ = 'w';
+ if (c & CEPH_CAP_GBUFFER)
+ *s++ = 'b';
+ if (c & CEPH_CAP_GWREXTEND)
+ *s++ = 'a';
+ if (c & CEPH_CAP_GLAZYIO)
+ *s++ = 'l';
+ return s;
+}
+
+const char *ceph_cap_string(int caps)
+{
+ int i;
+ char *s;
+ int c;
+
+ spin_lock(&cap_str_lock);
+ i = last_cap_str++;
+ if (last_cap_str == MAX_CAP_STR)
+ last_cap_str = 0;
+ spin_unlock(&cap_str_lock);
+
+ s = cap_str[i];
+
+ if (caps & CEPH_CAP_PIN)
+ *s++ = 'p';
+
+ c = (caps >> CEPH_CAP_SAUTH) & 3;
+ if (c) {
+ *s++ = 'A';
+ s = gcap_string(s, c);
+ }
+
+ c = (caps >> CEPH_CAP_SLINK) & 3;
+ if (c) {
+ *s++ = 'L';
+ s = gcap_string(s, c);
+ }
+
+ c = (caps >> CEPH_CAP_SXATTR) & 3;
+ if (c) {
+ *s++ = 'X';
+ s = gcap_string(s, c);
+ }
+
+ c = caps >> CEPH_CAP_SFILE;
+ if (c) {
+ *s++ = 'F';
+ s = gcap_string(s, c);
+ }
+
+ if (s == cap_str[i])
+ *s++ = '-';
+ *s = 0;
+ return cap_str[i];
+}
+
+void ceph_caps_init(struct ceph_mds_client *mdsc)
+{
+ INIT_LIST_HEAD(&mdsc->caps_list);
+ spin_lock_init(&mdsc->caps_list_lock);
+}
+
+void ceph_caps_finalize(struct ceph_mds_client *mdsc)
+{
+ struct ceph_cap *cap;
+
+ spin_lock(&mdsc->caps_list_lock);
+ while (!list_empty(&mdsc->caps_list)) {
+ cap = list_first_entry(&mdsc->caps_list,
+ struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+ kmem_cache_free(ceph_cap_cachep, cap);
+ }
+ mdsc->caps_total_count = 0;
+ mdsc->caps_avail_count = 0;
+ mdsc->caps_use_count = 0;
+ mdsc->caps_reserve_count = 0;
+ mdsc->caps_min_count = 0;
+ spin_unlock(&mdsc->caps_list_lock);
+}
+
+void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
+{
+ spin_lock(&mdsc->caps_list_lock);
+ mdsc->caps_min_count += delta;
+ BUG_ON(mdsc->caps_min_count < 0);
+ spin_unlock(&mdsc->caps_list_lock);
+}
+
+static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
+{
+ struct ceph_cap *cap;
+ int i;
+
+ if (nr_caps) {
+ BUG_ON(mdsc->caps_reserve_count < nr_caps);
+ mdsc->caps_reserve_count -= nr_caps;
+ if (mdsc->caps_avail_count >=
+ mdsc->caps_reserve_count + mdsc->caps_min_count) {
+ mdsc->caps_total_count -= nr_caps;
+ for (i = 0; i < nr_caps; i++) {
+ cap = list_first_entry(&mdsc->caps_list,
+ struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+ kmem_cache_free(ceph_cap_cachep, cap);
+ }
+ } else {
+ mdsc->caps_avail_count += nr_caps;
+ }
+
+ dout("%s: caps %d = %d used + %d resv + %d avail\n",
+ __func__,
+ mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count +
+ mdsc->caps_avail_count);
+ }
+}
+
+/*
+ * Called under mdsc->mutex.
+ */
+int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx, int need)
+{
+ int i, j;
+ struct ceph_cap *cap;
+ int have;
+ int alloc = 0;
+ int max_caps;
+ int err = 0;
+ bool trimmed = false;
+ struct ceph_mds_session *s;
+ LIST_HEAD(newcaps);
+
+ dout("reserve caps ctx=%p need=%d\n", ctx, need);
+
+ /* first reserve any caps that are already allocated */
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_avail_count >= need)
+ have = need;
+ else
+ have = mdsc->caps_avail_count;
+ mdsc->caps_avail_count -= have;
+ mdsc->caps_reserve_count += have;
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count +
+ mdsc->caps_avail_count);
+ spin_unlock(&mdsc->caps_list_lock);
+
+ for (i = have; i < need; ) {
+ cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+ if (cap) {
+ list_add(&cap->caps_item, &newcaps);
+ alloc++;
+ i++;
+ continue;
+ }
+
+ if (!trimmed) {
+ for (j = 0; j < mdsc->max_sessions; j++) {
+ s = __ceph_lookup_mds_session(mdsc, j);
+ if (!s)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ max_caps = s->s_nr_caps - (need - i);
+ ceph_trim_caps(mdsc, s, max_caps);
+ mutex_unlock(&s->s_mutex);
+
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ trimmed = true;
+
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_avail_count) {
+ int more_have;
+ if (mdsc->caps_avail_count >= need - i)
+ more_have = need - i;
+ else
+ more_have = mdsc->caps_avail_count;
+
+ i += more_have;
+ have += more_have;
+ mdsc->caps_avail_count -= more_have;
+ mdsc->caps_reserve_count += more_have;
+
+ }
+ spin_unlock(&mdsc->caps_list_lock);
+
+ continue;
+ }
+
+ pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+ ctx, need, have + alloc);
+ err = -ENOMEM;
+ break;
+ }
+
+ if (!err) {
+ BUG_ON(have + alloc != need);
+ ctx->count = need;
+ }
+
+ spin_lock(&mdsc->caps_list_lock);
+ mdsc->caps_total_count += alloc;
+ mdsc->caps_reserve_count += alloc;
+ list_splice(&newcaps, &mdsc->caps_list);
+
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count +
+ mdsc->caps_avail_count);
+
+ if (err)
+ __ceph_unreserve_caps(mdsc, have + alloc);
+
+ spin_unlock(&mdsc->caps_list_lock);
+
+ dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+ ctx, mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ return err;
+}
+
+void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx)
+{
+ dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+ spin_lock(&mdsc->caps_list_lock);
+ __ceph_unreserve_caps(mdsc, ctx->count);
+ ctx->count = 0;
+ spin_unlock(&mdsc->caps_list_lock);
+}
+
+struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx)
+{
+ struct ceph_cap *cap = NULL;
+
+ /* temporary, until we do something about cap import/export */
+ if (!ctx) {
+ cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+ if (cap) {
+ spin_lock(&mdsc->caps_list_lock);
+ mdsc->caps_use_count++;
+ mdsc->caps_total_count++;
+ spin_unlock(&mdsc->caps_list_lock);
+ } else {
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_avail_count) {
+ BUG_ON(list_empty(&mdsc->caps_list));
+
+ mdsc->caps_avail_count--;
+ mdsc->caps_use_count++;
+ cap = list_first_entry(&mdsc->caps_list,
+ struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count + mdsc->caps_avail_count);
+ }
+ spin_unlock(&mdsc->caps_list_lock);
+ }
+
+ return cap;
+ }
+
+ spin_lock(&mdsc->caps_list_lock);
+ dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+ ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ BUG_ON(!ctx->count);
+ BUG_ON(ctx->count > mdsc->caps_reserve_count);
+ BUG_ON(list_empty(&mdsc->caps_list));
+
+ ctx->count--;
+ mdsc->caps_reserve_count--;
+ mdsc->caps_use_count++;
+
+ cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count + mdsc->caps_avail_count);
+ spin_unlock(&mdsc->caps_list_lock);
+ return cap;
+}
+
+void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
+{
+ spin_lock(&mdsc->caps_list_lock);
+ dout("put_cap %p %d = %d used + %d resv + %d avail\n",
+ cap, mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ mdsc->caps_use_count--;
+ /*
+ * Keep some preallocated caps around (ceph_min_count), to
+ * avoid lots of free/alloc churn.
+ */
+ if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
+ mdsc->caps_min_count) {
+ mdsc->caps_total_count--;
+ kmem_cache_free(ceph_cap_cachep, cap);
+ } else {
+ mdsc->caps_avail_count++;
+ list_add(&cap->caps_item, &mdsc->caps_list);
+ }
+
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count + mdsc->caps_avail_count);
+ spin_unlock(&mdsc->caps_list_lock);
+}
+
+void ceph_reservation_status(struct ceph_fs_client *fsc,
+ int *total, int *avail, int *used, int *reserved,
+ int *min)
+{
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+
+ spin_lock(&mdsc->caps_list_lock);
+
+ if (total)
+ *total = mdsc->caps_total_count;
+ if (avail)
+ *avail = mdsc->caps_avail_count;
+ if (used)
+ *used = mdsc->caps_use_count;
+ if (reserved)
+ *reserved = mdsc->caps_reserve_count;
+ if (min)
+ *min = mdsc->caps_min_count;
+
+ spin_unlock(&mdsc->caps_list_lock);
+}
+
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_ceph_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+ struct ceph_cap *cap;
+ struct rb_node *n = ci->i_caps.rb_node;
+
+ while (n) {
+ cap = rb_entry(n, struct ceph_cap, ci_node);
+ if (mds < cap->mds)
+ n = n->rb_left;
+ else if (mds > cap->mds)
+ n = n->rb_right;
+ else
+ return cap;
+ }
+ return NULL;
+}
+
+struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+ struct ceph_cap *cap;
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ spin_unlock(&ci->i_ceph_lock);
+ return cap;
+}
+
+/*
+ * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
+{
+ struct ceph_cap *cap;
+ int mds = -1;
+ struct rb_node *p;
+
+ /* prefer mds with WR|BUFFER|EXCL caps */
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ mds = cap->mds;
+ if (cap->issued & (CEPH_CAP_FILE_WR |
+ CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_FILE_EXCL))
+ break;
+ }
+ return mds;
+}
+
+int ceph_get_cap_mds(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int mds;
+ spin_lock(&ci->i_ceph_lock);
+ mds = __ceph_get_cap_mds(ceph_inode(inode));
+ spin_unlock(&ci->i_ceph_lock);
+ return mds;
+}
+
+/*
+ * Called under i_ceph_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+ struct ceph_cap *new)
+{
+ struct rb_node **p = &ci->i_caps.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_cap *cap = NULL;
+
+ while (*p) {
+ parent = *p;
+ cap = rb_entry(parent, struct ceph_cap, ci_node);
+ if (new->mds < cap->mds)
+ p = &(*p)->rb_left;
+ else if (new->mds > cap->mds)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->ci_node, parent, p);
+ rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS. Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ struct ceph_mount_options *ma = mdsc->fsc->mount_options;
+
+ ci->i_hold_caps_min = round_jiffies(jiffies +
+ ma->caps_wanted_delay_min * HZ);
+ ci->i_hold_caps_max = round_jiffies(jiffies +
+ ma->caps_wanted_delay_max * HZ);
+ dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+ ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_ceph_lock
+ * -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ __cap_set_timeouts(mdsc, ci);
+ dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+ ci->i_ceph_flags, ci->i_hold_caps_max);
+ if (!mdsc->stopping) {
+ spin_lock(&mdsc->cap_delay_lock);
+ if (!list_empty(&ci->i_cap_delay_list)) {
+ if (ci->i_ceph_flags & CEPH_I_FLUSH)
+ goto no_change;
+ list_del_init(&ci->i_cap_delay_list);
+ }
+ list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+ spin_unlock(&mdsc->cap_delay_lock);
+ }
+}
+
+/*
+ * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+ spin_lock(&mdsc->cap_delay_lock);
+ ci->i_ceph_flags |= CEPH_I_FLUSH;
+ if (!list_empty(&ci->i_cap_delay_list))
+ list_del_init(&ci->i_cap_delay_list);
+ list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+ spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_ceph_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+ if (list_empty(&ci->i_cap_delay_list))
+ return;
+ spin_lock(&mdsc->cap_delay_lock);
+ list_del_init(&ci->i_cap_delay_list);
+ spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+ unsigned issued)
+{
+ unsigned had = __ceph_caps_issued(ci, NULL);
+
+ /*
+ * Each time we receive FILE_CACHE anew, we increment
+ * i_rdcache_gen.
+ */
+ if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
+ ci->i_rdcache_gen++;
+ }
+
+ /*
+ * If FILE_SHARED is newly issued, mark dir not complete. We don't
+ * know what happened to this directory while we didn't have the cap.
+ * If FILE_SHARED is being revoked, also mark dir not complete. It
+ * stops on-going cached readdir.
+ */
+ if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
+ if (issued & CEPH_CAP_FILE_SHARED)
+ atomic_inc(&ci->i_shared_gen);
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ dout(" marking %p NOT complete\n", &ci->vfs_inode);
+ __ceph_dir_clear_complete(ci);
+ }
+ }
+}
+
+/*
+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0. (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+void ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned seq, unsigned mseq, u64 realmino, int flags,
+ struct ceph_cap **new_cap)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap *cap;
+ int mds = session->s_mds;
+ int actual_wanted;
+
+ dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+ session->s_mds, cap_id, ceph_cap_string(issued), seq);
+
+ /*
+ * If we are opening the file, include file mode wanted bits
+ * in wanted.
+ */
+ if (fmode >= 0)
+ wanted |= ceph_caps_for_mode(fmode);
+
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ cap = *new_cap;
+ *new_cap = NULL;
+
+ cap->issued = 0;
+ cap->implemented = 0;
+ cap->mds = mds;
+ cap->mds_wanted = 0;
+ cap->mseq = 0;
+
+ cap->ci = ci;
+ __insert_cap_node(ci, cap);
+
+ /* add to session cap list */
+ cap->session = session;
+ spin_lock(&session->s_cap_lock);
+ list_add_tail(&cap->session_caps, &session->s_caps);
+ session->s_nr_caps++;
+ spin_unlock(&session->s_cap_lock);
+ } else {
+ /*
+ * auth mds of the inode changed. we received the cap export
+ * message, but still haven't received the cap import message.
+ * handle_cap_export() updated the new auth MDS' cap.
+ *
+ * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
+ * a message that was send before the cap import message. So
+ * don't remove caps.
+ */
+ if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+ WARN_ON(cap != ci->i_auth_cap);
+ WARN_ON(cap->cap_id != cap_id);
+ seq = cap->seq;
+ mseq = cap->mseq;
+ issued |= cap->issued;
+ flags |= CEPH_CAP_FLAG_AUTH;
+ }
+ }
+
+ if (!ci->i_snap_realm ||
+ ((flags & CEPH_CAP_FLAG_AUTH) &&
+ realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) {
+ /*
+ * add this inode to the appropriate snap realm
+ */
+ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+ realmino);
+ if (realm) {
+ struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
+ if (oldrealm) {
+ spin_lock(&oldrealm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ spin_unlock(&oldrealm->inodes_with_caps_lock);
+ }
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_add(&ci->i_snap_realm_item,
+ &realm->inodes_with_caps);
+ ci->i_snap_realm = realm;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = inode;
+ spin_unlock(&realm->inodes_with_caps_lock);
+
+ if (oldrealm)
+ ceph_put_snap_realm(mdsc, oldrealm);
+ } else {
+ pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+ realmino);
+ WARN_ON(!realm);
+ }
+ }
+
+ __check_cap_issue(ci, cap, issued);
+
+ /*
+ * If we are issued caps we don't want, or the mds' wanted
+ * value appears to be off, queue a check so we'll release
+ * later and/or update the mds wanted value.
+ */
+ actual_wanted = __ceph_caps_wanted(ci);
+ if ((wanted & ~actual_wanted) ||
+ (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+ dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+ ceph_cap_string(issued), ceph_cap_string(wanted),
+ ceph_cap_string(actual_wanted));
+ __cap_delay_requeue(mdsc, ci);
+ }
+
+ if (flags & CEPH_CAP_FLAG_AUTH) {
+ if (!ci->i_auth_cap ||
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
+ ci->i_auth_cap = cap;
+ cap->mds_wanted = wanted;
+ }
+ } else {
+ WARN_ON(ci->i_auth_cap == cap);
+ }
+
+ dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+ inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+ ceph_cap_string(issued|cap->issued), seq, mds);
+ cap->cap_id = cap_id;
+ cap->issued = issued;
+ cap->implemented |= issued;
+ if (ceph_seq_cmp(mseq, cap->mseq) > 0)
+ cap->mds_wanted = wanted;
+ else
+ cap->mds_wanted |= wanted;
+ cap->seq = seq;
+ cap->issue_seq = seq;
+ cap->mseq = mseq;
+ cap->cap_gen = session->s_cap_gen;
+
+ if (fmode >= 0)
+ __ceph_get_fmode(ci, fmode);
+}
+
+/*
+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+ unsigned long ttl;
+ u32 gen;
+
+ spin_lock(&cap->session->s_gen_ttl_lock);
+ gen = cap->session->s_cap_gen;
+ ttl = cap->session->s_cap_ttl;
+ spin_unlock(&cap->session->s_gen_ttl_lock);
+
+ if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
+ dout("__cap_is_valid %p cap %p issued %s "
+ "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+ cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Return set of valid cap bits issued to us. Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+ int have = ci->i_snap_caps;
+ struct ceph_cap *cap;
+ struct rb_node *p;
+
+ if (implemented)
+ *implemented = 0;
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ dout("__ceph_caps_issued %p cap %p issued %s\n",
+ &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+ have |= cap->issued;
+ if (implemented)
+ *implemented |= cap->implemented;
+ }
+ /*
+ * exclude caps issued by non-auth MDS, but are been revoking
+ * by the auth MDS. The non-auth MDS should be revoking/exporting
+ * these caps, but the message is delayed.
+ */
+ if (ci->i_auth_cap) {
+ cap = ci->i_auth_cap;
+ have &= ~cap->implemented | cap->issued;
+ }
+ return have;
+}
+
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+ int have = ci->i_snap_caps;
+ struct ceph_cap *cap;
+ struct rb_node *p;
+
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (cap == ocap)
+ continue;
+ if (!__cap_is_valid(cap))
+ continue;
+ have |= cap->issued;
+ }
+ return have;
+}
+
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+ struct ceph_mds_session *s = cap->session;
+
+ spin_lock(&s->s_cap_lock);
+ if (!s->s_cap_iterator) {
+ dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+ s->s_mds);
+ list_move_tail(&cap->session_caps, &s->s_caps);
+ } else {
+ dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
+ &cap->ci->vfs_inode, cap, s->s_mds);
+ }
+ spin_unlock(&s->s_cap_lock);
+}
+
+/*
+ * Check if we hold the given mask. If so, move the cap(s) to the
+ * front of their respective LRUs. (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+ struct ceph_cap *cap;
+ struct rb_node *p;
+ int have = ci->i_snap_caps;
+
+ if ((have & mask) == mask) {
+ dout("__ceph_caps_issued_mask %p snap issued %s"
+ " (mask %s)\n", &ci->vfs_inode,
+ ceph_cap_string(have),
+ ceph_cap_string(mask));
+ return 1;
+ }
+
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ if ((cap->issued & mask) == mask) {
+ dout("__ceph_caps_issued_mask %p cap %p issued %s"
+ " (mask %s)\n", &ci->vfs_inode, cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
+ if (touch)
+ __touch_cap(cap);
+ return 1;
+ }
+
+ /* does a combination of caps satisfy mask? */
+ have |= cap->issued;
+ if ((have & mask) == mask) {
+ dout("__ceph_caps_issued_mask %p combo issued %s"
+ " (mask %s)\n", &ci->vfs_inode,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
+ if (touch) {
+ struct rb_node *q;
+
+ /* touch this + preceding caps */
+ __touch_cap(cap);
+ for (q = rb_first(&ci->i_caps); q != p;
+ q = rb_next(q)) {
+ cap = rb_entry(q, struct ceph_cap,
+ ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ __touch_cap(cap);
+ }
+ }
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+ struct ceph_cap *ocap, int mask)
+{
+ struct ceph_cap *cap;
+ struct rb_node *p;
+
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (cap != ocap &&
+ (cap->implemented & ~cap->issued & mask))
+ return 1;
+ }
+ return 0;
+}
+
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_caps_revoking_other(ci, NULL, mask);
+ spin_unlock(&ci->i_ceph_lock);
+ dout("ceph_caps_revoking %p %s = %d\n", inode,
+ ceph_cap_string(mask), ret);
+ return ret;
+}
+
+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+ int used = 0;
+ if (ci->i_pin_ref)
+ used |= CEPH_CAP_PIN;
+ if (ci->i_rd_ref)
+ used |= CEPH_CAP_FILE_RD;
+ if (ci->i_rdcache_ref ||
+ (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
+ ci->vfs_inode.i_data.nrpages))
+ used |= CEPH_CAP_FILE_CACHE;
+ if (ci->i_wr_ref)
+ used |= CEPH_CAP_FILE_WR;
+ if (ci->i_wb_ref || ci->i_wrbuffer_ref)
+ used |= CEPH_CAP_FILE_BUFFER;
+ return used;
+}
+
+/*
+ * wanted, by virtue of open file modes
+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+ int i, bits = 0;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (ci->i_nr_by_mode[i])
+ bits |= 1 << i;
+ }
+ if (bits == 0)
+ return 0;
+ return ceph_caps_for_mode(bits >> 1);
+}
+
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
+{
+ struct ceph_cap *cap;
+ struct rb_node *p;
+ int mds_wanted = 0;
+
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (check && !__cap_is_valid(cap))
+ continue;
+ if (cap == ci->i_auth_cap)
+ mds_wanted |= cap->mds_wanted;
+ else
+ mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
+ }
+ return mds_wanted;
+}
+
+/*
+ * called under i_ceph_lock
+ */
+static int __ceph_is_single_caps(struct ceph_inode_info *ci)
+{
+ return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
+}
+
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+ return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+int ceph_is_any_caps(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_is_any_caps(ci);
+ spin_unlock(&ci->i_ceph_lock);
+
+ return ret;
+}
+
+static void drop_inode_snap_realm(struct ceph_inode_info *ci)
+{
+ struct ceph_snap_realm *realm = ci->i_snap_realm;
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ ci->i_snap_realm_counter++;
+ ci->i_snap_realm = NULL;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
+ realm);
+}
+
+/*
+ * Remove a cap. Take steps to deal with a racing iterate_session_caps.
+ *
+ * caller should hold i_ceph_lock.
+ * caller will not hold session s_mutex if called from destroy_inode.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
+{
+ struct ceph_mds_session *session = cap->session;
+ struct ceph_inode_info *ci = cap->ci;
+ struct ceph_mds_client *mdsc;
+ int removed = 0;
+
+ /* 'ci' being NULL means the remove have already occurred */
+ if (!ci) {
+ dout("%s: cap inode is NULL\n", __func__);
+ return;
+ }
+
+ dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+
+ mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc;
+
+ /* remove from inode's cap rbtree, and clear auth cap */
+ rb_erase(&cap->ci_node, &ci->i_caps);
+ if (ci->i_auth_cap == cap)
+ ci->i_auth_cap = NULL;
+
+ /* remove from session list */
+ spin_lock(&session->s_cap_lock);
+ if (session->s_cap_iterator == cap) {
+ /* not yet, we are iterating over this very cap */
+ dout("__ceph_remove_cap delaying %p removal from session %p\n",
+ cap, cap->session);
+ } else {
+ list_del_init(&cap->session_caps);
+ session->s_nr_caps--;
+ cap->session = NULL;
+ removed = 1;
+ }
+ /* protect backpointer with s_cap_lock: see iterate_session_caps */
+ cap->ci = NULL;
+
+ /*
+ * s_cap_reconnect is protected by s_cap_lock. no one changes
+ * s_cap_gen while session is in the reconnect state.
+ */
+ if (queue_release &&
+ (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
+ cap->queue_release = 1;
+ if (removed) {
+ list_add_tail(&cap->session_caps,
+ &session->s_cap_releases);
+ session->s_num_cap_releases++;
+ removed = 0;
+ }
+ } else {
+ cap->queue_release = 0;
+ }
+ cap->cap_ino = ci->i_vino.ino;
+
+ spin_unlock(&session->s_cap_lock);
+
+ if (removed)
+ ceph_put_cap(mdsc, cap);
+
+ /* when reconnect denied, we remove session caps forcibly,
+ * i_wr_ref can be non-zero. If there are ongoing write,
+ * keep i_snap_realm.
+ */
+ if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
+ drop_inode_snap_realm(ci);
+
+ if (!__ceph_is_any_real_caps(ci))
+ __cap_delay_cancel(mdsc, ci);
+}
+
+struct cap_msg_args {
+ struct ceph_mds_session *session;
+ u64 ino, cid, follows;
+ u64 flush_tid, oldest_flush_tid, size, max_size;
+ u64 xattr_version;
+ struct ceph_buffer *xattr_buf;
+ struct timespec64 atime, mtime, ctime;
+ int op, caps, wanted, dirty;
+ u32 seq, issue_seq, mseq, time_warp_seq;
+ u32 flags;
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ bool inline_data;
+};
+
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct cap_msg_args *arg)
+{
+ struct ceph_mds_caps *fc;
+ struct ceph_msg *msg;
+ void *p;
+ size_t extra_len;
+ struct timespec64 zerotime = {0};
+ struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
+
+ dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+ " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
+ " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
+ arg->cid, arg->ino, ceph_cap_string(arg->caps),
+ ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
+ arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
+ arg->mseq, arg->follows, arg->size, arg->max_size,
+ arg->xattr_version,
+ arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
+
+ /* flock buffer size + inline version + inline data size +
+ * osd_epoch_barrier + oldest_flush_tid */
+ extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
+ GFP_NOFS, false);
+ if (!msg)
+ return -ENOMEM;
+
+ msg->hdr.version = cpu_to_le16(10);
+ msg->hdr.tid = cpu_to_le64(arg->flush_tid);
+
+ fc = msg->front.iov_base;
+ memset(fc, 0, sizeof(*fc));
+
+ fc->cap_id = cpu_to_le64(arg->cid);
+ fc->op = cpu_to_le32(arg->op);
+ fc->seq = cpu_to_le32(arg->seq);
+ fc->issue_seq = cpu_to_le32(arg->issue_seq);
+ fc->migrate_seq = cpu_to_le32(arg->mseq);
+ fc->caps = cpu_to_le32(arg->caps);
+ fc->wanted = cpu_to_le32(arg->wanted);
+ fc->dirty = cpu_to_le32(arg->dirty);
+ fc->ino = cpu_to_le64(arg->ino);
+ fc->snap_follows = cpu_to_le64(arg->follows);
+
+ fc->size = cpu_to_le64(arg->size);
+ fc->max_size = cpu_to_le64(arg->max_size);
+ ceph_encode_timespec64(&fc->mtime, &arg->mtime);
+ ceph_encode_timespec64(&fc->atime, &arg->atime);
+ ceph_encode_timespec64(&fc->ctime, &arg->ctime);
+ fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
+
+ fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
+ fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
+ fc->mode = cpu_to_le32(arg->mode);
+
+ fc->xattr_version = cpu_to_le64(arg->xattr_version);
+ if (arg->xattr_buf) {
+ msg->middle = ceph_buffer_get(arg->xattr_buf);
+ fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
+ msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
+ }
+
+ p = fc + 1;
+ /* flock buffer size (version 2) */
+ ceph_encode_32(&p, 0);
+ /* inline version (version 4) */
+ ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
+ /* inline data size */
+ ceph_encode_32(&p, 0);
+ /*
+ * osd_epoch_barrier (version 5)
+ * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
+ * case it was recently changed
+ */
+ ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
+ /* oldest_flush_tid (version 6) */
+ ceph_encode_64(&p, arg->oldest_flush_tid);
+
+ /*
+ * caller_uid/caller_gid (version 7)
+ *
+ * Currently, we don't properly track which caller dirtied the caps
+ * last, and force a flush of them when there is a conflict. For now,
+ * just set this to 0:0, to emulate how the MDS has worked up to now.
+ */
+ ceph_encode_32(&p, 0);
+ ceph_encode_32(&p, 0);
+
+ /* pool namespace (version 8) (mds always ignores this) */
+ ceph_encode_32(&p, 0);
+
+ /*
+ * btime and change_attr (version 9)
+ *
+ * We just zero these out for now, as the MDS ignores them unless
+ * the requisite feature flags are set (which we don't do yet).
+ */
+ ceph_encode_timespec64(p, &zerotime);
+ p += sizeof(struct ceph_timespec);
+ ceph_encode_64(&p, 0);
+
+ /* Advisory flags (version 10) */
+ ceph_encode_32(&p, arg->flags);
+
+ ceph_con_send(&arg->session->s_con, msg);
+ return 0;
+}
+
+/*
+ * Queue cap releases when an inode is dropped from our cache.
+ */
+void ceph_queue_caps_release(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct rb_node *p;
+
+ /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
+ * may call __ceph_caps_issued_mask() on a freeing inode. */
+ spin_lock(&ci->i_ceph_lock);
+ p = rb_first(&ci->i_caps);
+ while (p) {
+ struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+ p = rb_next(p);
+ __ceph_remove_cap(cap, true);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+}
+
+/*
+ * Send a cap msg on the given inode. Update our caps state, then
+ * drop i_ceph_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE. Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_ceph_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+ int op, bool sync, int used, int want, int retain,
+ int flushing, u64 flush_tid, u64 oldest_flush_tid)
+ __releases(cap->ci->i_ceph_lock)
+{
+ struct ceph_inode_info *ci = cap->ci;
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_buffer *old_blob = NULL;
+ struct cap_msg_args arg;
+ int held, revoking;
+ int wake = 0;
+ int delayed = 0;
+ int ret;
+
+ held = cap->issued | cap->implemented;
+ revoking = cap->implemented & ~cap->issued;
+ retain &= ~revoking;
+
+ dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+ inode, cap, cap->session,
+ ceph_cap_string(held), ceph_cap_string(held & retain),
+ ceph_cap_string(revoking));
+ BUG_ON((retain & CEPH_CAP_PIN) == 0);
+
+ arg.session = cap->session;
+
+ /* don't release wanted unless we've waited a bit. */
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+ time_before(jiffies, ci->i_hold_caps_min)) {
+ dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & retain),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(want));
+ want |= cap->mds_wanted;
+ retain |= cap->issued;
+ delayed = 1;
+ }
+ ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+ if (want & ~cap->mds_wanted) {
+ /* user space may open/close single file frequently.
+ * This avoids droping mds_wanted immediately after
+ * requesting new mds_wanted.
+ */
+ __cap_set_timeouts(mdsc, ci);
+ }
+
+ cap->issued &= retain; /* drop bits we don't want */
+ if (cap->implemented & ~cap->issued) {
+ /*
+ * Wake up any waiters on wanted -> needed transition.
+ * This is due to the weird transition from buffered
+ * to sync IO... we need to flush dirty pages _before_
+ * allowing sync writes to avoid reordering.
+ */
+ wake = 1;
+ }
+ cap->implemented &= cap->issued | used;
+ cap->mds_wanted = want;
+
+ arg.ino = ceph_vino(inode).ino;
+ arg.cid = cap->cap_id;
+ arg.follows = flushing ? ci->i_head_snapc->seq : 0;
+ arg.flush_tid = flush_tid;
+ arg.oldest_flush_tid = oldest_flush_tid;
+
+ arg.size = inode->i_size;
+ ci->i_reported_size = arg.size;
+ arg.max_size = ci->i_wanted_max_size;
+ ci->i_requested_max_size = arg.max_size;
+
+ if (flushing & CEPH_CAP_XATTR_EXCL) {
+ old_blob = __ceph_build_xattrs_blob(ci);
+ arg.xattr_version = ci->i_xattrs.version;
+ arg.xattr_buf = ci->i_xattrs.blob;
+ } else {
+ arg.xattr_buf = NULL;
+ }
+
+ arg.mtime = inode->i_mtime;
+ arg.atime = inode->i_atime;
+ arg.ctime = inode->i_ctime;
+
+ arg.op = op;
+ arg.caps = cap->implemented;
+ arg.wanted = want;
+ arg.dirty = flushing;
+
+ arg.seq = cap->seq;
+ arg.issue_seq = cap->issue_seq;
+ arg.mseq = cap->mseq;
+ arg.time_warp_seq = ci->i_time_warp_seq;
+
+ arg.uid = inode->i_uid;
+ arg.gid = inode->i_gid;
+ arg.mode = inode->i_mode;
+
+ arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+ if (list_empty(&ci->i_cap_snaps))
+ arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
+ else
+ arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
+ if (sync)
+ arg.flags |= CEPH_CLIENT_CAPS_SYNC;
+
+ spin_unlock(&ci->i_ceph_lock);
+
+ ceph_buffer_put(old_blob);
+
+ ret = send_cap_msg(&arg);
+ if (ret < 0) {
+ dout("error sending cap msg, must requeue %p\n", inode);
+ delayed = 1;
+ }
+
+ if (wake)
+ wake_up_all(&ci->i_cap_wq);
+
+ return delayed;
+}
+
+static inline int __send_flush_snap(struct inode *inode,
+ struct ceph_mds_session *session,
+ struct ceph_cap_snap *capsnap,
+ u32 mseq, u64 oldest_flush_tid)
+{
+ struct cap_msg_args arg;
+
+ arg.session = session;
+ arg.ino = ceph_vino(inode).ino;
+ arg.cid = 0;
+ arg.follows = capsnap->follows;
+ arg.flush_tid = capsnap->cap_flush.tid;
+ arg.oldest_flush_tid = oldest_flush_tid;
+
+ arg.size = capsnap->size;
+ arg.max_size = 0;
+ arg.xattr_version = capsnap->xattr_version;
+ arg.xattr_buf = capsnap->xattr_blob;
+
+ arg.atime = capsnap->atime;
+ arg.mtime = capsnap->mtime;
+ arg.ctime = capsnap->ctime;
+
+ arg.op = CEPH_CAP_OP_FLUSHSNAP;
+ arg.caps = capsnap->issued;
+ arg.wanted = 0;
+ arg.dirty = capsnap->dirty;
+
+ arg.seq = 0;
+ arg.issue_seq = 0;
+ arg.mseq = mseq;
+ arg.time_warp_seq = capsnap->time_warp_seq;
+
+ arg.uid = capsnap->uid;
+ arg.gid = capsnap->gid;
+ arg.mode = capsnap->mode;
+
+ arg.inline_data = capsnap->inline_data;
+ arg.flags = 0;
+
+ return send_cap_msg(&arg);
+}
+
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken. This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Called under i_ceph_lock. Takes s_mutex as needed.
+ */
+static void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_cap_snap *capsnap;
+ u64 oldest_flush_tid = 0;
+ u64 first_tid = 1, last_tid = 0;
+
+ dout("__flush_snaps %p session %p\n", inode, session);
+
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ /*
+ * we need to wait for sync writes to complete and for dirty
+ * pages to be written out.
+ */
+ if (capsnap->dirty_pages || capsnap->writing)
+ break;
+
+ /* should be removed by ceph_try_drop_cap_snap() */
+ BUG_ON(!capsnap->need_flush);
+
+ /* only flush each capsnap once */
+ if (capsnap->cap_flush.tid > 0) {
+ dout(" already flushed %p, skipping\n", capsnap);
+ continue;
+ }
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+ list_add_tail(&capsnap->cap_flush.g_list,
+ &mdsc->cap_flush_list);
+ if (oldest_flush_tid == 0)
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ if (list_empty(&ci->i_flushing_item)) {
+ list_add_tail(&ci->i_flushing_item,
+ &session->s_cap_flushing);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ list_add_tail(&capsnap->cap_flush.i_list,
+ &ci->i_cap_flush_list);
+
+ if (first_tid == 1)
+ first_tid = capsnap->cap_flush.tid;
+ last_tid = capsnap->cap_flush.tid;
+ }
+
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
+
+ while (first_tid <= last_tid) {
+ struct ceph_cap *cap = ci->i_auth_cap;
+ struct ceph_cap_flush *cf;
+ int ret;
+
+ if (!(cap && cap->session == session)) {
+ dout("__flush_snaps %p auth cap %p not mds%d, "
+ "stop\n", inode, cap, session->s_mds);
+ break;
+ }
+
+ ret = -ENOENT;
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid >= first_tid) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret < 0)
+ break;
+
+ first_tid = cf->tid + 1;
+
+ capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
+ refcount_inc(&capsnap->nref);
+ spin_unlock(&ci->i_ceph_lock);
+
+ dout("__flush_snaps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("__flush_snaps: error sending cap flushsnap, "
+ "ino (%llx.%llx) tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid, capsnap->follows);
+ }
+
+ ceph_put_cap_snap(capsnap);
+ spin_lock(&ci->i_ceph_lock);
+ }
+}
+
+void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *session = NULL;
+ int mds;
+
+ dout("ceph_flush_snaps %p\n", inode);
+ if (psession)
+ session = *psession;
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
+ dout(" no capsnap needs flush, doing nothing\n");
+ goto out;
+ }
+ if (!ci->i_auth_cap) {
+ dout(" no auth cap (migrating?), doing nothing\n");
+ goto out;
+ }
+
+ mds = ci->i_auth_cap->session->s_mds;
+ if (session && session->s_mds != mds) {
+ dout(" oops, wrong session %p mutex\n", session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ session = NULL;
+ }
+ if (!session) {
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ if (session) {
+ dout(" inverting session/ino locks on %p\n", session);
+ mutex_lock(&session->s_mutex);
+ }
+ goto retry;
+ }
+
+ // make sure flushsnap messages are sent in proper order.
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ __kick_flushing_caps(mdsc, session, ci, 0);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ }
+
+ __ceph_flush_snaps(ci, session);
+out:
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (psession) {
+ *psession = session;
+ } else if (session) {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+ /* we flushed them all; remove this inode from the queue */
+ spin_lock(&mdsc->snap_flush_lock);
+ list_del_init(&ci->i_snap_flush_item);
+ spin_unlock(&mdsc->snap_flush_lock);
+}
+
+/*
+ * Mark caps dirty. If inode is newly dirty, return the dirty flags.
+ * Caller is then responsible for calling __mark_inode_dirty with the
+ * returned flags value.
+ */
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+ struct ceph_cap_flush **pcf)
+{
+ struct ceph_mds_client *mdsc =
+ ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+ struct inode *inode = &ci->vfs_inode;
+ int was = ci->i_dirty_caps;
+ int dirty = 0;
+
+ if (!ci->i_auth_cap) {
+ pr_warn("__mark_dirty_caps %p %llx mask %s, "
+ "but no auth cap (session was closed?)\n",
+ inode, ceph_ino(inode), ceph_cap_string(mask));
+ return 0;
+ }
+
+ dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+ ceph_cap_string(mask), ceph_cap_string(was),
+ ceph_cap_string(was | mask));
+ ci->i_dirty_caps |= mask;
+ if (was == 0) {
+ WARN_ON_ONCE(ci->i_prealloc_cap_flush);
+ swap(ci->i_prealloc_cap_flush, *pcf);
+
+ if (!ci->i_head_snapc) {
+ WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
+ ci->i_head_snapc = ceph_get_snap_context(
+ ci->i_snap_realm->cached_context);
+ }
+ dout(" inode %p now dirty snapc %p auth cap %p\n",
+ &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+ BUG_ON(!list_empty(&ci->i_dirty_item));
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ if (ci->i_flushing_caps == 0) {
+ ihold(inode);
+ dirty |= I_DIRTY_SYNC;
+ }
+ } else {
+ WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
+ }
+ BUG_ON(list_empty(&ci->i_dirty_item));
+ if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+ (mask & CEPH_CAP_FILE_BUFFER))
+ dirty |= I_DIRTY_DATASYNC;
+ __cap_delay_requeue(mdsc, ci);
+ return dirty;
+}
+
+struct ceph_cap_flush *ceph_alloc_cap_flush(void)
+{
+ return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+}
+
+void ceph_free_cap_flush(struct ceph_cap_flush *cf)
+{
+ if (cf)
+ kmem_cache_free(ceph_cap_flush_cachep, cf);
+}
+
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
+{
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ return cf->tid;
+ }
+ return 0;
+}
+
+/*
+ * Remove cap_flush from the mdsc's or inode's flushing cap list.
+ * Return true if caller needs to wake up flush waiters.
+ */
+static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci,
+ struct ceph_cap_flush *cf)
+{
+ struct ceph_cap_flush *prev;
+ bool wake = cf->wake;
+ if (mdsc) {
+ /* are there older pending cap flushes? */
+ if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
+ prev = list_prev_entry(cf, g_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->g_list);
+ } else if (ci) {
+ if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
+ prev = list_prev_entry(cf, i_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->i_list);
+ } else {
+ BUG_ON(1);
+ }
+ return wake;
+}
+
+/*
+ * Add dirty inode to the flushing list. Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ *
+ * Called under i_ceph_lock.
+ */
+static int __mark_caps_flushing(struct inode *inode,
+ struct ceph_mds_session *session, bool wake,
+ u64 *flush_tid, u64 *oldest_flush_tid)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap_flush *cf = NULL;
+ int flushing;
+
+ BUG_ON(ci->i_dirty_caps == 0);
+ BUG_ON(list_empty(&ci->i_dirty_item));
+ BUG_ON(!ci->i_prealloc_cap_flush);
+
+ flushing = ci->i_dirty_caps;
+ dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+ ceph_cap_string(flushing),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps | flushing));
+ ci->i_flushing_caps |= flushing;
+ ci->i_dirty_caps = 0;
+ dout(" inode %p now !dirty\n", inode);
+
+ swap(cf, ci->i_prealloc_cap_flush);
+ cf->caps = flushing;
+ cf->wake = wake;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_del_init(&ci->i_dirty_item);
+
+ cf->tid = ++mdsc->last_cap_flush_tid;
+ list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
+ *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+
+ if (list_empty(&ci->i_flushing_item)) {
+ list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+ mdsc->num_cap_flushing++;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
+
+ *flush_tid = cf->tid;
+ return flushing;
+}
+
+/*
+ * try to invalidate mapping pages without blocking.
+ */
+static int try_nonblocking_invalidate(struct inode *inode)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u32 invalidating_gen = ci->i_rdcache_gen;
+
+ spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_invalidate(inode);
+ invalidate_mapping_pages(&inode->i_data, 0, -1);
+ spin_lock(&ci->i_ceph_lock);
+
+ if (inode->i_data.nrpages == 0 &&
+ invalidating_gen == ci->i_rdcache_gen) {
+ /* success. */
+ dout("try_nonblocking_invalidate %p success\n", inode);
+ /* save any racing async invalidate some trouble */
+ ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
+ return 0;
+ }
+ dout("try_nonblocking_invalidate %p failed\n", inode);
+ return -1;
+}
+
+bool __ceph_should_report_size(struct ceph_inode_info *ci)
+{
+ loff_t size = ci->vfs_inode.i_size;
+ /* mds will adjust max size according to the reported size */
+ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
+ return false;
+ if (size >= ci->i_max_size)
+ return true;
+ /* half of previous max_size increment has been used */
+ if (ci->i_max_size > ci->i_reported_size &&
+ (size << 1) >= ci->i_max_size + ci->i_reported_size)
+ return true;
+ return false;
+}
+
+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps. Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ * cap release further.
+ * CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ * further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+ struct ceph_mds_session *session)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+ u64 flush_tid, oldest_flush_tid;
+ int file_wanted, used, cap_used;
+ int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
+ int issued, implemented, want, retain, revoking, flushing = 0;
+ int mds = -1; /* keep track of how far we've gone through i_caps list
+ to avoid an infinite loop on retry */
+ struct rb_node *p;
+ int delayed = 0, sent = 0;
+ bool no_delay = flags & CHECK_CAPS_NODELAY;
+ bool queue_invalidate = false;
+ bool tried_invalidate = false;
+
+ /* if we are unmounting, flush any unused caps immediately. */
+ if (mdsc->stopping)
+ no_delay = true;
+
+ spin_lock(&ci->i_ceph_lock);
+
+ if (ci->i_ceph_flags & CEPH_I_FLUSH)
+ flags |= CHECK_CAPS_FLUSH;
+
+ if (!(flags & CHECK_CAPS_AUTHONLY) ||
+ (ci->i_auth_cap && __ceph_is_single_caps(ci)))
+ __cap_delay_cancel(mdsc, ci);
+
+ goto retry_locked;
+retry:
+ spin_lock(&ci->i_ceph_lock);
+retry_locked:
+ file_wanted = __ceph_caps_file_wanted(ci);
+ used = __ceph_caps_used(ci);
+ issued = __ceph_caps_issued(ci, &implemented);
+ revoking = implemented & ~issued;
+
+ want = file_wanted;
+ retain = file_wanted | used | CEPH_CAP_PIN;
+ if (!mdsc->stopping && inode->i_nlink > 0) {
+ if (file_wanted) {
+ retain |= CEPH_CAP_ANY; /* be greedy */
+ } else if (S_ISDIR(inode->i_mode) &&
+ (issued & CEPH_CAP_FILE_SHARED) &&
+ __ceph_dir_is_complete(ci)) {
+ /*
+ * If a directory is complete, we want to keep
+ * the exclusive cap. So that MDS does not end up
+ * revoking the shared cap on every create/unlink
+ * operation.
+ */
+ want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ retain |= want;
+ } else {
+
+ retain |= CEPH_CAP_ANY_SHARED;
+ /*
+ * keep RD only if we didn't have the file open RW,
+ * because then the mds would revoke it anyway to
+ * journal max_size=0.
+ */
+ if (ci->i_max_size == 0)
+ retain |= CEPH_CAP_ANY_RD;
+ }
+ }
+
+ dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+ " issued %s revoking %s retain %s %s%s%s\n", inode,
+ ceph_cap_string(file_wanted),
+ ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(issued), ceph_cap_string(revoking),
+ ceph_cap_string(retain),
+ (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+ (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+ (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+
+ /*
+ * If we no longer need to hold onto old our caps, and we may
+ * have cached pages, but don't want them, then try to invalidate.
+ * If we fail, it's because pages are locked.... try again later.
+ */
+ if ((!no_delay || mdsc->stopping) &&
+ !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
+ !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
+ inode->i_data.nrpages && /* have cached pages */
+ (revoking & (CEPH_CAP_FILE_CACHE|
+ CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
+ !tried_invalidate) {
+ dout("check_caps trying to invalidate on %p\n", inode);
+ if (try_nonblocking_invalidate(inode) < 0) {
+ dout("check_caps queuing invalidate\n");
+ queue_invalidate = true;
+ ci->i_rdcache_revoking = ci->i_rdcache_gen;
+ }
+ tried_invalidate = true;
+ goto retry_locked;
+ }
+
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+
+ /* avoid looping forever */
+ if (mds >= cap->mds ||
+ ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+ continue;
+
+ /* NOTE: no side-effects allowed, until we take s_mutex */
+
+ cap_used = used;
+ if (ci->i_auth_cap && cap != ci->i_auth_cap)
+ cap_used &= ~ci->i_auth_cap->issued;
+
+ revoking = cap->implemented & ~cap->issued;
+ dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
+ cap->mds, cap, ceph_cap_string(cap_used),
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->implemented),
+ ceph_cap_string(revoking));
+
+ if (cap == ci->i_auth_cap &&
+ (cap->issued & CEPH_CAP_FILE_WR)) {
+ /* request larger max_size from MDS? */
+ if (ci->i_wanted_max_size > ci->i_max_size &&
+ ci->i_wanted_max_size > ci->i_requested_max_size) {
+ dout("requesting new max_size\n");
+ goto ack;
+ }
+
+ /* approaching file_max? */
+ if (__ceph_should_report_size(ci)) {
+ dout("i_size approaching max_size\n");
+ goto ack;
+ }
+ }
+ /* flush anything dirty? */
+ if (cap == ci->i_auth_cap) {
+ if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
+ dout("flushing dirty caps\n");
+ goto ack;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
+ dout("flushing snap caps\n");
+ goto ack;
+ }
+ }
+
+ /* completed revocation? going down and there are no caps? */
+ if (revoking && (revoking & cap_used) == 0) {
+ dout("completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /* want more caps from mds? */
+ if (want & ~cap->mds_wanted) {
+ if (want & ~(cap->mds_wanted | cap->issued))
+ goto ack;
+ if (!__cap_is_valid(cap))
+ goto ack;
+ }
+
+ /* things we might delay */
+ if ((cap->issued & ~retain) == 0 &&
+ cap->mds_wanted == want)
+ continue; /* nope, all good */
+
+ if (no_delay)
+ goto ack;
+
+ /* delay? */
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+ time_before(jiffies, ci->i_hold_caps_max)) {
+ dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & retain),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(want));
+ delayed++;
+ continue;
+ }
+
+ack:
+ if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+ dout(" skipping %p I_NOFLUSH set\n", inode);
+ continue;
+ }
+
+ if (session && session != cap->session) {
+ dout("oops, wrong session %p mutex\n", session);
+ mutex_unlock(&session->s_mutex);
+ session = NULL;
+ }
+ if (!session) {
+ session = cap->session;
+ if (mutex_trylock(&session->s_mutex) == 0) {
+ dout("inverting session/ino locks on %p\n",
+ session);
+ session = ceph_get_mds_session(session);
+ spin_unlock(&ci->i_ceph_lock);
+ if (took_snap_rwsem) {
+ up_read(&mdsc->snap_rwsem);
+ took_snap_rwsem = 0;
+ }
+ if (session) {
+ mutex_lock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ } else {
+ /*
+ * Because we take the reference while
+ * holding the i_ceph_lock, it should
+ * never be NULL. Throw a warning if it
+ * ever is.
+ */
+ WARN_ON_ONCE(true);
+ }
+ goto retry;
+ }
+ }
+
+ /* kick flushing and flush snaps before sending normal
+ * cap message */
+ if (cap == ci->i_auth_cap &&
+ (ci->i_ceph_flags &
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ __kick_flushing_caps(mdsc, session, ci, 0);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+ __ceph_flush_snaps(ci, session);
+
+ goto retry_locked;
+ }
+
+ /* take snap_rwsem after session mutex */
+ if (!took_snap_rwsem) {
+ if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+ dout("inverting snap/in locks on %p\n",
+ inode);
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ took_snap_rwsem = 1;
+ goto retry;
+ }
+ took_snap_rwsem = 1;
+ }
+
+ if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
+ flushing = __mark_caps_flushing(inode, session, false,
+ &flush_tid,
+ &oldest_flush_tid);
+ } else {
+ flushing = 0;
+ flush_tid = 0;
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
+
+ mds = cap->mds; /* remember mds, so we don't repeat */
+ sent++;
+
+ /* __send_cap drops i_ceph_lock */
+ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
+ cap_used, want, retain, flushing,
+ flush_tid, oldest_flush_tid);
+ goto retry; /* retake i_ceph_lock and restart our cap scan. */
+ }
+
+ /* Reschedule delayed caps release if we delayed anything */
+ if (delayed)
+ __cap_delay_requeue(mdsc, ci);
+
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (queue_invalidate)
+ ceph_queue_invalidate(inode);
+
+ if (session)
+ mutex_unlock(&session->s_mutex);
+ if (took_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, u64 *ptid)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_session *session = NULL;
+ int flushing = 0;
+ u64 flush_tid = 0, oldest_flush_tid = 0;
+
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+ spin_unlock(&ci->i_ceph_lock);
+ dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+ goto out;
+ }
+ if (ci->i_dirty_caps && ci->i_auth_cap) {
+ struct ceph_cap *cap = ci->i_auth_cap;
+ int used = __ceph_caps_used(ci);
+ int want = __ceph_caps_wanted(ci);
+ int delayed;
+
+ if (!session || session != cap->session) {
+ spin_unlock(&ci->i_ceph_lock);
+ if (session)
+ mutex_unlock(&session->s_mutex);
+ session = cap->session;
+ mutex_lock(&session->s_mutex);
+ goto retry;
+ }
+ if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) {
+ spin_unlock(&ci->i_ceph_lock);
+ goto out;
+ }
+
+ flushing = __mark_caps_flushing(inode, session, true,
+ &flush_tid, &oldest_flush_tid);
+
+ /* __send_cap drops i_ceph_lock */
+ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
+ used, want, (cap->issued | cap->implemented),
+ flushing, flush_tid, oldest_flush_tid);
+
+ if (delayed) {
+ spin_lock(&ci->i_ceph_lock);
+ __cap_delay_requeue(mdsc, ci);
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ } else {
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_last_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ cf->wake = true;
+ flush_tid = cf->tid;
+ }
+ flushing = ci->i_flushing_caps;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+out:
+ if (session)
+ mutex_unlock(&session->s_mutex);
+
+ *ptid = flush_tid;
+ return flushing;
+}
+
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, u64 flush_tid)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret = 1;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ struct ceph_cap_flush * cf =
+ list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ if (cf->tid <= flush_tid)
+ ret = 0;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
+/*
+ * wait for any unsafe requests to complete.
+ */
+static int unsafe_request_wait(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+ int ret, err = 0;
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
+ req1 = list_last_entry(&ci->i_unsafe_dirops,
+ struct ceph_mds_request,
+ r_unsafe_dir_item);
+ ceph_mdsc_get_request(req1);
+ }
+ if (!list_empty(&ci->i_unsafe_iops)) {
+ req2 = list_last_entry(&ci->i_unsafe_iops,
+ struct ceph_mds_request,
+ r_unsafe_target_item);
+ ceph_mdsc_get_request(req2);
+ }
+ spin_unlock(&ci->i_unsafe_lock);
+
+ dout("unsafe_request_wait %p wait on tid %llu %llu\n",
+ inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
+ if (req1) {
+ ret = !wait_for_completion_timeout(&req1->r_safe_completion,
+ ceph_timeout_jiffies(req1->r_timeout));
+ if (ret)
+ err = -EIO;
+ ceph_mdsc_put_request(req1);
+ }
+ if (req2) {
+ ret = !wait_for_completion_timeout(&req2->r_safe_completion,
+ ceph_timeout_jiffies(req2->r_timeout));
+ if (ret)
+ err = -EIO;
+ ceph_mdsc_put_request(req2);
+ }
+ return err;
+}
+
+int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 flush_tid;
+ int ret;
+ int dirty;
+
+ dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+
+ ret = file_write_and_wait_range(file, start, end);
+ if (ret < 0)
+ goto out;
+
+ if (datasync)
+ goto out;
+
+ inode_lock(inode);
+
+ dirty = try_flush_caps(inode, &flush_tid);
+ dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+
+ ret = unsafe_request_wait(inode);
+
+ /*
+ * only wait on non-file metadata writeback (the mds
+ * can recover size and mtime, so we don't need to
+ * wait for that)
+ */
+ if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+ ret = wait_event_interruptible(ci->i_cap_wq,
+ caps_are_flushed(inode, flush_tid));
+ }
+ inode_unlock(inode);
+out:
+ dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
+ return ret;
+}
+
+/*
+ * Flush any dirty caps back to the mds. If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 flush_tid;
+ int err = 0;
+ int dirty;
+ int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
+
+ dout("write_inode %p wait=%d\n", inode, wait);
+ if (wait) {
+ dirty = try_flush_caps(inode, &flush_tid);
+ if (dirty)
+ err = wait_event_interruptible(ci->i_cap_wq,
+ caps_are_flushed(inode, flush_tid));
+ } else {
+ struct ceph_mds_client *mdsc =
+ ceph_sb_to_client(inode->i_sb)->mdsc;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_caps_dirty(ci))
+ __cap_delay_requeue_front(mdsc, ci);
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ return err;
+}
+
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+ struct ceph_cap_flush *cf;
+ int ret;
+ u64 first_tid = 0;
+
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid < first_tid)
+ continue;
+
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n",
+ inode, cap, session->s_mds);
+ break;
+ }
+
+ first_tid = cf->tid + 1;
+
+ if (cf->caps) {
+ dout("kick_flushing_caps %p cap %p tid %llu %s\n",
+ inode, cap, cf->tid, ceph_cap_string(cf->caps));
+ ci->i_ceph_flags |= CEPH_I_NODELAY;
+ ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ false, __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ cap->issued | cap->implemented,
+ cf->caps, cf->tid, oldest_flush_tid);
+ if (ret) {
+ pr_err("kick_flushing_caps: error sending "
+ "cap flush, ino (%llx.%llx) "
+ "tid %llu flushing %s\n",
+ ceph_vinop(inode), cf->tid,
+ ceph_cap_string(cf->caps));
+ }
+ } else {
+ struct ceph_cap_snap *capsnap =
+ container_of(cf, struct ceph_cap_snap,
+ cap_flush);
+ dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
+
+ refcount_inc(&capsnap->nref);
+ spin_unlock(&ci->i_ceph_lock);
+
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("kick_flushing_caps: error sending "
+ "cap flushsnap, ino (%llx.%llx) "
+ "tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid,
+ capsnap->follows);
+ }
+
+ ceph_put_cap_snap(capsnap);
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ }
+}
+
+void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+ u64 oldest_flush_tid;
+
+ dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n",
+ &ci->vfs_inode, cap, session->s_mds);
+ spin_unlock(&ci->i_ceph_lock);
+ continue;
+ }
+
+
+ /*
+ * if flushing caps were revoked, we re-send the cap flush
+ * in client reconnect stage. This guarantees MDS * processes
+ * the cap flush message before issuing the flushing caps to
+ * other client.
+ */
+ if ((cap->issued & ci->i_flushing_caps) !=
+ ci->i_flushing_caps) {
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ } else {
+ ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+ u64 oldest_flush_tid;
+
+ dout("kick_flushing_caps mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n",
+ &ci->vfs_inode, cap, session->s_mds);
+ spin_unlock(&ci->i_ceph_lock);
+ continue;
+ }
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct inode *inode)
+ __releases(ci->i_ceph_lock)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap *cap;
+
+ cap = ci->i_auth_cap;
+ dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+ ceph_cap_string(ci->i_flushing_caps));
+
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ u64 oldest_flush_tid;
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &cap->session->s_cap_flushing);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
+ spin_unlock(&ci->i_ceph_lock);
+ } else {
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+
+/*
+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_ceph_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+ bool snap_rwsem_locked)
+{
+ if (got & CEPH_CAP_PIN)
+ ci->i_pin_ref++;
+ if (got & CEPH_CAP_FILE_RD)
+ ci->i_rd_ref++;
+ if (got & CEPH_CAP_FILE_CACHE)
+ ci->i_rdcache_ref++;
+ if (got & CEPH_CAP_FILE_WR) {
+ if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
+ BUG_ON(!snap_rwsem_locked);
+ ci->i_head_snapc = ceph_get_snap_context(
+ ci->i_snap_realm->cached_context);
+ }
+ ci->i_wr_ref++;
+ }
+ if (got & CEPH_CAP_FILE_BUFFER) {
+ if (ci->i_wb_ref == 0)
+ ihold(&ci->vfs_inode);
+ ci->i_wb_ref++;
+ dout("__take_cap_refs %p wb %d -> %d (?)\n",
+ &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
+ }
+}
+
+/*
+ * Try to grab cap references. Specify those refs we @want, and the
+ * minimal set we @need. Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+ loff_t endoff, bool nonblock, int *got, int *err)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ int ret = 0;
+ int have, implemented;
+ int file_wanted;
+ bool snap_rwsem_locked = false;
+
+ dout("get_cap_refs %p need %s want %s\n", inode,
+ ceph_cap_string(need), ceph_cap_string(want));
+
+again:
+ spin_lock(&ci->i_ceph_lock);
+
+ /* make sure file is actually open */
+ file_wanted = __ceph_caps_file_wanted(ci);
+ if ((file_wanted & need) != need) {
+ dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
+ ceph_cap_string(need), ceph_cap_string(file_wanted));
+ *err = -EBADF;
+ ret = 1;
+ goto out_unlock;
+ }
+
+ /* finish pending truncate */
+ while (ci->i_truncate_pending) {
+ spin_unlock(&ci->i_ceph_lock);
+ if (snap_rwsem_locked) {
+ up_read(&mdsc->snap_rwsem);
+ snap_rwsem_locked = false;
+ }
+ __ceph_do_pending_vmtruncate(inode);
+ spin_lock(&ci->i_ceph_lock);
+ }
+
+ have = __ceph_caps_issued(ci, &implemented);
+
+ if (have & need & CEPH_CAP_FILE_WR) {
+ if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+ dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+ inode, endoff, ci->i_max_size);
+ if (endoff > ci->i_requested_max_size) {
+ *err = -EAGAIN;
+ ret = 1;
+ }
+ goto out_unlock;
+ }
+ /*
+ * If a sync write is in progress, we must wait, so that we
+ * can get a final snapshot value for size+mtime.
+ */
+ if (__ceph_have_pending_cap_snap(ci)) {
+ dout("get_cap_refs %p cap_snap_pending\n", inode);
+ goto out_unlock;
+ }
+ }
+
+ if ((have & need) == need) {
+ /*
+ * Look at (implemented & ~have & not) so that we keep waiting
+ * on transition from wanted -> needed caps. This is needed
+ * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+ * going before a prior buffered writeback happens.
+ */
+ int not = want & ~(have & need);
+ int revoking = implemented & ~have;
+ dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+ inode, ceph_cap_string(have), ceph_cap_string(not),
+ ceph_cap_string(revoking));
+ if ((revoking & not) == 0) {
+ if (!snap_rwsem_locked &&
+ !ci->i_head_snapc &&
+ (need & CEPH_CAP_FILE_WR)) {
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ /*
+ * we can not call down_read() when
+ * task isn't in TASK_RUNNING state
+ */
+ if (nonblock) {
+ *err = -EAGAIN;
+ ret = 1;
+ goto out_unlock;
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ snap_rwsem_locked = true;
+ goto again;
+ }
+ snap_rwsem_locked = true;
+ }
+ *got = need | (have & want);
+ if ((need & CEPH_CAP_FILE_RD) &&
+ !(*got & CEPH_CAP_FILE_CACHE))
+ ceph_disable_fscache_readpage(ci);
+ __take_cap_refs(ci, *got, true);
+ ret = 1;
+ }
+ } else {
+ int session_readonly = false;
+ if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+ struct ceph_mds_session *s = ci->i_auth_cap->session;
+ spin_lock(&s->s_cap_lock);
+ session_readonly = s->s_readonly;
+ spin_unlock(&s->s_cap_lock);
+ }
+ if (session_readonly) {
+ dout("get_cap_refs %p needed %s but mds%d readonly\n",
+ inode, ceph_cap_string(need), ci->i_auth_cap->mds);
+ *err = -EROFS;
+ ret = 1;
+ goto out_unlock;
+ }
+
+ if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
+ int mds_wanted;
+ if (READ_ONCE(mdsc->fsc->mount_state) ==
+ CEPH_MOUNT_SHUTDOWN) {
+ dout("get_cap_refs %p forced umount\n", inode);
+ *err = -EIO;
+ ret = 1;
+ goto out_unlock;
+ }
+ mds_wanted = __ceph_caps_mds_wanted(ci, false);
+ if (need & ~(mds_wanted & need)) {
+ dout("get_cap_refs %p caps were dropped"
+ " (session killed?)\n", inode);
+ *err = -ESTALE;
+ ret = 1;
+ goto out_unlock;
+ }
+ if (!(file_wanted & ~mds_wanted))
+ ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
+ }
+
+ dout("get_cap_refs %p have %s needed %s\n", inode,
+ ceph_cap_string(have), ceph_cap_string(need));
+ }
+out_unlock:
+ spin_unlock(&ci->i_ceph_lock);
+ if (snap_rwsem_locked)
+ up_read(&mdsc->snap_rwsem);
+
+ dout("get_cap_refs %p ret %d got %s\n", inode,
+ ret, ceph_cap_string(*got));
+ return ret;
+}
+
+/*
+ * Check the offset we are writing up to against our current
+ * max_size. If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int check = 0;
+
+ /* do we need to explicitly request a larger max_size? */
+ spin_lock(&ci->i_ceph_lock);
+ if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
+ dout("write %p at large endoff %llu, req max_size\n",
+ inode, endoff);
+ ci->i_wanted_max_size = endoff;
+ }
+ /* duplicate ceph_check_caps()'s logic */
+ if (ci->i_auth_cap &&
+ (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
+ ci->i_wanted_max_size > ci->i_max_size &&
+ ci->i_wanted_max_size > ci->i_requested_max_size)
+ check = 1;
+ spin_unlock(&ci->i_ceph_lock);
+ if (check)
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+
+int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
+{
+ int ret, err = 0;
+
+ BUG_ON(need & ~CEPH_CAP_FILE_RD);
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
+ ret = ceph_pool_perm_check(ci, need);
+ if (ret < 0)
+ return ret;
+
+ ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
+ if (ret) {
+ if (err == -EAGAIN) {
+ ret = 0;
+ } else if (err < 0) {
+ ret = err;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Wait for caps, and take cap references. If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+ loff_t endoff, int *got, struct page **pinned_page)
+{
+ int _got, ret, err = 0;
+
+ ret = ceph_pool_perm_check(ci, need);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ if (endoff > 0)
+ check_max_size(&ci->vfs_inode, endoff);
+
+ err = 0;
+ _got = 0;
+ ret = try_get_cap_refs(ci, need, want, endoff,
+ false, &_got, &err);
+ if (ret) {
+ if (err == -EAGAIN)
+ continue;
+ if (err < 0)
+ ret = err;
+ } else {
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ add_wait_queue(&ci->i_cap_wq, &wait);
+
+ while (!try_get_cap_refs(ci, need, want, endoff,
+ true, &_got, &err)) {
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+
+ remove_wait_queue(&ci->i_cap_wq, &wait);
+
+ if (err == -EAGAIN)
+ continue;
+ if (err < 0)
+ ret = err;
+ }
+ if (ret < 0) {
+ if (err == -ESTALE) {
+ /* session was killed, try renew caps */
+ ret = ceph_renew_caps(&ci->vfs_inode);
+ if (ret == 0)
+ continue;
+ }
+ return ret;
+ }
+
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ i_size_read(&ci->vfs_inode) > 0) {
+ struct page *page =
+ find_get_page(ci->vfs_inode.i_mapping, 0);
+ if (page) {
+ if (PageUptodate(page)) {
+ *pinned_page = page;
+ break;
+ }
+ put_page(page);
+ }
+ /*
+ * drop cap refs first because getattr while
+ * holding * caps refs can cause deadlock.
+ */
+ ceph_put_cap_refs(ci, _got);
+ _got = 0;
+
+ /*
+ * getattr request will bring inline data into
+ * page cache
+ */
+ ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+ CEPH_STAT_CAP_INLINE_DATA,
+ true);
+ if (ret < 0)
+ return ret;
+ continue;
+ }
+ break;
+ }
+
+ if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
+ ceph_fscache_revalidate_cookie(ci);
+
+ *got = _got;
+ return 0;
+}
+
+/*
+ * Take cap refs. Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+ spin_lock(&ci->i_ceph_lock);
+ __take_cap_refs(ci, caps, false);
+ spin_unlock(&ci->i_ceph_lock);
+}
+
+
+/*
+ * drop cap_snap that is not associated with any snapshot.
+ * we don't need to send FLUSHSNAP message for it.
+ */
+static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap)
+{
+ if (!capsnap->need_flush &&
+ !capsnap->writing && !capsnap->dirty_pages) {
+ dout("dropping cap_snap %p follows %llu\n",
+ capsnap, capsnap->follows);
+ BUG_ON(capsnap->cap_flush.tid > 0);
+ ceph_put_snap_context(capsnap->context);
+ if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+
+ list_del(&capsnap->ci_item);
+ ceph_put_cap_snap(capsnap);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int last = 0, put = 0, flushsnaps = 0, wake = 0;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (had & CEPH_CAP_PIN)
+ --ci->i_pin_ref;
+ if (had & CEPH_CAP_FILE_RD)
+ if (--ci->i_rd_ref == 0)
+ last++;
+ if (had & CEPH_CAP_FILE_CACHE)
+ if (--ci->i_rdcache_ref == 0)
+ last++;
+ if (had & CEPH_CAP_FILE_BUFFER) {
+ if (--ci->i_wb_ref == 0) {
+ last++;
+ put++;
+ }
+ dout("put_cap_refs %p wb %d -> %d (?)\n",
+ inode, ci->i_wb_ref+1, ci->i_wb_ref);
+ }
+ if (had & CEPH_CAP_FILE_WR)
+ if (--ci->i_wr_ref == 0) {
+ last++;
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ capsnap->writing = 0;
+ if (ceph_try_drop_cap_snap(ci, capsnap))
+ put++;
+ else if (__ceph_finish_cap_snap(ci, capsnap))
+ flushsnaps = 1;
+ wake = 1;
+ }
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ BUG_ON(!ci->i_head_snapc);
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
+ /* see comment in __ceph_remove_cap() */
+ if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
+ drop_inode_snap_realm(ci);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
+ last ? " last" : "", put ? " put" : "");
+
+ if (last && !flushsnaps)
+ ceph_check_caps(ci, 0, NULL);
+ else if (flushsnaps)
+ ceph_flush_snaps(ci, NULL);
+ if (wake)
+ wake_up_all(&ci->i_cap_wq);
+ while (put-- > 0)
+ iput(inode);
+}
+
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context. Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS. If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+ struct ceph_snap_context *snapc)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap_snap *capsnap = NULL;
+ int put = 0;
+ bool last = false;
+ bool found = false;
+ bool flush_snaps = false;
+ bool complete_capsnap = false;
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_wrbuffer_ref -= nr;
+ if (ci->i_wrbuffer_ref == 0) {
+ last = true;
+ put++;
+ }
+
+ if (ci->i_head_snapc == snapc) {
+ ci->i_wrbuffer_ref_head -= nr;
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ BUG_ON(!ci->i_head_snapc);
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
+ dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+ inode,
+ ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
+ ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+ last ? " LAST" : "");
+ } else {
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ if (capsnap->context == snapc) {
+ found = true;
+ break;
+ }
+ }
+ BUG_ON(!found);
+ capsnap->dirty_pages -= nr;
+ if (capsnap->dirty_pages == 0) {
+ complete_capsnap = true;
+ if (!capsnap->writing) {
+ if (ceph_try_drop_cap_snap(ci, capsnap)) {
+ put++;
+ } else {
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+ flush_snaps = true;
+ }
+ }
+ }
+ dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+ " snap %lld %d/%d -> %d/%d %s%s\n",
+ inode, capsnap, capsnap->context->seq,
+ ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+ ci->i_wrbuffer_ref, capsnap->dirty_pages,
+ last ? " (wrbuffer last)" : "",
+ complete_capsnap ? " (complete capsnap)" : "");
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (last) {
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ } else if (flush_snaps) {
+ ceph_flush_snaps(ci, NULL);
+ }
+ if (complete_capsnap)
+ wake_up_all(&ci->i_cap_wq);
+ while (put-- > 0)
+ iput(inode);
+}
+
+/*
+ * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
+ */
+static void invalidate_aliases(struct inode *inode)
+{
+ struct dentry *dn, *prev = NULL;
+
+ dout("invalidate_aliases inode %p\n", inode);
+ d_prune_aliases(inode);
+ /*
+ * For non-directory inode, d_find_alias() only returns
+ * hashed dentry. After calling d_invalidate(), the
+ * dentry becomes unhashed.
+ *
+ * For directory inode, d_find_alias() can return
+ * unhashed dentry. But directory inode should have
+ * one alias at most.
+ */
+ while ((dn = d_find_alias(inode))) {
+ if (dn == prev) {
+ dput(dn);
+ break;
+ }
+ d_invalidate(dn);
+ if (prev)
+ dput(prev);
+ prev = dn;
+ }
+ if (prev)
+ dput(prev);
+}
+
+struct cap_extra_info {
+ struct ceph_string *pool_ns;
+ /* inline data */
+ u64 inline_version;
+ void *inline_data;
+ u32 inline_len;
+ /* dirstat */
+ bool dirstat_valid;
+ u64 nfiles;
+ u64 nsubdirs;
+ /* currently issued */
+ int issued;
+};
+
+/*
+ * Handle a cap GRANT message from the MDS. (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex and i_ceph_lock, we drop both.
+ */
+static void handle_cap_grant(struct inode *inode,
+ struct ceph_mds_session *session,
+ struct ceph_cap *cap,
+ struct ceph_mds_caps *grant,
+ struct ceph_buffer *xattr_buf,
+ struct cap_extra_info *extra_info)
+ __releases(ci->i_ceph_lock)
+ __releases(session->s_mdsc->snap_rwsem)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int seq = le32_to_cpu(grant->seq);
+ int newcaps = le32_to_cpu(grant->caps);
+ int used, wanted, dirty;
+ u64 size = le64_to_cpu(grant->size);
+ u64 max_size = le64_to_cpu(grant->max_size);
+ int check_caps = 0;
+ bool wake = false;
+ bool writeback = false;
+ bool queue_trunc = false;
+ bool queue_invalidate = false;
+ bool deleted_inode = false;
+ bool fill_inline = false;
+
+ dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+ inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
+ dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+ inode->i_size);
+
+
+ /*
+ * auth mds of the inode changed. we received the cap export message,
+ * but still haven't received the cap import message. handle_cap_export
+ * updated the new auth MDS' cap.
+ *
+ * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+ * that was sent before the cap import message. So don't remove caps.
+ */
+ if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+ WARN_ON(cap != ci->i_auth_cap);
+ WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+ seq = cap->seq;
+ newcaps |= cap->issued;
+ }
+
+ /*
+ * If CACHE is being revoked, and we have no dirty buffers,
+ * try to invalidate (once). (If there are dirty buffers, we
+ * will invalidate _after_ writeback.)
+ */
+ if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
+ !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
+ if (try_nonblocking_invalidate(inode)) {
+ /* there were locked pages.. invalidate later
+ in a separate thread. */
+ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+ queue_invalidate = true;
+ ci->i_rdcache_revoking = ci->i_rdcache_gen;
+ }
+ }
+ }
+
+ /* side effects now are allowed */
+ cap->cap_gen = session->s_cap_gen;
+ cap->seq = seq;
+
+ __check_cap_issue(ci, cap, newcaps);
+
+ if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+ (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ inode->i_mode = le32_to_cpu(grant->mode);
+ inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
+ inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
+ dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
+ }
+
+ if ((newcaps & CEPH_CAP_LINK_SHARED) &&
+ (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
+ set_nlink(inode, le32_to_cpu(grant->nlink));
+ if (inode->i_nlink == 0 &&
+ (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+ deleted_inode = true;
+ }
+
+ if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+ grant->xattr_len) {
+ int len = le32_to_cpu(grant->xattr_len);
+ u64 version = le64_to_cpu(grant->xattr_version);
+
+ if (version > ci->i_xattrs.version) {
+ dout(" got new xattrs v%llu on %p len %d\n",
+ version, inode, len);
+ if (ci->i_xattrs.blob)
+ ceph_buffer_put(ci->i_xattrs.blob);
+ ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+ ci->i_xattrs.version = version;
+ ceph_forget_all_cached_acls(inode);
+ }
+ }
+
+ if (newcaps & CEPH_CAP_ANY_RD) {
+ struct timespec64 mtime, atime, ctime;
+ /* ctime/mtime/atime? */
+ ceph_decode_timespec64(&mtime, &grant->mtime);
+ ceph_decode_timespec64(&atime, &grant->atime);
+ ceph_decode_timespec64(&ctime, &grant->ctime);
+ ceph_fill_file_time(inode, extra_info->issued,
+ le32_to_cpu(grant->time_warp_seq),
+ &ctime, &mtime, &atime);
+ }
+
+ if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
+ ci->i_files = extra_info->nfiles;
+ ci->i_subdirs = extra_info->nsubdirs;
+ }
+
+ if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+ /* file layout may have changed */
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
+
+ if (ci->i_layout.pool_id != old_pool ||
+ extra_info->pool_ns != old_ns)
+ ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
+
+ extra_info->pool_ns = old_ns;
+
+ /* size/truncate_seq? */
+ queue_trunc = ceph_fill_file_size(inode, extra_info->issued,
+ le32_to_cpu(grant->truncate_seq),
+ le64_to_cpu(grant->truncate_size),
+ size);
+ }
+
+ if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
+ if (max_size != ci->i_max_size) {
+ dout("max_size %lld -> %llu\n",
+ ci->i_max_size, max_size);
+ ci->i_max_size = max_size;
+ if (max_size >= ci->i_wanted_max_size) {
+ ci->i_wanted_max_size = 0; /* reset */
+ ci->i_requested_max_size = 0;
+ }
+ wake = true;
+ } else if (ci->i_wanted_max_size > ci->i_max_size &&
+ ci->i_wanted_max_size > ci->i_requested_max_size) {
+ /* CEPH_CAP_OP_IMPORT */
+ wake = true;
+ }
+ }
+
+ /* check cap bits */
+ wanted = __ceph_caps_wanted(ci);
+ used = __ceph_caps_used(ci);
+ dirty = __ceph_caps_dirty(ci);
+ dout(" my wanted = %s, used = %s, dirty %s\n",
+ ceph_cap_string(wanted),
+ ceph_cap_string(used),
+ ceph_cap_string(dirty));
+ if (wanted != le32_to_cpu(grant->wanted)) {
+ dout("mds wanted %s -> %s\n",
+ ceph_cap_string(le32_to_cpu(grant->wanted)),
+ ceph_cap_string(wanted));
+ /* imported cap may not have correct mds_wanted */
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
+ check_caps = 1;
+ }
+
+ /* revocation, grant, or no-op? */
+ if (cap->issued & ~newcaps) {
+ int revoking = cap->issued & ~newcaps;
+
+ dout("revocation: %s -> %s (revoking %s)\n",
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps),
+ ceph_cap_string(revoking));
+ if (revoking & used & CEPH_CAP_FILE_BUFFER)
+ writeback = true; /* initiate writeback; will delay ack */
+ else if (revoking == CEPH_CAP_FILE_CACHE &&
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
+ queue_invalidate)
+ ; /* do nothing yet, invalidation will be queued */
+ else if (cap == ci->i_auth_cap)
+ check_caps = 1; /* check auth cap only */
+ else
+ check_caps = 2; /* check all caps */
+ cap->issued = newcaps;
+ cap->implemented |= newcaps;
+ } else if (cap->issued == newcaps) {
+ dout("caps unchanged: %s -> %s\n",
+ ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+ } else {
+ dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
+ /* non-auth MDS is revoking the newly grant caps ? */
+ if (cap == ci->i_auth_cap &&
+ __ceph_caps_revoking_other(ci, cap, newcaps))
+ check_caps = 2;
+
+ cap->issued = newcaps;
+ cap->implemented |= newcaps; /* add bits only, to
+ * avoid stepping on a
+ * pending revocation */
+ wake = true;
+ }
+ BUG_ON(cap->issued & ~cap->implemented);
+
+ if (extra_info->inline_version > 0 &&
+ extra_info->inline_version >= ci->i_inline_version) {
+ ci->i_inline_version = extra_info->inline_version;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
+ fill_inline = true;
+ }
+
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+ if (newcaps & ~extra_info->issued)
+ wake = true;
+ kick_flushing_inode_caps(session->s_mdsc, session, inode);
+ up_read(&session->s_mdsc->snap_rwsem);
+ } else {
+ spin_unlock(&ci->i_ceph_lock);
+ }
+
+ if (fill_inline)
+ ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
+ extra_info->inline_len);
+
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
+
+ if (writeback)
+ /*
+ * queue inode for writeback: we can't actually call
+ * filemap_write_and_wait, etc. from message handler
+ * context.
+ */
+ ceph_queue_writeback(inode);
+ if (queue_invalidate)
+ ceph_queue_invalidate(inode);
+ if (deleted_inode)
+ invalidate_aliases(inode);
+ if (wake)
+ wake_up_all(&ci->i_cap_wq);
+
+ if (check_caps == 1)
+ ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+ session);
+ else if (check_caps == 2)
+ ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+ else
+ mutex_unlock(&session->s_mutex);
+}
+
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
+ struct ceph_mds_caps *m,
+ struct ceph_mds_session *session,
+ struct ceph_cap *cap)
+ __releases(ci->i_ceph_lock)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_cap_flush *cf, *tmp_cf;
+ LIST_HEAD(to_remove);
+ unsigned seq = le32_to_cpu(m->seq);
+ int dirty = le32_to_cpu(m->dirty);
+ int cleaned = 0;
+ bool drop = false;
+ bool wake_ci = false;
+ bool wake_mdsc = false;
+
+ list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid == flush_tid)
+ cleaned = cf->caps;
+ if (cf->caps == 0) /* capsnap */
+ continue;
+ if (cf->tid <= flush_tid) {
+ if (__finish_cap_flush(NULL, ci, cf))
+ wake_ci = true;
+ list_add_tail(&cf->i_list, &to_remove);
+ } else {
+ cleaned &= ~cf->caps;
+ if (!cleaned)
+ break;
+ }
+ }
+
+ dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+ " flushing %s -> %s\n",
+ inode, session->s_mds, seq, ceph_cap_string(dirty),
+ ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+
+ if (list_empty(&to_remove) && !cleaned)
+ goto out;
+
+ ci->i_flushing_caps &= ~cleaned;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+
+ list_for_each_entry(cf, &to_remove, i_list) {
+ if (__finish_cap_flush(mdsc, NULL, cf))
+ wake_mdsc = true;
+ }
+
+ if (ci->i_flushing_caps == 0) {
+ if (list_empty(&ci->i_cap_flush_list)) {
+ list_del_init(&ci->i_flushing_item);
+ if (!list_empty(&session->s_cap_flushing)) {
+ dout(" mds%d still flushing cap on %p\n",
+ session->s_mds,
+ &list_first_entry(&session->s_cap_flushing,
+ struct ceph_inode_info,
+ i_flushing_item)->vfs_inode);
+ }
+ }
+ mdsc->num_cap_flushing--;
+ dout(" inode %p now !flushing\n", inode);
+
+ if (ci->i_dirty_caps == 0) {
+ dout(" inode %p now clean\n", inode);
+ BUG_ON(!list_empty(&ci->i_dirty_item));
+ drop = true;
+ if (ci->i_wr_ref == 0 &&
+ ci->i_wrbuffer_ref_head == 0) {
+ BUG_ON(!ci->i_head_snapc);
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
+ } else {
+ BUG_ON(list_empty(&ci->i_dirty_item));
+ }
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+out:
+ spin_unlock(&ci->i_ceph_lock);
+
+ while (!list_empty(&to_remove)) {
+ cf = list_first_entry(&to_remove,
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
+ ceph_free_cap_flush(cf);
+ }
+
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
+ if (drop)
+ iput(inode);
+}
+
+/*
+ * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
+ struct ceph_mds_caps *m,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ u64 follows = le64_to_cpu(m->snap_follows);
+ struct ceph_cap_snap *capsnap;
+ bool flushed = false;
+ bool wake_ci = false;
+ bool wake_mdsc = false;
+
+ dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+ inode, ci, session->s_mds, follows);
+
+ spin_lock(&ci->i_ceph_lock);
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ if (capsnap->follows == follows) {
+ if (capsnap->cap_flush.tid != flush_tid) {
+ dout(" cap_snap %p follows %lld tid %lld !="
+ " %lld\n", capsnap, follows,
+ flush_tid, capsnap->cap_flush.tid);
+ break;
+ }
+ flushed = true;
+ break;
+ } else {
+ dout(" skipping cap_snap %p follows %lld\n",
+ capsnap, capsnap->follows);
+ }
+ }
+ if (flushed) {
+ WARN_ON(capsnap->dirty_pages || capsnap->writing);
+ dout(" removing %p cap_snap %p follows %lld\n",
+ inode, capsnap, follows);
+ list_del(&capsnap->ci_item);
+ if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
+ wake_ci = true;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+
+ if (list_empty(&ci->i_cap_flush_list))
+ list_del_init(&ci->i_flushing_item);
+
+ if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
+ wake_mdsc = true;
+
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (flushed) {
+ ceph_put_snap_context(capsnap->context);
+ ceph_put_cap_snap(capsnap);
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
+ iput(inode);
+ }
+}
+
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+ struct ceph_mds_caps *trunc,
+ struct ceph_mds_session *session)
+ __releases(ci->i_ceph_lock)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int mds = session->s_mds;
+ int seq = le32_to_cpu(trunc->seq);
+ u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+ u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+ u64 size = le64_to_cpu(trunc->size);
+ int implemented = 0;
+ int dirty = __ceph_caps_dirty(ci);
+ int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+ int queue_trunc = 0;
+
+ issued |= implemented | dirty;
+
+ dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+ inode, mds, seq, truncate_size, truncate_seq);
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ truncate_seq, truncate_size, size);
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
+}
+
+/*
+ * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
+ * different one. If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+ struct ceph_mds_cap_peer *ph,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *tsession = NULL;
+ struct ceph_cap *cap, *tcap, *new_cap = NULL;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 t_cap_id;
+ unsigned mseq = le32_to_cpu(ex->migrate_seq);
+ unsigned t_seq, t_mseq;
+ int target, issued;
+ int mds = session->s_mds;
+
+ if (ph) {
+ t_cap_id = le64_to_cpu(ph->cap_id);
+ t_seq = le32_to_cpu(ph->seq);
+ t_mseq = le32_to_cpu(ph->mseq);
+ target = le32_to_cpu(ph->mds);
+ } else {
+ t_cap_id = t_seq = t_mseq = 0;
+ target = -1;
+ }
+
+ dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
+ inode, ci, mds, mseq, target);
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
+ goto out_unlock;
+
+ if (target < 0) {
+ __ceph_remove_cap(cap, false);
+ if (!ci->i_auth_cap)
+ ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+ goto out_unlock;
+ }
+
+ /*
+ * now we know we haven't received the cap import message yet
+ * because the exported cap still exist.
+ */
+
+ issued = cap->issued;
+ if (issued != cap->implemented)
+ pr_err_ratelimited("handle_cap_export: issued != implemented: "
+ "ino (%llx.%llx) mds%d seq %d mseq %d "
+ "issued %s implemented %s\n",
+ ceph_vinop(inode), mds, cap->seq, cap->mseq,
+ ceph_cap_string(issued),
+ ceph_cap_string(cap->implemented));
+
+
+ tcap = __get_cap_for_mds(ci, target);
+ if (tcap) {
+ /* already have caps from the target */
+ if (tcap->cap_id == t_cap_id &&
+ ceph_seq_cmp(tcap->seq, t_seq) < 0) {
+ dout(" updating import cap %p mds%d\n", tcap, target);
+ tcap->cap_id = t_cap_id;
+ tcap->seq = t_seq - 1;
+ tcap->issue_seq = t_seq - 1;
+ tcap->issued |= issued;
+ tcap->implemented |= issued;
+ if (cap == ci->i_auth_cap)
+ ci->i_auth_cap = tcap;
+
+ if (!list_empty(&ci->i_cap_flush_list) &&
+ ci->i_auth_cap == tcap) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &tcap->session->s_cap_flushing);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
+ }
+ __ceph_remove_cap(cap, false);
+ goto out_unlock;
+ } else if (tsession) {
+ /* add placeholder for the export tagert */
+ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+ tcap = new_cap;
+ ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+ t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
+
+ if (!list_empty(&ci->i_cap_flush_list) &&
+ ci->i_auth_cap == tcap) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &tcap->session->s_cap_flushing);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
+
+ __ceph_remove_cap(cap, false);
+ goto out_unlock;
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&session->s_mutex);
+
+ /* open target session */
+ tsession = ceph_mdsc_open_export_target_session(mdsc, target);
+ if (!IS_ERR(tsession)) {
+ if (mds > target) {
+ mutex_lock(&session->s_mutex);
+ mutex_lock_nested(&tsession->s_mutex,
+ SINGLE_DEPTH_NESTING);
+ } else {
+ mutex_lock(&tsession->s_mutex);
+ mutex_lock_nested(&session->s_mutex,
+ SINGLE_DEPTH_NESTING);
+ }
+ new_cap = ceph_get_cap(mdsc, NULL);
+ } else {
+ WARN_ON(1);
+ tsession = NULL;
+ target = -1;
+ mutex_lock(&session->s_mutex);
+ }
+ goto retry;
+
+out_unlock:
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&session->s_mutex);
+ if (tsession) {
+ mutex_unlock(&tsession->s_mutex);
+ ceph_put_mds_session(tsession);
+ }
+ if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
+}
+
+/*
+ * Handle cap IMPORT.
+ *
+ * caller holds s_mutex. acquires i_ceph_lock
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+ struct inode *inode, struct ceph_mds_caps *im,
+ struct ceph_mds_cap_peer *ph,
+ struct ceph_mds_session *session,
+ struct ceph_cap **target_cap, int *old_issued)
+ __acquires(ci->i_ceph_lock)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap *cap, *ocap, *new_cap = NULL;
+ int mds = session->s_mds;
+ int issued;
+ unsigned caps = le32_to_cpu(im->caps);
+ unsigned wanted = le32_to_cpu(im->wanted);
+ unsigned seq = le32_to_cpu(im->seq);
+ unsigned mseq = le32_to_cpu(im->migrate_seq);
+ u64 realmino = le64_to_cpu(im->realm);
+ u64 cap_id = le64_to_cpu(im->cap_id);
+ u64 p_cap_id;
+ int peer;
+
+ if (ph) {
+ p_cap_id = le64_to_cpu(ph->cap_id);
+ peer = le32_to_cpu(ph->mds);
+ } else {
+ p_cap_id = 0;
+ peer = -1;
+ }
+
+ dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
+ inode, ci, mds, mseq, peer);
+
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ if (!new_cap) {
+ spin_unlock(&ci->i_ceph_lock);
+ new_cap = ceph_get_cap(mdsc, NULL);
+ goto retry;
+ }
+ cap = new_cap;
+ } else {
+ if (new_cap) {
+ ceph_put_cap(mdsc, new_cap);
+ new_cap = NULL;
+ }
+ }
+
+ __ceph_caps_issued(ci, &issued);
+ issued |= __ceph_caps_dirty(ci);
+
+ ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+ realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
+
+ ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+ if (ocap && ocap->cap_id == p_cap_id) {
+ dout(" remove export cap %p mds%d flags %d\n",
+ ocap, peer, ph->flags);
+ if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
+ (ocap->seq != le32_to_cpu(ph->seq) ||
+ ocap->mseq != le32_to_cpu(ph->mseq))) {
+ pr_err_ratelimited("handle_cap_import: "
+ "mismatched seq/mseq: ino (%llx.%llx) "
+ "mds%d seq %d mseq %d importer mds%d "
+ "has peer seq %d mseq %d\n",
+ ceph_vinop(inode), peer, ocap->seq,
+ ocap->mseq, mds, le32_to_cpu(ph->seq),
+ le32_to_cpu(ph->mseq));
+ }
+ __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+ }
+
+ /* make sure we re-request max_size, if necessary */
+ ci->i_requested_max_size = 0;
+
+ *old_issued = issued;
+ *target_cap = cap;
+}
+
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+ struct ceph_mds_caps *h;
+ struct ceph_mds_cap_peer *peer = NULL;
+ struct ceph_snap_realm *realm = NULL;
+ int op;
+ int msg_version = le16_to_cpu(msg->hdr.version);
+ u32 seq, mseq;
+ struct ceph_vino vino;
+ void *snaptrace;
+ size_t snaptrace_len;
+ void *p, *end;
+ struct cap_extra_info extra_info = {};
+
+ dout("handle_caps from mds%d\n", session->s_mds);
+
+ /* decode */
+ end = msg->front.iov_base + msg->front.iov_len;
+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;
+ h = msg->front.iov_base;
+ op = le32_to_cpu(h->op);
+ vino.ino = le64_to_cpu(h->ino);
+ vino.snap = CEPH_NOSNAP;
+ seq = le32_to_cpu(h->seq);
+ mseq = le32_to_cpu(h->migrate_seq);
+
+ snaptrace = h + 1;
+ snaptrace_len = le32_to_cpu(h->snap_trace_len);
+ p = snaptrace + snaptrace_len;
+
+ if (msg_version >= 2) {
+ u32 flock_len;
+ ceph_decode_32_safe(&p, end, flock_len, bad);
+ if (p + flock_len > end)
+ goto bad;
+ p += flock_len;
+ }
+
+ if (msg_version >= 3) {
+ if (op == CEPH_CAP_OP_IMPORT) {
+ if (p + sizeof(*peer) > end)
+ goto bad;
+ peer = p;
+ p += sizeof(*peer);
+ } else if (op == CEPH_CAP_OP_EXPORT) {
+ /* recorded in unused fields */
+ peer = (void *)&h->size;
+ }
+ }
+
+ if (msg_version >= 4) {
+ ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
+ ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
+ if (p + extra_info.inline_len > end)
+ goto bad;
+ extra_info.inline_data = p;
+ p += extra_info.inline_len;
+ }
+
+ if (msg_version >= 5) {
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+ u32 epoch_barrier;
+
+ ceph_decode_32_safe(&p, end, epoch_barrier, bad);
+ ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
+ }
+
+ if (msg_version >= 8) {
+ u64 flush_tid;
+ u32 caller_uid, caller_gid;
+ u32 pool_ns_len;
+
+ /* version >= 6 */
+ ceph_decode_64_safe(&p, end, flush_tid, bad);
+ /* version >= 7 */
+ ceph_decode_32_safe(&p, end, caller_uid, bad);
+ ceph_decode_32_safe(&p, end, caller_gid, bad);
+ /* version >= 8 */
+ ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+ if (pool_ns_len > 0) {
+ ceph_decode_need(&p, end, pool_ns_len, bad);
+ extra_info.pool_ns =
+ ceph_find_or_create_string(p, pool_ns_len);
+ p += pool_ns_len;
+ }
+ }
+
+ if (msg_version >= 11) {
+ struct ceph_timespec *btime;
+ u64 change_attr;
+ u32 flags;
+
+ /* version >= 9 */
+ if (p + sizeof(*btime) > end)
+ goto bad;
+ btime = p;
+ p += sizeof(*btime);
+ ceph_decode_64_safe(&p, end, change_attr, bad);
+ /* version >= 10 */
+ ceph_decode_32_safe(&p, end, flags, bad);
+ /* version >= 11 */
+ extra_info.dirstat_valid = true;
+ ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
+ ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
+ }
+
+ /* lookup ino */
+ inode = ceph_find_inode(mdsc->fsc->sb, vino);
+ ci = ceph_inode(inode);
+ dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+ vino.snap, inode);
+
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;
+ dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+ (unsigned)seq);
+
+ if (!inode) {
+ dout(" i don't have ino %llx\n", vino.ino);
+
+ if (op == CEPH_CAP_OP_IMPORT) {
+ cap = ceph_get_cap(mdsc, NULL);
+ cap->cap_ino = vino.ino;
+ cap->queue_release = 1;
+ cap->cap_id = le64_to_cpu(h->cap_id);
+ cap->mseq = mseq;
+ cap->seq = seq;
+ cap->issue_seq = seq;
+ spin_lock(&session->s_cap_lock);
+ list_add_tail(&cap->session_caps,
+ &session->s_cap_releases);
+ session->s_num_cap_releases++;
+ spin_unlock(&session->s_cap_lock);
+ }
+ goto flush_cap_releases;
+ }
+
+ /* these will work even if we don't have a cap yet */
+ switch (op) {
+ case CEPH_CAP_OP_FLUSHSNAP_ACK:
+ handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
+ h, session);
+ goto done;
+
+ case CEPH_CAP_OP_EXPORT:
+ handle_cap_export(inode, h, peer, session);
+ goto done_unlocked;
+
+ case CEPH_CAP_OP_IMPORT:
+ realm = NULL;
+ if (snaptrace_len) {
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, snaptrace,
+ snaptrace + snaptrace_len,
+ false, &realm);
+ downgrade_write(&mdsc->snap_rwsem);
+ } else {
+ down_read(&mdsc->snap_rwsem);
+ }
+ handle_cap_import(mdsc, inode, h, peer, session,
+ &cap, &extra_info.issued);
+ handle_cap_grant(inode, session, cap,
+ h, msg->middle, &extra_info);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
+ goto done_unlocked;
+ }
+
+ /* the rest require a cap */
+ spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
+ if (!cap) {
+ dout(" no cap on %p ino %llx.%llx from mds%d\n",
+ inode, ceph_ino(inode), ceph_snap(inode),
+ session->s_mds);
+ spin_unlock(&ci->i_ceph_lock);
+ goto flush_cap_releases;
+ }
+
+ /* note that each of these drops i_ceph_lock for us */
+ switch (op) {
+ case CEPH_CAP_OP_REVOKE:
+ case CEPH_CAP_OP_GRANT:
+ __ceph_caps_issued(ci, &extra_info.issued);
+ extra_info.issued |= __ceph_caps_dirty(ci);
+ handle_cap_grant(inode, session, cap,
+ h, msg->middle, &extra_info);
+ goto done_unlocked;
+
+ case CEPH_CAP_OP_FLUSH_ACK:
+ handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
+ h, session, cap);
+ break;
+
+ case CEPH_CAP_OP_TRUNC:
+ handle_cap_trunc(inode, h, session);
+ break;
+
+ default:
+ spin_unlock(&ci->i_ceph_lock);
+ pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+ ceph_cap_op_name(op));
+ }
+
+ goto done;
+
+flush_cap_releases:
+ /*
+ * send any cap release message to try to move things
+ * along for the mds (who clearly thinks we still have this
+ * cap).
+ */
+ ceph_send_cap_releases(mdsc, session);
+
+done:
+ mutex_unlock(&session->s_mutex);
+done_unlocked:
+ iput(inode);
+ ceph_put_string(extra_info.pool_ns);
+ return;
+
+bad:
+ pr_err("ceph_handle_caps: corrupt message\n");
+ ceph_msg_dump(msg);
+ return;
+}
+
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+{
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ int flags = CHECK_CAPS_NODELAY;
+
+ dout("check_delayed_caps\n");
+ while (1) {
+ spin_lock(&mdsc->cap_delay_lock);
+ if (list_empty(&mdsc->cap_delay_list))
+ break;
+ ci = list_first_entry(&mdsc->cap_delay_list,
+ struct ceph_inode_info,
+ i_cap_delay_list);
+ if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+ time_before(jiffies, ci->i_hold_caps_max))
+ break;
+ list_del_init(&ci->i_cap_delay_list);
+
+ inode = igrab(&ci->vfs_inode);
+ spin_unlock(&mdsc->cap_delay_lock);
+
+ if (inode) {
+ dout("check_delayed_caps on %p\n", inode);
+ ceph_check_caps(ci, flags, NULL);
+ iput(inode);
+ }
+ }
+ spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+ struct ceph_inode_info *ci;
+ struct inode *inode;
+
+ dout("flush_dirty_caps\n");
+ spin_lock(&mdsc->cap_dirty_lock);
+ while (!list_empty(&mdsc->cap_dirty)) {
+ ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
+ i_dirty_item);
+ inode = &ci->vfs_inode;
+ ihold(inode);
+ dout("flush_dirty_caps %p\n", inode);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+ iput(inode);
+ spin_lock(&mdsc->cap_dirty_lock);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ dout("flush_dirty_caps done\n");
+}
+
+void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+{
+ int i;
+ int bits = (fmode << 1) | 1;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i))
+ ci->i_nr_by_mode[i]++;
+ }
+}
+
+/*
+ * Drop open file reference. If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+ int i, last = 0;
+ int bits = (fmode << 1) | 1;
+ spin_lock(&ci->i_ceph_lock);
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i)) {
+ BUG_ON(ci->i_nr_by_mode[i] == 0);
+ if (--ci->i_nr_by_mode[i] == 0)
+ last++;
+ }
+ }
+ dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
+ &ci->vfs_inode, fmode,
+ ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
+ ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (last && ci->i_vino.snap == CEPH_NOSNAP)
+ ceph_check_caps(ci, 0, NULL);
+}
+
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+int ceph_drop_caps_for_unlink(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (inode->i_nlink == 1) {
+ drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+
+ ci->i_ceph_flags |= CEPH_I_NODELAY;
+ if (__ceph_caps_dirty(ci)) {
+ struct ceph_mds_client *mdsc =
+ ceph_inode_to_client(inode)->mdsc;
+ __cap_delay_requeue_front(mdsc, ci);
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ return drop;
+}
+
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,
+ int mds, int drop, int unless, int force)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap *cap;
+ struct ceph_mds_request_release *rel = *p;
+ int used, dirty;
+ int ret = 0;
+
+ spin_lock(&ci->i_ceph_lock);
+ used = __ceph_caps_used(ci);
+ dirty = __ceph_caps_dirty(ci);
+
+ dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
+ inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
+ ceph_cap_string(unless));
+
+ /* only drop unused, clean caps */
+ drop &= ~(used | dirty);
+
+ cap = __get_cap_for_mds(ci, mds);
+ if (cap && __cap_is_valid(cap)) {
+ unless &= cap->issued;
+ if (unless) {
+ if (unless & CEPH_CAP_AUTH_EXCL)
+ drop &= ~CEPH_CAP_AUTH_SHARED;
+ if (unless & CEPH_CAP_LINK_EXCL)
+ drop &= ~CEPH_CAP_LINK_SHARED;
+ if (unless & CEPH_CAP_XATTR_EXCL)
+ drop &= ~CEPH_CAP_XATTR_SHARED;
+ if (unless & CEPH_CAP_FILE_EXCL)
+ drop &= ~CEPH_CAP_FILE_SHARED;
+ }
+
+ if (force || (cap->issued & drop)) {
+ if (cap->issued & drop) {
+ int wanted = __ceph_caps_wanted(ci);
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
+ wanted |= cap->mds_wanted;
+ dout("encode_inode_release %p cap %p "
+ "%s -> %s, wanted %s -> %s\n", inode, cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & ~drop),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(wanted));
+
+ cap->issued &= ~drop;
+ cap->implemented &= ~drop;
+ cap->mds_wanted = wanted;
+ } else {
+ dout("encode_inode_release %p cap %p %s"
+ " (force)\n", inode, cap,
+ ceph_cap_string(cap->issued));
+ }
+
+ rel->ino = cpu_to_le64(ceph_ino(inode));
+ rel->cap_id = cpu_to_le64(cap->cap_id);
+ rel->seq = cpu_to_le32(cap->seq);
+ rel->issue_seq = cpu_to_le32(cap->issue_seq);
+ rel->mseq = cpu_to_le32(cap->mseq);
+ rel->caps = cpu_to_le32(cap->implemented);
+ rel->wanted = cpu_to_le32(cap->mds_wanted);
+ rel->dname_len = 0;
+ rel->dname_seq = 0;
+ *p += sizeof(*rel);
+ ret = 1;
+ } else {
+ dout("encode_inode_release %p cap %p %s (noop)\n",
+ inode, cap, ceph_cap_string(cap->issued));
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+ struct inode *dir,
+ int mds, int drop, int unless)
+{
+ struct dentry *parent = NULL;
+ struct ceph_mds_request_release *rel = *p;
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int force = 0;
+ int ret;
+
+ /*
+ * force an record for the directory caps if we have a dentry lease.
+ * this is racy (can't take i_ceph_lock and d_lock together), but it
+ * doesn't have to be perfect; the mds will revoke anything we don't
+ * release.
+ */
+ spin_lock(&dentry->d_lock);
+ if (di->lease_session && di->lease_session->s_mds == mds)
+ force = 1;
+ if (!dir) {
+ parent = dget(dentry->d_parent);
+ dir = d_inode(parent);
+ }
+ spin_unlock(&dentry->d_lock);
+
+ ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+ dput(parent);
+
+ spin_lock(&dentry->d_lock);
+ if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+ dout("encode_dentry_release %p mds%d seq %d\n",
+ dentry, mds, (int)di->lease_seq);
+ rel->dname_len = cpu_to_le32(dentry->d_name.len);
+ memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+ *p += dentry->d_name.len;
+ rel->dname_seq = cpu_to_le32(di->lease_seq);
+ __ceph_mdsc_drop_dentry_lease(dentry);
+ }
+ spin_unlock(&dentry->d_lock);
+ return ret;
+}
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000..6f67d5b88
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph 'frag' type
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+ unsigned va = ceph_frag_value(a);
+ unsigned vb = ceph_frag_value(b);
+ if (va < vb)
+ return -1;
+ if (va > vb)
+ return 1;
+ va = ceph_frag_bits(a);
+ vb = ceph_frag_bits(b);
+ if (va < vb)
+ return -1;
+ if (va > vb)
+ return 1;
+ return 0;
+}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000..e6b7d43b5
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#include "super.h"
+
+#ifdef CONFIG_DEBUG_FS
+
+#include "mds_client.h"
+
+static int mdsmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_mdsmap *mdsmap;
+
+ if (!fsc->mdsc || !fsc->mdsc->mdsmap)
+ return 0;
+ mdsmap = fsc->mdsc->mdsmap;
+ seq_printf(s, "epoch %d\n", mdsmap->m_epoch);
+ seq_printf(s, "root %d\n", mdsmap->m_root);
+ seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
+ seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
+ seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
+ for (i = 0; i < mdsmap->m_num_mds; i++) {
+ struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
+ int state = mdsmap->m_info[i].state;
+ seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+ ceph_pr_addr(&addr->in_addr),
+ ceph_mds_state_name(state));
+ }
+ return 0;
+}
+
+/*
+ * mdsc debugfs
+ */
+static int mdsc_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ struct rb_node *rp;
+ int pathlen;
+ u64 pathbase;
+ char *path;
+
+ mutex_lock(&mdsc->mutex);
+ for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
+ req = rb_entry(rp, struct ceph_mds_request, r_node);
+
+ if (req->r_request && req->r_session)
+ seq_printf(s, "%lld\tmds%d\t", req->r_tid,
+ req->r_session->s_mds);
+ else if (!req->r_request)
+ seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+ else
+ seq_printf(s, "%lld\t(no session)\t", req->r_tid);
+
+ seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
+
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
+ seq_puts(s, "\t(unsafe)");
+ else
+ seq_puts(s, "\t");
+
+ if (req->r_inode) {
+ seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+ } else if (req->r_dentry) {
+ path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ &pathbase, 0);
+ if (IS_ERR(path))
+ path = NULL;
+ spin_lock(&req->r_dentry->d_lock);
+ seq_printf(s, " #%llx/%pd (%s)",
+ ceph_ino(d_inode(req->r_dentry->d_parent)),
+ req->r_dentry,
+ path ? path : "");
+ spin_unlock(&req->r_dentry->d_lock);
+ kfree(path);
+ } else if (req->r_path1) {
+ seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+ req->r_path1);
+ } else {
+ seq_printf(s, " #%llx", req->r_ino1.ino);
+ }
+
+ if (req->r_old_dentry) {
+ path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+ &pathbase, 0);
+ if (IS_ERR(path))
+ path = NULL;
+ spin_lock(&req->r_old_dentry->d_lock);
+ seq_printf(s, " #%llx/%pd (%s)",
+ req->r_old_dentry_dir ?
+ ceph_ino(req->r_old_dentry_dir) : 0,
+ req->r_old_dentry,
+ path ? path : "");
+ spin_unlock(&req->r_old_dentry->d_lock);
+ kfree(path);
+ } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
+ if (req->r_ino2.ino)
+ seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+ req->r_path2);
+ else
+ seq_printf(s, " %s", req->r_path2);
+ }
+
+ seq_puts(s, "\n");
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ return 0;
+}
+
+static int caps_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ int total, avail, used, reserved, min;
+
+ ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
+ seq_printf(s, "total\t\t%d\n"
+ "avail\t\t%d\n"
+ "used\t\t%d\n"
+ "reserved\t%d\n"
+ "min\t%d\n",
+ total, avail, used, reserved, min);
+ return 0;
+}
+
+static int dentry_lru_show(struct seq_file *s, void *ptr)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_dentry_info *di;
+
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_for_each_entry(di, &mdsc->dentry_lru, lru) {
+ struct dentry *dentry = di->dentry;
+ seq_printf(s, "%p %p\t%pd\n",
+ di, dentry, dentry);
+ }
+ spin_unlock(&mdsc->dentry_lru_lock);
+
+ return 0;
+}
+
+static int mds_sessions_show(struct seq_file *s, void *ptr)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_auth_client *ac = fsc->client->monc.auth;
+ struct ceph_options *opt = fsc->client->options;
+ int mds = -1;
+
+ mutex_lock(&mdsc->mutex);
+
+ /* The 'num' portion of an 'entity name' */
+ seq_printf(s, "global_id %llu\n", ac->global_id);
+
+ /* The -o name mount argument */
+ seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : "");
+
+ /* The list of MDS session rank+state */
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
+ struct ceph_mds_session *session =
+ __ceph_lookup_mds_session(mdsc, mds);
+ if (!session) {
+ continue;
+ }
+ mutex_unlock(&mdsc->mutex);
+ seq_printf(s, "mds.%d %s\n",
+ session->s_mds,
+ ceph_session_state_name(session->s_state));
+
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
+CEPH_DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
+
+
+/*
+ * debugfs
+ */
+static int congestion_kb_set(void *data, u64 val)
+{
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+
+ fsc->mount_options->congestion_kb = (int)val;
+ return 0;
+}
+
+static int congestion_kb_get(void *data, u64 *val)
+{
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+
+ *val = (u64)fsc->mount_options->congestion_kb;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
+ congestion_kb_set, "%llu\n");
+
+
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
+{
+ dout("ceph_fs_debugfs_cleanup\n");
+ debugfs_remove(fsc->debugfs_bdi);
+ debugfs_remove(fsc->debugfs_congestion_kb);
+ debugfs_remove(fsc->debugfs_mdsmap);
+ debugfs_remove(fsc->debugfs_mds_sessions);
+ debugfs_remove(fsc->debugfs_caps);
+ debugfs_remove(fsc->debugfs_mdsc);
+ debugfs_remove(fsc->debugfs_dentry_lru);
+}
+
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+{
+ char name[100];
+ int err = -ENOMEM;
+
+ dout("ceph_fs_debugfs_init\n");
+ BUG_ON(!fsc->client->debugfs_dir);
+ fsc->debugfs_congestion_kb =
+ debugfs_create_file("writeback_congestion_kb",
+ 0600,
+ fsc->client->debugfs_dir,
+ fsc,
+ &congestion_kb_fops);
+ if (!fsc->debugfs_congestion_kb)
+ goto out;
+
+ snprintf(name, sizeof(name), "../../bdi/%s",
+ bdi_dev_name(fsc->sb->s_bdi));
+ fsc->debugfs_bdi =
+ debugfs_create_symlink("bdi",
+ fsc->client->debugfs_dir,
+ name);
+ if (!fsc->debugfs_bdi)
+ goto out;
+
+ fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
+ 0400,
+ fsc->client->debugfs_dir,
+ fsc,
+ &mdsmap_show_fops);
+ if (!fsc->debugfs_mdsmap)
+ goto out;
+
+ fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
+ 0400,
+ fsc->client->debugfs_dir,
+ fsc,
+ &mds_sessions_show_fops);
+ if (!fsc->debugfs_mds_sessions)
+ goto out;
+
+ fsc->debugfs_mdsc = debugfs_create_file("mdsc",
+ 0400,
+ fsc->client->debugfs_dir,
+ fsc,
+ &mdsc_show_fops);
+ if (!fsc->debugfs_mdsc)
+ goto out;
+
+ fsc->debugfs_caps = debugfs_create_file("caps",
+ 0400,
+ fsc->client->debugfs_dir,
+ fsc,
+ &caps_show_fops);
+ if (!fsc->debugfs_caps)
+ goto out;
+
+ fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+ 0400,
+ fsc->client->debugfs_dir,
+ fsc,
+ &dentry_lru_show_fops);
+ if (!fsc->debugfs_dentry_lru)
+ goto out;
+
+ return 0;
+
+out:
+ ceph_fs_debugfs_cleanup(fsc);
+ return err;
+}
+
+
+#else /* CONFIG_DEBUG_FS */
+
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+{
+ return 0;
+}
+
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000..7f3f64ba4
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1541 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/spinlock.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/xattr.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Directory operations: readdir, lookup, create, link, unlink,
+ * rename, etc.
+ */
+
+/*
+ * Ceph MDS operations are specified in terms of a base ino and
+ * relative path. Thus, the client can specify an operation on a
+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
+ * relative to, say, the root directory.
+ *
+ * Normally, we limit ourselves to strict inode ops (no path component)
+ * or dentry operations (a single path component relative to an ino). The
+ * exception to this is open_root_dentry(), which will open the mount
+ * point by name.
+ */
+
+const struct dentry_operations ceph_dentry_ops;
+
+/*
+ * Initialize ceph dentry state.
+ */
+static int ceph_d_init(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+
+ di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
+ if (!di)
+ return -ENOMEM; /* oh well */
+
+ di->dentry = dentry;
+ di->lease_session = NULL;
+ di->time = jiffies;
+ dentry->d_fsdata = di;
+ ceph_dentry_lru_add(dentry);
+ return 0;
+}
+
+/*
+ * for f_pos for readdir:
+ * - hash order:
+ * (0xff << 52) | ((24 bits hash) << 28) |
+ * (the nth entry has hash collision);
+ * - frag+name order;
+ * ((frag value) << 28) | (the nth entry in frag);
+ */
+#define OFFSET_BITS 28
+#define OFFSET_MASK ((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER (0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
+{
+ loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+ if (hash_order)
+ fpos |= HASH_ORDER;
+ return fpos;
+}
+
+static bool is_hash_order(loff_t p)
+{
+ return (p & HASH_ORDER) == HASH_ORDER;
+}
+
+static unsigned fpos_frag(loff_t p)
+{
+ return p >> OFFSET_BITS;
+}
+
+static unsigned fpos_hash(loff_t p)
+{
+ return ceph_frag_value(fpos_frag(p));
+}
+
+static unsigned fpos_off(loff_t p)
+{
+ return p & OFFSET_MASK;
+}
+
+static int fpos_cmp(loff_t l, loff_t r)
+{
+ int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
+ if (v)
+ return v;
+ return (int)(fpos_off(l) - fpos_off(r));
+}
+
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
+ int len, unsigned next_offset)
+{
+ char *buf = kmalloc(len+1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ kfree(dfi->last_name);
+ dfi->last_name = buf;
+ memcpy(dfi->last_name, name, len);
+ dfi->last_name[len] = 0;
+ dfi->next_offset = next_offset;
+ dout("note_last_dentry '%s'\n", dfi->last_name);
+ return 0;
+}
+
+
+static struct dentry *
+__dcache_find_get_entry(struct dentry *parent, u64 idx,
+ struct ceph_readdir_cache_control *cache_ctl)
+{
+ struct inode *dir = d_inode(parent);
+ struct dentry *dentry;
+ unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
+ loff_t ptr_pos = idx * sizeof(struct dentry *);
+ pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
+
+ if (ptr_pos >= i_size_read(dir))
+ return NULL;
+
+ if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
+ ceph_readdir_cache_release(cache_ctl);
+ cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
+ if (!cache_ctl->page) {
+ dout(" page %lu not found\n", ptr_pgoff);
+ return ERR_PTR(-EAGAIN);
+ }
+ /* reading/filling the cache are serialized by
+ i_mutex, no need to use page lock */
+ unlock_page(cache_ctl->page);
+ cache_ctl->dentries = kmap(cache_ctl->page);
+ }
+
+ cache_ctl->index = idx & idx_mask;
+
+ rcu_read_lock();
+ spin_lock(&parent->d_lock);
+ /* check i_size again here, because empty directory can be
+ * marked as complete while not holding the i_mutex. */
+ if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
+ dentry = cache_ctl->dentries[cache_ctl->index];
+ else
+ dentry = NULL;
+ spin_unlock(&parent->d_lock);
+ if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+ dentry = NULL;
+ rcu_read_unlock();
+ return dentry ? : ERR_PTR(-EAGAIN);
+}
+
+/*
+ * When possible, we try to satisfy a readdir by peeking at the
+ * dcache. We make this work by carefully ordering dentries on
+ * d_child when we initially get results back from the MDS, and
+ * falling back to a "normal" sync readdir if any dentries in the dir
+ * are dropped.
+ *
+ * Complete dir indicates that we have all dentries in the dir. It is
+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
+ * the MDS if/when the directory is modified).
+ */
+static int __dcache_readdir(struct file *file, struct dir_context *ctx,
+ int shared_gen)
+{
+ struct ceph_dir_file_info *dfi = file->private_data;
+ struct dentry *parent = file->f_path.dentry;
+ struct inode *dir = d_inode(parent);
+ struct dentry *dentry, *last = NULL;
+ struct ceph_dentry_info *di;
+ struct ceph_readdir_cache_control cache_ctl = {};
+ u64 idx = 0;
+ int err = 0;
+
+ dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos);
+
+ /* search start position */
+ if (ctx->pos > 2) {
+ u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
+ while (count > 0) {
+ u64 step = count >> 1;
+ dentry = __dcache_find_get_entry(parent, idx + step,
+ &cache_ctl);
+ if (!dentry) {
+ /* use linar search */
+ idx = 0;
+ break;
+ }
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out;
+ }
+ di = ceph_dentry(dentry);
+ spin_lock(&dentry->d_lock);
+ if (fpos_cmp(di->offset, ctx->pos) < 0) {
+ idx += step + 1;
+ count -= step + 1;
+ } else {
+ count = step;
+ }
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ }
+
+ dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
+ }
+
+
+ for (;;) {
+ bool emit_dentry = false;
+ dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
+ if (!dentry) {
+ dfi->file_info.flags |= CEPH_F_ATEND;
+ err = 0;
+ break;
+ }
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out;
+ }
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ if (d_unhashed(dentry) ||
+ d_really_is_negative(dentry) ||
+ di->lease_shared_gen != shared_gen) {
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ err = -EAGAIN;
+ goto out;
+ }
+ if (fpos_cmp(ctx->pos, di->offset) <= 0) {
+ emit_dentry = true;
+ }
+ spin_unlock(&dentry->d_lock);
+
+ if (emit_dentry) {
+ dout(" %llx dentry %p %pd %p\n", di->offset,
+ dentry, dentry, d_inode(dentry));
+ ctx->pos = di->offset;
+ if (!dir_emit(ctx, dentry->d_name.name,
+ dentry->d_name.len,
+ ceph_translate_ino(dentry->d_sb,
+ d_inode(dentry)->i_ino),
+ d_inode(dentry)->i_mode >> 12)) {
+ dput(dentry);
+ err = 0;
+ break;
+ }
+ ctx->pos++;
+
+ if (last)
+ dput(last);
+ last = dentry;
+ } else {
+ dput(dentry);
+ }
+ }
+out:
+ ceph_readdir_cache_release(&cache_ctl);
+ if (last) {
+ int ret;
+ di = ceph_dentry(last);
+ ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len,
+ fpos_off(di->offset) + 1);
+ if (ret < 0)
+ err = ret;
+ dput(last);
+ /* last_name no longer match cache index */
+ if (dfi->readdir_cache_idx >= 0) {
+ dfi->readdir_cache_idx = -1;
+ dfi->dir_release_count = 0;
+ }
+ }
+ return err;
+}
+
+static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos)
+{
+ if (!dfi->last_readdir)
+ return true;
+ if (is_hash_order(pos))
+ return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos));
+ else
+ return dfi->frag != fpos_frag(pos);
+}
+
+static int ceph_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct ceph_dir_file_info *dfi = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ int i;
+ int err;
+ unsigned frag = -1;
+ struct ceph_mds_reply_info_parsed *rinfo;
+
+ dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
+ if (dfi->file_info.flags & CEPH_F_ATEND)
+ return 0;
+
+ /* always start with . and .. */
+ if (ctx->pos == 0) {
+ dout("readdir off 0 -> '.'\n");
+ if (!dir_emit(ctx, ".", 1,
+ ceph_translate_ino(inode->i_sb, inode->i_ino),
+ inode->i_mode >> 12))
+ return 0;
+ ctx->pos = 1;
+ }
+ if (ctx->pos == 1) {
+ ino_t ino = parent_ino(file->f_path.dentry);
+ dout("readdir off 1 -> '..'\n");
+ if (!dir_emit(ctx, "..", 2,
+ ceph_translate_ino(inode->i_sb, ino),
+ inode->i_mode >> 12))
+ return 0;
+ ctx->pos = 2;
+ }
+
+ /* can we use the dcache? */
+ spin_lock(&ci->i_ceph_lock);
+ if (ceph_test_mount_opt(fsc, DCACHE) &&
+ !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
+ ceph_snap(inode) != CEPH_SNAPDIR &&
+ __ceph_dir_is_complete_ordered(ci) &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ int shared_gen = atomic_read(&ci->i_shared_gen);
+ spin_unlock(&ci->i_ceph_lock);
+ err = __dcache_readdir(file, ctx, shared_gen);
+ if (err != -EAGAIN)
+ return err;
+ } else {
+ spin_unlock(&ci->i_ceph_lock);
+ }
+
+ /* proceed with a normal readdir */
+more:
+ /* do we have the correct frag content buffered? */
+ if (need_send_readdir(dfi, ctx->pos)) {
+ struct ceph_mds_request *req;
+ int op = ceph_snap(inode) == CEPH_SNAPDIR ?
+ CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
+
+ /* discard old result, if any */
+ if (dfi->last_readdir) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
+ }
+
+ if (is_hash_order(ctx->pos)) {
+ /* fragtree isn't always accurate. choose frag
+ * based on previous reply when possible. */
+ if (frag == (unsigned)-1)
+ frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+ NULL, NULL);
+ } else {
+ frag = fpos_frag(ctx->pos);
+ }
+
+ dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
+ ceph_vinop(inode), frag, dfi->last_name);
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ err = ceph_alloc_readdir_reply_buffer(req, inode);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return err;
+ }
+ /* hints to request -> mds selection code */
+ req->r_direct_mode = USE_AUTH_MDS;
+ if (op == CEPH_MDS_OP_READDIR) {
+ req->r_direct_hash = ceph_frag_value(frag);
+ __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+ req->r_inode_drop = CEPH_CAP_FILE_EXCL;
+ }
+ if (dfi->last_name) {
+ req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL);
+ if (!req->r_path2) {
+ ceph_mdsc_put_request(req);
+ return -ENOMEM;
+ }
+ } else if (is_hash_order(ctx->pos)) {
+ req->r_args.readdir.offset_hash =
+ cpu_to_le32(fpos_hash(ctx->pos));
+ }
+
+ req->r_dir_release_cnt = dfi->dir_release_count;
+ req->r_dir_ordered_cnt = dfi->dir_ordered_count;
+ req->r_readdir_cache_idx = dfi->readdir_cache_idx;
+ req->r_readdir_offset = dfi->next_offset;
+ req->r_args.readdir.frag = cpu_to_le32(frag);
+ req->r_args.readdir.flags =
+ cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
+
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_dentry = dget(file->f_path.dentry);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0) {
+ ceph_mdsc_put_request(req);
+ return err;
+ }
+ dout("readdir got and parsed readdir result=%d on "
+ "frag %x, end=%d, complete=%d, hash_order=%d\n",
+ err, frag,
+ (int)req->r_reply_info.dir_end,
+ (int)req->r_reply_info.dir_complete,
+ (int)req->r_reply_info.hash_order);
+
+ rinfo = &req->r_reply_info;
+ if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+ frag = le32_to_cpu(rinfo->dir_dir->frag);
+ if (!rinfo->hash_order) {
+ dfi->next_offset = req->r_readdir_offset;
+ /* adjust ctx->pos to beginning of frag */
+ ctx->pos = ceph_make_fpos(frag,
+ dfi->next_offset,
+ false);
+ }
+ }
+
+ dfi->frag = frag;
+ dfi->last_readdir = req;
+
+ if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
+ dfi->readdir_cache_idx = req->r_readdir_cache_idx;
+ if (dfi->readdir_cache_idx < 0) {
+ /* preclude from marking dir ordered */
+ dfi->dir_ordered_count = 0;
+ } else if (ceph_frag_is_leftmost(frag) &&
+ dfi->next_offset == 2) {
+ /* note dir version at start of readdir so
+ * we can tell if any dentries get dropped */
+ dfi->dir_release_count = req->r_dir_release_cnt;
+ dfi->dir_ordered_count = req->r_dir_ordered_cnt;
+ }
+ } else {
+ dout("readdir !did_prepopulate\n");
+ /* disable readdir cache */
+ dfi->readdir_cache_idx = -1;
+ /* preclude from marking dir complete */
+ dfi->dir_release_count = 0;
+ }
+
+ /* note next offset and last dentry name */
+ if (rinfo->dir_nr > 0) {
+ struct ceph_mds_reply_dir_entry *rde =
+ rinfo->dir_entries + (rinfo->dir_nr-1);
+ unsigned next_offset = req->r_reply_info.dir_end ?
+ 2 : (fpos_off(rde->offset) + 1);
+ err = note_last_dentry(dfi, rde->name, rde->name_len,
+ next_offset);
+ if (err)
+ return err;
+ } else if (req->r_reply_info.dir_end) {
+ dfi->next_offset = 2;
+ /* keep last name */
+ }
+ }
+
+ rinfo = &dfi->last_readdir->r_reply_info;
+ dout("readdir frag %x num %d pos %llx chunk first %llx\n",
+ dfi->frag, rinfo->dir_nr, ctx->pos,
+ rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
+
+ i = 0;
+ /* search start position */
+ if (rinfo->dir_nr > 0) {
+ int step, nr = rinfo->dir_nr;
+ while (nr > 0) {
+ step = nr >> 1;
+ if (rinfo->dir_entries[i + step].offset < ctx->pos) {
+ i += step + 1;
+ nr -= step + 1;
+ } else {
+ nr = step;
+ }
+ }
+ }
+ for (; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
+ struct ceph_vino vino;
+ ino_t ino;
+ u32 ftype;
+
+ BUG_ON(rde->offset < ctx->pos);
+
+ ctx->pos = rde->offset;
+ dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
+ i, rinfo->dir_nr, ctx->pos,
+ rde->name_len, rde->name, &rde->inode.in);
+
+ BUG_ON(!rde->inode.in);
+ ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
+ vino.ino = le64_to_cpu(rde->inode.in->ino);
+ vino.snap = le64_to_cpu(rde->inode.in->snapid);
+ ino = ceph_vino_to_ino(vino);
+
+ if (!dir_emit(ctx, rde->name, rde->name_len,
+ ceph_translate_ino(inode->i_sb, ino), ftype)) {
+ dout("filldir stopping us...\n");
+ return 0;
+ }
+ ctx->pos++;
+ }
+
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
+
+ if (dfi->next_offset > 2) {
+ frag = dfi->frag;
+ goto more;
+ }
+
+ /* more frags? */
+ if (!ceph_frag_is_rightmost(dfi->frag)) {
+ frag = ceph_frag_next(dfi->frag);
+ if (is_hash_order(ctx->pos)) {
+ loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+ dfi->next_offset, true);
+ if (new_pos > ctx->pos)
+ ctx->pos = new_pos;
+ /* keep last_name */
+ } else {
+ ctx->pos = ceph_make_fpos(frag, dfi->next_offset,
+ false);
+ kfree(dfi->last_name);
+ dfi->last_name = NULL;
+ }
+ dout("readdir next frag is %x\n", frag);
+ goto more;
+ }
+ dfi->file_info.flags |= CEPH_F_ATEND;
+
+ /*
+ * if dir_release_count still matches the dir, no dentries
+ * were released during the whole readdir, and we should have
+ * the complete dir contents in our cache.
+ */
+ if (atomic64_read(&ci->i_release_count) ==
+ dfi->dir_release_count) {
+ spin_lock(&ci->i_ceph_lock);
+ if (dfi->dir_ordered_count ==
+ atomic64_read(&ci->i_ordered_count)) {
+ dout(" marking %p complete and ordered\n", inode);
+ /* use i_size to track number of entries in
+ * readdir cache */
+ BUG_ON(dfi->readdir_cache_idx < 0);
+ i_size_write(inode, dfi->readdir_cache_idx *
+ sizeof(struct dentry*));
+ } else {
+ dout(" marking %p complete\n", inode);
+ }
+ __ceph_dir_set_complete(ci, dfi->dir_release_count,
+ dfi->dir_ordered_count);
+ spin_unlock(&ci->i_ceph_lock);
+ }
+
+ dout("readdir %p file %p done.\n", inode, file);
+ return 0;
+}
+
+static void reset_readdir(struct ceph_dir_file_info *dfi)
+{
+ if (dfi->last_readdir) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
+ }
+ kfree(dfi->last_name);
+ dfi->last_name = NULL;
+ dfi->dir_release_count = 0;
+ dfi->readdir_cache_idx = -1;
+ dfi->next_offset = 2; /* compensate for . and .. */
+ dfi->file_info.flags &= ~CEPH_F_ATEND;
+}
+
+/*
+ * discard buffered readdir content on seekdir(0), or seek to new frag,
+ * or seek prior to current chunk
+ */
+static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos)
+{
+ struct ceph_mds_reply_info_parsed *rinfo;
+ loff_t chunk_offset;
+ if (new_pos == 0)
+ return true;
+ if (is_hash_order(new_pos)) {
+ /* no need to reset last_name for a forward seek when
+ * dentries are sotred in hash order */
+ } else if (dfi->frag != fpos_frag(new_pos)) {
+ return true;
+ }
+ rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL;
+ if (!rinfo || !rinfo->dir_nr)
+ return true;
+ chunk_offset = rinfo->dir_entries[0].offset;
+ return new_pos < chunk_offset ||
+ is_hash_order(new_pos) != is_hash_order(chunk_offset);
+}
+
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct ceph_dir_file_info *dfi = file->private_data;
+ struct inode *inode = file->f_mapping->host;
+ loff_t retval;
+
+ inode_lock(inode);
+ retval = -EINVAL;
+ switch (whence) {
+ case SEEK_CUR:
+ offset += file->f_pos;
+ case SEEK_SET:
+ break;
+ case SEEK_END:
+ retval = -EOPNOTSUPP;
+ default:
+ goto out;
+ }
+
+ if (offset >= 0) {
+ if (need_reset_readdir(dfi, offset)) {
+ dout("dir_llseek dropping %p content\n", file);
+ reset_readdir(dfi);
+ } else if (is_hash_order(offset) && offset > file->f_pos) {
+ /* for hash offset, we don't know if a forward seek
+ * is within same frag */
+ dfi->dir_release_count = 0;
+ dfi->readdir_cache_idx = -1;
+ }
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ dfi->file_info.flags &= ~CEPH_F_ATEND;
+ }
+ retval = offset;
+ }
+out:
+ inode_unlock(inode);
+ return retval;
+}
+
+/*
+ * Handle lookups for the hidden .snap directory.
+ */
+int ceph_handle_snapdir(struct ceph_mds_request *req,
+ struct dentry *dentry, int err)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
+
+ /* .snap dir? */
+ if (err == -ENOENT &&
+ ceph_snap(parent) == CEPH_NOSNAP &&
+ strcmp(dentry->d_name.name,
+ fsc->mount_options->snapdir_name) == 0) {
+ struct inode *inode = ceph_get_snapdir(parent);
+ dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n",
+ dentry, dentry, inode);
+ BUG_ON(!d_unhashed(dentry));
+ d_add(dentry, inode);
+ err = 0;
+ }
+ return err;
+}
+
+/*
+ * Figure out final result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+ struct dentry *dentry, int err)
+{
+ if (err == -ENOENT) {
+ /* no trace? */
+ err = 0;
+ if (!req->r_reply_info.head->is_dentry) {
+ dout("ENOENT and no trace, dentry %p inode %p\n",
+ dentry, d_inode(dentry));
+ if (d_really_is_positive(dentry)) {
+ d_drop(dentry);
+ err = -ENOENT;
+ } else {
+ d_add(dentry, NULL);
+ }
+ }
+ }
+ if (err)
+ dentry = ERR_PTR(err);
+ else if (dentry != req->r_dentry)
+ dentry = dget(req->r_dentry); /* we got spliced */
+ else
+ dentry = NULL;
+ return dentry;
+}
+
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+{
+ return ceph_ino(inode) == CEPH_INO_ROOT &&
+ strncmp(dentry->d_name.name, ".ceph", 5) == 0;
+}
+
+/*
+ * Look up a single dir entry. If there is a lookup intent, inform
+ * the MDS so that it gets our 'caps wanted' value in a single op.
+ */
+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int op;
+ int mask;
+ int err;
+
+ dout("lookup %p dentry %p '%pd'\n",
+ dir, dentry, dentry);
+
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ /* can we conclude ENOENT locally? */
+ if (d_really_is_negative(dentry)) {
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ spin_lock(&ci->i_ceph_lock);
+ dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+ if (strncmp(dentry->d_name.name,
+ fsc->mount_options->snapdir_name,
+ dentry->d_name.len) &&
+ !is_root_ceph_dentry(dir, dentry) &&
+ ceph_test_mount_opt(fsc, DCACHE) &&
+ __ceph_dir_is_complete(ci) &&
+ (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+ spin_unlock(&ci->i_ceph_lock);
+ dout(" dir %p complete, -ENOENT\n", dir);
+ d_add(dentry, NULL);
+ di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
+ return NULL;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ }
+
+ op = ceph_snap(dir) == CEPH_SNAPDIR ?
+ CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+ req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ err = ceph_handle_snapdir(req, dentry, err);
+ dentry = ceph_finish_lookup(req, dentry, err);
+ ceph_mdsc_put_request(req); /* will dput(dentry) */
+ dout("lookup result=%p\n", dentry);
+ return dentry;
+}
+
+/*
+ * If we do a create but get no trace back from the MDS, follow up with
+ * a lookup (the VFS expects us to link up the provided dentry).
+ */
+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
+{
+ struct dentry *result = ceph_lookup(dir, dentry, 0);
+
+ if (result && !IS_ERR(result)) {
+ /*
+ * We created the item, then did a lookup, and found
+ * it was already linked to another inode we already
+ * had in our cache (and thus got spliced). To not
+ * confuse VFS (especially when inode is a directory),
+ * we don't link our dentry to that inode, return an
+ * error instead.
+ *
+ * This event should be rare and it happens only when
+ * we talk to old MDS. Recent MDS does not send traceless
+ * reply for request that creates new inode.
+ */
+ d_drop(result);
+ return -ESTALE;
+ }
+ return PTR_ERR(result);
+}
+
+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ struct ceph_acls_info acls = {};
+ int err;
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (ceph_quota_is_max_files_exceeded(dir)) {
+ err = -EDQUOT;
+ goto out;
+ }
+
+ err = ceph_pre_init_acls(dir, &mode, &acls);
+ if (err < 0)
+ goto out;
+
+ dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
+ dir, dentry, mode, rdev);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_args.mknod.mode = cpu_to_le32(mode);
+ req->r_args.mknod.rdev = cpu_to_le32(rdev);
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ if (acls.pagelist) {
+ req->r_pagelist = acls.pagelist;
+ acls.pagelist = NULL;
+ }
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+ ceph_mdsc_put_request(req);
+out:
+ if (!err)
+ ceph_init_inode_acls(d_inode(dentry), &acls);
+ else
+ d_drop(dentry);
+ ceph_release_acls_info(&acls);
+ return err;
+}
+
+static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
+{
+ return ceph_mknod(dir, dentry, mode, 0);
+}
+
+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
+ const char *dest)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (ceph_quota_is_max_files_exceeded(dir)) {
+ err = -EDQUOT;
+ goto out;
+ }
+
+ dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_path2 = kstrdup(dest, GFP_KERNEL);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ ceph_mdsc_put_request(req);
+ goto out;
+ }
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+ ceph_mdsc_put_request(req);
+out:
+ if (err)
+ d_drop(dentry);
+ return err;
+}
+
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ struct ceph_acls_info acls = {};
+ int err = -EROFS;
+ int op;
+
+ if (ceph_snap(dir) == CEPH_SNAPDIR) {
+ /* mkdir .snap/foo is a MKSNAP */
+ op = CEPH_MDS_OP_MKSNAP;
+ dout("mksnap dir %p snap '%pd' dn %p\n", dir,
+ dentry, dentry);
+ } else if (ceph_snap(dir) == CEPH_NOSNAP) {
+ dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
+ op = CEPH_MDS_OP_MKDIR;
+ } else {
+ goto out;
+ }
+
+ if (op == CEPH_MDS_OP_MKDIR &&
+ ceph_quota_is_max_files_exceeded(dir)) {
+ err = -EDQUOT;
+ goto out;
+ }
+
+ mode |= S_IFDIR;
+ err = ceph_pre_init_acls(dir, &mode, &acls);
+ if (err < 0)
+ goto out;
+
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_args.mkdir.mode = cpu_to_le32(mode);
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ if (acls.pagelist) {
+ req->r_pagelist = acls.pagelist;
+ acls.pagelist = NULL;
+ }
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err &&
+ !req->r_reply_info.head->is_target &&
+ !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+ ceph_mdsc_put_request(req);
+out:
+ if (!err)
+ ceph_init_inode_acls(d_inode(dentry), &acls);
+ else
+ d_drop(dentry);
+ ceph_release_acls_info(&acls);
+ return err;
+}
+
+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("link in dir %p old_dentry %p dentry %p\n", dir,
+ old_dentry, dentry);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ d_drop(dentry);
+ return PTR_ERR(req);
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_old_dentry = dget(old_dentry);
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ /* release LINK_SHARED on source inode (mds will lock it) */
+ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (err) {
+ d_drop(dentry);
+ } else if (!req->r_reply_info.head->is_dentry) {
+ ihold(d_inode(old_dentry));
+ d_instantiate(dentry, d_inode(old_dentry));
+ }
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+/*
+ * rmdir and unlink are differ only by the metadata op code
+ */
+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct inode *inode = d_inode(dentry);
+ struct ceph_mds_request *req;
+ int err = -EROFS;
+ int op;
+
+ if (ceph_snap(dir) == CEPH_SNAPDIR) {
+ /* rmdir .snap/foo is RMSNAP */
+ dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry);
+ op = CEPH_MDS_OP_RMSNAP;
+ } else if (ceph_snap(dir) == CEPH_NOSNAP) {
+ dout("unlink/rmdir dir %p dn %p inode %p\n",
+ dir, dentry, inode);
+ op = d_is_dir(dentry) ?
+ CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
+ } else
+ goto out;
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ d_delete(dentry);
+ ceph_mdsc_put_request(req);
+out:
+ return err;
+}
+
+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int op = CEPH_MDS_OP_RENAME;
+ int err;
+
+ if (flags)
+ return -EINVAL;
+
+ if (ceph_snap(old_dir) != ceph_snap(new_dir))
+ return -EXDEV;
+ if (ceph_snap(old_dir) != CEPH_NOSNAP) {
+ if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR)
+ op = CEPH_MDS_OP_RENAMESNAP;
+ else
+ return -EROFS;
+ }
+ /* don't allow cross-quota renames */
+ if ((old_dir != new_dir) &&
+ (!ceph_quota_is_same_realm(old_dir, new_dir)))
+ return -EXDEV;
+
+ dout("rename dir %p dentry %p to dir %p dentry %p\n",
+ old_dir, old_dentry, new_dir, new_dentry);
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ ihold(old_dir);
+ req->r_dentry = dget(new_dentry);
+ req->r_num_caps = 2;
+ req->r_old_dentry = dget(old_dentry);
+ req->r_old_dentry_dir = old_dir;
+ req->r_parent = new_dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ /* release LINK_RDCACHE on source inode (mds will lock it) */
+ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+ if (d_really_is_positive(new_dentry)) {
+ req->r_inode_drop =
+ ceph_drop_caps_for_unlink(d_inode(new_dentry));
+ }
+ err = ceph_mdsc_do_request(mdsc, old_dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry) {
+ /*
+ * Normally d_move() is done by fill_trace (called by
+ * do_request, above). If there is no trace, we need
+ * to do it here.
+ */
+ d_move(old_dentry, new_dentry);
+ }
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ ceph_dentry(dentry)->time = jiffies;
+ ceph_dentry(dentry)->lease_shared_gen = 0;
+ spin_unlock(&dentry->d_lock);
+}
+
+/*
+ * Check if dentry lease is valid. If not, delete the lease. Try to
+ * renew if the least is more than half up.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
+ struct inode *dir)
+{
+ struct ceph_dentry_info *di;
+ struct ceph_mds_session *s;
+ int valid = 0;
+ u32 gen;
+ unsigned long ttl;
+ struct ceph_mds_session *session = NULL;
+ u32 seq = 0;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ if (di && di->lease_session) {
+ s = di->lease_session;
+ spin_lock(&s->s_gen_ttl_lock);
+ gen = s->s_cap_gen;
+ ttl = s->s_cap_ttl;
+ spin_unlock(&s->s_gen_ttl_lock);
+
+ if (di->lease_gen == gen &&
+ time_before(jiffies, di->time) &&
+ time_before(jiffies, ttl)) {
+ valid = 1;
+ if (di->lease_renew_after &&
+ time_after(jiffies, di->lease_renew_after)) {
+ /*
+ * We should renew. If we're in RCU walk mode
+ * though, we can't do that so just return
+ * -ECHILD.
+ */
+ if (flags & LOOKUP_RCU) {
+ valid = -ECHILD;
+ } else {
+ session = ceph_get_mds_session(s);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
+ }
+ }
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+
+ if (session) {
+ ceph_mdsc_lease_send_msg(session, dir, dentry,
+ CEPH_MDS_LEASE_RENEW, seq);
+ ceph_put_mds_session(session);
+ }
+ dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+ return valid;
+}
+
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int valid = 0;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen)
+ valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+ spin_unlock(&ci->i_ceph_lock);
+ dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
+ dir, (unsigned)atomic_read(&ci->i_shared_gen),
+ dentry, (unsigned)di->lease_shared_gen, valid);
+ return valid;
+}
+
+/*
+ * Check if cached dentry can be trusted.
+ */
+static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ int valid = 0;
+ struct dentry *parent;
+ struct inode *dir;
+
+ if (flags & LOOKUP_RCU) {
+ parent = READ_ONCE(dentry->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+ }
+
+ dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
+ dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
+
+ /* always trust cached snapped dentries, snapdir dentry */
+ if (ceph_snap(dir) != CEPH_NOSNAP) {
+ dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
+ dentry, d_inode(dentry));
+ valid = 1;
+ } else if (d_really_is_positive(dentry) &&
+ ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
+ valid = 1;
+ } else {
+ valid = dentry_lease_is_valid(dentry, flags, dir);
+ if (valid == -ECHILD)
+ return valid;
+ if (valid || dir_lease_is_valid(dir, dentry)) {
+ if (d_really_is_positive(dentry))
+ valid = ceph_is_any_caps(d_inode(dentry));
+ else
+ valid = 1;
+ }
+ }
+
+ if (!valid) {
+ struct ceph_mds_client *mdsc =
+ ceph_sb_to_client(dir->i_sb)->mdsc;
+ struct ceph_mds_request *req;
+ int op, err;
+ u32 mask;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ op = ceph_snap(dir) == CEPH_SNAPDIR ?
+ CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+ req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+ if (!IS_ERR(req)) {
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_parent = dir;
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ switch (err) {
+ case 0:
+ if (d_really_is_positive(dentry) &&
+ d_inode(dentry) == req->r_target_inode)
+ valid = 1;
+ break;
+ case -ENOENT:
+ if (d_really_is_negative(dentry))
+ valid = 1;
+ /* Fallthrough */
+ default:
+ break;
+ }
+ ceph_mdsc_put_request(req);
+ dout("d_revalidate %p lookup result=%d\n",
+ dentry, err);
+ }
+ }
+
+ dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
+ if (valid) {
+ ceph_dentry_lru_touch(dentry);
+ } else {
+ ceph_dir_clear_complete(dir);
+ }
+
+ if (!(flags & LOOKUP_RCU))
+ dput(parent);
+ return valid;
+}
+
+/*
+ * Release our ceph_dentry_info.
+ */
+static void ceph_d_release(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ dout("d_release %p\n", dentry);
+ ceph_dentry_lru_del(dentry);
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry->d_lock);
+
+ if (di->lease_session)
+ ceph_put_mds_session(di->lease_session);
+ kmem_cache_free(ceph_dentry_cachep, di);
+}
+
+/*
+ * When the VFS prunes a dentry from the cache, we need to clear the
+ * complete flag on the parent directory.
+ *
+ * Called under dentry->d_lock.
+ */
+static void ceph_d_prune(struct dentry *dentry)
+{
+ struct ceph_inode_info *dir_ci;
+ struct ceph_dentry_info *di;
+
+ dout("ceph_d_prune %pd %p\n", dentry, dentry);
+
+ /* do we have a valid parent? */
+ if (IS_ROOT(dentry))
+ return;
+
+ /* we hold d_lock, so d_parent is stable */
+ dir_ci = ceph_inode(d_inode(dentry->d_parent));
+ if (dir_ci->i_vino.snap == CEPH_SNAPDIR)
+ return;
+
+ /* who calls d_delete() should also disable dcache readdir */
+ if (d_really_is_negative(dentry))
+ return;
+
+ /* d_fsdata does not get cleared until d_release */
+ if (!d_unhashed(dentry)) {
+ __ceph_dir_clear_complete(dir_ci);
+ return;
+ }
+
+ /* Disable dcache readdir just in case that someone called d_drop()
+ * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED
+ * properly (dcache readdir is still enabled) */
+ di = ceph_dentry(dentry);
+ if (di->offset > 0 &&
+ di->lease_shared_gen == atomic_read(&dir_ci->i_shared_gen))
+ __ceph_dir_clear_ordered(dir_ci);
+}
+
+/*
+ * read() on a dir. This weird interface hack only works if mounted
+ * with '-o dirstat'.
+ */
+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
+ loff_t *ppos)
+{
+ struct ceph_dir_file_info *dfi = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int left;
+ const int bufsize = 1024;
+
+ if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+ return -EISDIR;
+
+ if (!dfi->dir_info) {
+ dfi->dir_info = kmalloc(bufsize, GFP_KERNEL);
+ if (!dfi->dir_info)
+ return -ENOMEM;
+ dfi->dir_info_len =
+ snprintf(dfi->dir_info, bufsize,
+ "entries: %20lld\n"
+ " files: %20lld\n"
+ " subdirs: %20lld\n"
+ "rentries: %20lld\n"
+ " rfiles: %20lld\n"
+ " rsubdirs: %20lld\n"
+ "rbytes: %20lld\n"
+ "rctime: %10lld.%09ld\n",
+ ci->i_files + ci->i_subdirs,
+ ci->i_files,
+ ci->i_subdirs,
+ ci->i_rfiles + ci->i_rsubdirs,
+ ci->i_rfiles,
+ ci->i_rsubdirs,
+ ci->i_rbytes,
+ ci->i_rctime.tv_sec,
+ ci->i_rctime.tv_nsec);
+ }
+
+ if (*ppos >= dfi->dir_info_len)
+ return 0;
+ size = min_t(unsigned, size, dfi->dir_info_len-*ppos);
+ left = copy_to_user(buf, dfi->dir_info + *ppos, size);
+ if (left == size)
+ return -EFAULT;
+ *ppos += (size - left);
+ return size - left;
+}
+
+/*
+ * We maintain a private dentry LRU.
+ *
+ * FIXME: this needs to be changed to a per-mds lru to be useful.
+ */
+void ceph_dentry_lru_add(struct dentry *dn)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn);
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_add_tail(&di->lru, &mdsc->dentry_lru);
+ mdsc->num_dentry++;
+ spin_unlock(&mdsc->dentry_lru_lock);
+}
+
+void ceph_dentry_lru_touch(struct dentry *dn)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn,
+ di->offset);
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_move_tail(&di->lru, &mdsc->dentry_lru);
+ spin_unlock(&mdsc->dentry_lru_lock);
+}
+
+void ceph_dentry_lru_del(struct dentry *dn)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn);
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_del_init(&di->lru);
+ mdsc->num_dentry--;
+ spin_unlock(&mdsc->dentry_lru_lock);
+}
+
+/*
+ * Return name hash for a given dentry. This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
+{
+ struct ceph_inode_info *dci = ceph_inode(dir);
+ unsigned hash;
+
+ switch (dci->i_dir_layout.dl_dir_hash) {
+ case 0: /* for backward compat */
+ case CEPH_STR_HASH_LINUX:
+ return dn->d_name.hash;
+
+ default:
+ spin_lock(&dn->d_lock);
+ hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+ dn->d_name.name, dn->d_name.len);
+ spin_unlock(&dn->d_lock);
+ return hash;
+ }
+}
+
+const struct file_operations ceph_dir_fops = {
+ .read = ceph_read_dir,
+ .iterate = ceph_readdir,
+ .llseek = ceph_dir_llseek,
+ .open = ceph_open,
+ .release = ceph_release,
+ .unlocked_ioctl = ceph_ioctl,
+ .fsync = ceph_fsync,
+ .lock = ceph_lock,
+ .flock = ceph_flock,
+};
+
+const struct file_operations ceph_snapdir_fops = {
+ .iterate = ceph_readdir,
+ .llseek = ceph_dir_llseek,
+ .open = ceph_open,
+ .release = ceph_release,
+};
+
+const struct inode_operations ceph_dir_iops = {
+ .lookup = ceph_lookup,
+ .permission = ceph_permission,
+ .getattr = ceph_getattr,
+ .setattr = ceph_setattr,
+ .listxattr = ceph_listxattr,
+ .get_acl = ceph_get_acl,
+ .set_acl = ceph_set_acl,
+ .mknod = ceph_mknod,
+ .symlink = ceph_symlink,
+ .mkdir = ceph_mkdir,
+ .link = ceph_link,
+ .unlink = ceph_unlink,
+ .rmdir = ceph_unlink,
+ .rename = ceph_rename,
+ .create = ceph_create,
+ .atomic_open = ceph_atomic_open,
+};
+
+const struct inode_operations ceph_snapdir_iops = {
+ .lookup = ceph_lookup,
+ .permission = ceph_permission,
+ .getattr = ceph_getattr,
+ .mkdir = ceph_mkdir,
+ .rmdir = ceph_unlink,
+ .rename = ceph_rename,
+};
+
+const struct dentry_operations ceph_dentry_ops = {
+ .d_revalidate = ceph_d_revalidate,
+ .d_release = ceph_d_release,
+ .d_prune = ceph_d_prune,
+ .d_init = ceph_d_init,
+};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000..4cfe1154d
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Basic fh
+ */
+struct ceph_nfs_fh {
+ u64 ino;
+} __attribute__ ((packed));
+
+/*
+ * Larger fh that includes parent ino.
+ */
+struct ceph_nfs_confh {
+ u64 ino, parent_ino;
+} __attribute__ ((packed));
+
+static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
+ struct inode *parent_inode)
+{
+ int type;
+ struct ceph_nfs_fh *fh = (void *)rawfh;
+ struct ceph_nfs_confh *cfh = (void *)rawfh;
+ int connected_handle_length = sizeof(*cfh)/4;
+ int handle_length = sizeof(*fh)/4;
+
+ /* don't re-export snaps */
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EINVAL;
+
+ if (parent_inode && (*max_len < connected_handle_length)) {
+ *max_len = connected_handle_length;
+ return FILEID_INVALID;
+ } else if (*max_len < handle_length) {
+ *max_len = handle_length;
+ return FILEID_INVALID;
+ }
+
+ if (parent_inode) {
+ dout("encode_fh %llx with parent %llx\n",
+ ceph_ino(inode), ceph_ino(parent_inode));
+ cfh->ino = ceph_ino(inode);
+ cfh->parent_ino = ceph_ino(parent_inode);
+ *max_len = connected_handle_length;
+ type = FILEID_INO32_GEN_PARENT;
+ } else {
+ dout("encode_fh %llx\n", ceph_ino(inode));
+ fh->ino = ceph_ino(inode);
+ *max_len = handle_length;
+ type = FILEID_INO32_GEN;
+ }
+ return type;
+}
+
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct inode *inode;
+ struct ceph_vino vino;
+ int err;
+
+ vino.ino = ino;
+ vino.snap = CEPH_NOSNAP;
+ inode = ceph_find_inode(sb, vino);
+ if (!inode) {
+ struct ceph_mds_request *req;
+ int mask;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
+
+ mask = CEPH_STAT_CAP_INODE;
+ if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
+ req->r_ino1 = vino;
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode = req->r_target_inode;
+ if (inode)
+ ihold(inode);
+ ceph_mdsc_put_request(req);
+ if (!inode)
+ return ERR_PTR(-ESTALE);
+ if (inode->i_nlink == 0) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ }
+
+ return d_obtain_alias(inode);
+}
+
+/*
+ * convert regular fh to dentry
+ */
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
+ struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct ceph_nfs_fh *fh = (void *)fid->raw;
+
+ if (fh_type != FILEID_INO32_GEN &&
+ fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
+ if (fh_len < sizeof(*fh) / 4)
+ return NULL;
+
+ dout("fh_to_dentry %llx\n", fh->ino);
+ return __fh_to_dentry(sb, fh->ino);
+}
+
+static struct dentry *__get_parent(struct super_block *sb,
+ struct dentry *child, u64 ino)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_request *req;
+ struct inode *inode;
+ int mask;
+ int err;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
+
+ if (child) {
+ req->r_inode = d_inode(child);
+ ihold(d_inode(child));
+ } else {
+ req->r_ino1 = (struct ceph_vino) {
+ .ino = ino,
+ .snap = CEPH_NOSNAP,
+ };
+ }
+
+ mask = CEPH_STAT_CAP_INODE;
+ if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return ERR_PTR(err);
+ }
+
+ inode = req->r_target_inode;
+ if (inode)
+ ihold(inode);
+ ceph_mdsc_put_request(req);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ return d_obtain_alias(inode);
+}
+
+static struct dentry *ceph_get_parent(struct dentry *child)
+{
+ /* don't re-export snaps */
+ if (ceph_snap(d_inode(child)) != CEPH_NOSNAP)
+ return ERR_PTR(-EINVAL);
+
+ dout("get_parent %p ino %llx.%llx\n",
+ child, ceph_vinop(d_inode(child)));
+ return __get_parent(child->d_sb, child, 0);
+}
+
+/*
+ * convert regular fh to parent
+ */
+static struct dentry *ceph_fh_to_parent(struct super_block *sb,
+ struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct ceph_nfs_confh *cfh = (void *)fid->raw;
+ struct dentry *dentry;
+
+ if (fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
+ if (fh_len < sizeof(*cfh) / 4)
+ return NULL;
+
+ dout("fh_to_parent %llx\n", cfh->parent_ino);
+ dentry = __get_parent(sb, NULL, cfh->ino);
+ if (unlikely(dentry == ERR_PTR(-ENOENT)))
+ dentry = __fh_to_dentry(sb, cfh->parent_ino);
+ return dentry;
+}
+
+static int ceph_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct ceph_mds_client *mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ mdsc = ceph_inode_to_client(d_inode(child))->mdsc;
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ inode_lock(d_inode(parent));
+
+ req->r_inode = d_inode(child);
+ ihold(d_inode(child));
+ req->r_ino2 = ceph_vino(d_inode(parent));
+ req->r_parent = d_inode(parent);
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_num_caps = 2;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+
+ inode_unlock(d_inode(parent));
+
+ if (!err) {
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ memcpy(name, rinfo->dname, rinfo->dname_len);
+ name[rinfo->dname_len] = 0;
+ dout("get_name %p ino %llx.%llx name %s\n",
+ child, ceph_vinop(d_inode(child)), name);
+ } else {
+ dout("get_name %p ino %llx.%llx err %d\n",
+ child, ceph_vinop(d_inode(child)), err);
+ }
+
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+const struct export_operations ceph_export_ops = {
+ .encode_fh = ceph_encode_fh,
+ .fh_to_dentry = ceph_fh_to_dentry,
+ .fh_to_parent = ceph_fh_to_parent,
+ .get_parent = ceph_get_parent,
+ .get_name = ceph_get_name,
+};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000..4ce2752c8
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,1830 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "cache.h"
+
+static __le32 ceph_flags_sys2wire(u32 flags)
+{
+ u32 wire_flags = 0;
+
+ switch (flags & O_ACCMODE) {
+ case O_RDONLY:
+ wire_flags |= CEPH_O_RDONLY;
+ break;
+ case O_WRONLY:
+ wire_flags |= CEPH_O_WRONLY;
+ break;
+ case O_RDWR:
+ wire_flags |= CEPH_O_RDWR;
+ break;
+ }
+
+ flags &= ~O_ACCMODE;
+
+#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
+
+ ceph_sys2wire(O_CREAT);
+ ceph_sys2wire(O_EXCL);
+ ceph_sys2wire(O_TRUNC);
+ ceph_sys2wire(O_DIRECTORY);
+ ceph_sys2wire(O_NOFOLLOW);
+
+#undef ceph_sys2wire
+
+ if (flags)
+ dout("unused open flags: %x\n", flags);
+
+ return cpu_to_le32(wire_flags);
+}
+
+/*
+ * Ceph file operations
+ *
+ * Implement basic open/close functionality, and implement
+ * read/write.
+ *
+ * We implement three modes of file I/O:
+ * - buffered uses the generic_file_aio_{read,write} helpers
+ *
+ * - synchronous is used when there is multi-client read/write
+ * sharing, avoids the page cache, and synchronously waits for an
+ * ack from the OSD.
+ *
+ * - direct io takes the variant of the sync path that references
+ * user pages directly.
+ *
+ * fsync() flushes and waits on dirty pages, but just queues metadata
+ * for writeback: since the MDS can recover size and mtime there is no
+ * need to wait for MDS acknowledgement.
+ */
+
+/*
+ * How many pages to get in one call to iov_iter_get_pages(). This
+ * determines the size of the on-stack array used as a buffer.
+ */
+#define ITER_GET_BVECS_PAGES 64
+
+static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
+ struct bio_vec *bvecs)
+{
+ size_t size = 0;
+ int bvec_idx = 0;
+
+ if (maxsize > iov_iter_count(iter))
+ maxsize = iov_iter_count(iter);
+
+ while (size < maxsize) {
+ struct page *pages[ITER_GET_BVECS_PAGES];
+ ssize_t bytes;
+ size_t start;
+ int idx = 0;
+
+ bytes = iov_iter_get_pages(iter, pages, maxsize - size,
+ ITER_GET_BVECS_PAGES, &start);
+ if (bytes < 0)
+ return size ?: bytes;
+
+ iov_iter_advance(iter, bytes);
+ size += bytes;
+
+ for ( ; bytes; idx++, bvec_idx++) {
+ struct bio_vec bv = {
+ .bv_page = pages[idx],
+ .bv_len = min_t(int, bytes, PAGE_SIZE - start),
+ .bv_offset = start,
+ };
+
+ bvecs[bvec_idx] = bv;
+ bytes -= bv.bv_len;
+ start = 0;
+ }
+ }
+
+ return size;
+}
+
+/*
+ * iov_iter_get_pages() only considers one iov_iter segment, no matter
+ * what maxsize or maxpages are given. For ITER_BVEC that is a single
+ * page.
+ *
+ * Attempt to get up to @maxsize bytes worth of pages from @iter.
+ * Return the number of bytes in the created bio_vec array, or an error.
+ */
+static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
+ struct bio_vec **bvecs, int *num_bvecs)
+{
+ struct bio_vec *bv;
+ size_t orig_count = iov_iter_count(iter);
+ ssize_t bytes;
+ int npages;
+
+ iov_iter_truncate(iter, maxsize);
+ npages = iov_iter_npages(iter, INT_MAX);
+ iov_iter_reexpand(iter, orig_count);
+
+ /*
+ * __iter_get_bvecs() may populate only part of the array -- zero it
+ * out.
+ */
+ bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO);
+ if (!bv)
+ return -ENOMEM;
+
+ bytes = __iter_get_bvecs(iter, maxsize, bv);
+ if (bytes < 0) {
+ /*
+ * No pages were pinned -- just free the array.
+ */
+ kvfree(bv);
+ return bytes;
+ }
+
+ *bvecs = bv;
+ *num_bvecs = npages;
+ return bytes;
+}
+
+static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
+{
+ int i;
+
+ for (i = 0; i < num_bvecs; i++) {
+ if (bvecs[i].bv_page) {
+ if (should_dirty)
+ set_page_dirty_lock(bvecs[i].bv_page);
+ put_page(bvecs[i].bv_page);
+ }
+ }
+ kvfree(bvecs);
+}
+
+/*
+ * Prepare an open request. Preallocate ceph_cap to avoid an
+ * inopportune ENOMEM later.
+ */
+static struct ceph_mds_request *
+prepare_open_request(struct super_block *sb, int flags, int create_mode)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int want_auth = USE_ANY_MDS;
+ int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
+
+ if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
+ want_auth = USE_AUTH_MDS;
+
+ req = ceph_mdsc_create_request(mdsc, op, want_auth);
+ if (IS_ERR(req))
+ goto out;
+ req->r_fmode = ceph_flags_to_mode(flags);
+ req->r_args.open.flags = ceph_flags_sys2wire(flags);
+ req->r_args.open.mode = cpu_to_le32(create_mode);
+out:
+ return req;
+}
+
+static int ceph_init_file_info(struct inode *inode, struct file *file,
+ int fmode, bool isdir)
+{
+ struct ceph_file_info *fi;
+
+ dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
+ inode->i_mode, isdir ? "dir" : "regular");
+ BUG_ON(inode->i_fop->release != ceph_release);
+
+ if (isdir) {
+ struct ceph_dir_file_info *dfi =
+ kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
+ if (!dfi) {
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ return -ENOMEM;
+ }
+
+ file->private_data = dfi;
+ fi = &dfi->file_info;
+ dfi->next_offset = 2;
+ dfi->readdir_cache_idx = -1;
+ } else {
+ fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
+ if (!fi) {
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ return -ENOMEM;
+ }
+
+ file->private_data = fi;
+ }
+
+ fi->fmode = fmode;
+ spin_lock_init(&fi->rw_contexts_lock);
+ INIT_LIST_HEAD(&fi->rw_contexts);
+
+ return 0;
+}
+
+/*
+ * initialize private struct file data.
+ * if we fail, clean up by dropping fmode reference on the ceph_inode
+ */
+static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
+{
+ int ret = 0;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ ceph_fscache_register_inode_cookie(inode);
+ ceph_fscache_file_set_cookie(inode, file);
+ case S_IFDIR:
+ ret = ceph_init_file_info(inode, file, fmode,
+ S_ISDIR(inode->i_mode));
+ if (ret)
+ return ret;
+ break;
+
+ case S_IFLNK:
+ dout("init_file %p %p 0%o (symlink)\n", inode, file,
+ inode->i_mode);
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ break;
+
+ default:
+ dout("init_file %p %p 0%o (special)\n", inode, file,
+ inode->i_mode);
+ /*
+ * we need to drop the open ref now, since we don't
+ * have .release set to ceph_release.
+ */
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ BUG_ON(inode->i_fop->release == ceph_release);
+
+ /* call the proper open fop */
+ ret = inode->i_fop->open(inode, file);
+ }
+ return ret;
+}
+
+/*
+ * try renew caps after session gets killed.
+ */
+int ceph_renew_caps(struct inode *inode)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_request *req;
+ int err, flags, wanted;
+
+ spin_lock(&ci->i_ceph_lock);
+ wanted = __ceph_caps_file_wanted(ci);
+ if (__ceph_is_any_real_caps(ci) &&
+ (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
+ int issued = __ceph_caps_issued(ci, NULL);
+ spin_unlock(&ci->i_ceph_lock);
+ dout("renew caps %p want %s issued %s updating mds_wanted\n",
+ inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+ ceph_check_caps(ci, 0, NULL);
+ return 0;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ flags = 0;
+ if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
+ flags = O_RDWR;
+ else if (wanted & CEPH_CAP_FILE_RD)
+ flags = O_RDONLY;
+ else if (wanted & CEPH_CAP_FILE_WR)
+ flags = O_WRONLY;
+#ifdef O_LAZY
+ if (wanted & CEPH_CAP_FILE_LAZYIO)
+ flags |= O_LAZY;
+#endif
+
+ req = prepare_open_request(inode->i_sb, flags, 0);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_num_caps = 1;
+ req->r_fmode = -1;
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ ceph_mdsc_put_request(req);
+out:
+ dout("renew caps %p open result=%d\n", inode, err);
+ return err < 0 ? err : 0;
+}
+
+/*
+ * If we already have the requisite capabilities, we can satisfy
+ * the open request locally (no need to request new caps from the
+ * MDS). We do, however, need to inform the MDS (asynchronously)
+ * if our wanted caps set expands.
+ */
+int ceph_open(struct inode *inode, struct file *file)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ struct ceph_file_info *fi = file->private_data;
+ int err;
+ int flags, fmode, wanted;
+
+ if (fi) {
+ dout("open file %p is already opened\n", file);
+ return 0;
+ }
+
+ /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
+ flags = file->f_flags & ~(O_CREAT|O_EXCL);
+ if (S_ISDIR(inode->i_mode))
+ flags = O_DIRECTORY; /* mds likes to know */
+
+ dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
+ ceph_vinop(inode), file, flags, file->f_flags);
+ fmode = ceph_flags_to_mode(flags);
+ wanted = ceph_caps_for_mode(fmode);
+
+ /* snapped files are read-only */
+ if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
+ return -EROFS;
+
+ /* trivially open snapdir */
+ if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ spin_lock(&ci->i_ceph_lock);
+ __ceph_get_fmode(ci, fmode);
+ spin_unlock(&ci->i_ceph_lock);
+ return ceph_init_file(inode, file, fmode);
+ }
+
+ /*
+ * No need to block if we have caps on the auth MDS (for
+ * write) or any MDS (for read). Update wanted set
+ * asynchronously.
+ */
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_is_any_real_caps(ci) &&
+ (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
+ int mds_wanted = __ceph_caps_mds_wanted(ci, true);
+ int issued = __ceph_caps_issued(ci, NULL);
+
+ dout("open %p fmode %d want %s issued %s using existing\n",
+ inode, fmode, ceph_cap_string(wanted),
+ ceph_cap_string(issued));
+ __ceph_get_fmode(ci, fmode);
+ spin_unlock(&ci->i_ceph_lock);
+
+ /* adjust wanted? */
+ if ((issued & wanted) != wanted &&
+ (mds_wanted & wanted) != wanted &&
+ ceph_snap(inode) != CEPH_SNAPDIR)
+ ceph_check_caps(ci, 0, NULL);
+
+ return ceph_init_file(inode, file, fmode);
+ } else if (ceph_snap(inode) != CEPH_NOSNAP &&
+ (ci->i_snap_caps & wanted) == wanted) {
+ __ceph_get_fmode(ci, fmode);
+ spin_unlock(&ci->i_ceph_lock);
+ return ceph_init_file(inode, file, fmode);
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+
+ dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+ req = prepare_open_request(inode->i_sb, flags, 0);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_inode = inode;
+ ihold(inode);
+
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (!err)
+ err = ceph_init_file(inode, file, req->r_fmode);
+ ceph_mdsc_put_request(req);
+ dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+out:
+ return err;
+}
+
+
+/*
+ * Do a lookup + open with a single request. If we get a non-existent
+ * file or symlink, return 1 so the VFS can retry.
+ */
+int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned flags, umode_t mode)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ struct dentry *dn;
+ struct ceph_acls_info acls = {};
+ int mask;
+ int err;
+
+ dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
+ dir, dentry, dentry,
+ d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
+
+ if (dentry->d_name.len > NAME_MAX)
+ return -ENAMETOOLONG;
+
+ if (flags & O_CREAT) {
+ if (ceph_quota_is_max_files_exceeded(dir))
+ return -EDQUOT;
+ err = ceph_pre_init_acls(dir, &mode, &acls);
+ if (err < 0)
+ return err;
+ }
+
+ /* do the open */
+ req = prepare_open_request(dir->i_sb, flags, mode);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out_acl;
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ if (flags & O_CREAT) {
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ if (acls.pagelist) {
+ req->r_pagelist = acls.pagelist;
+ acls.pagelist = NULL;
+ }
+ }
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.open.mask = cpu_to_le32(mask);
+
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ err = ceph_mdsc_do_request(mdsc,
+ (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
+ req);
+ err = ceph_handle_snapdir(req, dentry, err);
+ if (err)
+ goto out_req;
+
+ if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+
+ if (d_in_lookup(dentry)) {
+ dn = ceph_finish_lookup(req, dentry, err);
+ if (IS_ERR(dn))
+ err = PTR_ERR(dn);
+ } else {
+ /* we were given a hashed negative dentry */
+ dn = NULL;
+ }
+ if (err)
+ goto out_req;
+ if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
+ /* make vfs retry on splice, ENOENT, or symlink */
+ dout("atomic_open finish_no_open on dn %p\n", dn);
+ err = finish_no_open(file, dn);
+ } else {
+ dout("atomic_open finish_open on dn %p\n", dn);
+ if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+ ceph_init_inode_acls(d_inode(dentry), &acls);
+ file->f_mode |= FMODE_CREATED;
+ }
+ err = finish_open(file, dentry, ceph_open);
+ }
+out_req:
+ if (!req->r_err && req->r_target_inode)
+ ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
+ ceph_mdsc_put_request(req);
+out_acl:
+ ceph_release_acls_info(&acls);
+ dout("atomic_open result=%d\n", err);
+ return err;
+}
+
+int ceph_release(struct inode *inode, struct file *file)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (S_ISDIR(inode->i_mode)) {
+ struct ceph_dir_file_info *dfi = file->private_data;
+ dout("release inode %p dir file %p\n", inode, file);
+ WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
+
+ ceph_put_fmode(ci, dfi->file_info.fmode);
+
+ if (dfi->last_readdir)
+ ceph_mdsc_put_request(dfi->last_readdir);
+ kfree(dfi->last_name);
+ kfree(dfi->dir_info);
+ kmem_cache_free(ceph_dir_file_cachep, dfi);
+ } else {
+ struct ceph_file_info *fi = file->private_data;
+ dout("release inode %p regular file %p\n", inode, file);
+ WARN_ON(!list_empty(&fi->rw_contexts));
+
+ ceph_put_fmode(ci, fi->fmode);
+ kmem_cache_free(ceph_file_cachep, fi);
+ }
+
+ /* wake up anyone waiting for caps on this inode */
+ wake_up_all(&ci->i_cap_wq);
+ return 0;
+}
+
+enum {
+ HAVE_RETRIED = 1,
+ CHECK_EOF = 2,
+ READ_INLINE = 3,
+};
+
+/*
+ * Read a range of bytes striped over one or more objects. Iterate over
+ * objects we stripe over. (That's not atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
+ */
+static int striped_read(struct inode *inode,
+ u64 pos, u64 len,
+ struct page **pages, int num_pages,
+ int page_align, int *checkeof)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 this_len;
+ loff_t i_size;
+ int page_idx;
+ int ret, read = 0;
+ bool hit_stripe, was_short;
+
+ /*
+ * we may need to do multiple reads. not atomic, unfortunately.
+ */
+more:
+ this_len = len;
+ page_idx = (page_align + read) >> PAGE_SHIFT;
+ ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+ &ci->i_layout, pos, &this_len,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ pages + page_idx, num_pages - page_idx,
+ ((page_align + read) & ~PAGE_MASK));
+ if (ret == -ENOENT)
+ ret = 0;
+ hit_stripe = this_len < len;
+ was_short = ret >= 0 && ret < this_len;
+ dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
+ ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+
+ i_size = i_size_read(inode);
+ if (ret >= 0) {
+ if (was_short && (pos + ret < i_size)) {
+ int zlen = min(this_len - ret, i_size - pos - ret);
+ int zoff = page_align + read + ret;
+ dout(" zero gap %llu to %llu\n",
+ pos + ret, pos + ret + zlen);
+ ceph_zero_page_vector_range(zoff, zlen, pages);
+ ret += zlen;
+ }
+
+ read += ret;
+ pos += ret;
+ len -= ret;
+
+ /* hit stripe and need continue*/
+ if (len && hit_stripe && pos < i_size)
+ goto more;
+ }
+
+ if (read > 0) {
+ ret = read;
+ /* did we bounce off eof? */
+ if (pos + len > i_size)
+ *checkeof = CHECK_EOF;
+ }
+
+ dout("striped_read returns %d\n", ret);
+ return ret;
+}
+
+/*
+ * Completely synchronous read and write methods. Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads.
+ */
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
+ int *checkeof)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct page **pages;
+ u64 off = iocb->ki_pos;
+ int num_pages;
+ ssize_t ret;
+ size_t len = iov_iter_count(to);
+
+ dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
+ (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+ if (!len)
+ return 0;
+ /*
+ * flush any page cache pages in this range. this
+ * will make concurrent normal and sync io slow,
+ * but it will at least behave sensibly when they are
+ * in sequence.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, off,
+ off + len);
+ if (ret < 0)
+ return ret;
+
+ if (unlikely(to->type & ITER_PIPE)) {
+ size_t page_off;
+ ret = iov_iter_get_pages_alloc(to, &pages, len,
+ &page_off);
+ if (ret <= 0)
+ return -ENOMEM;
+ num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+
+ ret = striped_read(inode, off, ret, pages, num_pages,
+ page_off, checkeof);
+ if (ret > 0) {
+ iov_iter_advance(to, ret);
+ off += ret;
+ } else {
+ iov_iter_advance(to, 0);
+ }
+ ceph_put_page_vector(pages, num_pages, false);
+ } else {
+ num_pages = calc_pages_for(off, len);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ ret = striped_read(inode, off, len, pages, num_pages,
+ (off & ~PAGE_MASK), checkeof);
+ if (ret > 0) {
+ int l, k = 0;
+ size_t left = ret;
+
+ while (left) {
+ size_t page_off = off & ~PAGE_MASK;
+ size_t copy = min_t(size_t, left,
+ PAGE_SIZE - page_off);
+ l = copy_page_to_iter(pages[k++], page_off,
+ copy, to);
+ off += l;
+ left -= l;
+ if (l < copy)
+ break;
+ }
+ }
+ ceph_release_page_vector(pages, num_pages);
+ }
+
+ if (off > iocb->ki_pos) {
+ ret = off - iocb->ki_pos;
+ iocb->ki_pos = off;
+ }
+
+ dout("sync_read result %zd\n", ret);
+ return ret;
+}
+
+struct ceph_aio_request {
+ struct kiocb *iocb;
+ size_t total_len;
+ bool write;
+ bool should_dirty;
+ int error;
+ struct list_head osd_reqs;
+ unsigned num_reqs;
+ atomic_t pending_reqs;
+ struct timespec64 mtime;
+ struct ceph_cap_flush *prealloc_cf;
+};
+
+struct ceph_aio_work {
+ struct work_struct work;
+ struct ceph_osd_request *req;
+};
+
+static void ceph_aio_retry_work(struct work_struct *work);
+
+static void ceph_aio_complete(struct inode *inode,
+ struct ceph_aio_request *aio_req)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ if (!atomic_dec_and_test(&aio_req->pending_reqs))
+ return;
+
+ ret = aio_req->error;
+ if (!ret)
+ ret = aio_req->total_len;
+
+ dout("ceph_aio_complete %p rc %d\n", inode, ret);
+
+ if (ret >= 0 && aio_req->write) {
+ int dirty;
+
+ loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
+ if (endoff > i_size_read(inode)) {
+ if (ceph_inode_set_size(inode, endoff))
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &aio_req->prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+
+ }
+
+ ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
+ CEPH_CAP_FILE_RD));
+
+ aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+
+ ceph_free_cap_flush(aio_req->prealloc_cf);
+ kfree(aio_req);
+}
+
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
+{
+ int rc = req->r_result;
+ struct inode *inode = req->r_inode;
+ struct ceph_aio_request *aio_req = req->r_priv;
+ struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
+ BUG_ON(!osd_data->num_bvecs);
+
+ dout("ceph_aio_complete_req %p rc %d bytes %u\n",
+ inode, rc, osd_data->bvec_pos.iter.bi_size);
+
+ if (rc == -EOLDSNAPC) {
+ struct ceph_aio_work *aio_work;
+ BUG_ON(!aio_req->write);
+
+ aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
+ if (aio_work) {
+ INIT_WORK(&aio_work->work, ceph_aio_retry_work);
+ aio_work->req = req;
+ queue_work(ceph_inode_to_client(inode)->wb_wq,
+ &aio_work->work);
+ return;
+ }
+ rc = -ENOMEM;
+ } else if (!aio_req->write) {
+ if (rc == -ENOENT)
+ rc = 0;
+ if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) {
+ struct iov_iter i;
+ int zlen = osd_data->bvec_pos.iter.bi_size - rc;
+
+ /*
+ * If read is satisfied by single OSD request,
+ * it can pass EOF. Otherwise read is within
+ * i_size.
+ */
+ if (aio_req->num_reqs == 1) {
+ loff_t i_size = i_size_read(inode);
+ loff_t endoff = aio_req->iocb->ki_pos + rc;
+ if (endoff < i_size)
+ zlen = min_t(size_t, zlen,
+ i_size - endoff);
+ aio_req->total_len = rc + zlen;
+ }
+
+ iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs,
+ osd_data->num_bvecs,
+ osd_data->bvec_pos.iter.bi_size);
+ iov_iter_advance(&i, rc);
+ iov_iter_zero(zlen, &i);
+ }
+ }
+
+ put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
+ aio_req->should_dirty);
+ ceph_osdc_put_request(req);
+
+ if (rc < 0)
+ cmpxchg(&aio_req->error, 0, rc);
+
+ ceph_aio_complete(inode, aio_req);
+ return;
+}
+
+static void ceph_aio_retry_work(struct work_struct *work)
+{
+ struct ceph_aio_work *aio_work =
+ container_of(work, struct ceph_aio_work, work);
+ struct ceph_osd_request *orig_req = aio_work->req;
+ struct ceph_aio_request *aio_req = orig_req->r_priv;
+ struct inode *inode = orig_req->r_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
+ struct ceph_osd_request *req;
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
+ false, GFP_NOFS);
+ if (!req) {
+ ret = -ENOMEM;
+ req = orig_req;
+ goto out;
+ }
+
+ req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
+ ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
+ ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ req = orig_req;
+ goto out;
+ }
+
+ req->r_ops[0] = orig_req->r_ops[0];
+
+ req->r_mtime = aio_req->mtime;
+ req->r_data_offset = req->r_ops[0].extent.offset;
+
+ ceph_osdc_put_request(orig_req);
+
+ req->r_callback = ceph_aio_complete_req;
+ req->r_inode = inode;
+ req->r_priv = aio_req;
+
+ ret = ceph_osdc_start_request(req->r_osdc, req, false);
+out:
+ if (ret < 0) {
+ req->r_result = ret;
+ ceph_aio_complete_req(req);
+ }
+
+ ceph_put_snap_context(snapc);
+ kfree(aio_work);
+}
+
+static ssize_t
+ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct ceph_snap_context *snapc,
+ struct ceph_cap_flush **pcf)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_vino vino;
+ struct ceph_osd_request *req;
+ struct bio_vec *bvecs;
+ struct ceph_aio_request *aio_req = NULL;
+ int num_pages = 0;
+ int flags;
+ int ret;
+ struct timespec64 mtime = current_time(inode);
+ size_t count = iov_iter_count(iter);
+ loff_t pos = iocb->ki_pos;
+ bool write = iov_iter_rw(iter) == WRITE;
+ bool should_dirty = !write && iter_is_iovec(iter);
+
+ if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
+ (write ? "write" : "read"), file, pos, (unsigned)count,
+ snapc, snapc->seq);
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ if (ret < 0)
+ return ret;
+
+ if (write) {
+ int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (pos + count) >> PAGE_SHIFT);
+ if (ret2 < 0)
+ dout("invalidate_inode_pages2_range returned %d\n", ret2);
+
+ flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
+ } else {
+ flags = CEPH_OSD_FLAG_READ;
+ }
+
+ while (iov_iter_count(iter) > 0) {
+ u64 size = iov_iter_count(iter);
+ ssize_t len;
+
+ if (write)
+ size = min_t(u64, size, fsc->mount_options->wsize);
+ else
+ size = min_t(u64, size, fsc->mount_options->rsize);
+
+ vino = ceph_vino(inode);
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ vino, pos, &size, 0,
+ 1,
+ write ? CEPH_OSD_OP_WRITE :
+ CEPH_OSD_OP_READ,
+ flags, snapc,
+ ci->i_truncate_seq,
+ ci->i_truncate_size,
+ false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
+
+ len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
+ if (len < 0) {
+ ceph_osdc_put_request(req);
+ ret = len;
+ break;
+ }
+ if (len != size)
+ osd_req_op_extent_update(req, 0, len);
+
+ /*
+ * To simplify error handling, allow AIO when IO within i_size
+ * or IO can be satisfied by single OSD request.
+ */
+ if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
+ (len == count || pos + count <= i_size_read(inode))) {
+ aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
+ if (aio_req) {
+ aio_req->iocb = iocb;
+ aio_req->write = write;
+ aio_req->should_dirty = should_dirty;
+ INIT_LIST_HEAD(&aio_req->osd_reqs);
+ if (write) {
+ aio_req->mtime = mtime;
+ swap(aio_req->prealloc_cf, *pcf);
+ }
+ }
+ /* ignore error */
+ }
+
+ if (write) {
+ /*
+ * throw out any page cache pages in this range. this
+ * may block.
+ */
+ truncate_inode_pages_range(inode->i_mapping, pos,
+ (pos+len) | (PAGE_SIZE - 1));
+
+ req->r_mtime = mtime;
+ }
+
+ osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
+
+ if (aio_req) {
+ aio_req->total_len += len;
+ aio_req->num_reqs++;
+ atomic_inc(&aio_req->pending_reqs);
+
+ req->r_callback = ceph_aio_complete_req;
+ req->r_inode = inode;
+ req->r_priv = aio_req;
+ list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
+
+ pos += len;
+ continue;
+ }
+
+ ret = ceph_osdc_start_request(req->r_osdc, req, false);
+ if (!ret)
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+ size = i_size_read(inode);
+ if (!write) {
+ if (ret == -ENOENT)
+ ret = 0;
+ if (ret >= 0 && ret < len && pos + ret < size) {
+ struct iov_iter i;
+ int zlen = min_t(size_t, len - ret,
+ size - pos - ret);
+
+ iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages,
+ len);
+ iov_iter_advance(&i, ret);
+ iov_iter_zero(zlen, &i);
+ ret += zlen;
+ }
+ if (ret >= 0)
+ len = ret;
+ }
+
+ put_bvecs(bvecs, num_pages, should_dirty);
+ ceph_osdc_put_request(req);
+ if (ret < 0)
+ break;
+
+ pos += len;
+ if (!write && pos >= size)
+ break;
+
+ if (write && pos > size) {
+ if (ceph_inode_set_size(inode, pos))
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
+ }
+
+ if (aio_req) {
+ LIST_HEAD(osd_reqs);
+
+ if (aio_req->num_reqs == 0) {
+ kfree(aio_req);
+ return ret;
+ }
+
+ ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
+ CEPH_CAP_FILE_RD);
+
+ list_splice(&aio_req->osd_reqs, &osd_reqs);
+ while (!list_empty(&osd_reqs)) {
+ req = list_first_entry(&osd_reqs,
+ struct ceph_osd_request,
+ r_unsafe_item);
+ list_del_init(&req->r_unsafe_item);
+ if (ret >= 0)
+ ret = ceph_osdc_start_request(req->r_osdc,
+ req, false);
+ if (ret < 0) {
+ req->r_result = ret;
+ ceph_aio_complete_req(req);
+ }
+ }
+ return -EIOCBQUEUED;
+ }
+
+ if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
+ ret = pos - iocb->ki_pos;
+ iocb->ki_pos = pos;
+ }
+ return ret;
+}
+
+/*
+ * Synchronous write, straight from __user pointer or user pages.
+ *
+ * If write spans object boundary, just do multiple writes. (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+ struct ceph_snap_context *snapc)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_vino vino;
+ struct ceph_osd_request *req;
+ struct page **pages;
+ u64 len;
+ int num_pages;
+ int written = 0;
+ int flags;
+ int ret;
+ bool check_caps = false;
+ struct timespec64 mtime = current_time(inode);
+ size_t count = iov_iter_count(from);
+
+ if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
+ file, pos, (unsigned)count, snapc, snapc->seq);
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ if (ret < 0)
+ return ret;
+
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (pos + count) >> PAGE_SHIFT);
+ if (ret < 0)
+ dout("invalidate_inode_pages2_range returned %d\n", ret);
+
+ flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
+
+ while ((len = iov_iter_count(from)) > 0) {
+ size_t left;
+ int n;
+
+ vino = ceph_vino(inode);
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ vino, pos, &len, 0, 1,
+ CEPH_OSD_OP_WRITE, flags, snapc,
+ ci->i_truncate_seq,
+ ci->i_truncate_size,
+ false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
+
+ /*
+ * write from beginning of first page,
+ * regardless of io alignment
+ */
+ num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ goto out;
+ }
+
+ left = len;
+ for (n = 0; n < num_pages; n++) {
+ size_t plen = min_t(size_t, left, PAGE_SIZE);
+ ret = copy_page_from_iter(pages[n], 0, plen, from);
+ if (ret != plen) {
+ ret = -EFAULT;
+ break;
+ }
+ left -= ret;
+ }
+
+ if (ret < 0) {
+ ceph_release_page_vector(pages, num_pages);
+ goto out;
+ }
+
+ req->r_inode = inode;
+
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+ false, true);
+
+ req->r_mtime = mtime;
+ ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!ret)
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+out:
+ ceph_osdc_put_request(req);
+ if (ret != 0) {
+ ceph_set_error_write(ci);
+ break;
+ }
+
+ ceph_clear_error_write(ci);
+ pos += len;
+ written += len;
+ if (pos > i_size_read(inode)) {
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
+
+ }
+
+ if (ret != -EOLDSNAPC && written > 0) {
+ ret = written;
+ iocb->ki_pos = pos;
+ }
+ return ret;
+}
+
+/*
+ * Wrap generic_file_aio_read with checks for cap bits on the inode.
+ * Atomically grab references, so that those bits are not released
+ * back to the MDS mid-read.
+ *
+ * Hmm, the sync read case isn't actually async... should it be?
+ */
+static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *filp = iocb->ki_filp;
+ struct ceph_file_info *fi = filp->private_data;
+ size_t len = iov_iter_count(to);
+ struct inode *inode = file_inode(filp);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct page *pinned_page = NULL;
+ ssize_t ret;
+ int want, got = 0;
+ int retry_op = 0, read = 0;
+
+again:
+ dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_CACHE;
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+ if (ret < 0)
+ return ret;
+
+ if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
+ (iocb->ki_flags & IOCB_DIRECT) ||
+ (fi->flags & CEPH_F_SYNC)) {
+
+ dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
+
+ if (ci->i_inline_version == CEPH_INLINE_NONE) {
+ if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
+ ret = ceph_direct_read_write(iocb, to,
+ NULL, NULL);
+ if (ret >= 0 && ret < len)
+ retry_op = CHECK_EOF;
+ } else {
+ ret = ceph_sync_read(iocb, to, &retry_op);
+ }
+ } else {
+ retry_op = READ_INLINE;
+ }
+ } else {
+ CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
+ dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
+ ceph_add_rw_context(fi, &rw_ctx);
+ ret = generic_file_read_iter(iocb, to);
+ ceph_del_rw_context(fi, &rw_ctx);
+ }
+ dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
+ inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+ if (pinned_page) {
+ put_page(pinned_page);
+ pinned_page = NULL;
+ }
+ ceph_put_cap_refs(ci, got);
+ if (retry_op > HAVE_RETRIED && ret >= 0) {
+ int statret;
+ struct page *page = NULL;
+ loff_t i_size;
+ if (retry_op == READ_INLINE) {
+ page = __page_cache_alloc(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ }
+
+ statret = __ceph_do_getattr(inode, page,
+ CEPH_STAT_CAP_INLINE_DATA, !!page);
+ if (statret < 0) {
+ if (page)
+ __free_page(page);
+ if (statret == -ENODATA) {
+ BUG_ON(retry_op != READ_INLINE);
+ goto again;
+ }
+ return statret;
+ }
+
+ i_size = i_size_read(inode);
+ if (retry_op == READ_INLINE) {
+ BUG_ON(ret > 0 || read > 0);
+ if (iocb->ki_pos < i_size &&
+ iocb->ki_pos < PAGE_SIZE) {
+ loff_t end = min_t(loff_t, i_size,
+ iocb->ki_pos + len);
+ end = min_t(loff_t, end, PAGE_SIZE);
+ if (statret < end)
+ zero_user_segment(page, statret, end);
+ ret = copy_page_to_iter(page,
+ iocb->ki_pos & ~PAGE_MASK,
+ end - iocb->ki_pos, to);
+ iocb->ki_pos += ret;
+ read += ret;
+ }
+ if (iocb->ki_pos < i_size && read < len) {
+ size_t zlen = min_t(size_t, len - read,
+ i_size - iocb->ki_pos);
+ ret = iov_iter_zero(zlen, to);
+ iocb->ki_pos += ret;
+ read += ret;
+ }
+ __free_pages(page, 0);
+ return read;
+ }
+
+ /* hit EOF or hole? */
+ if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
+ ret < len) {
+ dout("sync_read hit hole, ppos %lld < size %lld"
+ ", reading more\n", iocb->ki_pos, i_size);
+
+ read += ret;
+ len -= ret;
+ retry_op = HAVE_RETRIED;
+ goto again;
+ }
+ }
+
+ if (ret >= 0)
+ ret += read;
+
+ return ret;
+}
+
+/*
+ * Take cap references to avoid releasing caps to MDS mid-write.
+ *
+ * If we are synchronous, and write with an old snap context, the OSD
+ * may return EOLDSNAPC. In that case, retry the write.. _after_
+ * dropping our cap refs and allowing the pending snap to logically
+ * complete _before_ this write occurs.
+ *
+ * If we are near ENOSPC, write synchronously.
+ */
+static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_cap_flush *prealloc_cf;
+ ssize_t count, written = 0;
+ int err, want, got;
+ u32 map_flags;
+ u64 pool_flags;
+ loff_t pos;
+ loff_t limit = max(i_size_read(inode), fsc->max_file_size);
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+retry_snap:
+ inode_lock(inode);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = inode_to_bdi(inode);
+
+ if (iocb->ki_flags & IOCB_APPEND) {
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+ if (err < 0)
+ goto out;
+ }
+
+ err = generic_write_checks(iocb, from);
+ if (err <= 0)
+ goto out;
+
+ pos = iocb->ki_pos;
+ if (unlikely(pos >= limit)) {
+ err = -EFBIG;
+ goto out;
+ } else {
+ iov_iter_truncate(from, limit - pos);
+ }
+
+ count = iov_iter_count(from);
+ if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) {
+ err = -EDQUOT;
+ goto out;
+ }
+
+ err = file_remove_privs(file);
+ if (err)
+ goto out;
+
+ err = file_update_time(file);
+ if (err)
+ goto out;
+
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ err = ceph_uninline_data(file, NULL);
+ if (err < 0)
+ goto out;
+ }
+
+ down_read(&osdc->lock);
+ map_flags = osdc->osdmap->flags;
+ pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
+ up_read(&osdc->lock);
+ if ((map_flags & CEPH_OSDMAP_FULL) ||
+ (pool_flags & CEPH_POOL_FLAG_FULL)) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
+ inode, ceph_vinop(inode), pos, count, i_size_read(inode));
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+ got = 0;
+ err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count,
+ &got, NULL);
+ if (err < 0)
+ goto out;
+
+ dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
+
+ if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
+ (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
+ (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
+ struct ceph_snap_context *snapc;
+ struct iov_iter data;
+ inode_unlock(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ /* we might need to revert back to that point */
+ data = *from;
+ if (iocb->ki_flags & IOCB_DIRECT)
+ written = ceph_direct_read_write(iocb, &data, snapc,
+ &prealloc_cf);
+ else
+ written = ceph_sync_write(iocb, &data, pos, snapc);
+ if (written > 0)
+ iov_iter_advance(from, written);
+ ceph_put_snap_context(snapc);
+ } else {
+ /*
+ * No need to acquire the i_truncate_mutex. Because
+ * the MDS revokes Fwb caps before sending truncate
+ * message to us. We can't get Fwb cap while there
+ * are pending vmtruncate. So write and vmtruncate
+ * can not run at the same time
+ */
+ written = generic_perform_write(file, from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
+ inode_unlock(inode);
+ }
+
+ if (written >= 0) {
+ int dirty;
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
+ ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
+ }
+
+ dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, (unsigned)count,
+ ceph_cap_string(got));
+ ceph_put_cap_refs(ci, got);
+
+ if (written == -EOLDSNAPC) {
+ dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
+ inode, ceph_vinop(inode), pos, (unsigned)count);
+ goto retry_snap;
+ }
+
+ if (written >= 0) {
+ if ((map_flags & CEPH_OSDMAP_NEARFULL) ||
+ (pool_flags & CEPH_POOL_FLAG_NEARFULL))
+ iocb->ki_flags |= IOCB_DSYNC;
+ written = generic_write_sync(iocb, written);
+ }
+
+ goto out_unlocked;
+
+out:
+ inode_unlock(inode);
+out_unlocked:
+ ceph_free_cap_flush(prealloc_cf);
+ current->backing_dev_info = NULL;
+ return written ? written : err;
+}
+
+/*
+ * llseek. be sure to verify file size on SEEK_END.
+ */
+static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ loff_t i_size;
+ loff_t ret;
+
+ inode_lock(inode);
+
+ if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
+ ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+ if (ret < 0)
+ goto out;
+ }
+
+ i_size = i_size_read(inode);
+ switch (whence) {
+ case SEEK_END:
+ offset += i_size;
+ break;
+ case SEEK_CUR:
+ /*
+ * Here we special-case the lseek(fd, 0, SEEK_CUR)
+ * position-querying operation. Avoid rewriting the "same"
+ * f_pos value back to the file because a concurrent read(),
+ * write() or lseek() might have altered it
+ */
+ if (offset == 0) {
+ ret = file->f_pos;
+ goto out;
+ }
+ offset += file->f_pos;
+ break;
+ case SEEK_DATA:
+ if (offset < 0 || offset >= i_size) {
+ ret = -ENXIO;
+ goto out;
+ }
+ break;
+ case SEEK_HOLE:
+ if (offset < 0 || offset >= i_size) {
+ ret = -ENXIO;
+ goto out;
+ }
+ offset = i_size;
+ break;
+ }
+
+ ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size));
+
+out:
+ inode_unlock(inode);
+ return ret;
+}
+
+static inline void ceph_zero_partial_page(
+ struct inode *inode, loff_t offset, unsigned size)
+{
+ struct page *page;
+ pgoff_t index = offset >> PAGE_SHIFT;
+
+ page = find_lock_page(inode->i_mapping, index);
+ if (page) {
+ wait_on_page_writeback(page);
+ zero_user(page, offset & (PAGE_SIZE - 1), size);
+ unlock_page(page);
+ put_page(page);
+ }
+}
+
+static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
+ loff_t length)
+{
+ loff_t nearly = round_up(offset, PAGE_SIZE);
+ if (offset < nearly) {
+ loff_t size = nearly - offset;
+ if (length < size)
+ size = length;
+ ceph_zero_partial_page(inode, offset, size);
+ offset += size;
+ length -= size;
+ }
+ if (length >= PAGE_SIZE) {
+ loff_t size = round_down(length, PAGE_SIZE);
+ truncate_pagecache_range(inode, offset, offset + size - 1);
+ offset += size;
+ length -= size;
+ }
+ if (length)
+ ceph_zero_partial_page(inode, offset, length);
+}
+
+static int ceph_zero_partial_object(struct inode *inode,
+ loff_t offset, loff_t *length)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_request *req;
+ int ret = 0;
+ loff_t zero = 0;
+ int op;
+
+ if (!length) {
+ op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
+ length = &zero;
+ } else {
+ op = CEPH_OSD_OP_ZERO;
+ }
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ ceph_vino(inode),
+ offset, length,
+ 0, 1, op,
+ CEPH_OSD_FLAG_WRITE,
+ NULL, 0, 0, false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_mtime = inode->i_mtime;
+ ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!ret) {
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ if (ret == -ENOENT)
+ ret = 0;
+ }
+ ceph_osdc_put_request(req);
+
+out:
+ return ret;
+}
+
+static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
+{
+ int ret = 0;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ s32 stripe_unit = ci->i_layout.stripe_unit;
+ s32 stripe_count = ci->i_layout.stripe_count;
+ s32 object_size = ci->i_layout.object_size;
+ u64 object_set_size = object_size * stripe_count;
+ u64 nearly, t;
+
+ /* round offset up to next period boundary */
+ nearly = offset + object_set_size - 1;
+ t = nearly;
+ nearly -= do_div(t, object_set_size);
+
+ while (length && offset < nearly) {
+ loff_t size = length;
+ ret = ceph_zero_partial_object(inode, offset, &size);
+ if (ret < 0)
+ return ret;
+ offset += size;
+ length -= size;
+ }
+ while (length >= object_set_size) {
+ int i;
+ loff_t pos = offset;
+ for (i = 0; i < stripe_count; ++i) {
+ ret = ceph_zero_partial_object(inode, pos, NULL);
+ if (ret < 0)
+ return ret;
+ pos += stripe_unit;
+ }
+ offset += object_set_size;
+ length -= object_set_size;
+ }
+ while (length) {
+ loff_t size = length;
+ ret = ceph_zero_partial_object(inode, offset, &size);
+ if (ret < 0)
+ return ret;
+ offset += size;
+ length -= size;
+ }
+ return ret;
+}
+
+static long ceph_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t length)
+{
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap_flush *prealloc_cf;
+ int want, got = 0;
+ int dirty;
+ int ret = 0;
+ loff_t endoff = 0;
+ loff_t size;
+
+ if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ inode_lock(inode);
+
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ ret = -EROFS;
+ goto unlock;
+ }
+
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ ret = ceph_uninline_data(file, NULL);
+ if (ret < 0)
+ goto unlock;
+ }
+
+ size = i_size_read(inode);
+
+ /* Are we punching a hole beyond EOF? */
+ if (offset >= size)
+ goto unlock;
+ if ((offset + length) > size)
+ length = size - offset;
+
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
+ if (ret < 0)
+ goto unlock;
+
+ ceph_zero_pagecache_range(inode, offset, length);
+ ret = ceph_zero_objects(inode, offset, length);
+
+ if (!ret) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+
+ ceph_put_cap_refs(ci, got);
+unlock:
+ inode_unlock(inode);
+ ceph_free_cap_flush(prealloc_cf);
+ return ret;
+}
+
+const struct file_operations ceph_file_fops = {
+ .open = ceph_open,
+ .release = ceph_release,
+ .llseek = ceph_llseek,
+ .read_iter = ceph_read_iter,
+ .write_iter = ceph_write_iter,
+ .mmap = ceph_mmap,
+ .fsync = ceph_fsync,
+ .lock = ceph_lock,
+ .setlease = simple_nosetlease,
+ .flock = ceph_flock,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .unlocked_ioctl = ceph_ioctl,
+ .compat_ioctl = ceph_ioctl,
+ .fallocate = ceph_fallocate,
+};
+
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000..5f041fede
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,2312 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/writeback.h>
+#include <linux/vmalloc.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
+#include <linux/sort.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "cache.h"
+#include <linux/ceph/decode.h>
+
+/*
+ * Ceph inode operations
+ *
+ * Implement basic inode helpers (get, alloc) and inode ops (getattr,
+ * setattr, etc.), xattr helpers, and helpers for assimilating
+ * metadata returned by the MDS into our cache.
+ *
+ * Also define helpers for doing asynchronous writeback, invalidation,
+ * and truncation for the benefit of those who can't afford to block
+ * (typically because they are in the message handler path).
+ */
+
+static const struct inode_operations ceph_symlink_iops;
+
+static void ceph_invalidate_work(struct work_struct *work);
+static void ceph_writeback_work(struct work_struct *work);
+static void ceph_vmtruncate_work(struct work_struct *work);
+
+/*
+ * find or create an inode, given the ceph ino number
+ */
+static int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+ ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+ inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+ return 0;
+}
+
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+{
+ struct inode *inode;
+ ino_t t = ceph_vino_to_ino(vino);
+
+ inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (inode->i_state & I_NEW) {
+ dout("get_inode created new inode %p %llx.%llx ino %llx\n",
+ inode, ceph_vinop(inode), (u64)inode->i_ino);
+ unlock_new_inode(inode);
+ }
+
+ dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
+ vino.snap, inode);
+ return inode;
+}
+
+/*
+ * get/constuct snapdir inode for a given directory
+ */
+struct inode *ceph_get_snapdir(struct inode *parent)
+{
+ struct ceph_vino vino = {
+ .ino = ceph_ino(parent),
+ .snap = CEPH_SNAPDIR,
+ };
+ struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ BUG_ON(!S_ISDIR(parent->i_mode));
+ if (IS_ERR(inode))
+ return inode;
+ inode->i_mode = parent->i_mode;
+ inode->i_uid = parent->i_uid;
+ inode->i_gid = parent->i_gid;
+ inode->i_op = &ceph_snapdir_iops;
+ inode->i_fop = &ceph_snapdir_fops;
+ ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+ ci->i_rbytes = 0;
+ return inode;
+}
+
+const struct inode_operations ceph_file_iops = {
+ .permission = ceph_permission,
+ .setattr = ceph_setattr,
+ .getattr = ceph_getattr,
+ .listxattr = ceph_listxattr,
+ .get_acl = ceph_get_acl,
+ .set_acl = ceph_set_acl,
+};
+
+
+/*
+ * We use a 'frag tree' to keep track of the MDS's directory fragments
+ * for a given inode (usually there is just a single fragment). We
+ * need to know when a child frag is delegated to a new MDS, or when
+ * it is flagged as replicated, so we can direct our requests
+ * accordingly.
+ */
+
+/*
+ * find/create a frag in the tree
+ */
+static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
+ u32 f)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct ceph_inode_frag *frag;
+ int c;
+
+ p = &ci->i_fragtree.rb_node;
+ while (*p) {
+ parent = *p;
+ frag = rb_entry(parent, struct ceph_inode_frag, node);
+ c = ceph_frag_compare(f, frag->frag);
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else
+ return frag;
+ }
+
+ frag = kmalloc(sizeof(*frag), GFP_NOFS);
+ if (!frag)
+ return ERR_PTR(-ENOMEM);
+
+ frag->frag = f;
+ frag->split_by = 0;
+ frag->mds = -1;
+ frag->ndist = 0;
+
+ rb_link_node(&frag->node, parent, p);
+ rb_insert_color(&frag->node, &ci->i_fragtree);
+
+ dout("get_or_create_frag added %llx.%llx frag %x\n",
+ ceph_vinop(&ci->vfs_inode), f);
+ return frag;
+}
+
+/*
+ * find a specific frag @f
+ */
+struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
+{
+ struct rb_node *n = ci->i_fragtree.rb_node;
+
+ while (n) {
+ struct ceph_inode_frag *frag =
+ rb_entry(n, struct ceph_inode_frag, node);
+ int c = ceph_frag_compare(f, frag->frag);
+ if (c < 0)
+ n = n->rb_left;
+ else if (c > 0)
+ n = n->rb_right;
+ else
+ return frag;
+ }
+ return NULL;
+}
+
+/*
+ * Choose frag containing the given value @v. If @pfrag is
+ * specified, copy the frag delegation info to the caller if
+ * it is present.
+ */
+static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag, int *found)
+{
+ u32 t = ceph_frag_make(0, 0);
+ struct ceph_inode_frag *frag;
+ unsigned nway, i;
+ u32 n;
+
+ if (found)
+ *found = 0;
+
+ while (1) {
+ WARN_ON(!ceph_frag_contains_value(t, v));
+ frag = __ceph_find_frag(ci, t);
+ if (!frag)
+ break; /* t is a leaf */
+ if (frag->split_by == 0) {
+ if (pfrag)
+ memcpy(pfrag, frag, sizeof(*pfrag));
+ if (found)
+ *found = 1;
+ break;
+ }
+
+ /* choose child */
+ nway = 1 << frag->split_by;
+ dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
+ frag->split_by, nway);
+ for (i = 0; i < nway; i++) {
+ n = ceph_frag_make_child(t, frag->split_by, i);
+ if (ceph_frag_contains_value(n, v)) {
+ t = n;
+ break;
+ }
+ }
+ BUG_ON(i == nway);
+ }
+ dout("choose_frag(%x) = %x\n", v, t);
+
+ return t;
+}
+
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag, int *found)
+{
+ u32 ret;
+ mutex_lock(&ci->i_fragtree_mutex);
+ ret = __ceph_choose_frag(ci, v, pfrag, found);
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return ret;
+}
+
+/*
+ * Process dirfrag (delegation) info from the mds. Include leaf
+ * fragment in tree ONLY if ndist > 0. Otherwise, only
+ * branches/splits are included in i_fragtree)
+ */
+static int ceph_fill_dirfrag(struct inode *inode,
+ struct ceph_mds_reply_dirfrag *dirinfo)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_frag *frag;
+ u32 id = le32_to_cpu(dirinfo->frag);
+ int mds = le32_to_cpu(dirinfo->auth);
+ int ndist = le32_to_cpu(dirinfo->ndist);
+ int diri_auth = -1;
+ int i;
+ int err = 0;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap)
+ diri_auth = ci->i_auth_cap->mds;
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (mds == -1) /* CDIR_AUTH_PARENT */
+ mds = diri_auth;
+
+ mutex_lock(&ci->i_fragtree_mutex);
+ if (ndist == 0 && mds == diri_auth) {
+ /* no delegation info needed. */
+ frag = __ceph_find_frag(ci, id);
+ if (!frag)
+ goto out;
+ if (frag->split_by == 0) {
+ /* tree leaf, remove */
+ dout("fill_dirfrag removed %llx.%llx frag %x"
+ " (no ref)\n", ceph_vinop(inode), id);
+ rb_erase(&frag->node, &ci->i_fragtree);
+ kfree(frag);
+ } else {
+ /* tree branch, keep and clear */
+ dout("fill_dirfrag cleared %llx.%llx frag %x"
+ " referral\n", ceph_vinop(inode), id);
+ frag->mds = -1;
+ frag->ndist = 0;
+ }
+ goto out;
+ }
+
+
+ /* find/add this frag to store mds delegation info */
+ frag = __get_or_create_frag(ci, id);
+ if (IS_ERR(frag)) {
+ /* this is not the end of the world; we can continue
+ with bad/inaccurate delegation info */
+ pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
+ ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+ err = -ENOMEM;
+ goto out;
+ }
+
+ frag->mds = mds;
+ frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
+ for (i = 0; i < frag->ndist; i++)
+ frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
+ dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
+ ceph_vinop(inode), frag->frag, frag->ndist);
+
+out:
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return err;
+}
+
+static int frag_tree_split_cmp(const void *l, const void *r)
+{
+ struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
+ struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
+ return ceph_frag_compare(le32_to_cpu(ls->frag),
+ le32_to_cpu(rs->frag));
+}
+
+static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
+{
+ if (!frag)
+ return f == ceph_frag_make(0, 0);
+ if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
+ return false;
+ return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
+}
+
+static int ceph_fill_fragtree(struct inode *inode,
+ struct ceph_frag_tree_head *fragtree,
+ struct ceph_mds_reply_dirfrag *dirinfo)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_frag *frag, *prev_frag = NULL;
+ struct rb_node *rb_node;
+ unsigned i, split_by, nsplits;
+ u32 id;
+ bool update = false;
+
+ mutex_lock(&ci->i_fragtree_mutex);
+ nsplits = le32_to_cpu(fragtree->nsplits);
+ if (nsplits != ci->i_fragtree_nsplits) {
+ update = true;
+ } else if (nsplits) {
+ i = prandom_u32() % nsplits;
+ id = le32_to_cpu(fragtree->splits[i].frag);
+ if (!__ceph_find_frag(ci, id))
+ update = true;
+ } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
+ rb_node = rb_first(&ci->i_fragtree);
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
+ update = true;
+ }
+ if (!update && dirinfo) {
+ id = le32_to_cpu(dirinfo->frag);
+ if (id != __ceph_choose_frag(ci, id, NULL, NULL))
+ update = true;
+ }
+ if (!update)
+ goto out_unlock;
+
+ if (nsplits > 1) {
+ sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
+ frag_tree_split_cmp, NULL);
+ }
+
+ dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
+ rb_node = rb_first(&ci->i_fragtree);
+ for (i = 0; i < nsplits; i++) {
+ id = le32_to_cpu(fragtree->splits[i].frag);
+ split_by = le32_to_cpu(fragtree->splits[i].by);
+ if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
+ pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
+ "frag %x split by %d\n", ceph_vinop(inode),
+ i, nsplits, id, split_by);
+ continue;
+ }
+ frag = NULL;
+ while (rb_node) {
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ if (ceph_frag_compare(frag->frag, id) >= 0) {
+ if (frag->frag != id)
+ frag = NULL;
+ else
+ rb_node = rb_next(rb_node);
+ break;
+ }
+ rb_node = rb_next(rb_node);
+ /* delete stale split/leaf node */
+ if (frag->split_by > 0 ||
+ !is_frag_child(frag->frag, prev_frag)) {
+ rb_erase(&frag->node, &ci->i_fragtree);
+ if (frag->split_by > 0)
+ ci->i_fragtree_nsplits--;
+ kfree(frag);
+ }
+ frag = NULL;
+ }
+ if (!frag) {
+ frag = __get_or_create_frag(ci, id);
+ if (IS_ERR(frag))
+ continue;
+ }
+ if (frag->split_by == 0)
+ ci->i_fragtree_nsplits++;
+ frag->split_by = split_by;
+ dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+ prev_frag = frag;
+ }
+ while (rb_node) {
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ rb_node = rb_next(rb_node);
+ /* delete stale split/leaf node */
+ if (frag->split_by > 0 ||
+ !is_frag_child(frag->frag, prev_frag)) {
+ rb_erase(&frag->node, &ci->i_fragtree);
+ if (frag->split_by > 0)
+ ci->i_fragtree_nsplits--;
+ kfree(frag);
+ }
+ }
+out_unlock:
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return 0;
+}
+
+/*
+ * initialize a newly allocated inode.
+ */
+struct inode *ceph_alloc_inode(struct super_block *sb)
+{
+ struct ceph_inode_info *ci;
+ int i;
+
+ ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+ if (!ci)
+ return NULL;
+
+ dout("alloc_inode %p\n", &ci->vfs_inode);
+
+ spin_lock_init(&ci->i_ceph_lock);
+
+ ci->i_version = 0;
+ ci->i_inline_version = 0;
+ ci->i_time_warp_seq = 0;
+ ci->i_ceph_flags = 0;
+ atomic64_set(&ci->i_ordered_count, 1);
+ atomic64_set(&ci->i_release_count, 1);
+ atomic64_set(&ci->i_complete_seq[0], 0);
+ atomic64_set(&ci->i_complete_seq[1], 0);
+ ci->i_symlink = NULL;
+
+ ci->i_max_bytes = 0;
+ ci->i_max_files = 0;
+
+ memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
+
+ ci->i_fragtree = RB_ROOT;
+ mutex_init(&ci->i_fragtree_mutex);
+
+ ci->i_xattrs.blob = NULL;
+ ci->i_xattrs.prealloc_blob = NULL;
+ ci->i_xattrs.dirty = false;
+ ci->i_xattrs.index = RB_ROOT;
+ ci->i_xattrs.count = 0;
+ ci->i_xattrs.names_size = 0;
+ ci->i_xattrs.vals_size = 0;
+ ci->i_xattrs.version = 0;
+ ci->i_xattrs.index_version = 0;
+
+ ci->i_caps = RB_ROOT;
+ ci->i_auth_cap = NULL;
+ ci->i_dirty_caps = 0;
+ ci->i_flushing_caps = 0;
+ INIT_LIST_HEAD(&ci->i_dirty_item);
+ INIT_LIST_HEAD(&ci->i_flushing_item);
+ ci->i_prealloc_cap_flush = NULL;
+ INIT_LIST_HEAD(&ci->i_cap_flush_list);
+ init_waitqueue_head(&ci->i_cap_wq);
+ ci->i_hold_caps_min = 0;
+ ci->i_hold_caps_max = 0;
+ INIT_LIST_HEAD(&ci->i_cap_delay_list);
+ INIT_LIST_HEAD(&ci->i_cap_snaps);
+ ci->i_head_snapc = NULL;
+ ci->i_snap_caps = 0;
+
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
+ ci->i_nr_by_mode[i] = 0;
+
+ mutex_init(&ci->i_truncate_mutex);
+ ci->i_truncate_seq = 0;
+ ci->i_truncate_size = 0;
+ ci->i_truncate_pending = 0;
+
+ ci->i_max_size = 0;
+ ci->i_reported_size = 0;
+ ci->i_wanted_max_size = 0;
+ ci->i_requested_max_size = 0;
+
+ ci->i_pin_ref = 0;
+ ci->i_rd_ref = 0;
+ ci->i_rdcache_ref = 0;
+ ci->i_wr_ref = 0;
+ ci->i_wb_ref = 0;
+ ci->i_wrbuffer_ref = 0;
+ ci->i_wrbuffer_ref_head = 0;
+ atomic_set(&ci->i_filelock_ref, 0);
+ atomic_set(&ci->i_shared_gen, 0);
+ ci->i_rdcache_gen = 0;
+ ci->i_rdcache_revoking = 0;
+
+ INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+ INIT_LIST_HEAD(&ci->i_unsafe_iops);
+ spin_lock_init(&ci->i_unsafe_lock);
+
+ ci->i_snap_realm = NULL;
+ INIT_LIST_HEAD(&ci->i_snap_realm_item);
+ INIT_LIST_HEAD(&ci->i_snap_flush_item);
+
+ INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
+ INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
+
+ INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+
+ ceph_fscache_inode_init(ci);
+
+ return &ci->vfs_inode;
+}
+
+static void ceph_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ kfree(ci->i_symlink);
+ kmem_cache_free(ceph_inode_cachep, ci);
+}
+
+void ceph_evict_inode(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_frag *frag;
+ struct rb_node *n;
+
+ dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+
+ ceph_fscache_unregister_inode_cookie(ci);
+
+ ceph_queue_caps_release(inode);
+
+ if (__ceph_has_any_quota(ci))
+ ceph_adjust_quota_realms_count(inode, false);
+
+ /*
+ * we may still have a snap_realm reference if there are stray
+ * caps in i_snap_caps.
+ */
+ if (ci->i_snap_realm) {
+ struct ceph_mds_client *mdsc =
+ ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+ struct ceph_snap_realm *realm = ci->i_snap_realm;
+
+ dout(" dropping residual ref to snap realm %p\n", realm);
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ ci->i_snap_realm = NULL;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_put_snap_realm(mdsc, realm);
+ }
+
+ while ((n = rb_first(&ci->i_fragtree)) != NULL) {
+ frag = rb_entry(n, struct ceph_inode_frag, node);
+ rb_erase(n, &ci->i_fragtree);
+ kfree(frag);
+ }
+ ci->i_fragtree_nsplits = 0;
+
+ __ceph_destroy_xattrs(ci);
+ if (ci->i_xattrs.blob)
+ ceph_buffer_put(ci->i_xattrs.blob);
+ if (ci->i_xattrs.prealloc_blob)
+ ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+
+ ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+}
+
+void ceph_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, ceph_i_callback);
+}
+
+int ceph_drop_inode(struct inode *inode)
+{
+ /*
+ * Positve dentry and corresponding inode are always accompanied
+ * in MDS reply. So no need to keep inode in the cache after
+ * dropping all its aliases.
+ */
+ return 1;
+}
+
+static inline blkcnt_t calc_inode_blocks(u64 size)
+{
+ return (size + (1<<9) - 1) >> 9;
+}
+
+/*
+ * Helpers to fill in size, ctime, mtime, and atime. We have to be
+ * careful because either the client or MDS may have more up to date
+ * info, depending on which capabilities are held, and whether
+ * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
+ * and size are monotonically increasing, except when utimes() or
+ * truncate() increments the corresponding _seq values.)
+ */
+int ceph_fill_file_size(struct inode *inode, int issued,
+ u32 truncate_seq, u64 truncate_size, u64 size)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int queue_trunc = 0;
+
+ if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
+ (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
+ dout("size %lld -> %llu\n", inode->i_size, size);
+ if (size > 0 && S_ISDIR(inode->i_mode)) {
+ pr_err("fill_file_size non-zero size for directory\n");
+ size = 0;
+ }
+ i_size_write(inode, size);
+ inode->i_blocks = calc_inode_blocks(size);
+ ci->i_reported_size = size;
+ if (truncate_seq != ci->i_truncate_seq) {
+ dout("truncate_seq %u -> %u\n",
+ ci->i_truncate_seq, truncate_seq);
+ ci->i_truncate_seq = truncate_seq;
+
+ /* the MDS should have revoked these caps */
+ WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_RD |
+ CEPH_CAP_FILE_WR |
+ CEPH_CAP_FILE_LAZYIO));
+ /*
+ * If we hold relevant caps, or in the case where we're
+ * not the only client referencing this file and we
+ * don't hold those caps, then we need to check whether
+ * the file is either opened or mmaped
+ */
+ if ((issued & (CEPH_CAP_FILE_CACHE|
+ CEPH_CAP_FILE_BUFFER)) ||
+ mapping_mapped(inode->i_mapping) ||
+ __ceph_caps_file_wanted(ci)) {
+ ci->i_truncate_pending++;
+ queue_trunc = 1;
+ }
+ }
+ }
+ if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
+ ci->i_truncate_size != truncate_size) {
+ dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
+ truncate_size);
+ ci->i_truncate_size = truncate_size;
+ }
+
+ if (queue_trunc)
+ ceph_fscache_invalidate(inode);
+
+ return queue_trunc;
+}
+
+void ceph_fill_file_time(struct inode *inode, int issued,
+ u64 time_warp_seq, struct timespec64 *ctime,
+ struct timespec64 *mtime, struct timespec64 *atime)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int warn = 0;
+
+ if (issued & (CEPH_CAP_FILE_EXCL|
+ CEPH_CAP_FILE_WR|
+ CEPH_CAP_FILE_BUFFER|
+ CEPH_CAP_AUTH_EXCL|
+ CEPH_CAP_XATTR_EXCL)) {
+ if (ci->i_version == 0 ||
+ timespec64_compare(ctime, &inode->i_ctime) > 0) {
+ dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
+ inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+ ctime->tv_sec, ctime->tv_nsec);
+ inode->i_ctime = *ctime;
+ }
+ if (ci->i_version == 0 ||
+ ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+ /* the MDS did a utimes() */
+ dout("mtime %lld.%09ld -> %lld.%09ld "
+ "tw %d -> %d\n",
+ inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+ mtime->tv_sec, mtime->tv_nsec,
+ ci->i_time_warp_seq, (int)time_warp_seq);
+
+ inode->i_mtime = *mtime;
+ inode->i_atime = *atime;
+ ci->i_time_warp_seq = time_warp_seq;
+ } else if (time_warp_seq == ci->i_time_warp_seq) {
+ /* nobody did utimes(); take the max */
+ if (timespec64_compare(mtime, &inode->i_mtime) > 0) {
+ dout("mtime %lld.%09ld -> %lld.%09ld inc\n",
+ inode->i_mtime.tv_sec,
+ inode->i_mtime.tv_nsec,
+ mtime->tv_sec, mtime->tv_nsec);
+ inode->i_mtime = *mtime;
+ }
+ if (timespec64_compare(atime, &inode->i_atime) > 0) {
+ dout("atime %lld.%09ld -> %lld.%09ld inc\n",
+ inode->i_atime.tv_sec,
+ inode->i_atime.tv_nsec,
+ atime->tv_sec, atime->tv_nsec);
+ inode->i_atime = *atime;
+ }
+ } else if (issued & CEPH_CAP_FILE_EXCL) {
+ /* we did a utimes(); ignore mds values */
+ } else {
+ warn = 1;
+ }
+ } else {
+ /* we have no write|excl caps; whatever the MDS says is true */
+ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
+ inode->i_ctime = *ctime;
+ inode->i_mtime = *mtime;
+ inode->i_atime = *atime;
+ ci->i_time_warp_seq = time_warp_seq;
+ } else {
+ warn = 1;
+ }
+ }
+ if (warn) /* time_warp_seq shouldn't go backwards */
+ dout("%p mds time_warp_seq %llu < %u\n",
+ inode, time_warp_seq, ci->i_time_warp_seq);
+}
+
+/*
+ * Populate an inode based on info from mds. May be called on new or
+ * existing inodes.
+ */
+static int fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session,
+ unsigned long ttl_from, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_reply_inode *info = iinfo->in;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int issued, new_issued, info_caps;
+ struct timespec64 mtime, atime, ctime;
+ struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_buffer *old_blob = NULL;
+ struct ceph_string *pool_ns = NULL;
+ struct ceph_cap *new_cap = NULL;
+ int err = 0;
+ bool wake = false;
+ bool queue_trunc = false;
+ bool new_version = false;
+ bool fill_inline = false;
+
+ dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+ inode, ceph_vinop(inode), le64_to_cpu(info->version),
+ ci->i_version);
+
+ info_caps = le32_to_cpu(info->cap.caps);
+
+ /* prealloc new cap struct */
+ if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
+ new_cap = ceph_get_cap(mdsc, caps_reservation);
+ if (!new_cap)
+ return -ENOMEM;
+ }
+
+ /*
+ * prealloc xattr data, if it looks like we'll need it. only
+ * if len > 4 (meaning there are actually xattrs; the first 4
+ * bytes are the xattr count).
+ */
+ if (iinfo->xattr_len > 4) {
+ xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
+ if (!xattr_blob)
+ pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+ iinfo->xattr_len);
+ }
+
+ if (iinfo->pool_ns_len > 0)
+ pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
+ iinfo->pool_ns_len);
+
+ spin_lock(&ci->i_ceph_lock);
+
+ /*
+ * provided version will be odd if inode value is projected,
+ * even if stable. skip the update if we have newer stable
+ * info (ours>=theirs, e.g. due to racing mds replies), unless
+ * we are getting projected (unstable) info (in which case the
+ * version is odd, and we want ours>theirs).
+ * us them
+ * 2 2 skip
+ * 3 2 skip
+ * 3 3 update
+ */
+ if (ci->i_version == 0 ||
+ ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ le64_to_cpu(info->version) > (ci->i_version & ~1)))
+ new_version = true;
+
+ __ceph_caps_issued(ci, &issued);
+ issued |= __ceph_caps_dirty(ci);
+ new_issued = ~issued & info_caps;
+
+ /* update inode */
+ inode->i_rdev = le32_to_cpu(info->rdev);
+ /* directories have fl_stripe_unit set to zero */
+ if (le32_to_cpu(info->layout.fl_stripe_unit))
+ inode->i_blkbits =
+ fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ else
+ inode->i_blkbits = CEPH_BLOCK_SHIFT;
+
+ __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
+
+ if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
+ (issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ inode->i_mode = le32_to_cpu(info->mode);
+ inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
+ inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
+ dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
+ }
+
+ if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
+ (issued & CEPH_CAP_LINK_EXCL) == 0)
+ set_nlink(inode, le32_to_cpu(info->nlink));
+
+ if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
+ /* be careful with mtime, atime, size */
+ ceph_decode_timespec64(&atime, &info->atime);
+ ceph_decode_timespec64(&mtime, &info->mtime);
+ ceph_decode_timespec64(&ctime, &info->ctime);
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(info->time_warp_seq),
+ &ctime, &mtime, &atime);
+ }
+
+ if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) {
+ ci->i_files = le64_to_cpu(info->files);
+ ci->i_subdirs = le64_to_cpu(info->subdirs);
+ }
+
+ if (new_version ||
+ (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
+
+ if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
+ ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
+
+ pool_ns = old_ns;
+
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ le32_to_cpu(info->truncate_seq),
+ le64_to_cpu(info->truncate_size),
+ le64_to_cpu(info->size));
+ /* only update max_size on auth cap */
+ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ ci->i_max_size != le64_to_cpu(info->max_size)) {
+ dout("max_size %lld -> %llu\n", ci->i_max_size,
+ le64_to_cpu(info->max_size));
+ ci->i_max_size = le64_to_cpu(info->max_size);
+ }
+ }
+
+ /* layout and rstat are not tracked by capability, update them if
+ * the inode info is from auth mds */
+ if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
+ if (S_ISDIR(inode->i_mode)) {
+ ci->i_dir_layout = iinfo->dir_layout;
+ ci->i_rbytes = le64_to_cpu(info->rbytes);
+ ci->i_rfiles = le64_to_cpu(info->rfiles);
+ ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+ ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
+ }
+ }
+
+ /* xattrs */
+ /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
+ if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
+ le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
+ if (ci->i_xattrs.blob)
+ old_blob = ci->i_xattrs.blob;
+ ci->i_xattrs.blob = xattr_blob;
+ if (xattr_blob)
+ memcpy(ci->i_xattrs.blob->vec.iov_base,
+ iinfo->xattr_data, iinfo->xattr_len);
+ ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+ ceph_forget_all_cached_acls(inode);
+ xattr_blob = NULL;
+ }
+
+ /* finally update i_version */
+ if (le64_to_cpu(info->version) > ci->i_version)
+ ci->i_version = le64_to_cpu(info->version);
+
+ inode->i_mapping->a_ops = &ceph_aops;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFSOCK:
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ inode->i_op = &ceph_file_iops;
+ break;
+ case S_IFREG:
+ inode->i_op = &ceph_file_iops;
+ inode->i_fop = &ceph_file_fops;
+ break;
+ case S_IFLNK:
+ inode->i_op = &ceph_symlink_iops;
+ if (!ci->i_symlink) {
+ u32 symlen = iinfo->symlink_len;
+ char *sym;
+
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (symlen != i_size_read(inode)) {
+ pr_err("fill_inode %llx.%llx BAD symlink "
+ "size %lld\n", ceph_vinop(inode),
+ i_size_read(inode));
+ i_size_write(inode, symlen);
+ inode->i_blocks = calc_inode_blocks(symlen);
+ }
+
+ err = -ENOMEM;
+ sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
+ if (!sym)
+ goto out;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (!ci->i_symlink)
+ ci->i_symlink = sym;
+ else
+ kfree(sym); /* lost a race */
+ }
+ inode->i_link = ci->i_symlink;
+ break;
+ case S_IFDIR:
+ inode->i_op = &ceph_dir_iops;
+ inode->i_fop = &ceph_dir_fops;
+ break;
+ default:
+ pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+ ceph_vinop(inode), inode->i_mode);
+ }
+
+ /* were we issued a capability? */
+ if (info_caps) {
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
+ ceph_add_cap(inode, session,
+ le64_to_cpu(info->cap.cap_id),
+ cap_fmode, info_caps,
+ le32_to_cpu(info->cap.wanted),
+ le32_to_cpu(info->cap.seq),
+ le32_to_cpu(info->cap.mseq),
+ le64_to_cpu(info->cap.realm),
+ info->cap.flags, &new_cap);
+
+ /* set dir completion flag? */
+ if (S_ISDIR(inode->i_mode) &&
+ ci->i_files == 0 && ci->i_subdirs == 0 &&
+ (info_caps & CEPH_CAP_FILE_SHARED) &&
+ (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+ !__ceph_dir_is_complete(ci)) {
+ dout(" marking %p complete (empty)\n", inode);
+ i_size_write(inode, 0);
+ __ceph_dir_set_complete(ci,
+ atomic64_read(&ci->i_release_count),
+ atomic64_read(&ci->i_ordered_count));
+ }
+
+ wake = true;
+ } else {
+ dout(" %p got snap_caps %s\n", inode,
+ ceph_cap_string(info_caps));
+ ci->i_snap_caps |= info_caps;
+ if (cap_fmode >= 0)
+ __ceph_get_fmode(ci, cap_fmode);
+ }
+ } else if (cap_fmode >= 0) {
+ pr_warn("mds issued no caps on %llx.%llx\n",
+ ceph_vinop(inode));
+ __ceph_get_fmode(ci, cap_fmode);
+ }
+
+ if (iinfo->inline_version > 0 &&
+ iinfo->inline_version >= ci->i_inline_version) {
+ int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+ ci->i_inline_version = iinfo->inline_version;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (locked_page || (info_caps & cache_caps)))
+ fill_inline = true;
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (fill_inline)
+ ceph_fill_inline_data(inode, locked_page,
+ iinfo->inline_data, iinfo->inline_len);
+
+ if (wake)
+ wake_up_all(&ci->i_cap_wq);
+
+ /* queue truncate if we saw i_size decrease */
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
+
+ /* populate frag tree */
+ if (S_ISDIR(inode->i_mode))
+ ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
+
+ /* update delegation info? */
+ if (dirinfo)
+ ceph_fill_dirfrag(inode, dirinfo);
+
+ err = 0;
+out:
+ if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
+ ceph_buffer_put(old_blob);
+ ceph_buffer_put(xattr_blob);
+ ceph_put_string(pool_ns);
+ return err;
+}
+
+/*
+ * caller should hold session s_mutex.
+ */
+static void update_dentry_lease(struct dentry *dentry,
+ struct ceph_mds_reply_lease *lease,
+ struct ceph_mds_session *session,
+ unsigned long from_time,
+ struct ceph_vino *tgt_vino,
+ struct ceph_vino *dir_vino)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ long unsigned duration = le32_to_cpu(lease->duration_ms);
+ long unsigned ttl = from_time + (duration * HZ) / 1000;
+ long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
+ struct inode *dir;
+ struct ceph_mds_session *old_lease_session = NULL;
+
+ /*
+ * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
+ * we expect a negative dentry.
+ */
+ if (!tgt_vino && d_really_is_positive(dentry))
+ return;
+
+ if (tgt_vino && (d_really_is_negative(dentry) ||
+ !ceph_ino_compare(d_inode(dentry), tgt_vino)))
+ return;
+
+ spin_lock(&dentry->d_lock);
+ dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
+ dentry, duration, ttl);
+
+ dir = d_inode(dentry->d_parent);
+
+ /* make sure parent matches dir_vino */
+ if (!ceph_ino_compare(dir, dir_vino))
+ goto out_unlock;
+
+ /* only track leases on regular dentries */
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ goto out_unlock;
+
+ di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
+
+ if (duration == 0)
+ goto out_unlock;
+
+ if (di->lease_gen == session->s_cap_gen &&
+ time_before(ttl, di->time))
+ goto out_unlock; /* we already have a newer lease. */
+
+ if (di->lease_session && di->lease_session != session) {
+ old_lease_session = di->lease_session;
+ di->lease_session = NULL;
+ }
+
+ ceph_dentry_lru_touch(dentry);
+
+ if (!di->lease_session)
+ di->lease_session = ceph_get_mds_session(session);
+ di->lease_gen = session->s_cap_gen;
+ di->lease_seq = le32_to_cpu(lease->seq);
+ di->lease_renew_after = half_ttl;
+ di->lease_renew_from = 0;
+ di->time = ttl;
+out_unlock:
+ spin_unlock(&dentry->d_lock);
+ if (old_lease_session)
+ ceph_put_mds_session(old_lease_session);
+}
+
+/*
+ * splice a dentry to an inode.
+ * caller must hold directory i_mutex for this to be safe.
+ */
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
+{
+ struct dentry *realdn;
+
+ BUG_ON(d_inode(dn));
+
+ if (S_ISDIR(in->i_mode)) {
+ /* If inode is directory, d_splice_alias() below will remove
+ * 'realdn' from its origin parent. We need to ensure that
+ * origin parent's readdir cache will not reference 'realdn'
+ */
+ realdn = d_find_any_alias(in);
+ if (realdn) {
+ struct ceph_dentry_info *di = ceph_dentry(realdn);
+ spin_lock(&realdn->d_lock);
+
+ realdn->d_op->d_prune(realdn);
+
+ di->time = jiffies;
+ di->lease_shared_gen = 0;
+ di->offset = 0;
+
+ spin_unlock(&realdn->d_lock);
+ dput(realdn);
+ }
+ }
+
+ /* dn must be unhashed */
+ if (!d_unhashed(dn))
+ d_drop(dn);
+ realdn = d_splice_alias(in, dn);
+ if (IS_ERR(realdn)) {
+ pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
+ PTR_ERR(realdn), dn, in, ceph_vinop(in));
+ dn = realdn;
+ /*
+ * Caller should release 'dn' in the case of error.
+ * If 'req->r_dentry' is passed to this function,
+ * caller should leave 'req->r_dentry' untouched.
+ */
+ goto out;
+ } else if (realdn) {
+ dout("dn %p (%d) spliced with %p (%d) "
+ "inode %p ino %llx.%llx\n",
+ dn, d_count(dn),
+ realdn, d_count(realdn),
+ d_inode(realdn), ceph_vinop(d_inode(realdn)));
+ dput(dn);
+ dn = realdn;
+ } else {
+ BUG_ON(!ceph_dentry(dn));
+ dout("dn %p attached to %p ino %llx.%llx\n",
+ dn, d_inode(dn), ceph_vinop(d_inode(dn)));
+ }
+out:
+ return dn;
+}
+
+/*
+ * Incorporate results into the local cache. This is either just
+ * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
+ * after a lookup).
+ *
+ * A reply may contain
+ * a directory inode along with a dentry.
+ * and/or a target inode
+ *
+ * Called with snap_rwsem (read).
+ */
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
+{
+ struct ceph_mds_session *session = req->r_session;
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct inode *in = NULL;
+ struct ceph_vino tvino, dvino;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ int err = 0;
+
+ dout("fill_trace %p is_dentry %d is_target %d\n", req,
+ rinfo->head->is_dentry, rinfo->head->is_target);
+
+ if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
+ dout("fill_trace reply is empty!\n");
+ if (rinfo->head->result == 0 && req->r_parent)
+ ceph_invalidate_dir_request(req);
+ return 0;
+ }
+
+ if (rinfo->head->is_dentry) {
+ struct inode *dir = req->r_parent;
+
+ if (dir) {
+ err = fill_inode(dir, NULL,
+ &rinfo->diri, rinfo->dirfrag,
+ session, req->r_request_started, -1,
+ &req->r_caps_reservation);
+ if (err < 0)
+ goto done;
+ } else {
+ WARN_ON_ONCE(1);
+ }
+
+ if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+ struct qstr dname;
+ struct dentry *dn, *parent;
+
+ BUG_ON(!rinfo->head->is_target);
+ BUG_ON(req->r_dentry);
+
+ parent = d_find_any_alias(dir);
+ BUG_ON(!parent);
+
+ dname.name = rinfo->dname;
+ dname.len = rinfo->dname_len;
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+retry_lookup:
+ dn = d_lookup(parent, &dname);
+ dout("d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
+
+ if (!dn) {
+ dn = d_alloc(parent, &dname);
+ dout("d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
+ if (!dn) {
+ dput(parent);
+ err = -ENOMEM;
+ goto done;
+ }
+ err = 0;
+ } else if (d_really_is_positive(dn) &&
+ (ceph_ino(d_inode(dn)) != tvino.ino ||
+ ceph_snap(d_inode(dn)) != tvino.snap)) {
+ dout(" dn %p points to wrong inode %p\n",
+ dn, d_inode(dn));
+ ceph_dir_clear_ordered(dir);
+ d_delete(dn);
+ dput(dn);
+ goto retry_lookup;
+ }
+
+ req->r_dentry = dn;
+ dput(parent);
+ }
+ }
+
+ if (rinfo->head->is_target) {
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+ in = ceph_get_inode(sb, tvino);
+ if (IS_ERR(in)) {
+ err = PTR_ERR(in);
+ goto done;
+ }
+ req->r_target_inode = in;
+
+ err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
+ session, req->r_request_started,
+ (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ rinfo->head->result == 0) ? req->r_fmode : -1,
+ &req->r_caps_reservation);
+ if (err < 0) {
+ pr_err("fill_inode badness %p %llx.%llx\n",
+ in, ceph_vinop(in));
+ goto done;
+ }
+ }
+
+ /*
+ * ignore null lease/binding on snapdir ENOENT, or else we
+ * will have trouble splicing in the virtual snapdir later
+ */
+ if (rinfo->head->is_dentry &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
+ (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
+ fsc->mount_options->snapdir_name,
+ req->r_dentry->d_name.len))) {
+ /*
+ * lookup link rename : null -> possibly existing inode
+ * mknod symlink mkdir : null -> new inode
+ * unlink : linked -> null
+ */
+ struct inode *dir = req->r_parent;
+ struct dentry *dn = req->r_dentry;
+ bool have_dir_cap, have_lease;
+
+ BUG_ON(!dn);
+ BUG_ON(!dir);
+ BUG_ON(d_inode(dn->d_parent) != dir);
+
+ dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ BUG_ON(ceph_ino(dir) != dvino.ino);
+ BUG_ON(ceph_snap(dir) != dvino.snap);
+
+ /* do we have a lease on the whole dir? */
+ have_dir_cap =
+ (le32_to_cpu(rinfo->diri.in->cap.caps) &
+ CEPH_CAP_FILE_SHARED);
+
+ /* do we have a dn lease? */
+ have_lease = have_dir_cap ||
+ le32_to_cpu(rinfo->dlease->duration_ms);
+ if (!have_lease)
+ dout("fill_trace no dentry lease or dir cap\n");
+
+ /* rename? */
+ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+ struct inode *olddir = req->r_old_dentry_dir;
+ BUG_ON(!olddir);
+
+ dout(" src %p '%pd' dst %p '%pd'\n",
+ req->r_old_dentry,
+ req->r_old_dentry,
+ dn, dn);
+ dout("fill_trace doing d_move %p -> %p\n",
+ req->r_old_dentry, dn);
+
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(olddir);
+
+ d_move(req->r_old_dentry, dn);
+ dout(" src %p '%pd' dst %p '%pd'\n",
+ req->r_old_dentry,
+ req->r_old_dentry,
+ dn, dn);
+
+ /* ensure target dentry is invalidated, despite
+ rehashing bug in vfs_rename_dir */
+ ceph_invalidate_dentry_lease(dn);
+
+ dout("dn %p gets new offset %lld\n", req->r_old_dentry,
+ ceph_dentry(req->r_old_dentry)->offset);
+
+ dn = req->r_old_dentry; /* use old_dentry */
+ }
+
+ /* null dentry? */
+ if (!rinfo->head->is_target) {
+ dout("fill_trace null dentry\n");
+ if (d_really_is_positive(dn)) {
+ dout("d_delete %p\n", dn);
+ ceph_dir_clear_ordered(dir);
+ d_delete(dn);
+ } else if (have_lease) {
+ if (d_unhashed(dn))
+ d_add(dn, NULL);
+ update_dentry_lease(dn, rinfo->dlease,
+ session,
+ req->r_request_started,
+ NULL, &dvino);
+ }
+ goto done;
+ }
+
+ /* attach proper inode */
+ if (d_really_is_negative(dn)) {
+ ceph_dir_clear_ordered(dir);
+ ihold(in);
+ dn = splice_dentry(dn, in);
+ if (IS_ERR(dn)) {
+ err = PTR_ERR(dn);
+ goto done;
+ }
+ req->r_dentry = dn; /* may have spliced */
+ } else if (d_really_is_positive(dn) && d_inode(dn) != in) {
+ dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
+ dn, d_inode(dn), ceph_vinop(d_inode(dn)),
+ ceph_vinop(in));
+ d_invalidate(dn);
+ have_lease = false;
+ }
+
+ if (have_lease) {
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+ update_dentry_lease(dn, rinfo->dlease, session,
+ req->r_request_started,
+ &tvino, &dvino);
+ }
+ dout(" final dn %p\n", dn);
+ } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+ req->r_op == CEPH_MDS_OP_MKSNAP) &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+ struct dentry *dn = req->r_dentry;
+ struct inode *dir = req->r_parent;
+
+ /* fill out a snapdir LOOKUPSNAP dentry */
+ BUG_ON(!dn);
+ BUG_ON(!dir);
+ BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
+ dout(" linking snapped dir %p to dn %p\n", in, dn);
+ ceph_dir_clear_ordered(dir);
+ ihold(in);
+ dn = splice_dentry(dn, in);
+ if (IS_ERR(dn)) {
+ err = PTR_ERR(dn);
+ goto done;
+ }
+ req->r_dentry = dn; /* may have spliced */
+ } else if (rinfo->head->is_dentry) {
+ struct ceph_vino *ptvino = NULL;
+
+ if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
+ le32_to_cpu(rinfo->dlease->duration_ms)) {
+ dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ if (rinfo->head->is_target) {
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+ ptvino = &tvino;
+ }
+
+ update_dentry_lease(req->r_dentry, rinfo->dlease,
+ session, req->r_request_started, ptvino,
+ &dvino);
+ } else {
+ dout("%s: no dentry lease or dir cap\n", __func__);
+ }
+ }
+done:
+ dout("fill_trace done err=%d\n", err);
+ return err;
+}
+
+/*
+ * Prepopulate our cache with readdir results, leases, etc.
+ */
+static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ int i, err = 0;
+
+ for (i = 0; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
+ struct ceph_vino vino;
+ struct inode *in;
+ int rc;
+
+ vino.ino = le64_to_cpu(rde->inode.in->ino);
+ vino.snap = le64_to_cpu(rde->inode.in->snapid);
+
+ in = ceph_get_inode(req->r_dentry->d_sb, vino);
+ if (IS_ERR(in)) {
+ err = PTR_ERR(in);
+ dout("new_inode badness got %d\n", err);
+ continue;
+ }
+ rc = fill_inode(in, NULL, &rde->inode, NULL, session,
+ req->r_request_started, -1,
+ &req->r_caps_reservation);
+ if (rc < 0) {
+ pr_err("fill_inode badness on %p got %d\n", in, rc);
+ err = rc;
+ }
+ iput(in);
+ }
+
+ return err;
+}
+
+void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
+{
+ if (ctl->page) {
+ kunmap(ctl->page);
+ put_page(ctl->page);
+ ctl->page = NULL;
+ }
+}
+
+static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
+ struct ceph_readdir_cache_control *ctl,
+ struct ceph_mds_request *req)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
+ unsigned idx = ctl->index % nsize;
+ pgoff_t pgoff = ctl->index / nsize;
+
+ if (!ctl->page || pgoff != page_index(ctl->page)) {
+ ceph_readdir_cache_release(ctl);
+ if (idx == 0)
+ ctl->page = grab_cache_page(&dir->i_data, pgoff);
+ else
+ ctl->page = find_lock_page(&dir->i_data, pgoff);
+ if (!ctl->page) {
+ ctl->index = -1;
+ return idx == 0 ? -ENOMEM : 0;
+ }
+ /* reading/filling the cache are serialized by
+ * i_mutex, no need to use page lock */
+ unlock_page(ctl->page);
+ ctl->dentries = kmap(ctl->page);
+ if (idx == 0)
+ memset(ctl->dentries, 0, PAGE_SIZE);
+ }
+
+ if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
+ req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
+ dout("readdir cache dn %p idx %d\n", dn, ctl->index);
+ ctl->dentries[idx] = dn;
+ ctl->index++;
+ } else {
+ dout("disable readdir cache\n");
+ ctl->index = -1;
+ }
+ return 0;
+}
+
+int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+ struct ceph_mds_session *session)
+{
+ struct dentry *parent = req->r_dentry;
+ struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct qstr dname;
+ struct dentry *dn;
+ struct inode *in;
+ int err = 0, skipped = 0, ret, i;
+ struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
+ u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+ u32 last_hash = 0;
+ u32 fpos_offset;
+ struct ceph_readdir_cache_control cache_ctl = {};
+
+ if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
+ return readdir_prepopulate_inodes_only(req, session);
+
+ if (rinfo->hash_order) {
+ if (req->r_path2) {
+ last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ req->r_path2,
+ strlen(req->r_path2));
+ last_hash = ceph_frag_value(last_hash);
+ } else if (rinfo->offset_hash) {
+ /* mds understands offset_hash */
+ WARN_ON_ONCE(req->r_readdir_offset != 2);
+ last_hash = le32_to_cpu(rhead->args.readdir.offset_hash);
+ }
+ }
+
+ if (rinfo->dir_dir &&
+ le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+ dout("readdir_prepopulate got new frag %x -> %x\n",
+ frag, le32_to_cpu(rinfo->dir_dir->frag));
+ frag = le32_to_cpu(rinfo->dir_dir->frag);
+ if (!rinfo->hash_order)
+ req->r_readdir_offset = 2;
+ }
+
+ if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
+ dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
+ rinfo->dir_nr, parent);
+ } else {
+ dout("readdir_prepopulate %d items under dn %p\n",
+ rinfo->dir_nr, parent);
+ if (rinfo->dir_dir)
+ ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
+
+ if (ceph_frag_is_leftmost(frag) &&
+ req->r_readdir_offset == 2 &&
+ !(rinfo->hash_order && last_hash)) {
+ /* note dir version at start of readdir so we can
+ * tell if any dentries get dropped */
+ req->r_dir_release_cnt =
+ atomic64_read(&ci->i_release_count);
+ req->r_dir_ordered_cnt =
+ atomic64_read(&ci->i_ordered_count);
+ req->r_readdir_cache_idx = 0;
+ }
+ }
+
+ cache_ctl.index = req->r_readdir_cache_idx;
+ fpos_offset = req->r_readdir_offset;
+
+ /* FIXME: release caps/leases if error occurs */
+ for (i = 0; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
+ struct ceph_vino tvino, dvino;
+
+ dname.name = rde->name;
+ dname.len = rde->name_len;
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
+
+ tvino.ino = le64_to_cpu(rde->inode.in->ino);
+ tvino.snap = le64_to_cpu(rde->inode.in->snapid);
+
+ if (rinfo->hash_order) {
+ u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ rde->name, rde->name_len);
+ hash = ceph_frag_value(hash);
+ if (hash != last_hash)
+ fpos_offset = 2;
+ last_hash = hash;
+ rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+ } else {
+ rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+ }
+
+retry_lookup:
+ dn = d_lookup(parent, &dname);
+ dout("d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
+
+ if (!dn) {
+ dn = d_alloc(parent, &dname);
+ dout("d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
+ if (!dn) {
+ dout("d_alloc badness\n");
+ err = -ENOMEM;
+ goto out;
+ }
+ } else if (d_really_is_positive(dn) &&
+ (ceph_ino(d_inode(dn)) != tvino.ino ||
+ ceph_snap(d_inode(dn)) != tvino.snap)) {
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ dout(" dn %p points to wrong inode %p\n",
+ dn, d_inode(dn));
+
+ spin_lock(&dn->d_lock);
+ if (di->offset > 0 &&
+ di->lease_shared_gen ==
+ atomic_read(&ci->i_shared_gen)) {
+ __ceph_dir_clear_ordered(ci);
+ di->offset = 0;
+ }
+ spin_unlock(&dn->d_lock);
+
+ d_delete(dn);
+ dput(dn);
+ goto retry_lookup;
+ }
+
+ /* inode */
+ if (d_really_is_positive(dn)) {
+ in = d_inode(dn);
+ } else {
+ in = ceph_get_inode(parent->d_sb, tvino);
+ if (IS_ERR(in)) {
+ dout("new_inode badness\n");
+ d_drop(dn);
+ dput(dn);
+ err = PTR_ERR(in);
+ goto out;
+ }
+ }
+
+ ret = fill_inode(in, NULL, &rde->inode, NULL, session,
+ req->r_request_started, -1,
+ &req->r_caps_reservation);
+ if (ret < 0) {
+ pr_err("fill_inode badness on %p\n", in);
+ if (d_really_is_negative(dn))
+ iput(in);
+ d_drop(dn);
+ err = ret;
+ goto next_item;
+ }
+
+ if (d_really_is_negative(dn)) {
+ struct dentry *realdn;
+
+ if (ceph_security_xattr_deadlock(in)) {
+ dout(" skip splicing dn %p to inode %p"
+ " (security xattr deadlock)\n", dn, in);
+ iput(in);
+ skipped++;
+ goto next_item;
+ }
+
+ realdn = splice_dentry(dn, in);
+ if (IS_ERR(realdn)) {
+ err = PTR_ERR(realdn);
+ d_drop(dn);
+ goto next_item;
+ }
+ dn = realdn;
+ }
+
+ ceph_dentry(dn)->offset = rde->offset;
+
+ dvino = ceph_vino(d_inode(parent));
+ update_dentry_lease(dn, rde->lease, req->r_session,
+ req->r_request_started, &tvino, &dvino);
+
+ if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
+ ret = fill_readdir_cache(d_inode(parent), dn,
+ &cache_ctl, req);
+ if (ret < 0)
+ err = ret;
+ }
+next_item:
+ if (dn)
+ dput(dn);
+ }
+out:
+ if (err == 0 && skipped == 0) {
+ set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
+ req->r_readdir_cache_idx = cache_ctl.index;
+ }
+ ceph_readdir_cache_release(&cache_ctl);
+ dout("readdir_prepopulate done\n");
+ return err;
+}
+
+bool ceph_inode_set_size(struct inode *inode, loff_t size)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ bool ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+ i_size_write(inode, size);
+ inode->i_blocks = calc_inode_blocks(size);
+
+ ret = __ceph_should_report_size(ci);
+
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
+/*
+ * Write back inode data in a worker thread. (This can't be done
+ * in the message handler context.)
+ */
+void ceph_queue_writeback(struct inode *inode)
+{
+ ihold(inode);
+ if (queue_work(ceph_inode_to_client(inode)->wb_wq,
+ &ceph_inode(inode)->i_wb_work)) {
+ dout("ceph_queue_writeback %p\n", inode);
+ } else {
+ dout("ceph_queue_writeback %p failed\n", inode);
+ iput(inode);
+ }
+}
+
+static void ceph_writeback_work(struct work_struct *work)
+{
+ struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+ i_wb_work);
+ struct inode *inode = &ci->vfs_inode;
+
+ dout("writeback %p\n", inode);
+ filemap_fdatawrite(&inode->i_data);
+ iput(inode);
+}
+
+/*
+ * queue an async invalidation
+ */
+void ceph_queue_invalidate(struct inode *inode)
+{
+ ihold(inode);
+ if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
+ &ceph_inode(inode)->i_pg_inv_work)) {
+ dout("ceph_queue_invalidate %p\n", inode);
+ } else {
+ dout("ceph_queue_invalidate %p failed\n", inode);
+ iput(inode);
+ }
+}
+
+/*
+ * Invalidate inode pages in a worker thread. (This can't be done
+ * in the message handler context.)
+ */
+static void ceph_invalidate_work(struct work_struct *work)
+{
+ struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+ i_pg_inv_work);
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ u32 orig_gen;
+ int check = 0;
+
+ mutex_lock(&ci->i_truncate_mutex);
+
+ if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
+ inode, ceph_ino(inode));
+ mapping_set_error(inode->i_mapping, -EIO);
+ truncate_pagecache(inode, 0);
+ mutex_unlock(&ci->i_truncate_mutex);
+ goto out;
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ dout("invalidate_pages %p gen %d revoking %d\n", inode,
+ ci->i_rdcache_gen, ci->i_rdcache_revoking);
+ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+ if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+ check = 1;
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&ci->i_truncate_mutex);
+ goto out;
+ }
+ orig_gen = ci->i_rdcache_gen;
+ spin_unlock(&ci->i_ceph_lock);
+
+ ceph_fscache_invalidate(inode);
+ if (invalidate_inode_pages2(inode->i_mapping) < 0) {
+ pr_err("invalidate_pages %p fails\n", inode);
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ if (orig_gen == ci->i_rdcache_gen &&
+ orig_gen == ci->i_rdcache_revoking) {
+ dout("invalidate_pages %p gen %d successful\n", inode,
+ ci->i_rdcache_gen);
+ ci->i_rdcache_revoking--;
+ check = 1;
+ } else {
+ dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
+ inode, orig_gen, ci->i_rdcache_gen,
+ ci->i_rdcache_revoking);
+ if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+ check = 1;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&ci->i_truncate_mutex);
+out:
+ if (check)
+ ceph_check_caps(ci, 0, NULL);
+ iput(inode);
+}
+
+
+/*
+ * called by trunc_wq;
+ *
+ * We also truncate in a separate thread as well.
+ */
+static void ceph_vmtruncate_work(struct work_struct *work)
+{
+ struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+ i_vmtruncate_work);
+ struct inode *inode = &ci->vfs_inode;
+
+ dout("vmtruncate_work %p\n", inode);
+ __ceph_do_pending_vmtruncate(inode);
+ iput(inode);
+}
+
+/*
+ * Queue an async vmtruncate. If we fail to queue work, we will handle
+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
+ */
+void ceph_queue_vmtruncate(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ ihold(inode);
+
+ if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
+ &ci->i_vmtruncate_work)) {
+ dout("ceph_queue_vmtruncate %p\n", inode);
+ } else {
+ dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
+ inode, ci->i_truncate_pending);
+ iput(inode);
+ }
+}
+
+/*
+ * Make sure any pending truncation is applied before doing anything
+ * that may depend on it.
+ */
+void __ceph_do_pending_vmtruncate(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 to;
+ int wrbuffer_refs, finish = 0;
+
+ mutex_lock(&ci->i_truncate_mutex);
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_truncate_pending == 0) {
+ dout("__do_pending_vmtruncate %p none pending\n", inode);
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&ci->i_truncate_mutex);
+ return;
+ }
+
+ /*
+ * make sure any dirty snapped pages are flushed before we
+ * possibly truncate them.. so write AND block!
+ */
+ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
+ spin_unlock(&ci->i_ceph_lock);
+ dout("__do_pending_vmtruncate %p flushing snaps first\n",
+ inode);
+ filemap_write_and_wait_range(&inode->i_data, 0,
+ inode->i_sb->s_maxbytes);
+ goto retry;
+ }
+
+ /* there should be no reader or writer */
+ WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
+
+ to = ci->i_truncate_size;
+ wrbuffer_refs = ci->i_wrbuffer_ref;
+ dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+ ci->i_truncate_pending, to);
+ spin_unlock(&ci->i_ceph_lock);
+
+ truncate_pagecache(inode, to);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (to == ci->i_truncate_size) {
+ ci->i_truncate_pending = 0;
+ finish = 1;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (!finish)
+ goto retry;
+
+ mutex_unlock(&ci->i_truncate_mutex);
+
+ if (wrbuffer_refs == 0)
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+
+ wake_up_all(&ci->i_cap_wq);
+}
+
+/*
+ * symlinks
+ */
+static const struct inode_operations ceph_symlink_iops = {
+ .get_link = simple_get_link,
+ .setattr = ceph_setattr,
+ .getattr = ceph_getattr,
+ .listxattr = ceph_listxattr,
+};
+
+int __ceph_setattr(struct inode *inode, struct iattr *attr)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ const unsigned int ia_valid = attr->ia_valid;
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_cap_flush *prealloc_cf;
+ int issued;
+ int release = 0, dirtied = 0;
+ int mask = 0;
+ int err = 0;
+ int inode_dirty_flags = 0;
+ bool lock_snap_rwsem = false;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
+ USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ ceph_free_cap_flush(prealloc_cf);
+ return PTR_ERR(req);
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+
+ if (!ci->i_head_snapc &&
+ (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
+ lock_snap_rwsem = true;
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ }
+ }
+
+ dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+ if (ia_valid & ATTR_UID) {
+ dout("setattr %p uid %d -> %d\n", inode,
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kuid(&init_user_ns, attr->ia_uid));
+ if (issued & CEPH_CAP_AUTH_EXCL) {
+ inode->i_uid = attr->ia_uid;
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+ !uid_eq(attr->ia_uid, inode->i_uid)) {
+ req->r_args.setattr.uid = cpu_to_le32(
+ from_kuid(&init_user_ns, attr->ia_uid));
+ mask |= CEPH_SETATTR_UID;
+ release |= CEPH_CAP_AUTH_SHARED;
+ }
+ }
+ if (ia_valid & ATTR_GID) {
+ dout("setattr %p gid %d -> %d\n", inode,
+ from_kgid(&init_user_ns, inode->i_gid),
+ from_kgid(&init_user_ns, attr->ia_gid));
+ if (issued & CEPH_CAP_AUTH_EXCL) {
+ inode->i_gid = attr->ia_gid;
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+ !gid_eq(attr->ia_gid, inode->i_gid)) {
+ req->r_args.setattr.gid = cpu_to_le32(
+ from_kgid(&init_user_ns, attr->ia_gid));
+ mask |= CEPH_SETATTR_GID;
+ release |= CEPH_CAP_AUTH_SHARED;
+ }
+ }
+ if (ia_valid & ATTR_MODE) {
+ dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
+ attr->ia_mode);
+ if (issued & CEPH_CAP_AUTH_EXCL) {
+ inode->i_mode = attr->ia_mode;
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+ attr->ia_mode != inode->i_mode) {
+ inode->i_mode = attr->ia_mode;
+ req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
+ mask |= CEPH_SETATTR_MODE;
+ release |= CEPH_CAP_AUTH_SHARED;
+ }
+ }
+
+ if (ia_valid & ATTR_ATIME) {
+ dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode,
+ inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+ attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+ if (issued & CEPH_CAP_FILE_EXCL) {
+ ci->i_time_warp_seq++;
+ inode->i_atime = attr->ia_atime;
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ } else if ((issued & CEPH_CAP_FILE_WR) &&
+ timespec64_compare(&inode->i_atime,
+ &attr->ia_atime) < 0) {
+ inode->i_atime = attr->ia_atime;
+ dirtied |= CEPH_CAP_FILE_WR;
+ } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+ !timespec64_equal(&inode->i_atime, &attr->ia_atime)) {
+ ceph_encode_timespec64(&req->r_args.setattr.atime,
+ &attr->ia_atime);
+ mask |= CEPH_SETATTR_ATIME;
+ release |= CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+ }
+ }
+ if (ia_valid & ATTR_MTIME) {
+ dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
+ inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+ attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+ if (issued & CEPH_CAP_FILE_EXCL) {
+ ci->i_time_warp_seq++;
+ inode->i_mtime = attr->ia_mtime;
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ } else if ((issued & CEPH_CAP_FILE_WR) &&
+ timespec64_compare(&inode->i_mtime,
+ &attr->ia_mtime) < 0) {
+ inode->i_mtime = attr->ia_mtime;
+ dirtied |= CEPH_CAP_FILE_WR;
+ } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+ !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) {
+ ceph_encode_timespec64(&req->r_args.setattr.mtime,
+ &attr->ia_mtime);
+ mask |= CEPH_SETATTR_MTIME;
+ release |= CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+ }
+ }
+ if (ia_valid & ATTR_SIZE) {
+ dout("setattr %p size %lld -> %lld\n", inode,
+ inode->i_size, attr->ia_size);
+ if ((issued & CEPH_CAP_FILE_EXCL) &&
+ attr->ia_size > inode->i_size) {
+ i_size_write(inode, attr->ia_size);
+ inode->i_blocks = calc_inode_blocks(attr->ia_size);
+ ci->i_reported_size = attr->ia_size;
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+ attr->ia_size != inode->i_size) {
+ req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+ req->r_args.setattr.old_size =
+ cpu_to_le64(inode->i_size);
+ mask |= CEPH_SETATTR_SIZE;
+ release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+ }
+ }
+
+ /* these do nothing */
+ if (ia_valid & ATTR_CTIME) {
+ bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
+ ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
+ dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
+ inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+ attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+ only ? "ctime only" : "ignored");
+ if (only) {
+ /*
+ * if kernel wants to dirty ctime but nothing else,
+ * we need to choose a cap to dirty under, or do
+ * a almost-no-op setattr
+ */
+ if (issued & CEPH_CAP_AUTH_EXCL)
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ else if (issued & CEPH_CAP_FILE_EXCL)
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ else if (issued & CEPH_CAP_XATTR_EXCL)
+ dirtied |= CEPH_CAP_XATTR_EXCL;
+ else
+ mask |= CEPH_SETATTR_CTIME;
+ }
+ }
+ if (ia_valid & ATTR_FILE)
+ dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+
+ if (dirtied) {
+ inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
+ &prealloc_cf);
+ inode->i_ctime = attr->ia_ctime;
+ }
+
+ release &= issued;
+ spin_unlock(&ci->i_ceph_lock);
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
+
+ if (inode_dirty_flags)
+ __mark_inode_dirty(inode, inode_dirty_flags);
+
+
+ if (mask) {
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_inode_drop = release;
+ req->r_args.setattr.mask = cpu_to_le32(mask);
+ req->r_num_caps = 1;
+ req->r_stamp = attr->ia_ctime;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ }
+ dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
+ ceph_cap_string(dirtied), mask);
+
+ ceph_mdsc_put_request(req);
+ ceph_free_cap_flush(prealloc_cf);
+
+ if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
+ __ceph_do_pending_vmtruncate(inode);
+
+ return err;
+}
+
+/*
+ * setattr
+ */
+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ int err;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ err = setattr_prepare(dentry, attr);
+ if (err != 0)
+ return err;
+
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ attr->ia_size > max(inode->i_size, fsc->max_file_size))
+ return -EFBIG;
+
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
+ return -EDQUOT;
+
+ err = __ceph_setattr(inode, attr);
+
+ if (err >= 0 && (attr->ia_valid & ATTR_MODE))
+ err = posix_acl_chmod(inode, attr->ia_mode);
+
+ return err;
+}
+
+/*
+ * Verify that we have a lease on the given mask. If not,
+ * do a getattr against an mds.
+ */
+int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
+ int mask, bool force)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int mode;
+ int err;
+
+ if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ dout("do_getattr inode %p SNAPDIR\n", inode);
+ return 0;
+ }
+
+ dout("do_getattr inode %p mask %s mode 0%o\n",
+ inode, ceph_cap_string(mask), inode->i_mode);
+ if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
+ return 0;
+
+ mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_num_caps = 1;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+ req->r_locked_page = locked_page;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (locked_page && err == 0) {
+ u64 inline_version = req->r_reply_info.targeti.inline_version;
+ if (inline_version == 0) {
+ /* the reply is supposed to contain inline data */
+ err = -EINVAL;
+ } else if (inline_version == CEPH_INLINE_NONE) {
+ err = -ENODATA;
+ } else {
+ err = req->r_reply_info.targeti.inline_len;
+ }
+ }
+ ceph_mdsc_put_request(req);
+ dout("do_getattr result=%d\n", err);
+ return err;
+}
+
+
+/*
+ * Check inode permissions. We verify we have a valid value for
+ * the AUTH cap, then call the generic handler.
+ */
+int ceph_permission(struct inode *inode, int mask)
+{
+ int err;
+
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
+
+ if (!err)
+ err = generic_permission(inode, mask);
+ return err;
+}
+
+/*
+ * Get all attributes. Hopefully somedata we'll have a statlite()
+ * and can limit the fields we require to be accurate.
+ */
+int ceph_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int err;
+
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false);
+ if (!err) {
+ generic_fillattr(inode, stat);
+ stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ stat->dev = ceph_snap(inode);
+ else
+ stat->dev = 0;
+ if (S_ISDIR(inode->i_mode)) {
+ if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+ RBYTES))
+ stat->size = ci->i_rbytes;
+ else
+ stat->size = ci->i_files + ci->i_subdirs;
+ stat->blocks = 0;
+ stat->blksize = 65536;
+ /*
+ * Some applications rely on the number of st_nlink
+ * value on directories to be either 0 (if unlinked)
+ * or 2 + number of subdirectories.
+ */
+ if (stat->nlink == 1)
+ /* '.' + '..' + subdirs */
+ stat->nlink = 1 + 1 + ci->i_subdirs;
+ }
+ }
+ return err;
+}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000..c90f03beb
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+#include <linux/in.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "ioctl.h"
+#include <linux/ceph/striper.h>
+
+/*
+ * ioctls
+ */
+
+/*
+ * get and set the file layout
+ */
+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
+{
+ struct ceph_inode_info *ci = ceph_inode(file_inode(file));
+ struct ceph_ioctl_layout l;
+ int err;
+
+ err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
+ if (!err) {
+ l.stripe_unit = ci->i_layout.stripe_unit;
+ l.stripe_count = ci->i_layout.stripe_count;
+ l.object_size = ci->i_layout.object_size;
+ l.data_pool = ci->i_layout.pool_id;
+ l.preferred_osd = -1;
+ if (copy_to_user(arg, &l, sizeof(l)))
+ return -EFAULT;
+ }
+
+ return err;
+}
+
+static long __validate_layout(struct ceph_mds_client *mdsc,
+ struct ceph_ioctl_layout *l)
+{
+ int i, err;
+
+ /* validate striping parameters */
+ if ((l->object_size & ~PAGE_MASK) ||
+ (l->stripe_unit & ~PAGE_MASK) ||
+ ((unsigned)l->stripe_unit != 0 &&
+ ((unsigned)l->object_size % (unsigned)l->stripe_unit)))
+ return -EINVAL;
+
+ /* make sure it's a valid data pool */
+ mutex_lock(&mdsc->mutex);
+ err = -EINVAL;
+ for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+ if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) {
+ err = 0;
+ break;
+ }
+ mutex_unlock(&mdsc->mutex);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_request *req;
+ struct ceph_ioctl_layout l;
+ struct ceph_inode_info *ci = ceph_inode(file_inode(file));
+ struct ceph_ioctl_layout nl;
+ int err;
+
+ if (copy_from_user(&l, arg, sizeof(l)))
+ return -EFAULT;
+
+ /* validate changed params against current layout */
+ err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
+ if (err)
+ return err;
+
+ memset(&nl, 0, sizeof(nl));
+ if (l.stripe_count)
+ nl.stripe_count = l.stripe_count;
+ else
+ nl.stripe_count = ci->i_layout.stripe_count;
+ if (l.stripe_unit)
+ nl.stripe_unit = l.stripe_unit;
+ else
+ nl.stripe_unit = ci->i_layout.stripe_unit;
+ if (l.object_size)
+ nl.object_size = l.object_size;
+ else
+ nl.object_size = ci->i_layout.object_size;
+ if (l.data_pool)
+ nl.data_pool = l.data_pool;
+ else
+ nl.data_pool = ci->i_layout.pool_id;
+
+ /* this is obsolete, and always -1 */
+ nl.preferred_osd = -1;
+
+ err = __validate_layout(mdsc, &nl);
+ if (err)
+ return err;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
+ USE_AUTH_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_num_caps = 1;
+
+ req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
+
+ req->r_args.setlayout.layout.fl_stripe_unit =
+ cpu_to_le32(l.stripe_unit);
+ req->r_args.setlayout.layout.fl_stripe_count =
+ cpu_to_le32(l.stripe_count);
+ req->r_args.setlayout.layout.fl_object_size =
+ cpu_to_le32(l.object_size);
+ req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+/*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct ceph_mds_request *req;
+ struct ceph_ioctl_layout l;
+ int err;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+ /* copy and validate */
+ if (copy_from_user(&l, arg, sizeof(l)))
+ return -EFAULT;
+
+ err = __validate_layout(mdsc, &l);
+ if (err)
+ return err;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+ USE_AUTH_MDS);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_num_caps = 1;
+
+ req->r_args.setlayout.layout.fl_stripe_unit =
+ cpu_to_le32(l.stripe_unit);
+ req->r_args.setlayout.layout.fl_stripe_count =
+ cpu_to_le32(l.stripe_count);
+ req->r_args.setlayout.layout.fl_object_size =
+ cpu_to_le32(l.object_size);
+ req->r_args.setlayout.layout.fl_pg_pool =
+ cpu_to_le32(l.data_pool);
+
+ err = ceph_mdsc_do_request(mdsc, inode, req);
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+/*
+ * Return object name, size/offset information, and location (OSD
+ * number, network address) for a given file offset.
+ */
+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
+{
+ struct ceph_ioctl_dataloc dl;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc =
+ &ceph_sb_to_client(inode->i_sb)->client->osdc;
+ struct ceph_object_locator oloc;
+ CEPH_DEFINE_OID_ONSTACK(oid);
+ u32 xlen;
+ u64 tmp;
+ struct ceph_pg pgid;
+ int r;
+
+ /* copy and validate */
+ if (copy_from_user(&dl, arg, sizeof(dl)))
+ return -EFAULT;
+
+ down_read(&osdc->lock);
+ ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, 1,
+ &dl.object_no, &dl.object_offset, &xlen);
+ dl.file_offset -= dl.object_offset;
+ dl.object_size = ci->i_layout.object_size;
+ dl.block_size = ci->i_layout.stripe_unit;
+
+ /* block_offset = object_offset % block_size */
+ tmp = dl.object_offset;
+ dl.block_offset = do_div(tmp, dl.block_size);
+
+ snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
+ ceph_ino(inode), dl.object_no);
+
+ oloc.pool = ci->i_layout.pool_id;
+ oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ ceph_oid_printf(&oid, "%s", dl.object_name);
+
+ r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
+
+ ceph_oloc_destroy(&oloc);
+ if (r < 0) {
+ up_read(&osdc->lock);
+ return r;
+ }
+
+ dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
+ if (dl.osd >= 0) {
+ struct ceph_entity_addr *a =
+ ceph_osd_addr(osdc->osdmap, dl.osd);
+ if (a)
+ memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
+ } else {
+ memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
+ }
+ up_read(&osdc->lock);
+
+ /* send result back to user */
+ if (copy_to_user(arg, &dl, sizeof(dl)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long ceph_ioctl_lazyio(struct file *file)
+{
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
+ spin_lock(&ci->i_ceph_lock);
+ fi->fmode |= CEPH_FILE_MODE_LAZY;
+ ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
+ spin_unlock(&ci->i_ceph_lock);
+ dout("ioctl_layzio: file %p marked lazy\n", file);
+
+ ceph_check_caps(ci, 0, NULL);
+ } else {
+ dout("ioctl_layzio: file %p already lazy\n", file);
+ }
+ return 0;
+}
+
+static long ceph_ioctl_syncio(struct file *file)
+{
+ struct ceph_file_info *fi = file->private_data;
+
+ fi->flags |= CEPH_F_SYNC;
+ return 0;
+}
+
+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+ switch (cmd) {
+ case CEPH_IOC_GET_LAYOUT:
+ return ceph_ioctl_get_layout(file, (void __user *)arg);
+
+ case CEPH_IOC_SET_LAYOUT:
+ return ceph_ioctl_set_layout(file, (void __user *)arg);
+
+ case CEPH_IOC_SET_LAYOUT_POLICY:
+ return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
+
+ case CEPH_IOC_GET_DATALOC:
+ return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+
+ case CEPH_IOC_LAZYIO:
+ return ceph_ioctl_lazyio(file);
+
+ case CEPH_IOC_SYNCIO:
+ return ceph_ioctl_syncio(file);
+ }
+
+ return -ENOTTY;
+}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000..51f7f1d39
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef FS_CEPH_IOCTL_H
+#define FS_CEPH_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define CEPH_IOCTL_MAGIC 0x97
+
+/*
+ * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy
+ * CEPH_IOC_SET_LAYOUT - set file layout
+ * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy
+ *
+ * The file layout specifies how file data is striped over objects in
+ * the distributed object store, which object pool they belong to (if
+ * it differs from the default), and an optional 'preferred osd' to
+ * store them on.
+ *
+ * Files get a new layout based on the policy set on the containing
+ * directory or one of its ancestors. The GET_LAYOUT ioctl will let
+ * you examine the layout for a file or the policy on a directory.
+ *
+ * SET_LAYOUT will let you set a layout on a newly created file. This
+ * only works immediately after the file is created and before any
+ * data is written to it.
+ *
+ * SET_LAYOUT_POLICY will let you set a layout policy (default layout)
+ * on a directory that will apply to any new files created in that
+ * directory (or any child directory that doesn't specify a layout of
+ * its own).
+ */
+
+/* use u64 to align sanely on all archs */
+struct ceph_ioctl_layout {
+ __u64 stripe_unit, stripe_count, object_size;
+ __u64 data_pool;
+
+ /* obsolete. new values ignored, always return -1 */
+ __s64 preferred_osd;
+};
+
+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
+ struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
+ struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
+ struct ceph_ioctl_layout)
+
+/*
+ * CEPH_IOC_GET_DATALOC - get location of file data in the cluster
+ *
+ * Extract identity, address of the OSD and object storing a given
+ * file offset.
+ */
+struct ceph_ioctl_dataloc {
+ __u64 file_offset; /* in+out: file offset */
+ __u64 object_offset; /* out: offset in object */
+ __u64 object_no; /* out: object # */
+ __u64 object_size; /* out: object size */
+ char object_name[64]; /* out: object name */
+ __u64 block_offset; /* out: offset in block */
+ __u64 block_size; /* out: block length */
+ __s64 osd; /* out: osd # */
+ struct sockaddr_storage osd_addr; /* out: osd address */
+};
+
+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
+ struct ceph_ioctl_dataloc)
+
+/*
+ * CEPH_IOC_LAZYIO - relax consistency
+ *
+ * Normally Ceph switches to synchronous IO when multiple clients have
+ * the file open (and or more for write). Reads and writes bypass the
+ * page cache and go directly to the OSD. Setting this flag on a file
+ * descriptor will allow buffered IO for this file in cases where the
+ * application knows it won't interfere with other nodes (or doesn't
+ * care).
+ */
+#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
+
+/*
+ * CEPH_IOC_SYNCIO - force synchronous IO
+ *
+ * This ioctl sets a file flag that forces the synchronous IO that
+ * bypasses the page cache, even if it is not necessary. This is
+ * essentially the opposite behavior of IOC_LAZYIO. This forces the
+ * same read/write path as a file opened by multiple clients when one
+ * or more of those clients is opened for write.
+ *
+ * Note that this type of sync IO takes a different path than a file
+ * opened with O_SYNC/D_SYNC (writes hit the page cache and are
+ * immediately flushed on page boundaries). It is very similar to
+ * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes
+ * are not copied (user page must remain stable) and O_DIRECT writes
+ * have alignment restrictions (on the buffer and file offset).
+ */
+#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
+
+#endif
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
new file mode 100644
index 000000000..6a8f4a995
--- /dev/null
+++ b/fs/ceph/locks.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/random.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include <linux/ceph/pagelist.h>
+
+static u64 lock_secret;
+static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req);
+
+static inline u64 secure_addr(void *addr)
+{
+ u64 v = lock_secret ^ (u64)(unsigned long)addr;
+ /*
+ * Set the most significant bit, so that MDS knows the 'owner'
+ * is sufficient to identify the owner of lock. (old code uses
+ * both 'owner' and 'pid')
+ */
+ v |= (1ULL << 63);
+ return v;
+}
+
+void __init ceph_flock_init(void)
+{
+ get_random_bytes(&lock_secret, sizeof(lock_secret));
+}
+
+static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
+{
+ struct inode *inode = file_inode(src->fl_file);
+ atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+}
+
+static void ceph_fl_release_lock(struct file_lock *fl)
+{
+ struct inode *inode = file_inode(fl->fl_file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ if (atomic_dec_and_test(&ci->i_filelock_ref)) {
+ /* clear error when all locks are released */
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+static const struct file_lock_operations ceph_fl_lock_ops = {
+ .fl_copy_lock = ceph_fl_copy_lock,
+ .fl_release_private = ceph_fl_release_lock,
+};
+
+/**
+ * Implement fcntl and flock locking functions.
+ */
+static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
+ int cmd, u8 wait, struct file_lock *fl)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+ u64 length = 0;
+ u64 owner;
+
+ if (operation == CEPH_MDS_OP_SETFILELOCK) {
+ /*
+ * increasing i_filelock_ref closes race window between
+ * handling request reply and adding file_lock struct to
+ * inode. Otherwise, auth caps may get trimmed in the
+ * window. Caller function will decrease the counter.
+ */
+ fl->fl_ops = &ceph_fl_lock_ops;
+ atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+ }
+
+ if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
+ wait = 0;
+
+ req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_num_caps = 1;
+
+ /* mds requires start and length rather than start and end */
+ if (LLONG_MAX == fl->fl_end)
+ length = 0;
+ else
+ length = fl->fl_end - fl->fl_start + 1;
+
+ owner = secure_addr(fl->fl_owner);
+
+ dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
+ "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
+ (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
+ wait, fl->fl_type);
+
+ req->r_args.filelock_change.rule = lock_type;
+ req->r_args.filelock_change.type = cmd;
+ req->r_args.filelock_change.owner = cpu_to_le64(owner);
+ req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
+ req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
+ req->r_args.filelock_change.length = cpu_to_le64(length);
+ req->r_args.filelock_change.wait = wait;
+
+ if (wait)
+ req->r_wait_for_completion = ceph_lock_wait_for_completion;
+
+ err = ceph_mdsc_do_request(mdsc, inode, req);
+ if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
+ fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+ if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+ fl->fl_type = F_RDLCK;
+ else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+ fl->fl_type = F_WRLCK;
+ else
+ fl->fl_type = F_UNLCK;
+
+ fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+ length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+ le64_to_cpu(req->r_reply_info.filelock_reply->length);
+ if (length >= 1)
+ fl->fl_end = length -1;
+ else
+ fl->fl_end = 0;
+
+ }
+ ceph_mdsc_put_request(req);
+ dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+ "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
+ (int)operation, (u64)fl->fl_pid, fl->fl_start,
+ length, wait, fl->fl_type, err);
+ return err;
+}
+
+static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct ceph_mds_request *intr_req;
+ struct inode *inode = req->r_inode;
+ int err, lock_type;
+
+ BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
+ if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
+ lock_type = CEPH_LOCK_FCNTL_INTR;
+ else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
+ lock_type = CEPH_LOCK_FLOCK_INTR;
+ else
+ BUG_ON(1);
+ BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
+
+ err = wait_for_completion_interruptible(&req->r_completion);
+ if (!err)
+ return 0;
+
+ dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
+ req->r_tid);
+
+ mutex_lock(&mdsc->mutex);
+ if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+ err = 0;
+ } else {
+ /*
+ * ensure we aren't running concurrently with
+ * ceph_fill_trace or ceph_readdir_prepopulate, which
+ * rely on locks (dir mutex) held by our caller.
+ */
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = err;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+
+ if (!req->r_session) {
+ // haven't sent the request
+ err = 0;
+ }
+ }
+ mutex_unlock(&mdsc->mutex);
+ if (!err)
+ return 0;
+
+ intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
+ USE_AUTH_MDS);
+ if (IS_ERR(intr_req))
+ return PTR_ERR(intr_req);
+
+ intr_req->r_inode = inode;
+ ihold(inode);
+ intr_req->r_num_caps = 1;
+
+ intr_req->r_args.filelock_change = req->r_args.filelock_change;
+ intr_req->r_args.filelock_change.rule = lock_type;
+ intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
+
+ err = ceph_mdsc_do_request(mdsc, inode, intr_req);
+ ceph_mdsc_put_request(intr_req);
+
+ if (err && err != -ERESTARTSYS)
+ return err;
+
+ wait_for_completion_killable(&req->r_safe_completion);
+ return 0;
+}
+
+/**
+ * Attempt to set an fcntl lock.
+ * For now, this just goes away to the server. Later it may be more awesome.
+ */
+int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int err = 0;
+ u16 op = CEPH_MDS_OP_SETFILELOCK;
+ u8 wait = 0;
+ u8 lock_cmd;
+
+ if (!(fl->fl_flags & FL_POSIX))
+ return -ENOLCK;
+ /* No mandatory locks */
+ if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+ return -ENOLCK;
+
+ dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
+
+ /* set wait bit as appropriate, then make command as Ceph expects it*/
+ if (IS_GETLK(cmd))
+ op = CEPH_MDS_OP_GETFILELOCK;
+ else if (IS_SETLKW(cmd))
+ wait = 1;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
+ err = -EIO;
+ } else if (op == CEPH_MDS_OP_SETFILELOCK) {
+ /*
+ * increasing i_filelock_ref closes race window between
+ * handling request reply and adding file_lock struct to
+ * inode. Otherwise, i_auth_cap may get trimmed in the
+ * window. Caller function will decrease the counter.
+ */
+ fl->fl_ops = &ceph_fl_lock_ops;
+ atomic_inc(&ci->i_filelock_ref);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (err < 0) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
+ posix_lock_file(file, fl, NULL);
+ return err;
+ }
+
+ if (F_RDLCK == fl->fl_type)
+ lock_cmd = CEPH_LOCK_SHARED;
+ else if (F_WRLCK == fl->fl_type)
+ lock_cmd = CEPH_LOCK_EXCL;
+ else
+ lock_cmd = CEPH_LOCK_UNLOCK;
+
+ err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
+ if (!err) {
+ if (op == CEPH_MDS_OP_SETFILELOCK) {
+ dout("mds locked, locking locally\n");
+ err = posix_lock_file(file, fl, NULL);
+ if (err) {
+ /* undo! This should only happen if
+ * the kernel detects local
+ * deadlock. */
+ ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
+ CEPH_LOCK_UNLOCK, 0, fl);
+ dout("got %d on posix_lock_file, undid lock\n",
+ err);
+ }
+ }
+ }
+ return err;
+}
+
+int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int err = 0;
+ u8 wait = 0;
+ u8 lock_cmd;
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+ /* No mandatory locks */
+ if (fl->fl_type & LOCK_MAND)
+ return -EOPNOTSUPP;
+
+ dout("ceph_flock, fl_file: %p\n", fl->fl_file);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
+ err = -EIO;
+ } else {
+ /* see comment in ceph_lock */
+ fl->fl_ops = &ceph_fl_lock_ops;
+ atomic_inc(&ci->i_filelock_ref);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (err < 0) {
+ if (F_UNLCK == fl->fl_type)
+ locks_lock_file_wait(file, fl);
+ return err;
+ }
+
+ if (IS_SETLKW(cmd))
+ wait = 1;
+
+ if (F_RDLCK == fl->fl_type)
+ lock_cmd = CEPH_LOCK_SHARED;
+ else if (F_WRLCK == fl->fl_type)
+ lock_cmd = CEPH_LOCK_EXCL;
+ else
+ lock_cmd = CEPH_LOCK_UNLOCK;
+
+ err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
+ inode, lock_cmd, wait, fl);
+ if (!err) {
+ err = locks_lock_file_wait(file, fl);
+ if (err) {
+ ceph_lock_message(CEPH_LOCK_FLOCK,
+ CEPH_MDS_OP_SETFILELOCK,
+ inode, CEPH_LOCK_UNLOCK, 0, fl);
+ dout("got %d on locks_lock_file_wait, undid lock\n", err);
+ }
+ }
+ return err;
+}
+
+/*
+ * Fills in the passed counter variables, so you can prepare pagelist metadata
+ * before calling ceph_encode_locks.
+ */
+void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
+{
+ struct file_lock *lock;
+ struct file_lock_context *ctx;
+
+ *fcntl_count = 0;
+ *flock_count = 0;
+
+ ctx = inode->i_flctx;
+ if (ctx) {
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry(lock, &ctx->flc_posix, fl_list)
+ ++(*fcntl_count);
+ list_for_each_entry(lock, &ctx->flc_flock, fl_list)
+ ++(*flock_count);
+ spin_unlock(&ctx->flc_lock);
+ }
+ dout("counted %d flock locks and %d fcntl locks\n",
+ *flock_count, *fcntl_count);
+}
+
+/*
+ * Given a pointer to a lock, convert it to a ceph filelock
+ */
+static int lock_to_ceph_filelock(struct file_lock *lock,
+ struct ceph_filelock *cephlock)
+{
+ int err = 0;
+ cephlock->start = cpu_to_le64(lock->fl_start);
+ cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
+ cephlock->client = cpu_to_le64(0);
+ cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
+ cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
+
+ switch (lock->fl_type) {
+ case F_RDLCK:
+ cephlock->type = CEPH_LOCK_SHARED;
+ break;
+ case F_WRLCK:
+ cephlock->type = CEPH_LOCK_EXCL;
+ break;
+ case F_UNLCK:
+ cephlock->type = CEPH_LOCK_UNLOCK;
+ break;
+ default:
+ dout("Have unknown lock type %d\n", lock->fl_type);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+/**
+ * Encode the flock and fcntl locks for the given inode into the ceph_filelock
+ * array. Must be called with inode->i_lock already held.
+ * If we encounter more of a specific lock type than expected, return -ENOSPC.
+ */
+int ceph_encode_locks_to_buffer(struct inode *inode,
+ struct ceph_filelock *flocks,
+ int num_fcntl_locks, int num_flock_locks)
+{
+ struct file_lock *lock;
+ struct file_lock_context *ctx = inode->i_flctx;
+ int err = 0;
+ int seen_fcntl = 0;
+ int seen_flock = 0;
+ int l = 0;
+
+ dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
+ num_fcntl_locks);
+
+ if (!ctx)
+ return 0;
+
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
+ ++seen_fcntl;
+ if (seen_fcntl > num_fcntl_locks) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ err = lock_to_ceph_filelock(lock, &flocks[l]);
+ if (err)
+ goto fail;
+ ++l;
+ }
+ list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+ ++seen_flock;
+ if (seen_flock > num_flock_locks) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ err = lock_to_ceph_filelock(lock, &flocks[l]);
+ if (err)
+ goto fail;
+ ++l;
+ }
+fail:
+ spin_unlock(&ctx->flc_lock);
+ return err;
+}
+
+/**
+ * Copy the encoded flock and fcntl locks into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Returns zero on success.
+ */
+int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+ struct ceph_pagelist *pagelist,
+ int num_fcntl_locks, int num_flock_locks)
+{
+ int err = 0;
+ __le32 nlocks;
+
+ nlocks = cpu_to_le32(num_fcntl_locks);
+ err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+ if (err)
+ goto out_fail;
+
+ if (num_fcntl_locks > 0) {
+ err = ceph_pagelist_append(pagelist, flocks,
+ num_fcntl_locks * sizeof(*flocks));
+ if (err)
+ goto out_fail;
+ }
+
+ nlocks = cpu_to_le32(num_flock_locks);
+ err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+ if (err)
+ goto out_fail;
+
+ if (num_flock_locks > 0) {
+ err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
+ num_flock_locks * sizeof(*flocks));
+ }
+out_fail:
+ return err;
+}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000..5f3707a90
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,4331 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/ratelimit.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage. Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid. If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+
+struct ceph_reconnect_state {
+ int nr_caps;
+ struct ceph_pagelist *pagelist;
+ unsigned msg_version;
+};
+
+static void __wake_requests(struct ceph_mds_client *mdsc,
+ struct list_head *head);
+
+static const struct ceph_connection_operations mds_con_ops;
+
+
+/*
+ * mds reply parsing
+ */
+
+/*
+ * parse individual inode info
+ */
+static int parse_reply_info_in(void **p, void *end,
+ struct ceph_mds_reply_info_in *info,
+ u64 features)
+{
+ int err = -EIO;
+
+ info->in = *p;
+ *p += sizeof(struct ceph_mds_reply_inode) +
+ sizeof(*info->in->fragtree.splits) *
+ le32_to_cpu(info->in->fragtree.nsplits);
+
+ ceph_decode_32_safe(p, end, info->symlink_len, bad);
+ ceph_decode_need(p, end, info->symlink_len, bad);
+ info->symlink = *p;
+ *p += info->symlink_len;
+
+ if (features & CEPH_FEATURE_DIRLAYOUTHASH)
+ ceph_decode_copy_safe(p, end, &info->dir_layout,
+ sizeof(info->dir_layout), bad);
+ else
+ memset(&info->dir_layout, 0, sizeof(info->dir_layout));
+
+ ceph_decode_32_safe(p, end, info->xattr_len, bad);
+ ceph_decode_need(p, end, info->xattr_len, bad);
+ info->xattr_data = *p;
+ *p += info->xattr_len;
+
+ if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ ceph_decode_64_safe(p, end, info->inline_version, bad);
+ ceph_decode_32_safe(p, end, info->inline_len, bad);
+ ceph_decode_need(p, end, info->inline_len, bad);
+ info->inline_data = *p;
+ *p += info->inline_len;
+ } else
+ info->inline_version = CEPH_INLINE_NONE;
+
+ if (features & CEPH_FEATURE_MDS_QUOTA) {
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+
+ /*
+ * both struct_v and struct_compat are expected to be >= 1
+ */
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ if (!struct_v || !struct_compat)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ ceph_decode_64_safe(p, end, info->max_bytes, bad);
+ ceph_decode_64_safe(p, end, info->max_files, bad);
+ } else {
+ info->max_bytes = 0;
+ info->max_files = 0;
+ }
+
+ info->pool_ns_len = 0;
+ info->pool_ns_data = NULL;
+ if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+ }
+
+ return 0;
+bad:
+ return err;
+}
+
+/*
+ * parse a normal reply, which may contain a (dir+)dentry and/or a
+ * target inode.
+ */
+static int parse_reply_info_trace(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ int err;
+
+ if (info->head->is_dentry) {
+ err = parse_reply_info_in(p, end, &info->diri, features);
+ if (err < 0)
+ goto out_bad;
+
+ if (unlikely(*p + sizeof(*info->dirfrag) > end))
+ goto bad;
+ info->dirfrag = *p;
+ *p += sizeof(*info->dirfrag) +
+ sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
+ if (unlikely(*p > end))
+ goto bad;
+
+ ceph_decode_32_safe(p, end, info->dname_len, bad);
+ ceph_decode_need(p, end, info->dname_len, bad);
+ info->dname = *p;
+ *p += info->dname_len;
+ info->dlease = *p;
+ *p += sizeof(*info->dlease);
+ }
+
+ if (info->head->is_target) {
+ err = parse_reply_info_in(p, end, &info->targeti, features);
+ if (err < 0)
+ goto out_bad;
+ }
+
+ if (unlikely(*p != end))
+ goto bad;
+ return 0;
+
+bad:
+ err = -EIO;
+out_bad:
+ pr_err("problem parsing mds trace %d\n", err);
+ return err;
+}
+
+/*
+ * parse readdir results
+ */
+static int parse_reply_info_dir(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ u32 num, i = 0;
+ int err;
+
+ info->dir_dir = *p;
+ if (*p + sizeof(*info->dir_dir) > end)
+ goto bad;
+ *p += sizeof(*info->dir_dir) +
+ sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
+ if (*p > end)
+ goto bad;
+
+ ceph_decode_need(p, end, sizeof(num) + 2, bad);
+ num = ceph_decode_32(p);
+ {
+ u16 flags = ceph_decode_16(p);
+ info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+ info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+ info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+ info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
+ }
+ if (num == 0)
+ goto done;
+
+ BUG_ON(!info->dir_entries);
+ if ((unsigned long)(info->dir_entries + num) >
+ (unsigned long)info->dir_entries + info->dir_buf_size) {
+ pr_err("dir contents are larger than expected\n");
+ WARN_ON(1);
+ goto bad;
+ }
+
+ info->dir_nr = num;
+ while (num) {
+ struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
+ /* dentry */
+ ceph_decode_need(p, end, sizeof(u32)*2, bad);
+ rde->name_len = ceph_decode_32(p);
+ ceph_decode_need(p, end, rde->name_len, bad);
+ rde->name = *p;
+ *p += rde->name_len;
+ dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
+ rde->lease = *p;
+ *p += sizeof(struct ceph_mds_reply_lease);
+
+ /* inode */
+ err = parse_reply_info_in(p, end, &rde->inode, features);
+ if (err < 0)
+ goto out_bad;
+ /* ceph_readdir_prepopulate() will update it */
+ rde->offset = 0;
+ i++;
+ num--;
+ }
+
+done:
+ if (*p != end)
+ goto bad;
+ return 0;
+
+bad:
+ err = -EIO;
+out_bad:
+ pr_err("problem parsing dir contents %d\n", err);
+ return err;
+}
+
+/*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ if (*p + sizeof(*info->filelock_reply) > end)
+ goto bad;
+
+ info->filelock_reply = *p;
+ *p += sizeof(*info->filelock_reply);
+
+ if (unlikely(*p != end))
+ goto bad;
+ return 0;
+
+bad:
+ return -EIO;
+}
+
+/*
+ * parse create results
+ */
+static int parse_reply_info_create(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+ if (*p == end) {
+ info->has_create_ino = false;
+ } else {
+ info->has_create_ino = true;
+ info->ino = ceph_decode_64(p);
+ }
+ }
+
+ if (unlikely(*p != end))
+ goto bad;
+ return 0;
+
+bad:
+ return -EIO;
+}
+
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ u32 op = le32_to_cpu(info->head->op);
+
+ if (op == CEPH_MDS_OP_GETFILELOCK)
+ return parse_reply_info_filelock(p, end, info, features);
+ else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
+ return parse_reply_info_dir(p, end, info, features);
+ else if (op == CEPH_MDS_OP_CREATE)
+ return parse_reply_info_create(p, end, info, features);
+ else
+ return -EIO;
+}
+
+/*
+ * parse entire mds reply
+ */
+static int parse_reply_info(struct ceph_msg *msg,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ void *p, *end;
+ u32 len;
+ int err;
+
+ info->head = msg->front.iov_base;
+ p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
+ end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
+
+ /* trace */
+ ceph_decode_32_safe(&p, end, len, bad);
+ if (len > 0) {
+ ceph_decode_need(&p, end, len, bad);
+ err = parse_reply_info_trace(&p, p+len, info, features);
+ if (err < 0)
+ goto out_bad;
+ }
+
+ /* extra */
+ ceph_decode_32_safe(&p, end, len, bad);
+ if (len > 0) {
+ ceph_decode_need(&p, end, len, bad);
+ err = parse_reply_info_extra(&p, p+len, info, features);
+ if (err < 0)
+ goto out_bad;
+ }
+
+ /* snap blob */
+ ceph_decode_32_safe(&p, end, len, bad);
+ info->snapblob_len = len;
+ info->snapblob = p;
+ p += len;
+
+ if (p != end)
+ goto bad;
+ return 0;
+
+bad:
+ err = -EIO;
+out_bad:
+ pr_err("mds parse_reply err %d\n", err);
+ return err;
+}
+
+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
+{
+ if (!info->dir_entries)
+ return;
+ free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
+}
+
+
+/*
+ * sessions
+ */
+const char *ceph_session_state_name(int s)
+{
+ switch (s) {
+ case CEPH_MDS_SESSION_NEW: return "new";
+ case CEPH_MDS_SESSION_OPENING: return "opening";
+ case CEPH_MDS_SESSION_OPEN: return "open";
+ case CEPH_MDS_SESSION_HUNG: return "hung";
+ case CEPH_MDS_SESSION_CLOSING: return "closing";
+ case CEPH_MDS_SESSION_RESTARTING: return "restarting";
+ case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+ case CEPH_MDS_SESSION_REJECTED: return "rejected";
+ default: return "???";
+ }
+}
+
+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+{
+ if (refcount_inc_not_zero(&s->s_ref)) {
+ dout("mdsc get_session %p %d -> %d\n", s,
+ refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
+ return s;
+ } else {
+ dout("mdsc get_session %p 0 -- FAIL\n", s);
+ return NULL;
+ }
+}
+
+void ceph_put_mds_session(struct ceph_mds_session *s)
+{
+ dout("mdsc put_session %p %d -> %d\n", s,
+ refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
+ if (refcount_dec_and_test(&s->s_ref)) {
+ if (s->s_auth.authorizer)
+ ceph_auth_destroy_authorizer(s->s_auth.authorizer);
+ kfree(s);
+ }
+}
+
+/*
+ * called under mdsc->mutex
+ */
+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
+ int mds)
+{
+ struct ceph_mds_session *session;
+
+ if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
+ return NULL;
+ session = mdsc->sessions[mds];
+ dout("lookup_mds_session %p %d\n", session,
+ refcount_read(&session->s_ref));
+ get_session(session);
+ return session;
+}
+
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+ if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
+ return false;
+ else
+ return true;
+}
+
+static int __verify_registered_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s)
+{
+ if (s->s_mds >= mdsc->max_sessions ||
+ mdsc->sessions[s->s_mds] != s)
+ return -ENOENT;
+ return 0;
+}
+
+/*
+ * create+register a new session for given mds.
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
+ int mds)
+{
+ struct ceph_mds_session *s;
+
+ if (mds >= mdsc->mdsmap->m_num_mds)
+ return ERR_PTR(-EINVAL);
+
+ s = kzalloc(sizeof(*s), GFP_NOFS);
+ if (!s)
+ return ERR_PTR(-ENOMEM);
+
+ if (mds >= mdsc->max_sessions) {
+ int newmax = 1 << get_count_order(mds + 1);
+ struct ceph_mds_session **sa;
+
+ dout("%s: realloc to %d\n", __func__, newmax);
+ sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+ if (!sa)
+ goto fail_realloc;
+ if (mdsc->sessions) {
+ memcpy(sa, mdsc->sessions,
+ mdsc->max_sessions * sizeof(void *));
+ kfree(mdsc->sessions);
+ }
+ mdsc->sessions = sa;
+ mdsc->max_sessions = newmax;
+ }
+
+ dout("%s: mds%d\n", __func__, mds);
+ s->s_mdsc = mdsc;
+ s->s_mds = mds;
+ s->s_state = CEPH_MDS_SESSION_NEW;
+ s->s_ttl = 0;
+ s->s_seq = 0;
+ mutex_init(&s->s_mutex);
+
+ ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
+
+ spin_lock_init(&s->s_gen_ttl_lock);
+ s->s_cap_gen = 0;
+ s->s_cap_ttl = jiffies - 1;
+
+ spin_lock_init(&s->s_cap_lock);
+ s->s_renew_requested = 0;
+ s->s_renew_seq = 0;
+ INIT_LIST_HEAD(&s->s_caps);
+ s->s_nr_caps = 0;
+ s->s_trim_caps = 0;
+ refcount_set(&s->s_ref, 1);
+ INIT_LIST_HEAD(&s->s_waiting);
+ INIT_LIST_HEAD(&s->s_unsafe);
+ s->s_num_cap_releases = 0;
+ s->s_cap_reconnect = 0;
+ s->s_cap_iterator = NULL;
+ INIT_LIST_HEAD(&s->s_cap_releases);
+ INIT_LIST_HEAD(&s->s_cap_flushing);
+
+ mdsc->sessions[mds] = s;
+ atomic_inc(&mdsc->num_sessions);
+ refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
+
+ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
+ ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+ return s;
+
+fail_realloc:
+ kfree(s);
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __unregister_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s)
+{
+ dout("__unregister_session mds%d %p\n", s->s_mds, s);
+ BUG_ON(mdsc->sessions[s->s_mds] != s);
+ mdsc->sessions[s->s_mds] = NULL;
+ ceph_con_close(&s->s_con);
+ ceph_put_mds_session(s);
+ atomic_dec(&mdsc->num_sessions);
+}
+
+/*
+ * drop session refs in request.
+ *
+ * should be last request ref, or hold mdsc->mutex
+ */
+static void put_request_session(struct ceph_mds_request *req)
+{
+ if (req->r_session) {
+ ceph_put_mds_session(req->r_session);
+ req->r_session = NULL;
+ }
+}
+
+void ceph_mdsc_release_request(struct kref *kref)
+{
+ struct ceph_mds_request *req = container_of(kref,
+ struct ceph_mds_request,
+ r_kref);
+ destroy_reply_info(&req->r_reply_info);
+ if (req->r_request)
+ ceph_msg_put(req->r_request);
+ if (req->r_reply)
+ ceph_msg_put(req->r_reply);
+ if (req->r_inode) {
+ ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+ iput(req->r_inode);
+ }
+ if (req->r_parent)
+ ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ iput(req->r_target_inode);
+ if (req->r_dentry)
+ dput(req->r_dentry);
+ if (req->r_old_dentry)
+ dput(req->r_old_dentry);
+ if (req->r_old_dentry_dir) {
+ /*
+ * track (and drop pins for) r_old_dentry_dir
+ * separately, since r_old_dentry's d_parent may have
+ * changed between the dir mutex being dropped and
+ * this request being freed.
+ */
+ ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
+ CEPH_CAP_PIN);
+ iput(req->r_old_dentry_dir);
+ }
+ kfree(req->r_path1);
+ kfree(req->r_path2);
+ if (req->r_pagelist)
+ ceph_pagelist_release(req->r_pagelist);
+ put_request_session(req);
+ ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
+ kfree(req);
+}
+
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
+
+/*
+ * lookup session, bump ref if found.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
+{
+ struct ceph_mds_request *req;
+
+ req = lookup_request(&mdsc->request_tree, tid);
+ if (req)
+ ceph_mdsc_get_request(req);
+
+ return req;
+}
+
+/*
+ * Register an in-flight request, and assign a tid. Link to directory
+ * are modifying (if any).
+ *
+ * Called under mdsc->mutex.
+ */
+static void __register_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ struct inode *dir)
+{
+ int ret = 0;
+
+ req->r_tid = ++mdsc->last_tid;
+ if (req->r_num_caps) {
+ ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
+ req->r_num_caps);
+ if (ret < 0) {
+ pr_err("__register_request %p "
+ "failed to reserve caps: %d\n", req, ret);
+ /* set req->r_err to fail early from __do_request */
+ req->r_err = ret;
+ return;
+ }
+ }
+ dout("__register_request %p tid %lld\n", req, req->r_tid);
+ ceph_mdsc_get_request(req);
+ insert_request(&mdsc->request_tree, req);
+
+ req->r_uid = current_fsuid();
+ req->r_gid = current_fsgid();
+
+ if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
+ mdsc->oldest_tid = req->r_tid;
+
+ if (dir) {
+ ihold(dir);
+ req->r_unsafe_dir = dir;
+ }
+}
+
+static void __unregister_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+
+ /* Never leave an unregistered request on an unsafe list! */
+ list_del_init(&req->r_unsafe_item);
+
+ if (req->r_tid == mdsc->oldest_tid) {
+ struct rb_node *p = rb_next(&req->r_node);
+ mdsc->oldest_tid = 0;
+ while (p) {
+ struct ceph_mds_request *next_req =
+ rb_entry(p, struct ceph_mds_request, r_node);
+ if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
+ mdsc->oldest_tid = next_req->r_tid;
+ break;
+ }
+ p = rb_next(p);
+ }
+ }
+
+ erase_request(&mdsc->request_tree, req);
+
+ if (req->r_unsafe_dir &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_dir_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_target_inode &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_target_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+
+ if (req->r_unsafe_dir) {
+ iput(req->r_unsafe_dir);
+ req->r_unsafe_dir = NULL;
+ }
+
+ complete_all(&req->r_safe_completion);
+
+ ceph_mdsc_put_request(req);
+}
+
+/*
+ * Walk back up the dentry tree until we hit a dentry representing a
+ * non-snapshot inode. We do this using the rcu_read_lock (which must be held
+ * when calling this) to ensure that the objects won't disappear while we're
+ * working with them. Once we hit a candidate dentry, we attempt to take a
+ * reference to it, and return that as the result.
+ */
+static struct inode *get_nonsnap_parent(struct dentry *dentry)
+{
+ struct inode *inode = NULL;
+
+ while (dentry && !IS_ROOT(dentry)) {
+ inode = d_inode_rcu(dentry);
+ if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
+ break;
+ dentry = dentry->d_parent;
+ }
+ if (inode)
+ inode = igrab(inode);
+ return inode;
+}
+
+/*
+ * Choose mds to send request to next. If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds. If all else fails, choose randomly.
+ *
+ * Called under mdsc->mutex.
+ */
+static int __choose_mds(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+ int mode = req->r_direct_mode;
+ int mds = -1;
+ u32 hash = req->r_direct_hash;
+ bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+
+ /*
+ * is there a specific mds we should try? ignore hint if we have
+ * no session and the mds is not up (active or recovering).
+ */
+ if (req->r_resend_mds >= 0 &&
+ (__have_session(mdsc, req->r_resend_mds) ||
+ ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
+ dout("choose_mds using resend_mds mds%d\n",
+ req->r_resend_mds);
+ return req->r_resend_mds;
+ }
+
+ if (mode == USE_RANDOM_MDS)
+ goto random;
+
+ inode = NULL;
+ if (req->r_inode) {
+ if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {
+ inode = req->r_inode;
+ ihold(inode);
+ } else {
+ /* req->r_dentry is non-null for LSSNAP request */
+ rcu_read_lock();
+ inode = get_nonsnap_parent(req->r_dentry);
+ rcu_read_unlock();
+ dout("__choose_mds using snapdir's parent %p\n", inode);
+ }
+ } else if (req->r_dentry) {
+ /* ignore race with rename; old or new d_parent is okay */
+ struct dentry *parent;
+ struct inode *dir;
+
+ rcu_read_lock();
+ parent = req->r_dentry->d_parent;
+ dir = req->r_parent ? : d_inode_rcu(parent);
+
+ if (!dir || dir->i_sb != mdsc->fsc->sb) {
+ /* not this fs or parent went negative */
+ inode = d_inode(req->r_dentry);
+ if (inode)
+ ihold(inode);
+ } else if (ceph_snap(dir) != CEPH_NOSNAP) {
+ /* direct snapped/virtual snapdir requests
+ * based on parent dir inode */
+ inode = get_nonsnap_parent(parent);
+ dout("__choose_mds using nonsnap parent %p\n", inode);
+ } else {
+ /* dentry target */
+ inode = d_inode(req->r_dentry);
+ if (!inode || mode == USE_AUTH_MDS) {
+ /* dir + name */
+ inode = igrab(dir);
+ hash = ceph_dentry_hash(dir, req->r_dentry);
+ is_hash = true;
+ } else {
+ ihold(inode);
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
+ (int)hash, mode);
+ if (!inode)
+ goto random;
+ ci = ceph_inode(inode);
+
+ if (is_hash && S_ISDIR(inode->i_mode)) {
+ struct ceph_inode_frag frag;
+ int found;
+
+ ceph_choose_frag(ci, hash, &frag, &found);
+ if (found) {
+ if (mode == USE_ANY_MDS && frag.ndist > 0) {
+ u8 r;
+
+ /* choose a random replica */
+ get_random_bytes(&r, 1);
+ r %= frag.ndist;
+ mds = frag.dist[r];
+ dout("choose_mds %p %llx.%llx "
+ "frag %u mds%d (%d/%d)\n",
+ inode, ceph_vinop(inode),
+ frag.frag, mds,
+ (int)r, frag.ndist);
+ if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+ CEPH_MDS_STATE_ACTIVE)
+ goto out;
+ }
+
+ /* since this file/dir wasn't known to be
+ * replicated, then we want to look for the
+ * authoritative mds. */
+ mode = USE_AUTH_MDS;
+ if (frag.mds >= 0) {
+ /* choose auth mds */
+ mds = frag.mds;
+ dout("choose_mds %p %llx.%llx "
+ "frag %u mds%d (auth)\n",
+ inode, ceph_vinop(inode), frag.frag, mds);
+ if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+ CEPH_MDS_STATE_ACTIVE)
+ goto out;
+ }
+ }
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = NULL;
+ if (mode == USE_AUTH_MDS)
+ cap = ci->i_auth_cap;
+ if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+ cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+ if (!cap) {
+ spin_unlock(&ci->i_ceph_lock);
+ iput(inode);
+ goto random;
+ }
+ mds = cap->session->s_mds;
+ dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+ inode, ceph_vinop(inode), mds,
+ cap == ci->i_auth_cap ? "auth " : "", cap);
+ spin_unlock(&ci->i_ceph_lock);
+out:
+ iput(inode);
+ return mds;
+
+random:
+ mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
+ dout("choose_mds chose random mds%d\n", mds);
+ return mds;
+}
+
+
+/*
+ * session messages
+ */
+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_session_head *h;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
+ false);
+ if (!msg) {
+ pr_err("create_session_msg ENOMEM creating msg\n");
+ return NULL;
+ }
+ h = msg->front.iov_base;
+ h->op = cpu_to_le32(op);
+ h->seq = cpu_to_le64(seq);
+
+ return msg;
+}
+
+static void encode_supported_features(void **p, void *end)
+{
+ static const unsigned char bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
+ static const size_t count = ARRAY_SIZE(bits);
+
+ if (count > 0) {
+ size_t i;
+ size_t size = ((size_t)bits[count - 1] + 64) / 64 * 8;
+
+ BUG_ON(*p + 4 + size > end);
+ ceph_encode_32(p, size);
+ memset(*p, 0, size);
+ for (i = 0; i < count; i++)
+ ((unsigned char*)(*p))[i / 8] |= 1 << (bits[i] % 8);
+ *p += size;
+ } else {
+ BUG_ON(*p + 4 > end);
+ ceph_encode_32(p, 0);
+ }
+}
+
+/*
+ * session message, specialization for CEPH_SESSION_REQUEST_OPEN
+ * to include additional client metadata fields.
+ */
+static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_session_head *h;
+ int i = -1;
+ int extra_bytes = 0;
+ int metadata_key_count = 0;
+ struct ceph_options *opt = mdsc->fsc->client->options;
+ struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
+ void *p, *end;
+
+ const char* metadata[][2] = {
+ {"hostname", mdsc->nodename},
+ {"kernel_version", init_utsname()->release},
+ {"entity_id", opt->name ? : ""},
+ {"root", fsopt->server_path ? : "/"},
+ {NULL, NULL}
+ };
+
+ /* Calculate serialized length of metadata */
+ extra_bytes = 4; /* map length */
+ for (i = 0; metadata[i][0]; ++i) {
+ extra_bytes += 8 + strlen(metadata[i][0]) +
+ strlen(metadata[i][1]);
+ metadata_key_count++;
+ }
+ /* supported feature */
+ extra_bytes += 4 + 8;
+
+ /* Allocate the message */
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
+ GFP_NOFS, false);
+ if (!msg) {
+ pr_err("create_session_msg ENOMEM creating msg\n");
+ return NULL;
+ }
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ h = p;
+ h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
+ h->seq = cpu_to_le64(seq);
+
+ /*
+ * Serialize client metadata into waiting buffer space, using
+ * the format that userspace expects for map<string, string>
+ *
+ * ClientSession messages with metadata are v2
+ */
+ msg->hdr.version = cpu_to_le16(3);
+ msg->hdr.compat_version = cpu_to_le16(1);
+
+ /* The write pointer, following the session_head structure */
+ p += sizeof(*h);
+
+ /* Number of entries in the map */
+ ceph_encode_32(&p, metadata_key_count);
+
+ /* Two length-prefixed strings for each entry in the map */
+ for (i = 0; metadata[i][0]; ++i) {
+ size_t const key_len = strlen(metadata[i][0]);
+ size_t const val_len = strlen(metadata[i][1]);
+
+ ceph_encode_32(&p, key_len);
+ memcpy(p, metadata[i][0], key_len);
+ p += key_len;
+ ceph_encode_32(&p, val_len);
+ memcpy(p, metadata[i][1], val_len);
+ p += val_len;
+ }
+
+ encode_supported_features(&p, end);
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+ return msg;
+}
+
+/*
+ * send session open request.
+ *
+ * called under mdsc->mutex
+ */
+static int __open_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+ int mstate;
+ int mds = session->s_mds;
+
+ /* wait for mds to go active? */
+ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
+ dout("open_session to mds%d (%s)\n", mds,
+ ceph_mds_state_name(mstate));
+ session->s_state = CEPH_MDS_SESSION_OPENING;
+ session->s_renew_requested = jiffies;
+
+ /* send connect message */
+ msg = create_session_open_msg(mdsc, session->s_seq);
+ if (!msg)
+ return -ENOMEM;
+ ceph_con_send(&session->s_con, msg);
+ return 0;
+}
+
+/*
+ * open sessions for any export targets for the given mds
+ *
+ * called under mdsc->mutex
+ */
+static struct ceph_mds_session *
+__open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+ struct ceph_mds_session *session;
+
+ session = __ceph_lookup_mds_session(mdsc, target);
+ if (!session) {
+ session = register_session(mdsc, target);
+ if (IS_ERR(session))
+ return session;
+ }
+ if (session->s_state == CEPH_MDS_SESSION_NEW ||
+ session->s_state == CEPH_MDS_SESSION_CLOSING)
+ __open_session(mdsc, session);
+
+ return session;
+}
+
+struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+ struct ceph_mds_session *session;
+
+ dout("open_export_target_session to mds%d\n", target);
+
+ mutex_lock(&mdsc->mutex);
+ session = __open_export_target_session(mdsc, target);
+ mutex_unlock(&mdsc->mutex);
+
+ return session;
+}
+
+static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_info *mi;
+ struct ceph_mds_session *ts;
+ int i, mds = session->s_mds;
+
+ if (mds >= mdsc->mdsmap->m_num_mds)
+ return;
+
+ mi = &mdsc->mdsmap->m_info[mds];
+ dout("open_export_target_sessions for mds%d (%d targets)\n",
+ session->s_mds, mi->num_export_targets);
+
+ for (i = 0; i < mi->num_export_targets; i++) {
+ ts = __open_export_target_session(mdsc, mi->export_targets[i]);
+ if (!IS_ERR(ts))
+ ceph_put_mds_session(ts);
+ }
+}
+
+void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ mutex_lock(&mdsc->mutex);
+ __open_export_target_sessions(mdsc, session);
+ mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * session caps
+ */
+
+static void detach_cap_releases(struct ceph_mds_session *session,
+ struct list_head *target)
+{
+ lockdep_assert_held(&session->s_cap_lock);
+
+ list_splice_init(&session->s_cap_releases, target);
+ session->s_num_cap_releases = 0;
+ dout("dispose_cap_releases mds%d\n", session->s_mds);
+}
+
+static void dispose_cap_releases(struct ceph_mds_client *mdsc,
+ struct list_head *dispose)
+{
+ while (!list_empty(dispose)) {
+ struct ceph_cap *cap;
+ /* zero out the in-progress message */
+ cap = list_first_entry(dispose, struct ceph_cap, session_caps);
+ list_del(&cap->session_caps);
+ ceph_put_cap(mdsc, cap);
+ }
+}
+
+static void cleanup_session_requests(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_request *req;
+ struct rb_node *p;
+
+ dout("cleanup_session_requests mds%d\n", session->s_mds);
+ mutex_lock(&mdsc->mutex);
+ while (!list_empty(&session->s_unsafe)) {
+ req = list_first_entry(&session->s_unsafe,
+ struct ceph_mds_request, r_unsafe_item);
+ pr_warn_ratelimited(" dropping unsafe request %llu\n",
+ req->r_tid);
+ __unregister_request(mdsc, req);
+ }
+ /* zero r_attempts, so kick_requests() will re-send requests */
+ p = rb_first(&mdsc->request_tree);
+ while (p) {
+ req = rb_entry(p, struct ceph_mds_request, r_node);
+ p = rb_next(p);
+ if (req->r_session &&
+ req->r_session->s_mds == session->s_mds)
+ req->r_attempts = 0;
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Helper to safely iterate over all caps associated with a session, with
+ * special care taken to handle a racing __ceph_remove_cap().
+ *
+ * Caller must hold session s_mutex.
+ */
+static int iterate_session_caps(struct ceph_mds_session *session,
+ int (*cb)(struct inode *, struct ceph_cap *,
+ void *), void *arg)
+{
+ struct list_head *p;
+ struct ceph_cap *cap;
+ struct inode *inode, *last_inode = NULL;
+ struct ceph_cap *old_cap = NULL;
+ int ret;
+
+ dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+ spin_lock(&session->s_cap_lock);
+ p = session->s_caps.next;
+ while (p != &session->s_caps) {
+ cap = list_entry(p, struct ceph_cap, session_caps);
+ inode = igrab(&cap->ci->vfs_inode);
+ if (!inode) {
+ p = p->next;
+ continue;
+ }
+ session->s_cap_iterator = cap;
+ spin_unlock(&session->s_cap_lock);
+
+ if (last_inode) {
+ iput(last_inode);
+ last_inode = NULL;
+ }
+ if (old_cap) {
+ ceph_put_cap(session->s_mdsc, old_cap);
+ old_cap = NULL;
+ }
+
+ ret = cb(inode, cap, arg);
+ last_inode = inode;
+
+ spin_lock(&session->s_cap_lock);
+ p = p->next;
+ if (!cap->ci) {
+ dout("iterate_session_caps finishing cap %p removal\n",
+ cap);
+ BUG_ON(cap->session != session);
+ cap->session = NULL;
+ list_del_init(&cap->session_caps);
+ session->s_nr_caps--;
+ if (cap->queue_release) {
+ list_add_tail(&cap->session_caps,
+ &session->s_cap_releases);
+ session->s_num_cap_releases++;
+ } else {
+ old_cap = cap; /* put_cap it w/o locks held */
+ }
+ }
+ if (ret < 0)
+ goto out;
+ }
+ ret = 0;
+out:
+ session->s_cap_iterator = NULL;
+ spin_unlock(&session->s_cap_lock);
+
+ iput(last_inode);
+ if (old_cap)
+ ceph_put_cap(session->s_mdsc, old_cap);
+
+ return ret;
+}
+
+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+ void *arg)
+{
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ LIST_HEAD(to_remove);
+ bool drop = false;
+ bool invalidate = false;
+
+ dout("removing cap %p, ci is %p, inode is %p\n",
+ cap, ci, &ci->vfs_inode);
+ spin_lock(&ci->i_ceph_lock);
+ __ceph_remove_cap(cap, false);
+ if (!ci->i_auth_cap) {
+ struct ceph_cap_flush *cf;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+
+ ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+
+ if (ci->i_wrbuffer_ref > 0 &&
+ READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+ invalidate = true;
+
+ while (!list_empty(&ci->i_cap_flush_list)) {
+ cf = list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ list_move(&cf->i_list, &to_remove);
+ }
+
+ spin_lock(&mdsc->cap_dirty_lock);
+
+ list_for_each_entry(cf, &to_remove, i_list)
+ list_del(&cf->g_list);
+
+ if (!list_empty(&ci->i_dirty_item)) {
+ pr_warn_ratelimited(
+ " dropping dirty %s state for %p %lld\n",
+ ceph_cap_string(ci->i_dirty_caps),
+ inode, ceph_ino(inode));
+ ci->i_dirty_caps = 0;
+ list_del_init(&ci->i_dirty_item);
+ drop = true;
+ }
+ if (!list_empty(&ci->i_flushing_item)) {
+ pr_warn_ratelimited(
+ " dropping dirty+flushing %s state for %p %lld\n",
+ ceph_cap_string(ci->i_flushing_caps),
+ inode, ceph_ino(inode));
+ ci->i_flushing_caps = 0;
+ list_del_init(&ci->i_flushing_item);
+ mdsc->num_cap_flushing--;
+ drop = true;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ if (atomic_read(&ci->i_filelock_ref) > 0) {
+ /* make further file lock syscall return -EIO */
+ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
+ pr_warn_ratelimited(" dropping file locks for %p %lld\n",
+ inode, ceph_ino(inode));
+ }
+
+ if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+ list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
+ ci->i_prealloc_cap_flush = NULL;
+ }
+
+ if (drop &&
+ ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ while (!list_empty(&to_remove)) {
+ struct ceph_cap_flush *cf;
+ cf = list_first_entry(&to_remove,
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
+ ceph_free_cap_flush(cf);
+ }
+
+ wake_up_all(&ci->i_cap_wq);
+ if (invalidate)
+ ceph_queue_invalidate(inode);
+ if (drop)
+ iput(inode);
+ return 0;
+}
+
+/*
+ * caller must hold session s_mutex
+ */
+static void remove_session_caps(struct ceph_mds_session *session)
+{
+ struct ceph_fs_client *fsc = session->s_mdsc->fsc;
+ struct super_block *sb = fsc->sb;
+ LIST_HEAD(dispose);
+
+ dout("remove_session_caps on %p\n", session);
+ iterate_session_caps(session, remove_session_caps_cb, fsc);
+
+ wake_up_all(&fsc->mdsc->cap_flushing_wq);
+
+ spin_lock(&session->s_cap_lock);
+ if (session->s_nr_caps > 0) {
+ struct inode *inode;
+ struct ceph_cap *cap, *prev = NULL;
+ struct ceph_vino vino;
+ /*
+ * iterate_session_caps() skips inodes that are being
+ * deleted, we need to wait until deletions are complete.
+ * __wait_on_freeing_inode() is designed for the job,
+ * but it is not exported, so use lookup inode function
+ * to access it.
+ */
+ while (!list_empty(&session->s_caps)) {
+ cap = list_entry(session->s_caps.next,
+ struct ceph_cap, session_caps);
+ if (cap == prev)
+ break;
+ prev = cap;
+ vino = cap->ci->i_vino;
+ spin_unlock(&session->s_cap_lock);
+
+ inode = ceph_find_inode(sb, vino);
+ iput(inode);
+
+ spin_lock(&session->s_cap_lock);
+ }
+ }
+
+ // drop cap expires and unlock s_cap_lock
+ detach_cap_releases(session, &dispose);
+
+ BUG_ON(session->s_nr_caps > 0);
+ BUG_ON(!list_empty(&session->s_cap_flushing));
+ spin_unlock(&session->s_cap_lock);
+ dispose_cap_releases(session->s_mdsc, &dispose);
+}
+
+/*
+ * wake up any threads waiting on this session's caps. if the cap is
+ * old (didn't get renewed on the client reconnect), remove it now.
+ *
+ * caller must hold s_mutex.
+ */
+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
+ void *arg)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (arg) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_wanted_max_size = 0;
+ ci->i_requested_max_size = 0;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ wake_up_all(&ci->i_cap_wq);
+ return 0;
+}
+
+static void wake_up_session_caps(struct ceph_mds_session *session,
+ int reconnect)
+{
+ dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+ iterate_session_caps(session, wake_up_session_cb,
+ (void *)(unsigned long)reconnect);
+}
+
+/*
+ * Send periodic message to MDS renewing all currently held caps. The
+ * ack will reset the expiration for all caps from this session.
+ *
+ * caller holds s_mutex
+ */
+static int send_renew_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+ int state;
+
+ if (time_after_eq(jiffies, session->s_cap_ttl) &&
+ time_after_eq(session->s_cap_ttl, session->s_renew_requested))
+ pr_info("mds%d caps stale\n", session->s_mds);
+ session->s_renew_requested = jiffies;
+
+ /* do not try to renew caps until a recovering mds has reconnected
+ * with its clients. */
+ state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+ if (state < CEPH_MDS_STATE_RECONNECT) {
+ dout("send_renew_caps ignoring mds%d (%s)\n",
+ session->s_mds, ceph_mds_state_name(state));
+ return 0;
+ }
+
+ dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
+ ceph_mds_state_name(state));
+ msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ ++session->s_renew_seq);
+ if (!msg)
+ return -ENOMEM;
+ ceph_con_send(&session->s_con, msg);
+ return 0;
+}
+
+static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session, u64 seq)
+{
+ struct ceph_msg *msg;
+
+ dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
+ session->s_mds, ceph_session_state_name(session->s_state), seq);
+ msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+ if (!msg)
+ return -ENOMEM;
+ ceph_con_send(&session->s_con, msg);
+ return 0;
+}
+
+
+/*
+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ *
+ * Called under session->s_mutex
+ */
+static void renewed_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session, int is_renew)
+{
+ int was_stale;
+ int wake = 0;
+
+ spin_lock(&session->s_cap_lock);
+ was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
+
+ session->s_cap_ttl = session->s_renew_requested +
+ mdsc->mdsmap->m_session_timeout*HZ;
+
+ if (was_stale) {
+ if (time_before(jiffies, session->s_cap_ttl)) {
+ pr_info("mds%d caps renewed\n", session->s_mds);
+ wake = 1;
+ } else {
+ pr_info("mds%d caps still stale\n", session->s_mds);
+ }
+ }
+ dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
+ session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
+ time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+ spin_unlock(&session->s_cap_lock);
+
+ if (wake)
+ wake_up_session_caps(session, 0);
+}
+
+/*
+ * send a session close request
+ */
+static int request_close_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+
+ dout("request_close_session mds%d state %s seq %lld\n",
+ session->s_mds, ceph_session_state_name(session->s_state),
+ session->s_seq);
+ msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+ if (!msg)
+ return -ENOMEM;
+ ceph_con_send(&session->s_con, msg);
+ return 1;
+}
+
+/*
+ * Called with s_mutex held.
+ */
+static int __close_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+ return 0;
+ session->s_state = CEPH_MDS_SESSION_CLOSING;
+ return request_close_session(mdsc, session);
+}
+
+static bool drop_negative_children(struct dentry *dentry)
+{
+ struct dentry *child;
+ bool all_negative = true;
+
+ if (!d_is_dir(dentry))
+ goto out;
+
+ spin_lock(&dentry->d_lock);
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ if (d_really_is_positive(child)) {
+ all_negative = false;
+ break;
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+
+ if (all_negative)
+ shrink_dcache_parent(dentry);
+out:
+ return all_negative;
+}
+
+/*
+ * Trim old(er) caps.
+ *
+ * Because we can't cache an inode without one or more caps, we do
+ * this indirectly: if a cap is unused, we prune its aliases, at which
+ * point the inode will hopefully get dropped to.
+ *
+ * Yes, this is a bit sloppy. Our only real goal here is to respond to
+ * memory pressure from the MDS, though, so it needn't be perfect.
+ */
+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+{
+ struct ceph_mds_session *session = arg;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int used, wanted, oissued, mine;
+
+ if (session->s_trim_caps <= 0)
+ return -1;
+
+ spin_lock(&ci->i_ceph_lock);
+ mine = cap->issued | cap->implemented;
+ used = __ceph_caps_used(ci);
+ wanted = __ceph_caps_file_wanted(ci);
+ oissued = __ceph_caps_issued_other(ci, cap);
+
+ dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
+ inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
+ ceph_cap_string(used), ceph_cap_string(wanted));
+ if (cap == ci->i_auth_cap) {
+ if (ci->i_dirty_caps || ci->i_flushing_caps ||
+ !list_empty(&ci->i_cap_snaps))
+ goto out;
+ if ((used | wanted) & CEPH_CAP_ANY_WR)
+ goto out;
+ /* Note: it's possible that i_filelock_ref becomes non-zero
+ * after dropping auth caps. It doesn't hurt because reply
+ * of lock mds request will re-add auth caps. */
+ if (atomic_read(&ci->i_filelock_ref) > 0)
+ goto out;
+ }
+ /* The inode has cached pages, but it's no longer used.
+ * we can safely drop it */
+ if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+ !(oissued & CEPH_CAP_FILE_CACHE)) {
+ used = 0;
+ oissued = 0;
+ }
+ if ((used | wanted) & ~oissued & mine)
+ goto out; /* we need these caps */
+
+ if (oissued) {
+ /* we aren't the only cap.. just remove us */
+ __ceph_remove_cap(cap, true);
+ session->s_trim_caps--;
+ } else {
+ struct dentry *dentry;
+ /* try dropping referring dentries */
+ spin_unlock(&ci->i_ceph_lock);
+ dentry = d_find_any_alias(inode);
+ if (dentry && drop_negative_children(dentry)) {
+ int count;
+ dput(dentry);
+ d_prune_aliases(inode);
+ count = atomic_read(&inode->i_count);
+ if (count == 1)
+ session->s_trim_caps--;
+ dout("trim_caps_cb %p cap %p pruned, count now %d\n",
+ inode, cap, count);
+ } else {
+ dput(dentry);
+ }
+ return 0;
+ }
+
+out:
+ spin_unlock(&ci->i_ceph_lock);
+ return 0;
+}
+
+/*
+ * Trim session cap count down to some max number.
+ */
+int ceph_trim_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ int max_caps)
+{
+ int trim_caps = session->s_nr_caps - max_caps;
+
+ dout("trim_caps mds%d start: %d / %d, trim %d\n",
+ session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+ if (trim_caps > 0) {
+ session->s_trim_caps = trim_caps;
+ iterate_session_caps(session, trim_caps_cb, session);
+ dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
+ session->s_mds, session->s_nr_caps, max_caps,
+ trim_caps - session->s_trim_caps);
+ session->s_trim_caps = 0;
+ }
+
+ ceph_send_cap_releases(mdsc, session);
+ return 0;
+}
+
+static int check_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid)
+{
+ int ret = 1;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ if (cf->tid <= want_flush_tid) {
+ dout("check_caps_flush still flushing tid "
+ "%llu <= %llu\n", cf->tid, want_flush_tid);
+ ret = 0;
+ }
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ return ret;
+}
+
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_tid
+ */
+static void wait_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid)
+{
+ dout("check_caps_flush want %llu\n", want_flush_tid);
+
+ wait_event(mdsc->cap_flushing_wq,
+ check_caps_flush(mdsc, want_flush_tid));
+
+ dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
+}
+
+/*
+ * called under s_mutex
+ */
+void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg = NULL;
+ struct ceph_mds_cap_release *head;
+ struct ceph_mds_cap_item *item;
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+ struct ceph_cap *cap;
+ LIST_HEAD(tmp_list);
+ int num_cap_releases;
+ __le32 barrier, *cap_barrier;
+
+ down_read(&osdc->lock);
+ barrier = cpu_to_le32(osdc->epoch_barrier);
+ up_read(&osdc->lock);
+
+ spin_lock(&session->s_cap_lock);
+again:
+ list_splice_init(&session->s_cap_releases, &tmp_list);
+ num_cap_releases = session->s_num_cap_releases;
+ session->s_num_cap_releases = 0;
+ spin_unlock(&session->s_cap_lock);
+
+ while (!list_empty(&tmp_list)) {
+ if (!msg) {
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
+ PAGE_SIZE, GFP_NOFS, false);
+ if (!msg)
+ goto out_err;
+ head = msg->front.iov_base;
+ head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
+
+ msg->hdr.version = cpu_to_le16(2);
+ msg->hdr.compat_version = cpu_to_le16(1);
+ }
+
+ cap = list_first_entry(&tmp_list, struct ceph_cap,
+ session_caps);
+ list_del(&cap->session_caps);
+ num_cap_releases--;
+
+ head = msg->front.iov_base;
+ le32_add_cpu(&head->num, 1);
+ item = msg->front.iov_base + msg->front.iov_len;
+ item->ino = cpu_to_le64(cap->cap_ino);
+ item->cap_id = cpu_to_le64(cap->cap_id);
+ item->migrate_seq = cpu_to_le32(cap->mseq);
+ item->seq = cpu_to_le32(cap->issue_seq);
+ msg->front.iov_len += sizeof(*item);
+
+ ceph_put_cap(mdsc, cap);
+
+ if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+ ceph_con_send(&session->s_con, msg);
+ msg = NULL;
+ }
+ }
+
+ BUG_ON(num_cap_releases != 0);
+
+ spin_lock(&session->s_cap_lock);
+ if (!list_empty(&session->s_cap_releases))
+ goto again;
+ spin_unlock(&session->s_cap_lock);
+
+ if (msg) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+ ceph_con_send(&session->s_con, msg);
+ }
+ return;
+out_err:
+ pr_err("send_cap_releases mds%d, failed to allocate message\n",
+ session->s_mds);
+ spin_lock(&session->s_cap_lock);
+ list_splice(&tmp_list, &session->s_cap_releases);
+ session->s_num_cap_releases += num_cap_releases;
+ spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * requests
+ */
+
+int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+ struct inode *dir)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
+ size_t size = sizeof(struct ceph_mds_reply_dir_entry);
+ int order, num_entries;
+
+ spin_lock(&ci->i_ceph_lock);
+ num_entries = ci->i_files + ci->i_subdirs;
+ spin_unlock(&ci->i_ceph_lock);
+ num_entries = max(num_entries, 1);
+ num_entries = min(num_entries, opt->max_readdir);
+
+ order = get_order(size * num_entries);
+ while (order >= 0) {
+ rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
+ __GFP_NOWARN,
+ order);
+ if (rinfo->dir_entries)
+ break;
+ order--;
+ }
+ if (!rinfo->dir_entries)
+ return -ENOMEM;
+
+ num_entries = (PAGE_SIZE << order) / size;
+ num_entries = min(num_entries, opt->max_readdir);
+
+ rinfo->dir_buf_size = PAGE_SIZE << order;
+ req->r_num_caps = num_entries + 1;
+ req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
+ req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
+ return 0;
+}
+
+/*
+ * Create an mds request.
+ */
+struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
+{
+ struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+ struct timespec64 ts;
+
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&req->r_fill_mutex);
+ req->r_mdsc = mdsc;
+ req->r_started = jiffies;
+ req->r_resend_mds = -1;
+ INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+ INIT_LIST_HEAD(&req->r_unsafe_target_item);
+ req->r_fmode = -1;
+ kref_init(&req->r_kref);
+ RB_CLEAR_NODE(&req->r_node);
+ INIT_LIST_HEAD(&req->r_wait);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+
+ ktime_get_coarse_real_ts64(&ts);
+ req->r_stamp = timespec64_trunc(ts, mdsc->fsc->sb->s_time_gran);
+
+ req->r_op = op;
+ req->r_direct_mode = mode;
+ return req;
+}
+
+/*
+ * return oldest (lowest) request, tid in request tree, 0 if none.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
+{
+ if (RB_EMPTY_ROOT(&mdsc->request_tree))
+ return NULL;
+ return rb_entry(rb_first(&mdsc->request_tree),
+ struct ceph_mds_request, r_node);
+}
+
+static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+{
+ return mdsc->oldest_tid;
+}
+
+/*
+ * Build a dentry's path. Allocate on heap; caller must kfree. Based
+ * on build_path_from_dentry in fs/cifs/dir.c.
+ *
+ * If @stop_on_nosnap, generate path relative to the first non-snapped
+ * inode.
+ *
+ * Encode hidden .snap dirs as a double /, i.e.
+ * foo/.snap/bar -> foo//bar
+ */
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+ int stop_on_nosnap)
+{
+ struct dentry *temp;
+ char *path;
+ int len, pos;
+ unsigned seq;
+
+ if (!dentry)
+ return ERR_PTR(-EINVAL);
+
+retry:
+ len = 0;
+ seq = read_seqbegin(&rename_lock);
+ rcu_read_lock();
+ for (temp = dentry; !IS_ROOT(temp);) {
+ struct inode *inode = d_inode(temp);
+ if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
+ len++; /* slash only */
+ else if (stop_on_nosnap && inode &&
+ ceph_snap(inode) == CEPH_NOSNAP)
+ break;
+ else
+ len += 1 + temp->d_name.len;
+ temp = temp->d_parent;
+ }
+ rcu_read_unlock();
+ if (len)
+ len--; /* no leading '/' */
+
+ path = kmalloc(len+1, GFP_NOFS);
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ pos = len;
+ path[pos] = 0; /* trailing null */
+ rcu_read_lock();
+ for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+ struct inode *inode;
+
+ spin_lock(&temp->d_lock);
+ inode = d_inode(temp);
+ if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
+ dout("build_path path+%d: %p SNAPDIR\n",
+ pos, temp);
+ } else if (stop_on_nosnap && inode &&
+ ceph_snap(inode) == CEPH_NOSNAP) {
+ spin_unlock(&temp->d_lock);
+ break;
+ } else {
+ pos -= temp->d_name.len;
+ if (pos < 0) {
+ spin_unlock(&temp->d_lock);
+ break;
+ }
+ strncpy(path + pos, temp->d_name.name,
+ temp->d_name.len);
+ }
+ spin_unlock(&temp->d_lock);
+ if (pos)
+ path[--pos] = '/';
+ temp = temp->d_parent;
+ }
+ rcu_read_unlock();
+ if (pos != 0 || read_seqretry(&rename_lock, seq)) {
+ pr_err("build_path did not end path lookup where "
+ "expected, namelen is %d, pos is %d\n", len, pos);
+ /* presumably this is only possible if racing with a
+ rename of one of the parent directories (we can not
+ lock the dentries above us to prevent this, but
+ retrying should be harmless) */
+ kfree(path);
+ goto retry;
+ }
+
+ *base = ceph_ino(d_inode(temp));
+ *plen = len;
+ dout("build_path on %p %d built %llx '%.*s'\n",
+ dentry, d_count(dentry), *base, len, path);
+ return path;
+}
+
+/* Duplicate the dentry->d_name.name safely */
+static int clone_dentry_name(struct dentry *dentry, const char **ppath,
+ int *ppathlen)
+{
+ u32 len;
+ char *name;
+
+retry:
+ len = READ_ONCE(dentry->d_name.len);
+ name = kmalloc(len + 1, GFP_NOFS);
+ if (!name)
+ return -ENOMEM;
+
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_name.len != len) {
+ spin_unlock(&dentry->d_lock);
+ kfree(name);
+ goto retry;
+ }
+ memcpy(name, dentry->d_name.name, len);
+ spin_unlock(&dentry->d_lock);
+
+ name[len] = '\0';
+ *ppath = name;
+ *ppathlen = len;
+ return 0;
+}
+
+static int build_dentry_path(struct dentry *dentry, struct inode *dir,
+ const char **ppath, int *ppathlen, u64 *pino,
+ bool *pfreepath, bool parent_locked)
+{
+ int ret;
+ char *path;
+
+ rcu_read_lock();
+ if (!dir)
+ dir = d_inode_rcu(dentry->d_parent);
+ if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
+ *pino = ceph_ino(dir);
+ rcu_read_unlock();
+ if (parent_locked) {
+ *ppath = dentry->d_name.name;
+ *ppathlen = dentry->d_name.len;
+ } else {
+ ret = clone_dentry_name(dentry, ppath, ppathlen);
+ if (ret)
+ return ret;
+ *pfreepath = true;
+ }
+ return 0;
+ }
+ rcu_read_unlock();
+ path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ *ppath = path;
+ *pfreepath = true;
+ return 0;
+}
+
+static int build_inode_path(struct inode *inode,
+ const char **ppath, int *ppathlen, u64 *pino,
+ bool *pfreepath)
+{
+ struct dentry *dentry;
+ char *path;
+
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
+ *pino = ceph_ino(inode);
+ *ppathlen = 0;
+ return 0;
+ }
+ dentry = d_find_alias(inode);
+ path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+ dput(dentry);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ *ppath = path;
+ *pfreepath = true;
+ return 0;
+}
+
+/*
+ * request arguments may be specified via an inode *, a dentry *, or
+ * an explicit ino+path.
+ */
+static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
+ struct inode *rdiri, const char *rpath,
+ u64 rino, const char **ppath, int *pathlen,
+ u64 *ino, bool *freepath, bool parent_locked)
+{
+ int r = 0;
+
+ if (rinode) {
+ r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+ dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+ ceph_snap(rinode));
+ } else if (rdentry) {
+ r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
+ freepath, parent_locked);
+ dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
+ *ppath);
+ } else if (rpath || rino) {
+ *ino = rino;
+ *ppath = rpath;
+ *pathlen = rpath ? strlen(rpath) : 0;
+ dout(" path %.*s\n", *pathlen, rpath);
+ }
+
+ return r;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ int mds, bool drop_cap_releases)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_request_head *head;
+ const char *path1 = NULL;
+ const char *path2 = NULL;
+ u64 ino1 = 0, ino2 = 0;
+ int pathlen1 = 0, pathlen2 = 0;
+ bool freepath1 = false, freepath2 = false;
+ int len;
+ u16 releases;
+ void *p, *end;
+ int ret;
+
+ ret = set_request_path_attr(req->r_inode, req->r_dentry,
+ req->r_parent, req->r_path1, req->r_ino1.ino,
+ &path1, &pathlen1, &ino1, &freepath1,
+ test_bit(CEPH_MDS_R_PARENT_LOCKED,
+ &req->r_req_flags));
+ if (ret < 0) {
+ msg = ERR_PTR(ret);
+ goto out;
+ }
+
+ /* If r_old_dentry is set, then assume that its parent is locked */
+ ret = set_request_path_attr(NULL, req->r_old_dentry,
+ req->r_old_dentry_dir,
+ req->r_path2, req->r_ino2.ino,
+ &path2, &pathlen2, &ino2, &freepath2, true);
+ if (ret < 0) {
+ msg = ERR_PTR(ret);
+ goto out_free1;
+ }
+
+ len = sizeof(*head) +
+ pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+ sizeof(struct ceph_timespec);
+
+ /* calculate (max) length for cap releases */
+ len += sizeof(struct ceph_mds_request_release) *
+ (!!req->r_inode_drop + !!req->r_dentry_drop +
+ !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+ if (req->r_dentry_drop)
+ len += req->r_dentry->d_name.len;
+ if (req->r_old_dentry_drop)
+ len += req->r_old_dentry->d_name.len;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
+ if (!msg) {
+ msg = ERR_PTR(-ENOMEM);
+ goto out_free2;
+ }
+
+ msg->hdr.version = cpu_to_le16(2);
+ msg->hdr.tid = cpu_to_le64(req->r_tid);
+
+ head = msg->front.iov_base;
+ p = msg->front.iov_base + sizeof(*head);
+ end = msg->front.iov_base + msg->front.iov_len;
+
+ head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+ head->op = cpu_to_le32(req->r_op);
+ head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
+ head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
+ head->args = req->r_args;
+
+ ceph_encode_filepath(&p, end, ino1, path1);
+ ceph_encode_filepath(&p, end, ino2, path2);
+
+ /* make note of release offset, in case we need to replay */
+ req->r_request_release_offset = p - msg->front.iov_base;
+
+ /* cap releases */
+ releases = 0;
+ if (req->r_inode_drop)
+ releases += ceph_encode_inode_release(&p,
+ req->r_inode ? req->r_inode : d_inode(req->r_dentry),
+ mds, req->r_inode_drop, req->r_inode_unless, 0);
+ if (req->r_dentry_drop)
+ releases += ceph_encode_dentry_release(&p, req->r_dentry,
+ req->r_parent, mds, req->r_dentry_drop,
+ req->r_dentry_unless);
+ if (req->r_old_dentry_drop)
+ releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
+ req->r_old_dentry_dir, mds,
+ req->r_old_dentry_drop,
+ req->r_old_dentry_unless);
+ if (req->r_old_inode_drop)
+ releases += ceph_encode_inode_release(&p,
+ d_inode(req->r_old_dentry),
+ mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+
+ if (drop_cap_releases) {
+ releases = 0;
+ p = msg->front.iov_base + req->r_request_release_offset;
+ }
+
+ head->num_releases = cpu_to_le16(releases);
+
+ /* time stamp */
+ {
+ struct ceph_timespec ts;
+ ceph_encode_timespec64(&ts, &req->r_stamp);
+ ceph_encode_copy(&p, &ts, sizeof(ts));
+ }
+
+ BUG_ON(p > end);
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+ if (req->r_pagelist) {
+ struct ceph_pagelist *pagelist = req->r_pagelist;
+ refcount_inc(&pagelist->refcnt);
+ ceph_msg_data_add_pagelist(msg, pagelist);
+ msg->hdr.data_len = cpu_to_le32(pagelist->length);
+ } else {
+ msg->hdr.data_len = 0;
+ }
+
+ msg->hdr.data_off = cpu_to_le16(0);
+
+out_free2:
+ if (freepath2)
+ kfree((char *)path2);
+out_free1:
+ if (freepath1)
+ kfree((char *)path1);
+out:
+ return msg;
+}
+
+/*
+ * called under mdsc->mutex if error, under no mutex if
+ * success.
+ */
+static void complete_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ if (req->r_callback)
+ req->r_callback(mdsc, req);
+ else
+ complete_all(&req->r_completion);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static int __prepare_send_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ int mds, bool drop_cap_releases)
+{
+ struct ceph_mds_request_head *rhead;
+ struct ceph_msg *msg;
+ int flags = 0;
+
+ req->r_attempts++;
+ if (req->r_inode) {
+ struct ceph_cap *cap =
+ ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
+
+ if (cap)
+ req->r_sent_on_mseq = cap->mseq;
+ else
+ req->r_sent_on_mseq = -1;
+ }
+ dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+ req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ void *p;
+ /*
+ * Replay. Do not regenerate message (and rebuild
+ * paths, etc.); just use the original message.
+ * Rebuilding paths will break for renames because
+ * d_move mangles the src name.
+ */
+ msg = req->r_request;
+ rhead = msg->front.iov_base;
+
+ flags = le32_to_cpu(rhead->flags);
+ flags |= CEPH_MDS_FLAG_REPLAY;
+ rhead->flags = cpu_to_le32(flags);
+
+ if (req->r_target_inode)
+ rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+
+ rhead->num_retry = req->r_attempts - 1;
+
+ /* remove cap/dentry releases from message */
+ rhead->num_releases = 0;
+
+ /* time stamp */
+ p = msg->front.iov_base + req->r_request_release_offset;
+ {
+ struct ceph_timespec ts;
+ ceph_encode_timespec64(&ts, &req->r_stamp);
+ ceph_encode_copy(&p, &ts, sizeof(ts));
+ }
+
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ return 0;
+ }
+
+ if (req->r_request) {
+ ceph_msg_put(req->r_request);
+ req->r_request = NULL;
+ }
+ msg = create_request_message(mdsc, req, mds, drop_cap_releases);
+ if (IS_ERR(msg)) {
+ req->r_err = PTR_ERR(msg);
+ return PTR_ERR(msg);
+ }
+ req->r_request = msg;
+
+ rhead = msg->front.iov_base;
+ rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
+ flags |= CEPH_MDS_FLAG_REPLAY;
+ if (req->r_parent)
+ flags |= CEPH_MDS_FLAG_WANT_DENTRY;
+ rhead->flags = cpu_to_le32(flags);
+ rhead->num_fwd = req->r_num_fwd;
+ rhead->num_retry = req->r_attempts - 1;
+ rhead->ino = 0;
+
+ dout(" r_parent = %p\n", req->r_parent);
+ return 0;
+}
+
+/*
+ * send request, or put it on the appropriate wait list.
+ */
+static void __do_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct ceph_mds_session *session = NULL;
+ int mds = -1;
+ int err = 0;
+
+ if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+ if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
+ __unregister_request(mdsc, req);
+ return;
+ }
+
+ if (req->r_timeout &&
+ time_after_eq(jiffies, req->r_started + req->r_timeout)) {
+ dout("do_request timed out\n");
+ err = -EIO;
+ goto finish;
+ }
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout("do_request forced umount\n");
+ err = -EIO;
+ goto finish;
+ }
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
+ if (mdsc->mdsmap_err) {
+ err = mdsc->mdsmap_err;
+ dout("do_request mdsmap err %d\n", err);
+ goto finish;
+ }
+ if (mdsc->mdsmap->m_epoch == 0) {
+ dout("do_request no mdsmap, waiting for map\n");
+ list_add(&req->r_wait, &mdsc->waiting_for_map);
+ return;
+ }
+ if (!(mdsc->fsc->mount_options->flags &
+ CEPH_MOUNT_OPT_MOUNTWAIT) &&
+ !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
+ err = -EHOSTUNREACH;
+ goto finish;
+ }
+ }
+
+ put_request_session(req);
+
+ mds = __choose_mds(mdsc, req);
+ if (mds < 0 ||
+ ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+ dout("do_request no mds or not active, waiting for map\n");
+ list_add(&req->r_wait, &mdsc->waiting_for_map);
+ return;
+ }
+
+ /* get, open session */
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ if (!session) {
+ session = register_session(mdsc, mds);
+ if (IS_ERR(session)) {
+ err = PTR_ERR(session);
+ goto finish;
+ }
+ }
+ req->r_session = get_session(session);
+
+ dout("do_request mds%d session %p state %s\n", mds, session,
+ ceph_session_state_name(session->s_state));
+ if (session->s_state != CEPH_MDS_SESSION_OPEN &&
+ session->s_state != CEPH_MDS_SESSION_HUNG) {
+ if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
+ err = -EACCES;
+ goto out_session;
+ }
+ if (session->s_state == CEPH_MDS_SESSION_NEW ||
+ session->s_state == CEPH_MDS_SESSION_CLOSING)
+ __open_session(mdsc, session);
+ list_add(&req->r_wait, &session->s_waiting);
+ goto out_session;
+ }
+
+ /* send request */
+ req->r_resend_mds = -1; /* forget any previous mds hint */
+
+ if (req->r_request_started == 0) /* note request start time */
+ req->r_request_started = jiffies;
+
+ err = __prepare_send_request(mdsc, req, mds, false);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+
+out_session:
+ ceph_put_mds_session(session);
+finish:
+ if (err) {
+ dout("__do_request early error %d\n", err);
+ req->r_err = err;
+ complete_request(mdsc, req);
+ __unregister_request(mdsc, req);
+ }
+ return;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+ struct list_head *head)
+{
+ struct ceph_mds_request *req;
+ LIST_HEAD(tmp_list);
+
+ list_splice_init(head, &tmp_list);
+
+ while (!list_empty(&tmp_list)) {
+ req = list_entry(tmp_list.next,
+ struct ceph_mds_request, r_wait);
+ list_del_init(&req->r_wait);
+ dout(" wake request %p tid %llu\n", req, req->r_tid);
+ __do_request(mdsc, req);
+ }
+}
+
+/*
+ * Wake up threads with requests pending for @mds, so that they can
+ * resubmit their requests to a possibly different mds.
+ */
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
+{
+ struct ceph_mds_request *req;
+ struct rb_node *p = rb_first(&mdsc->request_tree);
+
+ dout("kick_requests mds%d\n", mds);
+ while (p) {
+ req = rb_entry(p, struct ceph_mds_request, r_node);
+ p = rb_next(p);
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
+ continue;
+ if (req->r_attempts > 0)
+ continue; /* only new requests */
+ if (req->r_session &&
+ req->r_session->s_mds == mds) {
+ dout(" kicking tid %llu\n", req->r_tid);
+ list_del_init(&req->r_wait);
+ __do_request(mdsc, req);
+ }
+ }
+}
+
+void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ dout("submit_request on %p\n", req);
+ mutex_lock(&mdsc->mutex);
+ __register_request(mdsc, req, NULL);
+ __do_request(mdsc, req);
+ mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Synchrously perform an mds request. Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+ struct inode *dir,
+ struct ceph_mds_request *req)
+{
+ int err;
+
+ dout("do_request on %p\n", req);
+
+ /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
+ if (req->r_inode)
+ ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+ if (req->r_parent)
+ ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ if (req->r_old_dentry_dir)
+ ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
+ CEPH_CAP_PIN);
+
+ /* issue */
+ mutex_lock(&mdsc->mutex);
+ __register_request(mdsc, req, dir);
+ __do_request(mdsc, req);
+
+ if (req->r_err) {
+ err = req->r_err;
+ goto out;
+ }
+
+ /* wait */
+ mutex_unlock(&mdsc->mutex);
+ dout("do_request waiting\n");
+ if (!req->r_timeout && req->r_wait_for_completion) {
+ err = req->r_wait_for_completion(mdsc, req);
+ } else {
+ long timeleft = wait_for_completion_killable_timeout(
+ &req->r_completion,
+ ceph_timeout_jiffies(req->r_timeout));
+ if (timeleft > 0)
+ err = 0;
+ else if (!timeleft)
+ err = -EIO; /* timed out */
+ else
+ err = timeleft; /* killed */
+ }
+ dout("do_request waited, got %d\n", err);
+ mutex_lock(&mdsc->mutex);
+
+ /* only abort if we didn't race with a real reply */
+ if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+ err = le32_to_cpu(req->r_reply_info.head->result);
+ } else if (err < 0) {
+ dout("aborted request %lld with %d\n", req->r_tid, err);
+
+ /*
+ * ensure we aren't running concurrently with
+ * ceph_fill_trace or ceph_readdir_prepopulate, which
+ * rely on locks (dir mutex) held by our caller.
+ */
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = err;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+
+ if (req->r_parent &&
+ (req->r_op & CEPH_MDS_OP_WRITE))
+ ceph_invalidate_dir_request(req);
+ } else {
+ err = req->r_err;
+ }
+
+out:
+ mutex_unlock(&mdsc->mutex);
+ dout("do_request %p done, result %d\n", req, err);
+ return err;
+}
+
+/*
+ * Invalidate dir's completeness, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+ struct inode *dir = req->r_parent;
+ struct inode *old_dir = req->r_old_dentry_dir;
+
+ dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir);
+
+ ceph_dir_clear_complete(dir);
+ if (old_dir)
+ ceph_dir_clear_complete(old_dir);
+ if (req->r_dentry)
+ ceph_invalidate_dentry_lease(req->r_dentry);
+ if (req->r_old_dentry)
+ ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+
+/*
+ * Handle mds reply.
+ *
+ * We take the session mutex and parse and process the reply immediately.
+ * This preserves the logical ordering of replies, capabilities, etc., sent
+ * by the MDS as they are applied to our local cache.
+ */
+static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
+{
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_mds_request *req;
+ struct ceph_mds_reply_head *head = msg->front.iov_base;
+ struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
+ struct ceph_snap_realm *realm;
+ u64 tid;
+ int err, result;
+ int mds = session->s_mds;
+
+ if (msg->front.iov_len < sizeof(*head)) {
+ pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+ ceph_msg_dump(msg);
+ return;
+ }
+
+ /* get request, session */
+ tid = le64_to_cpu(msg->hdr.tid);
+ mutex_lock(&mdsc->mutex);
+ req = lookup_get_request(mdsc, tid);
+ if (!req) {
+ dout("handle_reply on unknown tid %llu\n", tid);
+ mutex_unlock(&mdsc->mutex);
+ return;
+ }
+ dout("handle_reply %p\n", req);
+
+ /* correct session? */
+ if (req->r_session != session) {
+ pr_err("mdsc_handle_reply got %llu on session mds%d"
+ " not mds%d\n", tid, session->s_mds,
+ req->r_session ? req->r_session->s_mds : -1);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
+
+ /* dup? */
+ if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
+ (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
+ pr_warn("got a dup %s reply on %llu from mds%d\n",
+ head->safe ? "safe" : "unsafe", tid, mds);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
+ if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
+ pr_warn("got unsafe after safe on %llu from mds%d\n",
+ tid, mds);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
+
+ result = le32_to_cpu(head->result);
+
+ /*
+ * Handle an ESTALE
+ * if we're not talking to the authority, send to them
+ * if the authority has changed while we weren't looking,
+ * send to new authority
+ * Otherwise we just have to return an ESTALE
+ */
+ if (result == -ESTALE) {
+ dout("got ESTALE on request %llu\n", req->r_tid);
+ req->r_resend_mds = -1;
+ if (req->r_direct_mode != USE_AUTH_MDS) {
+ dout("not using auth, setting for that now\n");
+ req->r_direct_mode = USE_AUTH_MDS;
+ __do_request(mdsc, req);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ } else {
+ int mds = __choose_mds(mdsc, req);
+ if (mds >= 0 && mds != req->r_session->s_mds) {
+ dout("but auth changed, so resending\n");
+ __do_request(mdsc, req);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
+ }
+ dout("have to return ESTALE on request %llu\n", req->r_tid);
+ }
+
+
+ if (head->safe) {
+ set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
+ __unregister_request(mdsc, req);
+
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ /*
+ * We already handled the unsafe response, now do the
+ * cleanup. No need to examine the response; the MDS
+ * doesn't include any result info in the safe
+ * response. And even if it did, there is nothing
+ * useful we could do with a revised return value.
+ */
+ dout("got safe reply %llu, mds%d\n", tid, mds);
+
+ /* last unsafe request during umount? */
+ if (mdsc->stopping && !__get_oldest_req(mdsc))
+ complete_all(&mdsc->safe_umount_waiters);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
+ } else {
+ set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
+ list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+ if (req->r_unsafe_dir) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_unsafe_dir);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_dir_item,
+ &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+ }
+
+ dout("handle_reply tid %lld result %d\n", tid, result);
+ rinfo = &req->r_reply_info;
+ err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&session->s_mutex);
+ if (err < 0) {
+ pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
+ ceph_msg_dump(msg);
+ goto out_err;
+ }
+
+ /* snap trace */
+ realm = NULL;
+ if (rinfo->snapblob_len) {
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, rinfo->snapblob,
+ rinfo->snapblob + rinfo->snapblob_len,
+ le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
+ &realm);
+ downgrade_write(&mdsc->snap_rwsem);
+ } else {
+ down_read(&mdsc->snap_rwsem);
+ }
+
+ /* insert trace into our cache */
+ mutex_lock(&req->r_fill_mutex);
+ current->journal_info = req;
+ err = ceph_fill_trace(mdsc->fsc->sb, req);
+ if (err == 0) {
+ if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
+ req->r_op == CEPH_MDS_OP_LSSNAP))
+ ceph_readdir_prepopulate(req, req->r_session);
+ ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
+ }
+ current->journal_info = NULL;
+ mutex_unlock(&req->r_fill_mutex);
+
+ up_read(&mdsc->snap_rwsem);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
+
+ if (err == 0 && req->r_target_inode &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+out_err:
+ mutex_lock(&mdsc->mutex);
+ if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+ if (err) {
+ req->r_err = err;
+ } else {
+ req->r_reply = ceph_msg_get(msg);
+ set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
+ }
+ } else {
+ dout("reply arrived after request %lld was aborted\n", tid);
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_unlock(&session->s_mutex);
+
+ /* kick calling process */
+ complete_request(mdsc, req);
+out:
+ ceph_mdsc_put_request(req);
+ return;
+}
+
+
+
+/*
+ * handle mds notification that our request has been forwarded.
+ */
+static void handle_forward(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct ceph_mds_request *req;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+ u32 next_mds;
+ u32 fwd_seq;
+ int err = -EINVAL;
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ next_mds = ceph_decode_32(&p);
+ fwd_seq = ceph_decode_32(&p);
+
+ mutex_lock(&mdsc->mutex);
+ req = lookup_get_request(mdsc, tid);
+ if (!req) {
+ dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
+ goto out; /* dup reply? */
+ }
+
+ if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+ dout("forward tid %llu aborted, unregistering\n", tid);
+ __unregister_request(mdsc, req);
+ } else if (fwd_seq <= req->r_num_fwd) {
+ dout("forward tid %llu to mds%d - old seq %d <= %d\n",
+ tid, next_mds, req->r_num_fwd, fwd_seq);
+ } else {
+ /* resend. forward race not possible; mds would drop */
+ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+ BUG_ON(req->r_err);
+ BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
+ req->r_attempts = 0;
+ req->r_num_fwd = fwd_seq;
+ req->r_resend_mds = next_mds;
+ put_request_session(req);
+ __do_request(mdsc, req);
+ }
+ ceph_mdsc_put_request(req);
+out:
+ mutex_unlock(&mdsc->mutex);
+ return;
+
+bad:
+ pr_err("mdsc_handle_forward decode error err=%d\n", err);
+}
+
+/*
+ * handle a mds session control message
+ */
+static void handle_session(struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ u32 op;
+ u64 seq;
+ int mds = session->s_mds;
+ struct ceph_mds_session_head *h = msg->front.iov_base;
+ int wake = 0;
+
+ /* decode */
+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;
+ op = le32_to_cpu(h->op);
+ seq = le64_to_cpu(h->seq);
+
+ mutex_lock(&mdsc->mutex);
+ if (op == CEPH_SESSION_CLOSE) {
+ get_session(session);
+ __unregister_session(mdsc, session);
+ }
+ /* FIXME: this ttl calculation is generous */
+ session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&session->s_mutex);
+
+ dout("handle_session mds%d %s %p state %s seq %llu\n",
+ mds, ceph_session_op_name(op), session,
+ ceph_session_state_name(session->s_state), seq);
+
+ if (session->s_state == CEPH_MDS_SESSION_HUNG) {
+ session->s_state = CEPH_MDS_SESSION_OPEN;
+ pr_info("mds%d came back\n", session->s_mds);
+ }
+
+ switch (op) {
+ case CEPH_SESSION_OPEN:
+ if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+ pr_info("mds%d reconnect success\n", session->s_mds);
+ session->s_state = CEPH_MDS_SESSION_OPEN;
+ renewed_caps(mdsc, session, 0);
+ wake = 1;
+ if (mdsc->stopping)
+ __close_session(mdsc, session);
+ break;
+
+ case CEPH_SESSION_RENEWCAPS:
+ if (session->s_renew_seq == seq)
+ renewed_caps(mdsc, session, 1);
+ break;
+
+ case CEPH_SESSION_CLOSE:
+ if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+ pr_info("mds%d reconnect denied\n", session->s_mds);
+ cleanup_session_requests(mdsc, session);
+ remove_session_caps(session);
+ wake = 2; /* for good measure */
+ wake_up_all(&mdsc->session_close_wq);
+ break;
+
+ case CEPH_SESSION_STALE:
+ pr_info("mds%d caps went stale, renewing\n",
+ session->s_mds);
+ spin_lock(&session->s_gen_ttl_lock);
+ session->s_cap_gen++;
+ session->s_cap_ttl = jiffies - 1;
+ spin_unlock(&session->s_gen_ttl_lock);
+ send_renew_caps(mdsc, session);
+ break;
+
+ case CEPH_SESSION_RECALL_STATE:
+ ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+ break;
+
+ case CEPH_SESSION_FLUSHMSG:
+ send_flushmsg_ack(mdsc, session, seq);
+ break;
+
+ case CEPH_SESSION_FORCE_RO:
+ dout("force_session_readonly %p\n", session);
+ spin_lock(&session->s_cap_lock);
+ session->s_readonly = true;
+ spin_unlock(&session->s_cap_lock);
+ wake_up_session_caps(session, 0);
+ break;
+
+ case CEPH_SESSION_REJECT:
+ WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
+ pr_info("mds%d rejected session\n", session->s_mds);
+ session->s_state = CEPH_MDS_SESSION_REJECTED;
+ cleanup_session_requests(mdsc, session);
+ remove_session_caps(session);
+ wake = 2; /* for good measure */
+ break;
+
+ default:
+ pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+ WARN_ON(1);
+ }
+
+ mutex_unlock(&session->s_mutex);
+ if (wake) {
+ mutex_lock(&mdsc->mutex);
+ __wake_requests(mdsc, &session->s_waiting);
+ if (wake == 2)
+ kick_requests(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ }
+ if (op == CEPH_SESSION_CLOSE)
+ ceph_put_mds_session(session);
+ return;
+
+bad:
+ pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
+ (int)msg->front.iov_len);
+ ceph_msg_dump(msg);
+ return;
+}
+
+
+/*
+ * called under session->mutex.
+ */
+static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_request *req, *nreq;
+ struct rb_node *p;
+ int err;
+
+ dout("replay_unsafe_requests mds%d\n", session->s_mds);
+
+ mutex_lock(&mdsc->mutex);
+ list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
+ err = __prepare_send_request(mdsc, req, session->s_mds, true);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+ }
+
+ /*
+ * also re-send old requests when MDS enters reconnect stage. So that MDS
+ * can process completed request in clientreplay stage.
+ */
+ p = rb_first(&mdsc->request_tree);
+ while (p) {
+ req = rb_entry(p, struct ceph_mds_request, r_node);
+ p = rb_next(p);
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
+ continue;
+ if (req->r_attempts == 0)
+ continue; /* only old requests */
+ if (req->r_session &&
+ req->r_session->s_mds == session->s_mds) {
+ err = __prepare_send_request(mdsc, req,
+ session->s_mds, true);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+ }
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Encode information about a cap for a reconnect with the MDS.
+ */
+static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+ void *arg)
+{
+ union {
+ struct ceph_mds_cap_reconnect v2;
+ struct ceph_mds_cap_reconnect_v1 v1;
+ } rec;
+ struct ceph_inode_info *ci = cap->ci;
+ struct ceph_reconnect_state *recon_state = arg;
+ struct ceph_pagelist *pagelist = recon_state->pagelist;
+ char *path;
+ int pathlen, err;
+ u64 pathbase;
+ u64 snap_follows;
+ struct dentry *dentry;
+
+ dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+ inode, ceph_vinop(inode), cap, cap->cap_id,
+ ceph_cap_string(cap->issued));
+ err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+ if (err)
+ return err;
+
+ dentry = d_find_alias(inode);
+ if (dentry) {
+ path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out_dput;
+ }
+ } else {
+ path = NULL;
+ pathlen = 0;
+ pathbase = 0;
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ cap->seq = 0; /* reset cap seq */
+ cap->issue_seq = 0; /* and issue_seq */
+ cap->mseq = 0; /* and migrate_seq */
+ cap->cap_gen = cap->session->s_cap_gen;
+
+ if (recon_state->msg_version >= 2) {
+ rec.v2.cap_id = cpu_to_le64(cap->cap_id);
+ rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+ rec.v2.issued = cpu_to_le32(cap->issued);
+ rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+ rec.v2.pathbase = cpu_to_le64(pathbase);
+ rec.v2.flock_len = (__force __le32)
+ ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
+ } else {
+ rec.v1.cap_id = cpu_to_le64(cap->cap_id);
+ rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+ rec.v1.issued = cpu_to_le32(cap->issued);
+ rec.v1.size = cpu_to_le64(inode->i_size);
+ ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
+ ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
+ rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+ rec.v1.pathbase = cpu_to_le64(pathbase);
+ }
+
+ if (list_empty(&ci->i_cap_snaps)) {
+ snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;
+ } else {
+ struct ceph_cap_snap *capsnap =
+ list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ snap_follows = capsnap->follows;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (recon_state->msg_version >= 2) {
+ int num_fcntl_locks, num_flock_locks;
+ struct ceph_filelock *flocks = NULL;
+ size_t struct_len, total_len = 0;
+ u8 struct_v = 0;
+
+encode_again:
+ if (rec.v2.flock_len) {
+ ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+ } else {
+ num_fcntl_locks = 0;
+ num_flock_locks = 0;
+ }
+ if (num_fcntl_locks + num_flock_locks > 0) {
+ flocks = kmalloc_array(num_fcntl_locks + num_flock_locks,
+ sizeof(struct ceph_filelock),
+ GFP_NOFS);
+ if (!flocks) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ err = ceph_encode_locks_to_buffer(inode, flocks,
+ num_fcntl_locks,
+ num_flock_locks);
+ if (err) {
+ kfree(flocks);
+ flocks = NULL;
+ if (err == -ENOSPC)
+ goto encode_again;
+ goto out_free;
+ }
+ } else {
+ kfree(flocks);
+ flocks = NULL;
+ }
+
+ if (recon_state->msg_version >= 3) {
+ /* version, compat_version and struct_len */
+ total_len = 2 * sizeof(u8) + sizeof(u32);
+ struct_v = 2;
+ }
+ /*
+ * number of encoded locks is stable, so copy to pagelist
+ */
+ struct_len = 2 * sizeof(u32) +
+ (num_fcntl_locks + num_flock_locks) *
+ sizeof(struct ceph_filelock);
+ rec.v2.flock_len = cpu_to_le32(struct_len);
+
+ struct_len += sizeof(rec.v2);
+ struct_len += sizeof(u32) + pathlen;
+
+ if (struct_v >= 2)
+ struct_len += sizeof(u64); /* snap_follows */
+
+ total_len += struct_len;
+ err = ceph_pagelist_reserve(pagelist, total_len);
+
+ if (!err) {
+ if (recon_state->msg_version >= 3) {
+ ceph_pagelist_encode_8(pagelist, struct_v);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, struct_len);
+ }
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+ ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks,
+ num_flock_locks);
+ if (struct_v >= 2)
+ ceph_pagelist_encode_64(pagelist, snap_follows);
+ }
+ kfree(flocks);
+ } else {
+ size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
+ err = ceph_pagelist_reserve(pagelist, size);
+ if (!err) {
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+ }
+ }
+
+ recon_state->nr_caps++;
+out_free:
+ kfree(path);
+out_dput:
+ dput(dentry);
+ return err;
+}
+
+
+/*
+ * If an MDS fails and recovers, clients need to reconnect in order to
+ * reestablish shared state. This includes all caps issued through
+ * this session _and_ the snap_realm hierarchy. Because it's not
+ * clear which snap realms the mds cares about, we send everything we
+ * know about.. that ensures we'll then get any new info the
+ * recovering MDS might have.
+ *
+ * This is a relatively heavyweight operation, but it's rare.
+ *
+ * called with mdsc->mutex held.
+ */
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *reply;
+ struct rb_node *p;
+ int mds = session->s_mds;
+ int err = -ENOMEM;
+ int s_nr_caps;
+ struct ceph_pagelist *pagelist;
+ struct ceph_reconnect_state recon_state;
+ LIST_HEAD(dispose);
+
+ pr_info("mds%d reconnect start\n", mds);
+
+ pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ if (!pagelist)
+ goto fail_nopagelist;
+ ceph_pagelist_init(pagelist);
+
+ reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
+ if (!reply)
+ goto fail_nomsg;
+
+ mutex_lock(&session->s_mutex);
+ session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+ session->s_seq = 0;
+
+ dout("session %p state %s\n", session,
+ ceph_session_state_name(session->s_state));
+
+ spin_lock(&session->s_gen_ttl_lock);
+ session->s_cap_gen++;
+ spin_unlock(&session->s_gen_ttl_lock);
+
+ spin_lock(&session->s_cap_lock);
+ /* don't know if session is readonly */
+ session->s_readonly = 0;
+ /*
+ * notify __ceph_remove_cap() that we are composing cap reconnect.
+ * If a cap get released before being added to the cap reconnect,
+ * __ceph_remove_cap() should skip queuing cap release.
+ */
+ session->s_cap_reconnect = 1;
+ /* drop old cap expires; we're about to reestablish that state */
+ detach_cap_releases(session, &dispose);
+ spin_unlock(&session->s_cap_lock);
+ dispose_cap_releases(mdsc, &dispose);
+
+ /* trim unused caps to reduce MDS's cache rejoin time */
+ if (mdsc->fsc->sb->s_root)
+ shrink_dcache_parent(mdsc->fsc->sb->s_root);
+
+ ceph_con_close(&session->s_con);
+ ceph_con_open(&session->s_con,
+ CEPH_ENTITY_TYPE_MDS, mds,
+ ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+ /* replay unsafe requests */
+ replay_unsafe_requests(mdsc, session);
+
+ down_read(&mdsc->snap_rwsem);
+
+ /* traverse this session's caps */
+ s_nr_caps = session->s_nr_caps;
+ err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
+ if (err)
+ goto fail;
+
+ recon_state.nr_caps = 0;
+ recon_state.pagelist = pagelist;
+ if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+ recon_state.msg_version = 3;
+ else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK)
+ recon_state.msg_version = 2;
+ else
+ recon_state.msg_version = 1;
+ err = iterate_session_caps(session, encode_caps_cb, &recon_state);
+ if (err < 0)
+ goto fail;
+
+ spin_lock(&session->s_cap_lock);
+ session->s_cap_reconnect = 0;
+ spin_unlock(&session->s_cap_lock);
+
+ /*
+ * snaprealms. we provide mds with the ino, seq (version), and
+ * parent for all of our realms. If the mds has any newer info,
+ * it will tell us.
+ */
+ for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+ struct ceph_snap_realm *realm =
+ rb_entry(p, struct ceph_snap_realm, node);
+ struct ceph_mds_snaprealm_reconnect sr_rec;
+
+ dout(" adding snap realm %llx seq %lld parent %llx\n",
+ realm->ino, realm->seq, realm->parent_ino);
+ sr_rec.ino = cpu_to_le64(realm->ino);
+ sr_rec.seq = cpu_to_le64(realm->seq);
+ sr_rec.parent = cpu_to_le64(realm->parent_ino);
+ err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+ if (err)
+ goto fail;
+ }
+
+ reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+
+ /* raced with cap release? */
+ if (s_nr_caps != recon_state.nr_caps) {
+ struct page *page = list_first_entry(&pagelist->head,
+ struct page, lru);
+ __le32 *addr = kmap_atomic(page);
+ *addr = cpu_to_le32(recon_state.nr_caps);
+ kunmap_atomic(addr);
+ }
+
+ reply->hdr.data_len = cpu_to_le32(pagelist->length);
+ ceph_msg_data_add_pagelist(reply, pagelist);
+
+ ceph_early_kick_flushing_caps(mdsc, session);
+
+ ceph_con_send(&session->s_con, reply);
+
+ mutex_unlock(&session->s_mutex);
+
+ mutex_lock(&mdsc->mutex);
+ __wake_requests(mdsc, &session->s_waiting);
+ mutex_unlock(&mdsc->mutex);
+
+ up_read(&mdsc->snap_rwsem);
+ return;
+
+fail:
+ ceph_msg_put(reply);
+ up_read(&mdsc->snap_rwsem);
+ mutex_unlock(&session->s_mutex);
+fail_nomsg:
+ ceph_pagelist_release(pagelist);
+fail_nopagelist:
+ pr_err("error %d preparing reconnect for mds%d\n", err, mds);
+ return;
+}
+
+
+/*
+ * compare old and new mdsmaps, kicking requests
+ * and closing out old connections as necessary
+ *
+ * called under mdsc->mutex.
+ */
+static void check_new_map(struct ceph_mds_client *mdsc,
+ struct ceph_mdsmap *newmap,
+ struct ceph_mdsmap *oldmap)
+{
+ int i;
+ int oldstate, newstate;
+ struct ceph_mds_session *s;
+
+ dout("check_new_map new %u old %u\n",
+ newmap->m_epoch, oldmap->m_epoch);
+
+ for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
+ if (!mdsc->sessions[i])
+ continue;
+ s = mdsc->sessions[i];
+ oldstate = ceph_mdsmap_get_state(oldmap, i);
+ newstate = ceph_mdsmap_get_state(newmap, i);
+
+ dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
+ i, ceph_mds_state_name(oldstate),
+ ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
+ ceph_mds_state_name(newstate),
+ ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
+ ceph_session_state_name(s->s_state));
+
+ if (i >= newmap->m_num_mds ||
+ memcmp(ceph_mdsmap_get_addr(oldmap, i),
+ ceph_mdsmap_get_addr(newmap, i),
+ sizeof(struct ceph_entity_addr))) {
+ if (s->s_state == CEPH_MDS_SESSION_OPENING) {
+ /* the session never opened, just close it
+ * out now */
+ get_session(s);
+ __unregister_session(mdsc, s);
+ __wake_requests(mdsc, &s->s_waiting);
+ ceph_put_mds_session(s);
+ } else if (i >= newmap->m_num_mds) {
+ /* force close session for stopped mds */
+ get_session(s);
+ __unregister_session(mdsc, s);
+ __wake_requests(mdsc, &s->s_waiting);
+ kick_requests(mdsc, i);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ cleanup_session_requests(mdsc, s);
+ remove_session_caps(s);
+ mutex_unlock(&s->s_mutex);
+
+ ceph_put_mds_session(s);
+
+ mutex_lock(&mdsc->mutex);
+ } else {
+ /* just close it */
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_lock(&mdsc->mutex);
+ ceph_con_close(&s->s_con);
+ mutex_unlock(&s->s_mutex);
+ s->s_state = CEPH_MDS_SESSION_RESTARTING;
+ }
+ } else if (oldstate == newstate) {
+ continue; /* nothing new with this mds */
+ }
+
+ /*
+ * send reconnect?
+ */
+ if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
+ newstate >= CEPH_MDS_STATE_RECONNECT) {
+ mutex_unlock(&mdsc->mutex);
+ send_mds_reconnect(mdsc, s);
+ mutex_lock(&mdsc->mutex);
+ }
+
+ /*
+ * kick request on any mds that has gone active.
+ */
+ if (oldstate < CEPH_MDS_STATE_ACTIVE &&
+ newstate >= CEPH_MDS_STATE_ACTIVE) {
+ if (oldstate != CEPH_MDS_STATE_CREATING &&
+ oldstate != CEPH_MDS_STATE_STARTING)
+ pr_info("mds%d recovery completed\n", s->s_mds);
+ kick_requests(mdsc, i);
+ ceph_kick_flushing_caps(mdsc, s);
+ wake_up_session_caps(s, 1);
+ }
+ }
+
+ for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) {
+ s = mdsc->sessions[i];
+ if (!s)
+ continue;
+ if (!ceph_mdsmap_is_laggy(newmap, i))
+ continue;
+ if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+ s->s_state == CEPH_MDS_SESSION_HUNG ||
+ s->s_state == CEPH_MDS_SESSION_CLOSING) {
+ dout(" connecting to export targets of laggy mds%d\n",
+ i);
+ __open_export_target_sessions(mdsc, s);
+ }
+ }
+}
+
+
+
+/*
+ * leases
+ */
+
+/*
+ * caller must hold session s_mutex, dentry->d_lock
+ */
+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ ceph_put_mds_session(di->lease_session);
+ di->lease_session = NULL;
+}
+
+static void handle_lease(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct super_block *sb = mdsc->fsc->sb;
+ struct inode *inode;
+ struct dentry *parent, *dentry;
+ struct ceph_dentry_info *di;
+ int mds = session->s_mds;
+ struct ceph_mds_lease *h = msg->front.iov_base;
+ u32 seq;
+ struct ceph_vino vino;
+ struct qstr dname;
+ int release = 0;
+
+ dout("handle_lease from mds%d\n", mds);
+
+ /* decode */
+ if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
+ goto bad;
+ vino.ino = le64_to_cpu(h->ino);
+ vino.snap = CEPH_NOSNAP;
+ seq = le32_to_cpu(h->seq);
+ dname.len = get_unaligned_le32(h + 1);
+ if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len)
+ goto bad;
+ dname.name = (void *)(h + 1) + sizeof(u32);
+
+ /* lookup inode */
+ inode = ceph_find_inode(sb, vino);
+ dout("handle_lease %s, ino %llx %p %.*s\n",
+ ceph_lease_op_name(h->action), vino.ino, inode,
+ dname.len, dname.name);
+
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;
+
+ if (!inode) {
+ dout("handle_lease no inode %llx\n", vino.ino);
+ goto release;
+ }
+
+ /* dentry */
+ parent = d_find_alias(inode);
+ if (!parent) {
+ dout("no parent dentry on inode %p\n", inode);
+ WARN_ON(1);
+ goto release; /* hrm... */
+ }
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
+ dentry = d_lookup(parent, &dname);
+ dput(parent);
+ if (!dentry)
+ goto release;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ switch (h->action) {
+ case CEPH_MDS_LEASE_REVOKE:
+ if (di->lease_session == session) {
+ if (ceph_seq_cmp(di->lease_seq, seq) > 0)
+ h->seq = cpu_to_le32(di->lease_seq);
+ __ceph_mdsc_drop_dentry_lease(dentry);
+ }
+ release = 1;
+ break;
+
+ case CEPH_MDS_LEASE_RENEW:
+ if (di->lease_session == session &&
+ di->lease_gen == session->s_cap_gen &&
+ di->lease_renew_from &&
+ di->lease_renew_after == 0) {
+ unsigned long duration =
+ msecs_to_jiffies(le32_to_cpu(h->duration_ms));
+
+ di->lease_seq = seq;
+ di->time = di->lease_renew_from + duration;
+ di->lease_renew_after = di->lease_renew_from +
+ (duration >> 1);
+ di->lease_renew_from = 0;
+ }
+ break;
+ }
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+
+ if (!release)
+ goto out;
+
+release:
+ /* let's just reuse the same message */
+ h->action = CEPH_MDS_LEASE_REVOKE_ACK;
+ ceph_msg_get(msg);
+ ceph_con_send(&session->s_con, msg);
+
+out:
+ iput(inode);
+ mutex_unlock(&session->s_mutex);
+ return;
+
+bad:
+ pr_err("corrupt lease message\n");
+ ceph_msg_dump(msg);
+}
+
+void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+ struct inode *inode,
+ struct dentry *dentry, char action,
+ u32 seq)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_lease *lease;
+ int len = sizeof(*lease) + sizeof(u32);
+ int dnamelen = 0;
+
+ dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
+ inode, dentry, ceph_lease_op_name(action), session->s_mds);
+ dnamelen = dentry->d_name.len;
+ len += dnamelen;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
+ if (!msg)
+ return;
+ lease = msg->front.iov_base;
+ lease->action = action;
+ lease->ino = cpu_to_le64(ceph_vino(inode).ino);
+ lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
+ lease->seq = cpu_to_le32(seq);
+ put_unaligned_le32(dnamelen, lease + 1);
+ memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
+
+ /*
+ * if this is a preemptive lease RELEASE, no need to
+ * flush request stream, since the actual request will
+ * soon follow.
+ */
+ msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
+
+ ceph_con_send(&session->s_con, msg);
+}
+
+/*
+ * lock unlock sessions, to wait ongoing session activities
+ */
+static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
+{
+ int i;
+
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+ if (!s)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_unlock(&s->s_mutex);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
+
+
+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
+static void schedule_delayed(struct ceph_mds_client *mdsc)
+{
+ int delay = 5;
+ unsigned hz = round_jiffies_relative(HZ * delay);
+ schedule_delayed_work(&mdsc->delayed_work, hz);
+}
+
+static void delayed_work(struct work_struct *work)
+{
+ int i;
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, delayed_work.work);
+ int renew_interval;
+ int renew_caps;
+
+ dout("mdsc delayed_work\n");
+ ceph_check_delayed_caps(mdsc);
+
+ if (mdsc->stopping)
+ return;
+
+ mutex_lock(&mdsc->mutex);
+ renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+ renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+ mdsc->last_renew_caps);
+ if (renew_caps)
+ mdsc->last_renew_caps = jiffies;
+
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+ if (!s)
+ continue;
+ if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+ dout("resending session close request for mds%d\n",
+ s->s_mds);
+ request_close_session(mdsc, s);
+ ceph_put_mds_session(s);
+ continue;
+ }
+ if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+ if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+ s->s_state = CEPH_MDS_SESSION_HUNG;
+ pr_info("mds%d hung\n", s->s_mds);
+ }
+ }
+ if (s->s_state == CEPH_MDS_SESSION_NEW ||
+ s->s_state == CEPH_MDS_SESSION_RESTARTING ||
+ s->s_state == CEPH_MDS_SESSION_REJECTED) {
+ /* this mds is failed or recovering, just wait */
+ ceph_put_mds_session(s);
+ continue;
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ if (renew_caps)
+ send_renew_caps(mdsc, s);
+ else
+ ceph_con_keepalive(&s->s_con);
+ if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+ s->s_state == CEPH_MDS_SESSION_HUNG)
+ ceph_send_cap_releases(mdsc, s);
+ mutex_unlock(&s->s_mutex);
+ ceph_put_mds_session(s);
+
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ schedule_delayed(mdsc);
+}
+
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
+
+{
+ struct ceph_mds_client *mdsc;
+
+ mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+ if (!mdsc)
+ return -ENOMEM;
+ mdsc->fsc = fsc;
+ mutex_init(&mdsc->mutex);
+ mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+ if (!mdsc->mdsmap) {
+ kfree(mdsc);
+ return -ENOMEM;
+ }
+
+ init_completion(&mdsc->safe_umount_waiters);
+ init_waitqueue_head(&mdsc->session_close_wq);
+ INIT_LIST_HEAD(&mdsc->waiting_for_map);
+ mdsc->sessions = NULL;
+ atomic_set(&mdsc->num_sessions, 0);
+ mdsc->max_sessions = 0;
+ mdsc->stopping = 0;
+ atomic64_set(&mdsc->quotarealms_count, 0);
+ mdsc->last_snap_seq = 0;
+ init_rwsem(&mdsc->snap_rwsem);
+ mdsc->snap_realms = RB_ROOT;
+ INIT_LIST_HEAD(&mdsc->snap_empty);
+ spin_lock_init(&mdsc->snap_empty_lock);
+ mdsc->last_tid = 0;
+ mdsc->oldest_tid = 0;
+ mdsc->request_tree = RB_ROOT;
+ INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
+ mdsc->last_renew_caps = jiffies;
+ INIT_LIST_HEAD(&mdsc->cap_delay_list);
+ spin_lock_init(&mdsc->cap_delay_lock);
+ INIT_LIST_HEAD(&mdsc->snap_flush_list);
+ spin_lock_init(&mdsc->snap_flush_lock);
+ mdsc->last_cap_flush_tid = 1;
+ INIT_LIST_HEAD(&mdsc->cap_flush_list);
+ INIT_LIST_HEAD(&mdsc->cap_dirty);
+ INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
+ mdsc->num_cap_flushing = 0;
+ spin_lock_init(&mdsc->cap_dirty_lock);
+ init_waitqueue_head(&mdsc->cap_flushing_wq);
+ spin_lock_init(&mdsc->dentry_lru_lock);
+ INIT_LIST_HEAD(&mdsc->dentry_lru);
+
+ ceph_caps_init(mdsc);
+ ceph_adjust_min_caps(mdsc, fsc->min_caps);
+
+ init_rwsem(&mdsc->pool_perm_rwsem);
+ mdsc->pool_perm_tree = RB_ROOT;
+
+ strscpy(mdsc->nodename, utsname()->nodename,
+ sizeof(mdsc->nodename));
+
+ fsc->mdsc = mdsc;
+ return 0;
+}
+
+/*
+ * Wait for safe replies on open mds requests. If we time out, drop
+ * all requests from the tree to avoid dangling dentry refs.
+ */
+static void wait_requests(struct ceph_mds_client *mdsc)
+{
+ struct ceph_options *opts = mdsc->fsc->client->options;
+ struct ceph_mds_request *req;
+
+ mutex_lock(&mdsc->mutex);
+ if (__get_oldest_req(mdsc)) {
+ mutex_unlock(&mdsc->mutex);
+
+ dout("wait_requests waiting for requests\n");
+ wait_for_completion_timeout(&mdsc->safe_umount_waiters,
+ ceph_timeout_jiffies(opts->mount_timeout));
+
+ /* tear down remaining requests */
+ mutex_lock(&mdsc->mutex);
+ while ((req = __get_oldest_req(mdsc))) {
+ dout("wait_requests timed out on tid %llu\n",
+ req->r_tid);
+ __unregister_request(mdsc, req);
+ }
+ }
+ mutex_unlock(&mdsc->mutex);
+ dout("wait_requests done\n");
+}
+
+/*
+ * called before mount is ro, and before dentries are torn down.
+ * (hmm, does this still race with new lookups?)
+ */
+void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
+{
+ dout("pre_umount\n");
+ mdsc->stopping = 1;
+
+ lock_unlock_sessions(mdsc);
+ ceph_flush_dirty_caps(mdsc);
+ wait_requests(mdsc);
+
+ /*
+ * wait for reply handlers to drop their request refs and
+ * their inode/dcache refs
+ */
+ ceph_msgr_flush();
+}
+
+/*
+ * wait for all write mds requests to flush.
+ */
+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+ struct ceph_mds_request *req = NULL, *nextreq;
+ struct rb_node *n;
+
+ mutex_lock(&mdsc->mutex);
+ dout("wait_unsafe_requests want %lld\n", want_tid);
+restart:
+ req = __get_oldest_req(mdsc);
+ while (req && req->r_tid <= want_tid) {
+ /* find next request */
+ n = rb_next(&req->r_node);
+ if (n)
+ nextreq = rb_entry(n, struct ceph_mds_request, r_node);
+ else
+ nextreq = NULL;
+ if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
+ (req->r_op & CEPH_MDS_OP_WRITE)) {
+ /* write op */
+ ceph_mdsc_get_request(req);
+ if (nextreq)
+ ceph_mdsc_get_request(nextreq);
+ mutex_unlock(&mdsc->mutex);
+ dout("wait_unsafe_requests wait on %llu (want %llu)\n",
+ req->r_tid, want_tid);
+ wait_for_completion(&req->r_safe_completion);
+ mutex_lock(&mdsc->mutex);
+ ceph_mdsc_put_request(req);
+ if (!nextreq)
+ break; /* next dne before, so we're done! */
+ if (RB_EMPTY_NODE(&nextreq->r_node)) {
+ /* next request was removed from tree */
+ ceph_mdsc_put_request(nextreq);
+ goto restart;
+ }
+ ceph_mdsc_put_request(nextreq); /* won't go away */
+ }
+ req = nextreq;
+ }
+ mutex_unlock(&mdsc->mutex);
+ dout("wait_unsafe_requests done\n");
+}
+
+void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+{
+ u64 want_tid, want_flush;
+
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+ return;
+
+ dout("sync\n");
+ mutex_lock(&mdsc->mutex);
+ want_tid = mdsc->last_tid;
+ mutex_unlock(&mdsc->mutex);
+
+ ceph_flush_dirty_caps(mdsc);
+ spin_lock(&mdsc->cap_dirty_lock);
+ want_flush = mdsc->last_cap_flush_tid;
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_last_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ cf->wake = true;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ dout("sync want tid %lld flush_seq %lld\n",
+ want_tid, want_flush);
+
+ wait_unsafe_requests(mdsc, want_tid);
+ wait_caps_flush(mdsc, want_flush);
+}
+
+/*
+ * true if all sessions are closed, or we force unmount
+ */
+static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
+{
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+ return true;
+ return atomic_read(&mdsc->num_sessions) <= skipped;
+}
+
+/*
+ * called after sb is ro.
+ */
+void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
+{
+ struct ceph_options *opts = mdsc->fsc->client->options;
+ struct ceph_mds_session *session;
+ int i;
+ int skipped = 0;
+
+ dout("close_sessions\n");
+
+ /* close sessions */
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ session = __ceph_lookup_mds_session(mdsc, i);
+ if (!session)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&session->s_mutex);
+ if (__close_session(mdsc, session) <= 0)
+ skipped++;
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ dout("waiting for sessions to close\n");
+ wait_event_timeout(mdsc->session_close_wq,
+ done_closing_sessions(mdsc, skipped),
+ ceph_timeout_jiffies(opts->mount_timeout));
+
+ /* tear down remaining sessions */
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ if (mdsc->sessions[i]) {
+ session = get_session(mdsc->sessions[i]);
+ __unregister_session(mdsc, session);
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&session->s_mutex);
+ remove_session_caps(session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ }
+ }
+ WARN_ON(!list_empty(&mdsc->cap_delay_list));
+ mutex_unlock(&mdsc->mutex);
+
+ ceph_cleanup_empty_realms(mdsc);
+
+ cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+
+ dout("stopped\n");
+}
+
+void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_session *session;
+ int mds;
+
+ dout("force umount\n");
+
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ if (!session)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&session->s_mutex);
+ __close_session(mdsc, session);
+ if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ cleanup_session_requests(mdsc, session);
+ remove_session_caps(session);
+ }
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ kick_requests(mdsc, mds);
+ }
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
+}
+
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+{
+ dout("stop\n");
+ /*
+ * Make sure the delayed work stopped before releasing
+ * the resources.
+ *
+ * Because the cancel_delayed_work_sync() will only
+ * guarantee that the work finishes executing. But the
+ * delayed work will re-arm itself again after that.
+ */
+ flush_delayed_work(&mdsc->delayed_work);
+
+ if (mdsc->mdsmap)
+ ceph_mdsmap_destroy(mdsc->mdsmap);
+ kfree(mdsc->sessions);
+ ceph_caps_finalize(mdsc);
+ ceph_pool_perm_destroy(mdsc);
+}
+
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ dout("mdsc_destroy %p\n", mdsc);
+
+ if (!mdsc)
+ return;
+
+ /* flush out any connection work with references to us */
+ ceph_msgr_flush();
+
+ ceph_mdsc_stop(mdsc);
+
+ fsc->mdsc = NULL;
+ kfree(mdsc);
+ dout("mdsc_destroy %p done\n", mdsc);
+}
+
+void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+ struct ceph_fs_client *fsc = mdsc->fsc;
+ const char *mds_namespace = fsc->mount_options->mds_namespace;
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ u32 epoch;
+ u32 map_len;
+ u32 num_fs;
+ u32 mount_fscid = (u32)-1;
+ u8 struct_v, struct_cv;
+ int err = -EINVAL;
+
+ ceph_decode_need(&p, end, sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+
+ dout("handle_fsmap epoch %u\n", epoch);
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ struct_v = ceph_decode_8(&p);
+ struct_cv = ceph_decode_8(&p);
+ map_len = ceph_decode_32(&p);
+
+ ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
+ p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
+
+ num_fs = ceph_decode_32(&p);
+ while (num_fs-- > 0) {
+ void *info_p, *info_end;
+ u32 info_len;
+ u8 info_v, info_cv;
+ u32 fscid, namelen;
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ info_v = ceph_decode_8(&p);
+ info_cv = ceph_decode_8(&p);
+ info_len = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, info_len, bad);
+ info_p = p;
+ info_end = p + info_len;
+ p = info_end;
+
+ ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
+ fscid = ceph_decode_32(&info_p);
+ namelen = ceph_decode_32(&info_p);
+ ceph_decode_need(&info_p, info_end, namelen, bad);
+
+ if (mds_namespace &&
+ strlen(mds_namespace) == namelen &&
+ !strncmp(mds_namespace, (char *)info_p, namelen)) {
+ mount_fscid = fscid;
+ break;
+ }
+ }
+
+ ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
+ if (mount_fscid != (u32)-1) {
+ fsc->client->monc.fs_cluster_id = mount_fscid;
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ ceph_monc_renew_subs(&fsc->client->monc);
+ } else {
+ err = -ENOENT;
+ goto err_out;
+ }
+ return;
+
+bad:
+ pr_err("error decoding fsmap\n");
+err_out:
+ mutex_lock(&mdsc->mutex);
+ mdsc->mdsmap_err = err;
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * handle mds map update.
+ */
+void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+ u32 epoch;
+ u32 maplen;
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ struct ceph_mdsmap *newmap, *oldmap;
+ struct ceph_fsid fsid;
+ int err = -EINVAL;
+
+ ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
+ return;
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+
+ /* do we need it? */
+ mutex_lock(&mdsc->mutex);
+ if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
+ dout("handle_map epoch %u <= our %u\n",
+ epoch, mdsc->mdsmap->m_epoch);
+ mutex_unlock(&mdsc->mutex);
+ return;
+ }
+
+ newmap = ceph_mdsmap_decode(&p, end);
+ if (IS_ERR(newmap)) {
+ err = PTR_ERR(newmap);
+ goto bad_unlock;
+ }
+
+ /* swap into place */
+ if (mdsc->mdsmap) {
+ oldmap = mdsc->mdsmap;
+ mdsc->mdsmap = newmap;
+ check_new_map(mdsc, newmap, oldmap);
+ ceph_mdsmap_destroy(oldmap);
+ } else {
+ mdsc->mdsmap = newmap; /* first mds map */
+ }
+ mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size,
+ MAX_LFS_FILESIZE);
+
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
+ mdsc->mdsmap->m_epoch);
+
+ mutex_unlock(&mdsc->mutex);
+ schedule_delayed(mdsc);
+ return;
+
+bad_unlock:
+ mutex_unlock(&mdsc->mutex);
+bad:
+ pr_err("error decoding mdsmap %d\n", err);
+ return;
+}
+
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+ struct ceph_mds_session *s = con->private;
+
+ if (get_session(s)) {
+ dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref));
+ return con;
+ }
+ dout("mdsc con_get %p FAIL\n", s);
+ return NULL;
+}
+
+static void con_put(struct ceph_connection *con)
+{
+ struct ceph_mds_session *s = con->private;
+
+ dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1);
+ ceph_put_mds_session(s);
+}
+
+/*
+ * if the client is unresponsive for long enough, the mds will kill
+ * the session entirely.
+ */
+static void peer_reset(struct ceph_connection *con)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+
+ pr_warn("mds%d closed our session\n", s->s_mds);
+ send_mds_reconnect(mdsc, s);
+}
+
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ mutex_lock(&mdsc->mutex);
+ if (__verify_registered_session(mdsc, s) < 0) {
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ switch (type) {
+ case CEPH_MSG_MDS_MAP:
+ ceph_mdsc_handle_mdsmap(mdsc, msg);
+ break;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(mdsc, msg);
+ break;
+ case CEPH_MSG_CLIENT_SESSION:
+ handle_session(s, msg);
+ break;
+ case CEPH_MSG_CLIENT_REPLY:
+ handle_reply(s, msg);
+ break;
+ case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+ handle_forward(mdsc, s, msg);
+ break;
+ case CEPH_MSG_CLIENT_CAPS:
+ ceph_handle_caps(s, msg);
+ break;
+ case CEPH_MSG_CLIENT_SNAP:
+ ceph_handle_snap(mdsc, s, msg);
+ break;
+ case CEPH_MSG_CLIENT_LEASE:
+ handle_lease(mdsc, s, msg);
+ break;
+ case CEPH_MSG_CLIENT_QUOTA:
+ ceph_handle_quota(mdsc, s, msg);
+ break;
+
+ default:
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+out:
+ ceph_msg_put(msg);
+}
+
+/*
+ * authentication
+ */
+
+/*
+ * Note: returned pointer is the address of a structure that's
+ * managed separately. Caller must *not* attempt to free it.
+ */
+static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
+ int *proto, int force_new)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+
+ if (force_new && auth->authorizer) {
+ ceph_auth_destroy_authorizer(auth->authorizer);
+ auth->authorizer = NULL;
+ }
+ if (!auth->authorizer) {
+ int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+ auth);
+ if (ret)
+ return ERR_PTR(ret);
+ } else {
+ int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+ auth);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ *proto = ac->protocol;
+
+ return auth;
+}
+
+static int add_authorizer_challenge(struct ceph_connection *con,
+ void *challenge_buf, int challenge_buf_len)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+
+ return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer,
+ challenge_buf, challenge_buf_len);
+}
+
+static int verify_authorizer_reply(struct ceph_connection *con)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+
+ return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+
+ ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+
+ return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
+}
+
+static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr, int *skip)
+{
+ struct ceph_msg *msg;
+ int type = (int) le16_to_cpu(hdr->type);
+ int front_len = (int) le32_to_cpu(hdr->front_len);
+
+ if (con->in_msg)
+ return con->in_msg;
+
+ *skip = 0;
+ msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
+ if (!msg) {
+ pr_err("unable to allocate msg type %d len %d\n",
+ type, front_len);
+ return NULL;
+ }
+
+ return msg;
+}
+
+static int mds_sign_message(struct ceph_msg *msg)
+{
+ struct ceph_mds_session *s = msg->con->private;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+
+ return ceph_auth_sign_message(auth, msg);
+}
+
+static int mds_check_message_signature(struct ceph_msg *msg)
+{
+ struct ceph_mds_session *s = msg->con->private;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+
+ return ceph_auth_check_message_signature(auth, msg);
+}
+
+static const struct ceph_connection_operations mds_con_ops = {
+ .get = con_get,
+ .put = con_put,
+ .dispatch = dispatch,
+ .get_authorizer = get_authorizer,
+ .add_authorizer_challenge = add_authorizer_challenge,
+ .verify_authorizer_reply = verify_authorizer_reply,
+ .invalidate_authorizer = invalidate_authorizer,
+ .peer_reset = peer_reset,
+ .alloc_msg = mds_alloc_msg,
+ .sign_message = mds_sign_message,
+ .check_message_signature = mds_check_message_signature,
+};
+
+/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000..32fcce0d4
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,466 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MDS_CLIENT_H
+#define _FS_CEPH_MDS_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/utsname.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/auth.h>
+
+/* The first 8 bits are reserved for old ceph releases */
+#define CEPHFS_FEATURE_MIMIC 8
+
+#define CEPHFS_FEATURES_ALL { \
+ 0, 1, 2, 3, 4, 5, 6, 7, \
+ CEPHFS_FEATURE_MIMIC, \
+}
+
+#define CEPHFS_FEATURES_CLIENT_SUPPORTED CEPHFS_FEATURES_ALL
+#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
+
+
+/*
+ * Some lock dependencies:
+ *
+ * session->s_mutex
+ * mdsc->mutex
+ *
+ * mdsc->snap_rwsem
+ *
+ * ci->i_ceph_lock
+ * mdsc->snap_flush_lock
+ * mdsc->cap_delay_lock
+ *
+ */
+
+struct ceph_fs_client;
+struct ceph_cap;
+
+/*
+ * parsed info about a single inode. pointers are into the encoded
+ * on-wire structures within the mds reply message payload.
+ */
+struct ceph_mds_reply_info_in {
+ struct ceph_mds_reply_inode *in;
+ struct ceph_dir_layout dir_layout;
+ u32 symlink_len;
+ char *symlink;
+ u32 xattr_len;
+ char *xattr_data;
+ u64 inline_version;
+ u32 inline_len;
+ char *inline_data;
+ u32 pool_ns_len;
+ char *pool_ns_data;
+ u64 max_bytes;
+ u64 max_files;
+};
+
+struct ceph_mds_reply_dir_entry {
+ char *name;
+ u32 name_len;
+ struct ceph_mds_reply_lease *lease;
+ struct ceph_mds_reply_info_in inode;
+ loff_t offset;
+};
+
+/*
+ * parsed info about an mds reply, including information about
+ * either: 1) the target inode and/or its parent directory and dentry,
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
+ */
+struct ceph_mds_reply_info_parsed {
+ struct ceph_mds_reply_head *head;
+
+ /* trace */
+ struct ceph_mds_reply_info_in diri, targeti;
+ struct ceph_mds_reply_dirfrag *dirfrag;
+ char *dname;
+ u32 dname_len;
+ struct ceph_mds_reply_lease *dlease;
+
+ /* extra */
+ union {
+ /* for fcntl F_GETLK results */
+ struct ceph_filelock *filelock_reply;
+
+ /* for readdir results */
+ struct {
+ struct ceph_mds_reply_dirfrag *dir_dir;
+ size_t dir_buf_size;
+ int dir_nr;
+ bool dir_end;
+ bool dir_complete;
+ bool hash_order;
+ bool offset_hash;
+ struct ceph_mds_reply_dir_entry *dir_entries;
+ };
+
+ /* for create results */
+ struct {
+ bool has_create_ino;
+ u64 ino;
+ };
+ };
+
+ /* encoded blob describing snapshot contexts for certain
+ operations (e.g., open) */
+ void *snapblob;
+ int snapblob_len;
+};
+
+
+/*
+ * cap releases are batched and sent to the MDS en masse.
+ *
+ * Account for per-message overhead of mds_cap_release header
+ * and __le32 for osd epoch barrier trailing field.
+ */
+#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \
+ sizeof(struct ceph_mds_cap_release)) / \
+ sizeof(struct ceph_mds_cap_item))
+
+
+/*
+ * state associated with each MDS<->client session
+ */
+enum {
+ CEPH_MDS_SESSION_NEW = 1,
+ CEPH_MDS_SESSION_OPENING = 2,
+ CEPH_MDS_SESSION_OPEN = 3,
+ CEPH_MDS_SESSION_HUNG = 4,
+ CEPH_MDS_SESSION_CLOSING = 5,
+ CEPH_MDS_SESSION_RESTARTING = 6,
+ CEPH_MDS_SESSION_RECONNECTING = 7,
+ CEPH_MDS_SESSION_REJECTED = 8,
+};
+
+struct ceph_mds_session {
+ struct ceph_mds_client *s_mdsc;
+ int s_mds;
+ int s_state;
+ unsigned long s_ttl; /* time until mds kills us */
+ u64 s_seq; /* incoming msg seq # */
+ struct mutex s_mutex; /* serialize session messages */
+
+ struct ceph_connection s_con;
+
+ struct ceph_auth_handshake s_auth;
+
+ /* protected by s_gen_ttl_lock */
+ spinlock_t s_gen_ttl_lock;
+ u32 s_cap_gen; /* inc each time we get mds stale msg */
+ unsigned long s_cap_ttl; /* when session caps expire */
+
+ /* protected by s_cap_lock */
+ spinlock_t s_cap_lock;
+ struct list_head s_caps; /* all caps issued by this session */
+ int s_nr_caps, s_trim_caps;
+ int s_num_cap_releases;
+ int s_cap_reconnect;
+ int s_readonly;
+ struct list_head s_cap_releases; /* waiting cap_release messages */
+ struct ceph_cap *s_cap_iterator;
+
+ /* protected by mutex */
+ struct list_head s_cap_flushing; /* inodes w/ flushing caps */
+ unsigned long s_renew_requested; /* last time we sent a renew req */
+ u64 s_renew_seq;
+
+ refcount_t s_ref;
+ struct list_head s_waiting; /* waiting requests */
+ struct list_head s_unsafe; /* unsafe requests */
+};
+
+/*
+ * modes of choosing which MDS to send a request to
+ */
+enum {
+ USE_ANY_MDS,
+ USE_RANDOM_MDS,
+ USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
+};
+
+struct ceph_mds_request;
+struct ceph_mds_client;
+
+/*
+ * request completion callback
+ */
+typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req);
+/*
+ * wait for request completion callback
+ */
+typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req);
+
+/*
+ * an in-flight mds request
+ */
+struct ceph_mds_request {
+ u64 r_tid; /* transaction id */
+ struct rb_node r_node;
+ struct ceph_mds_client *r_mdsc;
+
+ int r_op; /* mds op code */
+
+ /* operation on what? */
+ struct inode *r_inode; /* arg1 */
+ struct dentry *r_dentry; /* arg1 */
+ struct dentry *r_old_dentry; /* arg2: rename from or link from */
+ struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */
+ char *r_path1, *r_path2;
+ struct ceph_vino r_ino1, r_ino2;
+
+ struct inode *r_parent; /* parent dir inode */
+ struct inode *r_target_inode; /* resulting inode */
+
+#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */
+#define CEPH_MDS_R_ABORTED (2) /* call was aborted */
+#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */
+#define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */
+#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */
+#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
+#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
+ unsigned long r_req_flags;
+
+ struct mutex r_fill_mutex;
+
+ union ceph_mds_request_args r_args;
+ int r_fmode; /* file mode, if expecting cap */
+ kuid_t r_uid;
+ kgid_t r_gid;
+ struct timespec64 r_stamp;
+
+ /* for choosing which mds to send this request to */
+ int r_direct_mode;
+ u32 r_direct_hash; /* choose dir frag based on this dentry hash */
+
+ /* data payload is used for xattr ops */
+ struct ceph_pagelist *r_pagelist;
+
+ /* what caps shall we drop? */
+ int r_inode_drop, r_inode_unless;
+ int r_dentry_drop, r_dentry_unless;
+ int r_old_dentry_drop, r_old_dentry_unless;
+ struct inode *r_old_inode;
+ int r_old_inode_drop, r_old_inode_unless;
+
+ struct ceph_msg *r_request; /* original request */
+ int r_request_release_offset;
+ struct ceph_msg *r_reply;
+ struct ceph_mds_reply_info_parsed r_reply_info;
+ struct page *r_locked_page;
+ int r_err;
+
+ unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
+ unsigned long r_started; /* start time to measure timeout against */
+ unsigned long r_request_started; /* start time for mds request only,
+ used to measure lease durations */
+
+ /* link unsafe requests to parent directory, for fsync */
+ struct inode *r_unsafe_dir;
+ struct list_head r_unsafe_dir_item;
+
+ /* unsafe requests that modify the target inode */
+ struct list_head r_unsafe_target_item;
+
+ struct ceph_mds_session *r_session;
+
+ int r_attempts; /* resend attempts */
+ int r_num_fwd; /* number of forward attempts */
+ int r_resend_mds; /* mds to resend to next, if any*/
+ u32 r_sent_on_mseq; /* cap mseq request was sent at*/
+
+ struct kref r_kref;
+ struct list_head r_wait;
+ struct completion r_completion;
+ struct completion r_safe_completion;
+ ceph_mds_request_callback_t r_callback;
+ ceph_mds_request_wait_callback_t r_wait_for_completion;
+ struct list_head r_unsafe_item; /* per-session unsafe list item */
+
+ long long r_dir_release_cnt;
+ long long r_dir_ordered_cnt;
+ int r_readdir_cache_idx;
+ u32 r_readdir_offset;
+
+ struct ceph_cap_reservation r_caps_reservation;
+ int r_num_caps;
+};
+
+struct ceph_pool_perm {
+ struct rb_node node;
+ int perm;
+ s64 pool;
+ size_t pool_ns_len;
+ char pool_ns[];
+};
+
+/*
+ * mds client state
+ */
+struct ceph_mds_client {
+ struct ceph_fs_client *fsc;
+ struct mutex mutex; /* all nested structures */
+
+ struct ceph_mdsmap *mdsmap;
+ struct completion safe_umount_waiters;
+ wait_queue_head_t session_close_wq;
+ struct list_head waiting_for_map;
+ int mdsmap_err;
+
+ struct ceph_mds_session **sessions; /* NULL for mds if no session */
+ atomic_t num_sessions;
+ int max_sessions; /* len of s_mds_sessions */
+ int stopping; /* true if shutting down */
+
+ atomic64_t quotarealms_count; /* # realms with quota */
+
+ /*
+ * snap_rwsem will cover cap linkage into snaprealms, and
+ * realm snap contexts. (later, we can do per-realm snap
+ * contexts locks..) the empty list contains realms with no
+ * references (implying they contain no inodes with caps) that
+ * should be destroyed.
+ */
+ u64 last_snap_seq;
+ struct rw_semaphore snap_rwsem;
+ struct rb_root snap_realms;
+ struct list_head snap_empty;
+ spinlock_t snap_empty_lock; /* protect snap_empty */
+
+ u64 last_tid; /* most recent mds request */
+ u64 oldest_tid; /* oldest incomplete mds request,
+ excluding setfilelock requests */
+ struct rb_root request_tree; /* pending mds requests */
+ struct delayed_work delayed_work; /* delayed work */
+ unsigned long last_renew_caps; /* last time we renewed our caps */
+ struct list_head cap_delay_list; /* caps with delayed release */
+ spinlock_t cap_delay_lock; /* protects cap_delay_list */
+ struct list_head snap_flush_list; /* cap_snaps ready to flush */
+ spinlock_t snap_flush_lock;
+
+ u64 last_cap_flush_tid;
+ struct list_head cap_flush_list;
+ struct list_head cap_dirty; /* inodes with dirty caps */
+ struct list_head cap_dirty_migrating; /* ...that are migration... */
+ int num_cap_flushing; /* # caps we are flushing */
+ spinlock_t cap_dirty_lock; /* protects above items */
+ wait_queue_head_t cap_flushing_wq;
+
+ /*
+ * Cap reservations
+ *
+ * Maintain a global pool of preallocated struct ceph_caps, referenced
+ * by struct ceph_caps_reservations. This ensures that we preallocate
+ * memory needed to successfully process an MDS response. (If an MDS
+ * sends us cap information and we fail to process it, we will have
+ * problems due to the client and MDS being out of sync.)
+ *
+ * Reservations are 'owned' by a ceph_cap_reservation context.
+ */
+ spinlock_t caps_list_lock;
+ struct list_head caps_list; /* unused (reserved or
+ unreserved) */
+ int caps_total_count; /* total caps allocated */
+ int caps_use_count; /* in use */
+ int caps_reserve_count; /* unused, reserved */
+ int caps_avail_count; /* unused, unreserved */
+ int caps_min_count; /* keep at least this many
+ (unreserved) */
+ spinlock_t dentry_lru_lock;
+ struct list_head dentry_lru;
+ int num_dentry;
+
+ struct rw_semaphore pool_perm_rwsem;
+ struct rb_root pool_perm_tree;
+
+ char nodename[__NEW_UTS_LEN + 1];
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+extern struct ceph_mds_session *
+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
+
+static inline struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s)
+{
+ refcount_inc(&s->s_ref);
+ return s;
+}
+
+extern const char *ceph_session_state_name(int s);
+
+extern void ceph_put_mds_session(struct ceph_mds_session *s);
+
+extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg, int mds);
+
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
+extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
+
+extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
+extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+ struct inode *dir);
+extern struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
+extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req);
+extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+ struct inode *dir,
+ struct ceph_mds_request *req);
+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
+{
+ kref_get(&req->r_kref);
+}
+extern void ceph_mdsc_release_request(struct kref *kref);
+static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
+{
+ kref_put(&req->r_kref, ceph_mdsc_release_request);
+}
+
+extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+
+extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
+
+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+ int stop_on_nosnap);
+
+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+ struct inode *inode,
+ struct dentry *dentry, char action,
+ u32 seq);
+
+extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
+extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
+
+extern struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
+extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+
+extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ int max_caps);
+#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000..44e53abeb
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+
+#include "super.h"
+
+
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+ int n = 0;
+ int i;
+
+ /* special case for one mds */
+ if (1 == m->m_num_mds && m->m_info[0].state > 0)
+ return 0;
+
+ /* count */
+ for (i = 0; i < m->m_num_mds; i++)
+ if (m->m_info[i].state > 0)
+ n++;
+ if (n == 0)
+ return -1;
+
+ /* pick */
+ n = prandom_u32() % n;
+ i = 0;
+ for (i = 0; n > 0; i++, n--)
+ while (m->m_info[i].state <= 0)
+ i++;
+
+ return i;
+}
+
+#define __decode_and_drop_type(p, end, type, bad) \
+ do { \
+ if (*p + sizeof(type) > end) \
+ goto bad; \
+ *p += sizeof(type); \
+ } while (0)
+
+#define __decode_and_drop_set(p, end, type, bad) \
+ do { \
+ u32 n; \
+ size_t need; \
+ ceph_decode_32_safe(p, end, n, bad); \
+ need = sizeof(type) * n; \
+ ceph_decode_need(p, end, need, bad); \
+ *p += need; \
+ } while (0)
+
+#define __decode_and_drop_map(p, end, ktype, vtype, bad) \
+ do { \
+ u32 n; \
+ size_t need; \
+ ceph_decode_32_safe(p, end, n, bad); \
+ need = (sizeof(ktype) + sizeof(vtype)) * n; \
+ ceph_decode_need(p, end, need, bad); \
+ *p += need; \
+ } while (0)
+
+
+static int __decode_and_drop_compat_set(void **p, void* end)
+{
+ int i;
+ /* compat, ro_compat, incompat*/
+ for (i = 0; i < 3; i++) {
+ u32 n;
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+ /* mask */
+ *p += sizeof(u64);
+ /* names (map<u64, string>) */
+ n = ceph_decode_32(p);
+ while (n-- > 0) {
+ u32 len;
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32),
+ bad);
+ *p += sizeof(u64);
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ }
+ }
+ return 0;
+bad:
+ return -1;
+}
+
+/*
+ * Decode an MDS map
+ *
+ * Ignore any fields we don't care about (there are quite a few of
+ * them).
+ */
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+{
+ struct ceph_mdsmap *m;
+ const void *start = *p;
+ int i, j, n;
+ int err = -EINVAL;
+ u8 mdsmap_v, mdsmap_cv;
+ u16 mdsmap_ev;
+
+ m = kzalloc(sizeof(*m), GFP_NOFS);
+ if (!m)
+ return ERR_PTR(-ENOMEM);
+
+ ceph_decode_need(p, end, 1 + 1, bad);
+ mdsmap_v = ceph_decode_8(p);
+ mdsmap_cv = ceph_decode_8(p);
+ if (mdsmap_v >= 4) {
+ u32 mdsmap_len;
+ ceph_decode_32_safe(p, end, mdsmap_len, bad);
+ if (end < *p + mdsmap_len)
+ goto bad;
+ end = *p + mdsmap_len;
+ }
+
+ ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
+ m->m_epoch = ceph_decode_32(p);
+ m->m_client_epoch = ceph_decode_32(p);
+ m->m_last_failure = ceph_decode_32(p);
+ m->m_root = ceph_decode_32(p);
+ m->m_session_timeout = ceph_decode_32(p);
+ m->m_session_autoclose = ceph_decode_32(p);
+ m->m_max_file_size = ceph_decode_64(p);
+ m->m_max_mds = ceph_decode_32(p);
+ m->m_num_mds = m->m_max_mds;
+
+ m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
+ if (!m->m_info)
+ goto nomem;
+
+ /* pick out active nodes from mds_info (state > 0) */
+ n = ceph_decode_32(p);
+ for (i = 0; i < n; i++) {
+ u64 global_id;
+ u32 namelen;
+ s32 mds, inc, state;
+ u64 state_seq;
+ u8 info_v;
+ void *info_end = NULL;
+ struct ceph_entity_addr addr;
+ u32 num_export_targets;
+ void *pexport_targets = NULL;
+ struct ceph_timespec laggy_since;
+ struct ceph_mds_info *info;
+
+ ceph_decode_need(p, end, sizeof(u64) + 1, bad);
+ global_id = ceph_decode_64(p);
+ info_v= ceph_decode_8(p);
+ if (info_v >= 4) {
+ u32 info_len;
+ u8 info_cv;
+ ceph_decode_need(p, end, 1 + sizeof(u32), bad);
+ info_cv = ceph_decode_8(p);
+ info_len = ceph_decode_32(p);
+ info_end = *p + info_len;
+ if (info_end > end)
+ goto bad;
+ }
+
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+ *p += sizeof(u64);
+ namelen = ceph_decode_32(p); /* skip mds name */
+ *p += namelen;
+
+ ceph_decode_need(p, end,
+ 4*sizeof(u32) + sizeof(u64) +
+ sizeof(addr) + sizeof(struct ceph_timespec),
+ bad);
+ mds = ceph_decode_32(p);
+ inc = ceph_decode_32(p);
+ state = ceph_decode_32(p);
+ state_seq = ceph_decode_64(p);
+ ceph_decode_copy(p, &addr, sizeof(addr));
+ ceph_decode_addr(&addr);
+ ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+ *p += sizeof(u32);
+ ceph_decode_32_safe(p, end, namelen, bad);
+ *p += namelen;
+ if (info_v >= 2) {
+ ceph_decode_32_safe(p, end, num_export_targets, bad);
+ pexport_targets = *p;
+ *p += num_export_targets * sizeof(u32);
+ } else {
+ num_export_targets = 0;
+ }
+
+ if (info_end && *p != info_end) {
+ if (*p > info_end)
+ goto bad;
+ *p = info_end;
+ }
+
+ dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+ i+1, n, global_id, mds, inc,
+ ceph_pr_addr(&addr.in_addr),
+ ceph_mds_state_name(state));
+
+ if (mds < 0 || state <= 0)
+ continue;
+
+ if (mds >= m->m_num_mds) {
+ int new_num = max(mds + 1, m->m_num_mds * 2);
+ void *new_m_info = krealloc(m->m_info,
+ new_num * sizeof(*m->m_info),
+ GFP_NOFS | __GFP_ZERO);
+ if (!new_m_info)
+ goto nomem;
+ m->m_info = new_m_info;
+ m->m_num_mds = new_num;
+ }
+
+ info = &m->m_info[mds];
+ info->global_id = global_id;
+ info->state = state;
+ info->addr = addr;
+ info->laggy = (laggy_since.tv_sec != 0 ||
+ laggy_since.tv_nsec != 0);
+ info->num_export_targets = num_export_targets;
+ if (num_export_targets) {
+ info->export_targets = kcalloc(num_export_targets,
+ sizeof(u32), GFP_NOFS);
+ if (!info->export_targets)
+ goto nomem;
+ for (j = 0; j < num_export_targets; j++)
+ info->export_targets[j] =
+ ceph_decode_32(&pexport_targets);
+ } else {
+ info->export_targets = NULL;
+ }
+ }
+ if (m->m_num_mds > m->m_max_mds) {
+ /* find max up mds */
+ for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
+ if (i == 0 || m->m_info[i-1].state > 0)
+ break;
+ }
+ m->m_num_mds = i;
+ }
+
+ /* pg_pools */
+ ceph_decode_32_safe(p, end, n, bad);
+ m->m_num_data_pg_pools = n;
+ m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
+ if (!m->m_data_pg_pools)
+ goto nomem;
+ ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
+ for (i = 0; i < n; i++)
+ m->m_data_pg_pools[i] = ceph_decode_64(p);
+ m->m_cas_pg_pool = ceph_decode_64(p);
+ m->m_enabled = m->m_epoch > 1;
+
+ mdsmap_ev = 1;
+ if (mdsmap_v >= 2) {
+ ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext);
+ }
+ if (mdsmap_ev >= 3) {
+ if (__decode_and_drop_compat_set(p, end) < 0)
+ goto bad_ext;
+ }
+ /* metadata_pool */
+ if (mdsmap_ev < 5) {
+ __decode_and_drop_type(p, end, u32, bad_ext);
+ } else {
+ __decode_and_drop_type(p, end, u64, bad_ext);
+ }
+
+ /* created + modified + tableserver */
+ __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
+ __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
+ __decode_and_drop_type(p, end, u32, bad_ext);
+
+ /* in */
+ {
+ int num_laggy = 0;
+ ceph_decode_32_safe(p, end, n, bad_ext);
+ ceph_decode_need(p, end, sizeof(u32) * n, bad_ext);
+
+ for (i = 0; i < n; i++) {
+ s32 mds = ceph_decode_32(p);
+ if (mds >= 0 && mds < m->m_num_mds) {
+ if (m->m_info[mds].laggy)
+ num_laggy++;
+ }
+ }
+ m->m_num_laggy = num_laggy;
+
+ if (n > m->m_num_mds) {
+ void *new_m_info = krealloc(m->m_info,
+ n * sizeof(*m->m_info),
+ GFP_NOFS | __GFP_ZERO);
+ if (!new_m_info)
+ goto nomem;
+ m->m_info = new_m_info;
+ }
+ m->m_num_mds = n;
+ }
+
+ /* inc */
+ __decode_and_drop_map(p, end, u32, u32, bad_ext);
+ /* up */
+ __decode_and_drop_map(p, end, u32, u64, bad_ext);
+ /* failed */
+ __decode_and_drop_set(p, end, u32, bad_ext);
+ /* stopped */
+ __decode_and_drop_set(p, end, u32, bad_ext);
+
+ if (mdsmap_ev >= 4) {
+ /* last_failure_osd_epoch */
+ __decode_and_drop_type(p, end, u32, bad_ext);
+ }
+ if (mdsmap_ev >= 6) {
+ /* ever_allowed_snaps */
+ __decode_and_drop_type(p, end, u8, bad_ext);
+ /* explicitly_allowed_snaps */
+ __decode_and_drop_type(p, end, u8, bad_ext);
+ }
+ if (mdsmap_ev >= 7) {
+ /* inline_data_enabled */
+ __decode_and_drop_type(p, end, u8, bad_ext);
+ }
+ if (mdsmap_ev >= 8) {
+ u32 name_len;
+ /* enabled */
+ ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
+ ceph_decode_32_safe(p, end, name_len, bad_ext);
+ ceph_decode_need(p, end, name_len, bad_ext);
+ *p += name_len;
+ }
+ /* damaged */
+ if (mdsmap_ev >= 9) {
+ size_t need;
+ ceph_decode_32_safe(p, end, n, bad_ext);
+ need = sizeof(u32) * n;
+ ceph_decode_need(p, end, need, bad_ext);
+ *p += need;
+ m->m_damaged = n > 0;
+ } else {
+ m->m_damaged = false;
+ }
+bad_ext:
+ *p = end;
+ dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+ return m;
+nomem:
+ err = -ENOMEM;
+ goto out_err;
+bad:
+ pr_err("corrupt mdsmap\n");
+ print_hex_dump(KERN_DEBUG, "mdsmap: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ start, end - start, true);
+out_err:
+ ceph_mdsmap_destroy(m);
+ return ERR_PTR(err);
+}
+
+void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
+{
+ int i;
+
+ for (i = 0; i < m->m_num_mds; i++)
+ kfree(m->m_info[i].export_targets);
+ kfree(m->m_info);
+ kfree(m->m_data_pg_pools);
+ kfree(m);
+}
+
+bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
+{
+ int i, nr_active = 0;
+ if (!m->m_enabled)
+ return false;
+ if (m->m_damaged)
+ return false;
+ if (m->m_num_laggy > 0)
+ return false;
+ for (i = 0; i < m->m_num_mds; i++) {
+ if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
+ nr_active++;
+ }
+ return nr_active > 0;
+}
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
new file mode 100644
index 000000000..03f4d24db
--- /dev/null
+++ b/fs/ceph/quota.c
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * quota.c - CephFS quota
+ *
+ * Copyright (C) 2017-2018 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/statfs.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ if (inc)
+ atomic64_inc(&mdsc->quotarealms_count);
+ else
+ atomic64_dec(&mdsc->quotarealms_count);
+}
+
+static inline bool ceph_has_realms_with_quotas(struct inode *inode)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ return atomic64_read(&mdsc->quotarealms_count) > 0;
+}
+
+void ceph_handle_quota(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct super_block *sb = mdsc->fsc->sb;
+ struct ceph_mds_quota *h = msg->front.iov_base;
+ struct ceph_vino vino;
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+
+ if (msg->front.iov_len < sizeof(*h)) {
+ pr_err("%s corrupt message mds%d len %d\n", __func__,
+ session->s_mds, (int)msg->front.iov_len);
+ ceph_msg_dump(msg);
+ return;
+ }
+
+ /* increment msg sequence number */
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;
+ mutex_unlock(&session->s_mutex);
+
+ /* lookup inode */
+ vino.ino = le64_to_cpu(h->ino);
+ vino.snap = CEPH_NOSNAP;
+ inode = ceph_find_inode(sb, vino);
+ if (!inode) {
+ pr_warn("Failed to find inode %llu\n", vino.ino);
+ return;
+ }
+ ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_rbytes = le64_to_cpu(h->rbytes);
+ ci->i_rfiles = le64_to_cpu(h->rfiles);
+ ci->i_rsubdirs = le64_to_cpu(h->rsubdirs);
+ __ceph_update_quota(ci, le64_to_cpu(h->max_bytes),
+ le64_to_cpu(h->max_files));
+ spin_unlock(&ci->i_ceph_lock);
+
+ iput(inode);
+}
+
+/*
+ * This function walks through the snaprealm for an inode and returns the
+ * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
+ * or max_bytes). If the root is reached, return the root ceph_snap_realm
+ * instead.
+ *
+ * Note that the caller is responsible for calling ceph_put_snap_realm() on the
+ * returned realm.
+ */
+static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
+ struct inode *inode)
+{
+ struct ceph_inode_info *ci = NULL;
+ struct ceph_snap_realm *realm, *next;
+ struct inode *in;
+ bool has_quota;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return NULL;
+
+ realm = ceph_inode(inode)->i_snap_realm;
+ if (realm)
+ ceph_get_snap_realm(mdsc, realm);
+ else
+ pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
+ "null i_snap_realm\n", ceph_vinop(inode));
+ while (realm) {
+ spin_lock(&realm->inodes_with_caps_lock);
+ in = realm->inode ? igrab(realm->inode) : NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (!in)
+ break;
+
+ ci = ceph_inode(in);
+ has_quota = __ceph_has_any_quota(ci);
+ iput(in);
+
+ next = realm->parent;
+ if (has_quota || !next)
+ return realm;
+
+ ceph_get_snap_realm(mdsc, next);
+ ceph_put_snap_realm(mdsc, realm);
+ realm = next;
+ }
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
+
+ return NULL;
+}
+
+bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc;
+ struct ceph_snap_realm *old_realm, *new_realm;
+ bool is_same;
+
+ down_read(&mdsc->snap_rwsem);
+ old_realm = get_quota_realm(mdsc, old);
+ new_realm = get_quota_realm(mdsc, new);
+ is_same = (old_realm == new_realm);
+ up_read(&mdsc->snap_rwsem);
+
+ if (old_realm)
+ ceph_put_snap_realm(mdsc, old_realm);
+ if (new_realm)
+ ceph_put_snap_realm(mdsc, new_realm);
+
+ return is_same;
+}
+
+enum quota_check_op {
+ QUOTA_CHECK_MAX_FILES_OP, /* check quota max_files limit */
+ QUOTA_CHECK_MAX_BYTES_OP, /* check quota max_files limit */
+ QUOTA_CHECK_MAX_BYTES_APPROACHING_OP /* check if quota max_files
+ limit is approaching */
+};
+
+/*
+ * check_quota_exceeded() will walk up the snaprealm hierarchy and, for each
+ * realm, it will execute quota check operation defined by the 'op' parameter.
+ * The snaprealm walk is interrupted if the quota check detects that the quota
+ * is exceeded or if the root inode is reached.
+ */
+static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
+ loff_t delta)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_realm *realm, *next;
+ struct inode *in;
+ u64 max, rvalue;
+ bool exceeded = false;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return false;
+
+ down_read(&mdsc->snap_rwsem);
+ realm = ceph_inode(inode)->i_snap_realm;
+ if (realm)
+ ceph_get_snap_realm(mdsc, realm);
+ else
+ pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
+ "null i_snap_realm\n", ceph_vinop(inode));
+ while (realm) {
+ spin_lock(&realm->inodes_with_caps_lock);
+ in = realm->inode ? igrab(realm->inode) : NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (!in)
+ break;
+
+ ci = ceph_inode(in);
+ spin_lock(&ci->i_ceph_lock);
+ if (op == QUOTA_CHECK_MAX_FILES_OP) {
+ max = ci->i_max_files;
+ rvalue = ci->i_rfiles + ci->i_rsubdirs;
+ } else {
+ max = ci->i_max_bytes;
+ rvalue = ci->i_rbytes;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ switch (op) {
+ case QUOTA_CHECK_MAX_FILES_OP:
+ exceeded = (max && (rvalue >= max));
+ break;
+ case QUOTA_CHECK_MAX_BYTES_OP:
+ exceeded = (max && (rvalue + delta > max));
+ break;
+ case QUOTA_CHECK_MAX_BYTES_APPROACHING_OP:
+ if (max) {
+ if (rvalue >= max)
+ exceeded = true;
+ else {
+ /*
+ * when we're writing more that 1/16th
+ * of the available space
+ */
+ exceeded =
+ (((max - rvalue) >> 4) < delta);
+ }
+ }
+ break;
+ default:
+ /* Shouldn't happen */
+ pr_warn("Invalid quota check op (%d)\n", op);
+ exceeded = true; /* Just break the loop */
+ }
+ iput(in);
+
+ next = realm->parent;
+ if (exceeded || !next)
+ break;
+ ceph_get_snap_realm(mdsc, next);
+ ceph_put_snap_realm(mdsc, realm);
+ realm = next;
+ }
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
+ up_read(&mdsc->snap_rwsem);
+
+ return exceeded;
+}
+
+/*
+ * ceph_quota_is_max_files_exceeded - check if we can create a new file
+ * @inode: directory where a new file is being created
+ *
+ * This functions returns true is max_files quota allows a new file to be
+ * created. It is necessary to walk through the snaprealm hierarchy (until the
+ * FS root) to check all realms with quotas set.
+ */
+bool ceph_quota_is_max_files_exceeded(struct inode *inode)
+{
+ if (!ceph_has_realms_with_quotas(inode))
+ return false;
+
+ WARN_ON(!S_ISDIR(inode->i_mode));
+
+ return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 0);
+}
+
+/*
+ * ceph_quota_is_max_bytes_exceeded - check if we can write to a file
+ * @inode: inode being written
+ * @newsize: new size if write succeeds
+ *
+ * This functions returns true is max_bytes quota allows a file size to reach
+ * @newsize; it returns false otherwise.
+ */
+bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newsize)
+{
+ loff_t size = i_size_read(inode);
+
+ if (!ceph_has_realms_with_quotas(inode))
+ return false;
+
+ /* return immediately if we're decreasing file size */
+ if (newsize <= size)
+ return false;
+
+ return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_OP, (newsize - size));
+}
+
+/*
+ * ceph_quota_is_max_bytes_approaching - check if we're reaching max_bytes
+ * @inode: inode being written
+ * @newsize: new size if write succeeds
+ *
+ * This function returns true if the new file size @newsize will be consuming
+ * more than 1/16th of the available quota space; it returns false otherwise.
+ */
+bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newsize)
+{
+ loff_t size = ceph_inode(inode)->i_reported_size;
+
+ if (!ceph_has_realms_with_quotas(inode))
+ return false;
+
+ /* return immediately if we're decreasing file size */
+ if (newsize <= size)
+ return false;
+
+ return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_APPROACHING_OP,
+ (newsize - size));
+}
+
+/*
+ * ceph_quota_update_statfs - if root has quota update statfs with quota status
+ * @fsc: filesystem client instance
+ * @buf: statfs to update
+ *
+ * If the mounted filesystem root has max_bytes quota set, update the filesystem
+ * statistics with the quota status.
+ *
+ * This function returns true if the stats have been updated, false otherwise.
+ */
+bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
+{
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_realm *realm;
+ struct inode *in;
+ u64 total = 0, used, free;
+ bool is_updated = false;
+
+ down_read(&mdsc->snap_rwsem);
+ realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root));
+ up_read(&mdsc->snap_rwsem);
+ if (!realm)
+ return false;
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ in = realm->inode ? igrab(realm->inode) : NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (in) {
+ ci = ceph_inode(in);
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_max_bytes) {
+ total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT;
+ used = ci->i_rbytes >> CEPH_BLOCK_SHIFT;
+ /* It is possible for a quota to be exceeded.
+ * Report 'zero' in that case
+ */
+ free = total > used ? total - used : 0;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (total) {
+ buf->f_blocks = total;
+ buf->f_bfree = free;
+ buf->f_bavail = free;
+ is_updated = true;
+ }
+ iput(in);
+ }
+ ceph_put_snap_realm(mdsc, realm);
+
+ return is_updated;
+}
+
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000..5cf7b5f4d
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,995 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/sort.h>
+#include <linux/slab.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+
+/*
+ * Snapshots in ceph are driven in large part by cooperation from the
+ * client. In contrast to local file systems or file servers that
+ * implement snapshots at a single point in the system, ceph's
+ * distributed access to storage requires clients to help decide
+ * whether a write logically occurs before or after a recently created
+ * snapshot.
+ *
+ * This provides a perfect instantanous client-wide snapshot. Between
+ * clients, however, snapshots may appear to be applied at slightly
+ * different points in time, depending on delays in delivering the
+ * snapshot notification.
+ *
+ * Snapshots are _not_ file system-wide. Instead, each snapshot
+ * applies to the subdirectory nested beneath some directory. This
+ * effectively divides the hierarchy into multiple "realms," where all
+ * of the files contained by each realm share the same set of
+ * snapshots. An individual realm's snap set contains snapshots
+ * explicitly created on that realm, as well as any snaps in its
+ * parent's snap set _after_ the point at which the parent became it's
+ * parent (due to, say, a rename). Similarly, snaps from prior parents
+ * during the time intervals during which they were the parent are included.
+ *
+ * The client is spared most of this detail, fortunately... it must only
+ * maintains a hierarchy of realms reflecting the current parent/child
+ * realm relationship, and for each realm has an explicit list of snaps
+ * inherited from prior parents.
+ *
+ * A snap_realm struct is maintained for realms containing every inode
+ * with an open cap in the system. (The needed snap realm information is
+ * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
+ * version number is used to ensure that as realm parameters change (new
+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
+ *
+ * The realm hierarchy drives the generation of a 'snap context' for each
+ * realm, which simply lists the resulting set of snaps for the realm. This
+ * is attached to any writes sent to OSDs.
+ */
+/*
+ * Unfortunately error handling is a bit mixed here. If we get a snap
+ * update, but don't have enough memory to update our realm hierarchy,
+ * it's not clear what we can do about it (besides complaining to the
+ * console).
+ */
+
+
+/*
+ * increase ref count for the realm
+ *
+ * caller must hold snap_rwsem for write.
+ */
+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("get_realm %p %d -> %d\n", realm,
+ atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+ /*
+ * since we _only_ increment realm refs or empty the empty
+ * list with snap_rwsem held, adjusting the empty list here is
+ * safe. we do need to protect against concurrent empty list
+ * additions, however.
+ */
+ if (atomic_inc_return(&realm->nref) == 1) {
+ spin_lock(&mdsc->snap_empty_lock);
+ list_del_init(&realm->empty_item);
+ spin_unlock(&mdsc->snap_empty_lock);
+ }
+}
+
+static void __insert_snap_realm(struct rb_root *root,
+ struct ceph_snap_realm *new)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_snap_realm *r = NULL;
+
+ while (*p) {
+ parent = *p;
+ r = rb_entry(parent, struct ceph_snap_realm, node);
+ if (new->ino < r->ino)
+ p = &(*p)->rb_left;
+ else if (new->ino > r->ino)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);
+}
+
+/*
+ * create and get the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *ceph_create_snap_realm(
+ struct ceph_mds_client *mdsc,
+ u64 ino)
+{
+ struct ceph_snap_realm *realm;
+
+ realm = kzalloc(sizeof(*realm), GFP_NOFS);
+ if (!realm)
+ return ERR_PTR(-ENOMEM);
+
+ atomic_set(&realm->nref, 1); /* for caller */
+ realm->ino = ino;
+ INIT_LIST_HEAD(&realm->children);
+ INIT_LIST_HEAD(&realm->child_item);
+ INIT_LIST_HEAD(&realm->empty_item);
+ INIT_LIST_HEAD(&realm->dirty_item);
+ INIT_LIST_HEAD(&realm->inodes_with_caps);
+ spin_lock_init(&realm->inodes_with_caps_lock);
+ __insert_snap_realm(&mdsc->snap_realms, realm);
+ dout("create_snap_realm %llx %p\n", realm->ino, realm);
+ return realm;
+}
+
+/*
+ * lookup the realm rooted at @ino.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
+{
+ struct rb_node *n = mdsc->snap_realms.rb_node;
+ struct ceph_snap_realm *r;
+
+ while (n) {
+ r = rb_entry(n, struct ceph_snap_realm, node);
+ if (ino < r->ino)
+ n = n->rb_left;
+ else if (ino > r->ino)
+ n = n->rb_right;
+ else {
+ dout("lookup_snap_realm %llx %p\n", r->ino, r);
+ return r;
+ }
+ }
+ return NULL;
+}
+
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
+{
+ struct ceph_snap_realm *r;
+ r = __lookup_snap_realm(mdsc, ino);
+ if (r)
+ ceph_get_snap_realm(mdsc, r);
+ return r;
+}
+
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm);
+
+/*
+ * called with snap_rwsem (write)
+ */
+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+
+ rb_erase(&realm->node, &mdsc->snap_realms);
+
+ if (realm->parent) {
+ list_del_init(&realm->child_item);
+ __put_snap_realm(mdsc, realm->parent);
+ }
+
+ kfree(realm->prior_parent_snaps);
+ kfree(realm->snaps);
+ ceph_put_snap_context(realm->cached_context);
+ kfree(realm);
+}
+
+/*
+ * caller holds snap_rwsem (write)
+ */
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+ atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ if (atomic_dec_and_test(&realm->nref))
+ __destroy_snap_realm(mdsc, realm);
+}
+
+/*
+ * caller needn't hold any locks
+ */
+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+ atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ if (!atomic_dec_and_test(&realm->nref))
+ return;
+
+ if (down_write_trylock(&mdsc->snap_rwsem)) {
+ __destroy_snap_realm(mdsc, realm);
+ up_write(&mdsc->snap_rwsem);
+ } else {
+ spin_lock(&mdsc->snap_empty_lock);
+ list_add(&realm->empty_item, &mdsc->snap_empty);
+ spin_unlock(&mdsc->snap_empty_lock);
+ }
+}
+
+/*
+ * Clean up any realms whose ref counts have dropped to zero. Note
+ * that this does not include realms who were created but not yet
+ * used.
+ *
+ * Called under snap_rwsem (write)
+ */
+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snap_realm *realm;
+
+ spin_lock(&mdsc->snap_empty_lock);
+ while (!list_empty(&mdsc->snap_empty)) {
+ realm = list_first_entry(&mdsc->snap_empty,
+ struct ceph_snap_realm, empty_item);
+ list_del(&realm->empty_item);
+ spin_unlock(&mdsc->snap_empty_lock);
+ __destroy_snap_realm(mdsc, realm);
+ spin_lock(&mdsc->snap_empty_lock);
+ }
+ spin_unlock(&mdsc->snap_empty_lock);
+}
+
+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+ down_write(&mdsc->snap_rwsem);
+ __cleanup_empty_realms(mdsc);
+ up_write(&mdsc->snap_rwsem);
+}
+
+/*
+ * adjust the parent realm of a given @realm. adjust child list, and parent
+ * pointers, and ref counts appropriately.
+ *
+ * return true if parent was changed, 0 if unchanged, <0 on error.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm,
+ u64 parentino)
+{
+ struct ceph_snap_realm *parent;
+
+ if (realm->parent_ino == parentino)
+ return 0;
+
+ parent = ceph_lookup_snap_realm(mdsc, parentino);
+ if (!parent) {
+ parent = ceph_create_snap_realm(mdsc, parentino);
+ if (IS_ERR(parent))
+ return PTR_ERR(parent);
+ }
+ dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
+ realm->ino, realm, realm->parent_ino, realm->parent,
+ parentino, parent);
+ if (realm->parent) {
+ list_del_init(&realm->child_item);
+ ceph_put_snap_realm(mdsc, realm->parent);
+ }
+ realm->parent_ino = parentino;
+ realm->parent = parent;
+ list_add(&realm->child_item, &parent->children);
+ return 1;
+}
+
+
+static int cmpu64_rev(const void *a, const void *b)
+{
+ if (*(u64 *)a < *(u64 *)b)
+ return 1;
+ if (*(u64 *)a > *(u64 *)b)
+ return -1;
+ return 0;
+}
+
+
+/*
+ * build the snap context for a given realm.
+ */
+static int build_snap_context(struct ceph_snap_realm *realm,
+ struct list_head* dirty_realms)
+{
+ struct ceph_snap_realm *parent = realm->parent;
+ struct ceph_snap_context *snapc;
+ int err = 0;
+ u32 num = realm->num_prior_parent_snaps + realm->num_snaps;
+
+ /*
+ * build parent context, if it hasn't been built.
+ * conservatively estimate that all parent snaps might be
+ * included by us.
+ */
+ if (parent) {
+ if (!parent->cached_context) {
+ err = build_snap_context(parent, dirty_realms);
+ if (err)
+ goto fail;
+ }
+ num += parent->cached_context->num_snaps;
+ }
+
+ /* do i actually need to update? not if my context seq
+ matches realm seq, and my parents' does to. (this works
+ because we rebuild_snap_realms() works _downward_ in
+ hierarchy after each update.) */
+ if (realm->cached_context &&
+ realm->cached_context->seq == realm->seq &&
+ (!parent ||
+ realm->cached_context->seq >= parent->cached_context->seq)) {
+ dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
+ " (unchanged)\n",
+ realm->ino, realm, realm->cached_context,
+ realm->cached_context->seq,
+ (unsigned int)realm->cached_context->num_snaps);
+ return 0;
+ }
+
+ /* alloc new snap context */
+ err = -ENOMEM;
+ if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
+ goto fail;
+ snapc = ceph_create_snap_context(num, GFP_NOFS);
+ if (!snapc)
+ goto fail;
+
+ /* build (reverse sorted) snap vector */
+ num = 0;
+ snapc->seq = realm->seq;
+ if (parent) {
+ u32 i;
+
+ /* include any of parent's snaps occurring _after_ my
+ parent became my parent */
+ for (i = 0; i < parent->cached_context->num_snaps; i++)
+ if (parent->cached_context->snaps[i] >=
+ realm->parent_since)
+ snapc->snaps[num++] =
+ parent->cached_context->snaps[i];
+ if (parent->cached_context->seq > snapc->seq)
+ snapc->seq = parent->cached_context->seq;
+ }
+ memcpy(snapc->snaps + num, realm->snaps,
+ sizeof(u64)*realm->num_snaps);
+ num += realm->num_snaps;
+ memcpy(snapc->snaps + num, realm->prior_parent_snaps,
+ sizeof(u64)*realm->num_prior_parent_snaps);
+ num += realm->num_prior_parent_snaps;
+
+ sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
+ snapc->num_snaps = num;
+ dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
+ realm->ino, realm, snapc, snapc->seq,
+ (unsigned int) snapc->num_snaps);
+
+ ceph_put_snap_context(realm->cached_context);
+ realm->cached_context = snapc;
+ /* queue realm for cap_snap creation */
+ list_add_tail(&realm->dirty_item, dirty_realms);
+ return 0;
+
+fail:
+ /*
+ * if we fail, clear old (incorrect) cached_context... hopefully
+ * we'll have better luck building it later
+ */
+ if (realm->cached_context) {
+ ceph_put_snap_context(realm->cached_context);
+ realm->cached_context = NULL;
+ }
+ pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
+ realm, err);
+ return err;
+}
+
+/*
+ * rebuild snap context for the given realm and all of its children.
+ */
+static void rebuild_snap_realms(struct ceph_snap_realm *realm,
+ struct list_head *dirty_realms)
+{
+ struct ceph_snap_realm *child;
+
+ dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
+ build_snap_context(realm, dirty_realms);
+
+ list_for_each_entry(child, &realm->children, child_item)
+ rebuild_snap_realms(child, dirty_realms);
+}
+
+
+/*
+ * helper to allocate and decode an array of snapids. free prior
+ * instance, if any.
+ */
+static int dup_array(u64 **dst, __le64 *src, u32 num)
+{
+ u32 i;
+
+ kfree(*dst);
+ if (num) {
+ *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
+ if (!*dst)
+ return -ENOMEM;
+ for (i = 0; i < num; i++)
+ (*dst)[i] = get_unaligned_le64(src + i);
+ } else {
+ *dst = NULL;
+ }
+ return 0;
+}
+
+static bool has_new_snaps(struct ceph_snap_context *o,
+ struct ceph_snap_context *n)
+{
+ if (n->num_snaps == 0)
+ return false;
+ /* snaps are in descending order */
+ return n->snaps[0] > o->seq;
+}
+
+/*
+ * When a snapshot is applied, the size/mtime inode metadata is queued
+ * in a ceph_cap_snap (one for each snapshot) until writeback
+ * completes and the metadata can be flushed back to the MDS.
+ *
+ * However, if a (sync) write is currently in-progress when we apply
+ * the snapshot, we have to wait until the write succeeds or fails
+ * (and a final size/mtime is known). In this case the
+ * cap_snap->writing = 1, and is said to be "pending." When the write
+ * finishes, we __ceph_finish_cap_snap().
+ *
+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
+ * change).
+ */
+void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap_snap *capsnap;
+ struct ceph_snap_context *old_snapc, *new_snapc;
+ struct ceph_buffer *old_blob = NULL;
+ int used, dirty;
+
+ capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
+ if (!capsnap) {
+ pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
+ return;
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ used = __ceph_caps_used(ci);
+ dirty = __ceph_caps_dirty(ci);
+
+ old_snapc = ci->i_head_snapc;
+ new_snapc = ci->i_snap_realm->cached_context;
+
+ /*
+ * If there is a write in progress, treat that as a dirty Fw,
+ * even though it hasn't completed yet; by the time we finish
+ * up this capsnap it will be.
+ */
+ if (used & CEPH_CAP_FILE_WR)
+ dirty |= CEPH_CAP_FILE_WR;
+
+ if (__ceph_have_pending_cap_snap(ci)) {
+ /* there is no point in queuing multiple "pending" cap_snaps,
+ as no new writes are allowed to start when pending, so any
+ writes in progress now were started before the previous
+ cap_snap. lucky us. */
+ dout("queue_cap_snap %p already pending\n", inode);
+ goto update_snapc;
+ }
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
+ dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+ goto update_snapc;
+ }
+
+ BUG_ON(!old_snapc);
+
+ /*
+ * There is no need to send FLUSHSNAP message to MDS if there is
+ * no new snapshot. But when there is dirty pages or on-going
+ * writes, we still need to create cap_snap. cap_snap is needed
+ * by the write path and page writeback path.
+ *
+ * also see ceph_try_drop_cap_snap()
+ */
+ if (has_new_snaps(old_snapc, new_snapc)) {
+ if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
+ capsnap->need_flush = true;
+ } else {
+ if (!(used & CEPH_CAP_FILE_WR) &&
+ ci->i_wrbuffer_ref_head == 0) {
+ dout("queue_cap_snap %p "
+ "no new_snap|dirty_page|writing\n", inode);
+ goto update_snapc;
+ }
+ }
+
+ dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
+ inode, capsnap, old_snapc, ceph_cap_string(dirty),
+ capsnap->need_flush ? "" : "no_flush");
+ ihold(inode);
+
+ refcount_set(&capsnap->nref, 1);
+ INIT_LIST_HEAD(&capsnap->ci_item);
+
+ capsnap->follows = old_snapc->seq;
+ capsnap->issued = __ceph_caps_issued(ci, NULL);
+ capsnap->dirty = dirty;
+
+ capsnap->mode = inode->i_mode;
+ capsnap->uid = inode->i_uid;
+ capsnap->gid = inode->i_gid;
+
+ if (dirty & CEPH_CAP_XATTR_EXCL) {
+ old_blob = __ceph_build_xattrs_blob(ci);
+ capsnap->xattr_blob =
+ ceph_buffer_get(ci->i_xattrs.blob);
+ capsnap->xattr_version = ci->i_xattrs.version;
+ } else {
+ capsnap->xattr_blob = NULL;
+ capsnap->xattr_version = 0;
+ }
+
+ capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+
+ /* dirty page count moved from _head to this cap_snap;
+ all subsequent writes page dirties occur _after_ this
+ snapshot. */
+ capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+ ci->i_wrbuffer_ref_head = 0;
+ capsnap->context = old_snapc;
+ list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+
+ if (used & CEPH_CAP_FILE_WR) {
+ dout("queue_cap_snap %p cap_snap %p snapc %p"
+ " seq %llu used WR, now pending\n", inode,
+ capsnap, old_snapc, old_snapc->seq);
+ capsnap->writing = 1;
+ } else {
+ /* note mtime, size NOW. */
+ __ceph_finish_cap_snap(ci, capsnap);
+ }
+ capsnap = NULL;
+ old_snapc = NULL;
+
+update_snapc:
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ci->i_head_snapc = NULL;
+ } else {
+ ci->i_head_snapc = ceph_get_snap_context(new_snapc);
+ dout(" new snapc is %p\n", new_snapc);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ ceph_buffer_put(old_blob);
+ kfree(capsnap);
+ ceph_put_snap_context(old_snapc);
+}
+
+/*
+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
+ * to be used for the snapshot, to be flushed back to the mds.
+ *
+ * If capsnap can now be flushed, add to snap_flush list, and return 1.
+ *
+ * Caller must hold i_ceph_lock.
+ */
+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+ BUG_ON(capsnap->writing);
+ capsnap->size = inode->i_size;
+ capsnap->mtime = inode->i_mtime;
+ capsnap->atime = inode->i_atime;
+ capsnap->ctime = inode->i_ctime;
+ capsnap->time_warp_seq = ci->i_time_warp_seq;
+ capsnap->truncate_size = ci->i_truncate_size;
+ capsnap->truncate_seq = ci->i_truncate_seq;
+ if (capsnap->dirty_pages) {
+ dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
+ "still has %d dirty pages\n", inode, capsnap,
+ capsnap->context, capsnap->context->seq,
+ ceph_cap_string(capsnap->dirty), capsnap->size,
+ capsnap->dirty_pages);
+ return 0;
+ }
+
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+ dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
+ inode, capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
+
+ spin_lock(&mdsc->snap_flush_lock);
+ if (list_empty(&ci->i_snap_flush_item))
+ list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ spin_unlock(&mdsc->snap_flush_lock);
+ return 1; /* caller may want to ceph_flush_snaps */
+}
+
+/*
+ * Queue cap_snaps for snap writeback for this realm and its children.
+ * Called under snap_rwsem, so realm topology won't change.
+ */
+static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+{
+ struct ceph_inode_info *ci;
+ struct inode *lastinode = NULL;
+
+ dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
+ struct inode *inode = igrab(&ci->vfs_inode);
+ if (!inode)
+ continue;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ iput(lastinode);
+ lastinode = inode;
+ ceph_queue_cap_snap(ci);
+ spin_lock(&realm->inodes_with_caps_lock);
+ }
+ spin_unlock(&realm->inodes_with_caps_lock);
+ iput(lastinode);
+
+ dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+}
+
+/*
+ * Parse and apply a snapblob "snap trace" from the MDS. This specifies
+ * the snap realm parameters from a given realm and all of its ancestors,
+ * up to the root.
+ *
+ * Caller must hold snap_rwsem for write.
+ */
+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
+ void *p, void *e, bool deletion,
+ struct ceph_snap_realm **realm_ret)
+{
+ struct ceph_mds_snap_realm *ri; /* encoded */
+ __le64 *snaps; /* encoded */
+ __le64 *prior_parent_snaps; /* encoded */
+ struct ceph_snap_realm *realm = NULL;
+ struct ceph_snap_realm *first_realm = NULL;
+ int invalidate = 0;
+ int err = -ENOMEM;
+ LIST_HEAD(dirty_realms);
+
+ dout("update_snap_trace deletion=%d\n", deletion);
+more:
+ ceph_decode_need(&p, e, sizeof(*ri), bad);
+ ri = p;
+ p += sizeof(*ri);
+ ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
+ le32_to_cpu(ri->num_prior_parent_snaps)), bad);
+ snaps = p;
+ p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
+ prior_parent_snaps = p;
+ p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
+
+ realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
+ if (!realm) {
+ realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
+ if (IS_ERR(realm)) {
+ err = PTR_ERR(realm);
+ goto fail;
+ }
+ }
+
+ /* ensure the parent is correct */
+ err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
+ if (err < 0)
+ goto fail;
+ invalidate += err;
+
+ if (le64_to_cpu(ri->seq) > realm->seq) {
+ dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+ realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+ /* update realm parameters, snap lists */
+ realm->seq = le64_to_cpu(ri->seq);
+ realm->created = le64_to_cpu(ri->created);
+ realm->parent_since = le64_to_cpu(ri->parent_since);
+
+ realm->num_snaps = le32_to_cpu(ri->num_snaps);
+ err = dup_array(&realm->snaps, snaps, realm->num_snaps);
+ if (err < 0)
+ goto fail;
+
+ realm->num_prior_parent_snaps =
+ le32_to_cpu(ri->num_prior_parent_snaps);
+ err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
+ realm->num_prior_parent_snaps);
+ if (err < 0)
+ goto fail;
+
+ if (realm->seq > mdsc->last_snap_seq)
+ mdsc->last_snap_seq = realm->seq;
+
+ invalidate = 1;
+ } else if (!realm->cached_context) {
+ dout("update_snap_trace %llx %p seq %lld new\n",
+ realm->ino, realm, realm->seq);
+ invalidate = 1;
+ } else {
+ dout("update_snap_trace %llx %p seq %lld unchanged\n",
+ realm->ino, realm, realm->seq);
+ }
+
+ dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
+ realm, invalidate, p, e);
+
+ /* invalidate when we reach the _end_ (root) of the trace */
+ if (invalidate && p >= e)
+ rebuild_snap_realms(realm, &dirty_realms);
+
+ if (!first_realm)
+ first_realm = realm;
+ else
+ ceph_put_snap_realm(mdsc, realm);
+
+ if (p < e)
+ goto more;
+
+ /*
+ * queue cap snaps _after_ we've built the new snap contexts,
+ * so that i_head_snapc can be set appropriately.
+ */
+ while (!list_empty(&dirty_realms)) {
+ realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
+ dirty_item);
+ list_del_init(&realm->dirty_item);
+ queue_realm_cap_snaps(realm);
+ }
+
+ if (realm_ret)
+ *realm_ret = first_realm;
+ else
+ ceph_put_snap_realm(mdsc, first_realm);
+
+ __cleanup_empty_realms(mdsc);
+ return 0;
+
+bad:
+ err = -EINVAL;
+fail:
+ if (realm && !IS_ERR(realm))
+ ceph_put_snap_realm(mdsc, realm);
+ if (first_realm)
+ ceph_put_snap_realm(mdsc, first_realm);
+ pr_err("update_snap_trace error %d\n", err);
+ return err;
+}
+
+
+/*
+ * Send any cap_snaps that are queued for flush. Try to carry
+ * s_mutex across multiple snap flushes to avoid locking overhead.
+ *
+ * Caller holds no locks.
+ */
+static void flush_snaps(struct ceph_mds_client *mdsc)
+{
+ struct ceph_inode_info *ci;
+ struct inode *inode;
+ struct ceph_mds_session *session = NULL;
+
+ dout("flush_snaps\n");
+ spin_lock(&mdsc->snap_flush_lock);
+ while (!list_empty(&mdsc->snap_flush_list)) {
+ ci = list_first_entry(&mdsc->snap_flush_list,
+ struct ceph_inode_info, i_snap_flush_item);
+ inode = &ci->vfs_inode;
+ ihold(inode);
+ spin_unlock(&mdsc->snap_flush_lock);
+ ceph_flush_snaps(ci, &session);
+ iput(inode);
+ spin_lock(&mdsc->snap_flush_lock);
+ }
+ spin_unlock(&mdsc->snap_flush_lock);
+
+ if (session) {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+ dout("flush_snaps done\n");
+}
+
+
+/*
+ * Handle a snap notification from the MDS.
+ *
+ * This can take two basic forms: the simplest is just a snap creation
+ * or deletion notification on an existing realm. This should update the
+ * realm and its children.
+ *
+ * The more difficult case is realm creation, due to snap creation at a
+ * new point in the file hierarchy, or due to a rename that moves a file or
+ * directory into another realm.
+ */
+void ceph_handle_snap(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct super_block *sb = mdsc->fsc->sb;
+ int mds = session->s_mds;
+ u64 split;
+ int op;
+ int trace_len;
+ struct ceph_snap_realm *realm = NULL;
+ void *p = msg->front.iov_base;
+ void *e = p + msg->front.iov_len;
+ struct ceph_mds_snap_head *h;
+ int num_split_inos, num_split_realms;
+ __le64 *split_inos = NULL, *split_realms = NULL;
+ int i;
+ int locked_rwsem = 0;
+
+ /* decode */
+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;
+ h = p;
+ op = le32_to_cpu(h->op);
+ split = le64_to_cpu(h->split); /* non-zero if we are splitting an
+ * existing realm */
+ num_split_inos = le32_to_cpu(h->num_split_inos);
+ num_split_realms = le32_to_cpu(h->num_split_realms);
+ trace_len = le32_to_cpu(h->trace_len);
+ p += sizeof(*h);
+
+ dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
+ ceph_snap_op_name(op), split, trace_len);
+
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;
+ mutex_unlock(&session->s_mutex);
+
+ down_write(&mdsc->snap_rwsem);
+ locked_rwsem = 1;
+
+ if (op == CEPH_SNAP_OP_SPLIT) {
+ struct ceph_mds_snap_realm *ri;
+
+ /*
+ * A "split" breaks part of an existing realm off into
+ * a new realm. The MDS provides a list of inodes
+ * (with caps) and child realms that belong to the new
+ * child.
+ */
+ split_inos = p;
+ p += sizeof(u64) * num_split_inos;
+ split_realms = p;
+ p += sizeof(u64) * num_split_realms;
+ ceph_decode_need(&p, e, sizeof(*ri), bad);
+ /* we will peek at realm info here, but will _not_
+ * advance p, as the realm update will occur below in
+ * ceph_update_snap_trace. */
+ ri = p;
+
+ realm = ceph_lookup_snap_realm(mdsc, split);
+ if (!realm) {
+ realm = ceph_create_snap_realm(mdsc, split);
+ if (IS_ERR(realm))
+ goto out;
+ }
+
+ dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+ for (i = 0; i < num_split_inos; i++) {
+ struct ceph_vino vino = {
+ .ino = le64_to_cpu(split_inos[i]),
+ .snap = CEPH_NOSNAP,
+ };
+ struct inode *inode = ceph_find_inode(sb, vino);
+ struct ceph_inode_info *ci;
+ struct ceph_snap_realm *oldrealm;
+
+ if (!inode)
+ continue;
+ ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (!ci->i_snap_realm)
+ goto skip_inode;
+ /*
+ * If this inode belongs to a realm that was
+ * created after our new realm, we experienced
+ * a race (due to another split notifications
+ * arriving from a different MDS). So skip
+ * this inode.
+ */
+ if (ci->i_snap_realm->created >
+ le64_to_cpu(ri->created)) {
+ dout(" leaving %p in newer realm %llx %p\n",
+ inode, ci->i_snap_realm->ino,
+ ci->i_snap_realm);
+ goto skip_inode;
+ }
+ dout(" will move %p to split realm %llx %p\n",
+ inode, realm->ino, realm);
+ /*
+ * Move the inode to the new realm
+ */
+ oldrealm = ci->i_snap_realm;
+ spin_lock(&oldrealm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ spin_unlock(&oldrealm->inodes_with_caps_lock);
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_add(&ci->i_snap_realm_item,
+ &realm->inodes_with_caps);
+ ci->i_snap_realm = realm;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = inode;
+ spin_unlock(&realm->inodes_with_caps_lock);
+
+ spin_unlock(&ci->i_ceph_lock);
+
+ ceph_get_snap_realm(mdsc, realm);
+ ceph_put_snap_realm(mdsc, oldrealm);
+
+ iput(inode);
+ continue;
+
+skip_inode:
+ spin_unlock(&ci->i_ceph_lock);
+ iput(inode);
+ }
+
+ /* we may have taken some of the old realm's children. */
+ for (i = 0; i < num_split_realms; i++) {
+ struct ceph_snap_realm *child =
+ __lookup_snap_realm(mdsc,
+ le64_to_cpu(split_realms[i]));
+ if (!child)
+ continue;
+ adjust_snap_realm_parent(mdsc, child, realm->ino);
+ }
+ }
+
+ /*
+ * update using the provided snap trace. if we are deleting a
+ * snap, we can avoid queueing cap_snaps.
+ */
+ ceph_update_snap_trace(mdsc, p, e,
+ op == CEPH_SNAP_OP_DESTROY, NULL);
+
+ if (op == CEPH_SNAP_OP_SPLIT)
+ /* we took a reference when we created the realm, above */
+ ceph_put_snap_realm(mdsc, realm);
+
+ __cleanup_empty_realms(mdsc);
+
+ up_write(&mdsc->snap_rwsem);
+
+ flush_snaps(mdsc);
+ return;
+
+bad:
+ pr_err("corrupt snap message from mds%d\n", mds);
+ ceph_msg_dump(msg);
+out:
+ if (locked_rwsem)
+ up_write(&mdsc->snap_rwsem);
+ return;
+}
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
new file mode 100644
index 000000000..4a79f3632
--- /dev/null
+++ b/fs/ceph/strings.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph fs string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+
+const char *ceph_mds_state_name(int s)
+{
+ switch (s) {
+ /* down and out */
+ case CEPH_MDS_STATE_DNE: return "down:dne";
+ case CEPH_MDS_STATE_STOPPED: return "down:stopped";
+ /* up and out */
+ case CEPH_MDS_STATE_BOOT: return "up:boot";
+ case CEPH_MDS_STATE_STANDBY: return "up:standby";
+ case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
+ case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
+ case CEPH_MDS_STATE_CREATING: return "up:creating";
+ case CEPH_MDS_STATE_STARTING: return "up:starting";
+ /* up and in */
+ case CEPH_MDS_STATE_REPLAY: return "up:replay";
+ case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
+ case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
+ case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
+ case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+ case CEPH_MDS_STATE_ACTIVE: return "up:active";
+ case CEPH_MDS_STATE_STOPPING: return "up:stopping";
+ }
+ return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+ switch (op) {
+ case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+ case CEPH_SESSION_OPEN: return "open";
+ case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+ case CEPH_SESSION_CLOSE: return "close";
+ case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+ case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+ case CEPH_SESSION_STALE: return "stale";
+ case CEPH_SESSION_RECALL_STATE: return "recall_state";
+ case CEPH_SESSION_FLUSHMSG: return "flushmsg";
+ case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
+ case CEPH_SESSION_FORCE_RO: return "force_ro";
+ case CEPH_SESSION_REJECT: return "reject";
+ }
+ return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+ switch (op) {
+ case CEPH_MDS_OP_LOOKUP: return "lookup";
+ case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
+ case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
+ case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
+ case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
+ case CEPH_MDS_OP_GETATTR: return "getattr";
+ case CEPH_MDS_OP_SETXATTR: return "setxattr";
+ case CEPH_MDS_OP_SETATTR: return "setattr";
+ case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+ case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
+ case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
+ case CEPH_MDS_OP_READDIR: return "readdir";
+ case CEPH_MDS_OP_MKNOD: return "mknod";
+ case CEPH_MDS_OP_LINK: return "link";
+ case CEPH_MDS_OP_UNLINK: return "unlink";
+ case CEPH_MDS_OP_RENAME: return "rename";
+ case CEPH_MDS_OP_MKDIR: return "mkdir";
+ case CEPH_MDS_OP_RMDIR: return "rmdir";
+ case CEPH_MDS_OP_SYMLINK: return "symlink";
+ case CEPH_MDS_OP_CREATE: return "create";
+ case CEPH_MDS_OP_OPEN: return "open";
+ case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+ case CEPH_MDS_OP_LSSNAP: return "lssnap";
+ case CEPH_MDS_OP_MKSNAP: return "mksnap";
+ case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+ case CEPH_MDS_OP_RENAMESNAP: return "renamesnap";
+ case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
+ case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
+ }
+ return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+ switch (op) {
+ case CEPH_CAP_OP_GRANT: return "grant";
+ case CEPH_CAP_OP_REVOKE: return "revoke";
+ case CEPH_CAP_OP_TRUNC: return "trunc";
+ case CEPH_CAP_OP_EXPORT: return "export";
+ case CEPH_CAP_OP_IMPORT: return "import";
+ case CEPH_CAP_OP_UPDATE: return "update";
+ case CEPH_CAP_OP_DROP: return "drop";
+ case CEPH_CAP_OP_FLUSH: return "flush";
+ case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+ case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+ case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+ case CEPH_CAP_OP_RELEASE: return "release";
+ case CEPH_CAP_OP_RENEW: return "renew";
+ }
+ return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+ switch (o) {
+ case CEPH_MDS_LEASE_REVOKE: return "revoke";
+ case CEPH_MDS_LEASE_RELEASE: return "release";
+ case CEPH_MDS_LEASE_RENEW: return "renew";
+ case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+ }
+ return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+ switch (o) {
+ case CEPH_SNAP_OP_UPDATE: return "update";
+ case CEPH_SNAP_OP_CREATE: return "create";
+ case CEPH_SNAP_OP_DESTROY: return "destroy";
+ case CEPH_SNAP_OP_SPLIT: return "split";
+ }
+ return "???";
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000..18e967089
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1214 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "cache.h"
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+/*
+ * Ceph superblock operations
+ *
+ * Handle the basics of mounting, unmounting.
+ */
+
+/*
+ * super ops
+ */
+static void ceph_put_super(struct super_block *s)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+
+ dout("put_super\n");
+ ceph_mdsc_close_sessions(fsc->mdsc);
+}
+
+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
+ struct ceph_mon_client *monc = &fsc->client->monc;
+ struct ceph_statfs st;
+ u64 fsid;
+ int err;
+ u64 data_pool;
+
+ if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
+ data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0];
+ } else {
+ data_pool = CEPH_NOPOOL;
+ }
+
+ dout("statfs\n");
+ err = ceph_monc_do_statfs(monc, data_pool, &st);
+ if (err < 0)
+ return err;
+
+ /* fill in kstatfs */
+ buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
+
+ /*
+ * express utilization in terms of large blocks to avoid
+ * overflow on 32-bit machines.
+ *
+ * NOTE: for the time being, we make bsize == frsize to humor
+ * not-yet-ancient versions of glibc that are broken.
+ * Someday, we will probably want to report a real block
+ * size... whatever that may mean for a network file system!
+ */
+ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+ buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
+
+ /*
+ * By default use root quota for stats; fallback to overall filesystem
+ * usage if using 'noquotadf' mount option or if the root dir doesn't
+ * have max_bytes quota set.
+ */
+ if (ceph_test_mount_opt(fsc, NOQUOTADF) ||
+ !ceph_quota_update_statfs(fsc, buf)) {
+ buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+ buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+ buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+ }
+
+ buf->f_files = le64_to_cpu(st.num_objects);
+ buf->f_ffree = -1;
+ buf->f_namelen = NAME_MAX;
+
+ /* Must convert the fsid, for consistent values across arches */
+ mutex_lock(&monc->mutex);
+ fsid = le64_to_cpu(*(__le64 *)(&monc->monmap->fsid)) ^
+ le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1));
+ mutex_unlock(&monc->mutex);
+
+ buf->f_fsid.val[0] = fsid & 0xffffffff;
+ buf->f_fsid.val[1] = fsid >> 32;
+
+ return 0;
+}
+
+static int ceph_sync_fs(struct super_block *sb, int wait)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+ if (!wait) {
+ dout("sync_fs (non-blocking)\n");
+ ceph_flush_dirty_caps(fsc->mdsc);
+ dout("sync_fs (non-blocking) done\n");
+ return 0;
+ }
+
+ dout("sync_fs (blocking)\n");
+ ceph_osdc_sync(&fsc->client->osdc);
+ ceph_mdsc_sync(fsc->mdsc);
+ dout("sync_fs (blocking) done\n");
+ return 0;
+}
+
+/*
+ * mount options
+ */
+enum {
+ Opt_wsize,
+ Opt_rsize,
+ Opt_rasize,
+ Opt_caps_wanted_delay_min,
+ Opt_caps_wanted_delay_max,
+ Opt_readdir_max_entries,
+ Opt_readdir_max_bytes,
+ Opt_congestion_kb,
+ Opt_last_int,
+ /* int args above */
+ Opt_snapdirname,
+ Opt_mds_namespace,
+ Opt_fscache_uniq,
+ Opt_last_string,
+ /* string args above */
+ Opt_dirstat,
+ Opt_nodirstat,
+ Opt_rbytes,
+ Opt_norbytes,
+ Opt_asyncreaddir,
+ Opt_noasyncreaddir,
+ Opt_dcache,
+ Opt_nodcache,
+ Opt_ino32,
+ Opt_noino32,
+ Opt_fscache,
+ Opt_nofscache,
+ Opt_poolperm,
+ Opt_nopoolperm,
+ Opt_require_active_mds,
+ Opt_norequire_active_mds,
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ Opt_acl,
+#endif
+ Opt_noacl,
+ Opt_quotadf,
+ Opt_noquotadf,
+};
+
+static match_table_t fsopt_tokens = {
+ {Opt_wsize, "wsize=%d"},
+ {Opt_rsize, "rsize=%d"},
+ {Opt_rasize, "rasize=%d"},
+ {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
+ {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+ {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+ {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
+ {Opt_congestion_kb, "write_congestion_kb=%d"},
+ /* int args above */
+ {Opt_snapdirname, "snapdirname=%s"},
+ {Opt_mds_namespace, "mds_namespace=%s"},
+ {Opt_fscache_uniq, "fsc=%s"},
+ /* string args above */
+ {Opt_dirstat, "dirstat"},
+ {Opt_nodirstat, "nodirstat"},
+ {Opt_rbytes, "rbytes"},
+ {Opt_norbytes, "norbytes"},
+ {Opt_asyncreaddir, "asyncreaddir"},
+ {Opt_noasyncreaddir, "noasyncreaddir"},
+ {Opt_dcache, "dcache"},
+ {Opt_nodcache, "nodcache"},
+ {Opt_ino32, "ino32"},
+ {Opt_noino32, "noino32"},
+ {Opt_fscache, "fsc"},
+ {Opt_nofscache, "nofsc"},
+ {Opt_poolperm, "poolperm"},
+ {Opt_nopoolperm, "nopoolperm"},
+ {Opt_require_active_mds, "require_active_mds"},
+ {Opt_norequire_active_mds, "norequire_active_mds"},
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ {Opt_acl, "acl"},
+#endif
+ {Opt_noacl, "noacl"},
+ {Opt_quotadf, "quotadf"},
+ {Opt_noquotadf, "noquotadf"},
+ {-1, NULL}
+};
+
+/*
+ * Remove adjacent slashes and then the trailing slash, unless it is
+ * the only remaining character.
+ *
+ * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/".
+ */
+static void canonicalize_path(char *path)
+{
+ int i, j = 0;
+
+ for (i = 0; path[i] != '\0'; i++) {
+ if (path[i] != '/' || j < 1 || path[j - 1] != '/')
+ path[j++] = path[i];
+ }
+
+ if (j > 1 && path[j - 1] == '/')
+ j--;
+ path[j] = '\0';
+}
+
+static int parse_fsopt_token(char *c, void *private)
+{
+ struct ceph_mount_options *fsopt = private;
+ substring_t argstr[MAX_OPT_ARGS];
+ int token, intval, ret;
+
+ token = match_token((char *)c, fsopt_tokens, argstr);
+ if (token < 0)
+ return -EINVAL;
+
+ if (token < Opt_last_int) {
+ ret = match_int(&argstr[0], &intval);
+ if (ret < 0) {
+ pr_err("bad option arg (not int) at '%s'\n", c);
+ return ret;
+ }
+ dout("got int token %d val %d\n", token, intval);
+ } else if (token > Opt_last_int && token < Opt_last_string) {
+ dout("got string token %d val %s\n", token,
+ argstr[0].from);
+ } else {
+ dout("got token %d\n", token);
+ }
+
+ switch (token) {
+ case Opt_snapdirname:
+ kfree(fsopt->snapdir_name);
+ fsopt->snapdir_name = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ if (!fsopt->snapdir_name)
+ return -ENOMEM;
+ break;
+ case Opt_mds_namespace:
+ kfree(fsopt->mds_namespace);
+ fsopt->mds_namespace = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ if (!fsopt->mds_namespace)
+ return -ENOMEM;
+ break;
+ case Opt_fscache_uniq:
+#ifdef CONFIG_CEPH_FSCACHE
+ kfree(fsopt->fscache_uniq);
+ fsopt->fscache_uniq = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ if (!fsopt->fscache_uniq)
+ return -ENOMEM;
+ fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
+ break;
+#else
+ pr_err("fscache support is disabled\n");
+ return -EINVAL;
+#endif
+ case Opt_wsize:
+ if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE)
+ return -EINVAL;
+ fsopt->wsize = ALIGN(intval, PAGE_SIZE);
+ break;
+ case Opt_rsize:
+ if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_READ_SIZE)
+ return -EINVAL;
+ fsopt->rsize = ALIGN(intval, PAGE_SIZE);
+ break;
+ case Opt_rasize:
+ if (intval < 0)
+ return -EINVAL;
+ fsopt->rasize = ALIGN(intval, PAGE_SIZE);
+ break;
+ case Opt_caps_wanted_delay_min:
+ if (intval < 1)
+ return -EINVAL;
+ fsopt->caps_wanted_delay_min = intval;
+ break;
+ case Opt_caps_wanted_delay_max:
+ if (intval < 1)
+ return -EINVAL;
+ fsopt->caps_wanted_delay_max = intval;
+ break;
+ case Opt_readdir_max_entries:
+ if (intval < 1)
+ return -EINVAL;
+ fsopt->max_readdir = intval;
+ break;
+ case Opt_readdir_max_bytes:
+ if (intval < (int)PAGE_SIZE && intval != 0)
+ return -EINVAL;
+ fsopt->max_readdir_bytes = intval;
+ break;
+ case Opt_congestion_kb:
+ if (intval < 1024) /* at least 1M */
+ return -EINVAL;
+ fsopt->congestion_kb = intval;
+ break;
+ case Opt_dirstat:
+ fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+ break;
+ case Opt_nodirstat:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+ break;
+ case Opt_rbytes:
+ fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+ break;
+ case Opt_norbytes:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+ break;
+ case Opt_asyncreaddir:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
+ break;
+ case Opt_noasyncreaddir:
+ fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+ break;
+ case Opt_dcache:
+ fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
+ break;
+ case Opt_nodcache:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
+ break;
+ case Opt_ino32:
+ fsopt->flags |= CEPH_MOUNT_OPT_INO32;
+ break;
+ case Opt_noino32:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
+ break;
+ case Opt_fscache:
+#ifdef CONFIG_CEPH_FSCACHE
+ fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
+ kfree(fsopt->fscache_uniq);
+ fsopt->fscache_uniq = NULL;
+ break;
+#else
+ pr_err("fscache support is disabled\n");
+ return -EINVAL;
+#endif
+ case Opt_nofscache:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
+ kfree(fsopt->fscache_uniq);
+ fsopt->fscache_uniq = NULL;
+ break;
+ case Opt_poolperm:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
+ break;
+ case Opt_nopoolperm:
+ fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
+ break;
+ case Opt_require_active_mds:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT;
+ break;
+ case Opt_norequire_active_mds:
+ fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT;
+ break;
+ case Opt_quotadf:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF;
+ break;
+ case Opt_noquotadf:
+ fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
+ break;
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ case Opt_acl:
+ fsopt->sb_flags |= SB_POSIXACL;
+ break;
+#endif
+ case Opt_noacl:
+ fsopt->sb_flags &= ~SB_POSIXACL;
+ break;
+ default:
+ BUG_ON(token);
+ }
+ return 0;
+}
+
+static void destroy_mount_options(struct ceph_mount_options *args)
+{
+ dout("destroy_mount_options %p\n", args);
+ kfree(args->snapdir_name);
+ kfree(args->mds_namespace);
+ kfree(args->server_path);
+ kfree(args->fscache_uniq);
+ kfree(args);
+}
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+ if (!s1 && !s2)
+ return 0;
+ if (s1 && !s2)
+ return -1;
+ if (!s1 && s2)
+ return 1;
+ return strcmp(s1, s2);
+}
+
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+ struct ceph_options *new_opt,
+ struct ceph_fs_client *fsc)
+{
+ struct ceph_mount_options *fsopt1 = new_fsopt;
+ struct ceph_mount_options *fsopt2 = fsc->mount_options;
+ int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+ int ret;
+
+ ret = memcmp(fsopt1, fsopt2, ofs);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq);
+ if (ret)
+ return ret;
+
+ return ceph_compare_options(new_opt, fsc->client);
+}
+
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+ struct ceph_options **popt,
+ int flags, char *options,
+ const char *dev_name)
+{
+ struct ceph_mount_options *fsopt;
+ const char *dev_name_end;
+ int err;
+
+ if (!dev_name || !*dev_name)
+ return -EINVAL;
+
+ fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+ if (!fsopt)
+ return -ENOMEM;
+
+ dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+
+ fsopt->sb_flags = flags;
+ fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+ fsopt->wsize = CEPH_MAX_WRITE_SIZE;
+ fsopt->rsize = CEPH_MAX_READ_SIZE;
+ fsopt->rasize = CEPH_RASIZE_DEFAULT;
+ fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+ if (!fsopt->snapdir_name) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+ fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+ fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+ fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+ fsopt->congestion_kb = default_congestion_kb();
+
+ /*
+ * Distinguish the server list from the path in "dev_name".
+ * Internally we do not include the leading '/' in the path.
+ *
+ * "dev_name" will look like:
+ * <server_spec>[,<server_spec>...]:[<path>]
+ * where
+ * <server_spec> is <ip>[:<port>]
+ * <path> is optional, but if present must begin with '/'
+ */
+ dev_name_end = strchr(dev_name, '/');
+ if (dev_name_end) {
+ /*
+ * The server_path will include the whole chars from userland
+ * including the leading '/'.
+ */
+ fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+ if (!fsopt->server_path) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ canonicalize_path(fsopt->server_path);
+ } else {
+ dev_name_end = dev_name + strlen(dev_name);
+ }
+ err = -EINVAL;
+ dev_name_end--; /* back up to ':' separator */
+ if (dev_name_end < dev_name || *dev_name_end != ':') {
+ pr_err("device name is missing path (no : separator in %s)\n",
+ dev_name);
+ goto out;
+ }
+ dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
+ if (fsopt->server_path)
+ dout("server path '%s'\n", fsopt->server_path);
+
+ *popt = ceph_parse_options(options, dev_name, dev_name_end,
+ parse_fsopt_token, (void *)fsopt);
+ if (IS_ERR(*popt)) {
+ err = PTR_ERR(*popt);
+ goto out;
+ }
+
+ /* success */
+ *pfsopt = fsopt;
+ return 0;
+
+out:
+ destroy_mount_options(fsopt);
+ return err;
+}
+
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @root: root of that (sub)tree
+ */
+static int ceph_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
+ struct ceph_mount_options *fsopt = fsc->mount_options;
+ size_t pos;
+ int ret;
+
+ /* a comma between MNT/MS and client options */
+ seq_putc(m, ',');
+ pos = m->count;
+
+ ret = ceph_print_client_options(m, fsc->client);
+ if (ret)
+ return ret;
+
+ /* retract our comma if no client options */
+ if (m->count == pos)
+ m->count--;
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+ seq_puts(m, ",dirstat");
+ if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
+ seq_puts(m, ",rbytes");
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+ seq_puts(m, ",noasyncreaddir");
+ if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
+ seq_puts(m, ",nodcache");
+ if (fsopt->flags & CEPH_MOUNT_OPT_INO32)
+ seq_puts(m, ",ino32");
+ if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) {
+ seq_show_option(m, "fsc", fsopt->fscache_uniq);
+ }
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
+ seq_puts(m, ",nopoolperm");
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF)
+ seq_puts(m, ",noquotadf");
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ if (fsopt->sb_flags & SB_POSIXACL)
+ seq_puts(m, ",acl");
+ else
+ seq_puts(m, ",noacl");
+#endif
+
+ if (fsopt->mds_namespace)
+ seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
+ if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
+ seq_printf(m, ",wsize=%d", fsopt->wsize);
+ if (fsopt->rsize != CEPH_MAX_READ_SIZE)
+ seq_printf(m, ",rsize=%d", fsopt->rsize);
+ if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
+ seq_printf(m, ",rasize=%d", fsopt->rasize);
+ if (fsopt->congestion_kb != default_congestion_kb())
+ seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+ if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+ seq_printf(m, ",caps_wanted_delay_min=%d",
+ fsopt->caps_wanted_delay_min);
+ if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+ seq_printf(m, ",caps_wanted_delay_max=%d",
+ fsopt->caps_wanted_delay_max);
+ if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+ seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+ if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+ seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+ if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+ seq_show_option(m, "snapdirname", fsopt->snapdir_name);
+
+ return 0;
+}
+
+/*
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
+ */
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
+{
+ struct ceph_fs_client *fsc = client->private;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ switch (type) {
+ case CEPH_MSG_MDS_MAP:
+ ceph_mdsc_handle_mdsmap(fsc->mdsc, msg);
+ return 0;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(fsc->mdsc, msg);
+ return 0;
+ default:
+ return -1;
+ }
+}
+
+/*
+ * create a new fs client
+ *
+ * Success or not, this function consumes @fsopt and @opt.
+ */
+static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+ struct ceph_options *opt)
+{
+ struct ceph_fs_client *fsc;
+ int page_count;
+ size_t size;
+ int err;
+
+ fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
+ if (!fsc) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ fsc->client = ceph_create_client(opt, fsc);
+ if (IS_ERR(fsc->client)) {
+ err = PTR_ERR(fsc->client);
+ goto fail;
+ }
+ opt = NULL; /* fsc->client now owns this */
+
+ fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+ fsc->client->osdc.abort_on_full = true;
+
+ if (!fsopt->mds_namespace) {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ } else {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP,
+ 0, false);
+ }
+
+ fsc->mount_options = fsopt;
+
+ fsc->sb = NULL;
+ fsc->mount_state = CEPH_MOUNT_MOUNTING;
+
+ atomic_long_set(&fsc->writeback_count, 0);
+
+ err = -ENOMEM;
+ /*
+ * The number of concurrent works can be high but they don't need
+ * to be processed in parallel, limit concurrency.
+ */
+ fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
+ if (!fsc->wb_wq)
+ goto fail_client;
+ fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
+ if (!fsc->pg_inv_wq)
+ goto fail_wb_wq;
+ fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
+ if (!fsc->trunc_wq)
+ goto fail_pg_inv_wq;
+
+ /* set up mempools */
+ err = -ENOMEM;
+ page_count = fsc->mount_options->wsize >> PAGE_SHIFT;
+ size = sizeof (struct page *) * (page_count ? page_count : 1);
+ fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
+ if (!fsc->wb_pagevec_pool)
+ goto fail_trunc_wq;
+
+ /* caps */
+ fsc->min_caps = fsopt->max_readdir;
+
+ return fsc;
+
+fail_trunc_wq:
+ destroy_workqueue(fsc->trunc_wq);
+fail_pg_inv_wq:
+ destroy_workqueue(fsc->pg_inv_wq);
+fail_wb_wq:
+ destroy_workqueue(fsc->wb_wq);
+fail_client:
+ ceph_destroy_client(fsc->client);
+fail:
+ kfree(fsc);
+ if (opt)
+ ceph_destroy_options(opt);
+ destroy_mount_options(fsopt);
+ return ERR_PTR(err);
+}
+
+static void flush_fs_workqueues(struct ceph_fs_client *fsc)
+{
+ flush_workqueue(fsc->wb_wq);
+ flush_workqueue(fsc->pg_inv_wq);
+ flush_workqueue(fsc->trunc_wq);
+}
+
+static void destroy_fs_client(struct ceph_fs_client *fsc)
+{
+ dout("destroy_fs_client %p\n", fsc);
+
+ destroy_workqueue(fsc->wb_wq);
+ destroy_workqueue(fsc->pg_inv_wq);
+ destroy_workqueue(fsc->trunc_wq);
+
+ mempool_destroy(fsc->wb_pagevec_pool);
+
+ destroy_mount_options(fsc->mount_options);
+
+ ceph_destroy_client(fsc->client);
+
+ kfree(fsc);
+ dout("destroy_fs_client %p done\n", fsc);
+}
+
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_flush_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+struct kmem_cache *ceph_dir_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
+{
+ struct ceph_inode_info *ci = foo;
+ inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+ int error = -ENOMEM;
+
+ ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+ sizeof(struct ceph_inode_info),
+ __alignof__(struct ceph_inode_info),
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ceph_inode_init_once);
+ if (!ceph_inode_cachep)
+ return -ENOMEM;
+
+ ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
+ if (!ceph_cap_cachep)
+ goto bad_cap;
+ ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (!ceph_cap_flush_cachep)
+ goto bad_cap_flush;
+
+ ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (!ceph_dentry_cachep)
+ goto bad_dentry;
+
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
+ if (!ceph_file_cachep)
+ goto bad_file;
+
+ ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD);
+ if (!ceph_dir_file_cachep)
+ goto bad_dir_file;
+
+ error = ceph_fscache_register();
+ if (error)
+ goto bad_fscache;
+
+ return 0;
+
+bad_fscache:
+ kmem_cache_destroy(ceph_dir_file_cachep);
+bad_dir_file:
+ kmem_cache_destroy(ceph_file_cachep);
+bad_file:
+ kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+ kmem_cache_destroy(ceph_cap_flush_cachep);
+bad_cap_flush:
+ kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+ kmem_cache_destroy(ceph_inode_cachep);
+ return error;
+}
+
+static void destroy_caches(void)
+{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+
+ kmem_cache_destroy(ceph_inode_cachep);
+ kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_cap_flush_cachep);
+ kmem_cache_destroy(ceph_dentry_cachep);
+ kmem_cache_destroy(ceph_file_cachep);
+ kmem_cache_destroy(ceph_dir_file_cachep);
+
+ ceph_fscache_unregister();
+}
+
+/*
+ * ceph_umount_begin - initiate forced umount. Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
+ */
+static void ceph_umount_begin(struct super_block *sb)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+ dout("ceph_umount_begin - starting forced umount\n");
+ if (!fsc)
+ return;
+ fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+ ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
+ ceph_mdsc_force_umount(fsc->mdsc);
+ return;
+}
+
+static int ceph_remount(struct super_block *sb, int *flags, char *data)
+{
+ sync_filesystem(sb);
+ return 0;
+}
+
+static const struct super_operations ceph_super_ops = {
+ .alloc_inode = ceph_alloc_inode,
+ .destroy_inode = ceph_destroy_inode,
+ .write_inode = ceph_write_inode,
+ .drop_inode = ceph_drop_inode,
+ .evict_inode = ceph_evict_inode,
+ .sync_fs = ceph_sync_fs,
+ .put_super = ceph_put_super,
+ .remount_fs = ceph_remount,
+ .show_options = ceph_show_options,
+ .statfs = ceph_statfs,
+ .umount_begin = ceph_umount_begin,
+};
+
+/*
+ * Bootstrap mount by opening the root directory. Note the mount
+ * @started time from caller, and time out if this takes too long.
+ */
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
+ const char *path,
+ unsigned long started)
+{
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req = NULL;
+ int err;
+ struct dentry *root;
+
+ /* open dir */
+ dout("open_root_inode opening '%s'\n", path);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
+ req->r_path1 = kstrdup(path, GFP_NOFS);
+ if (!req->r_path1) {
+ root = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ req->r_ino1.ino = CEPH_INO_ROOT;
+ req->r_ino1.snap = CEPH_NOSNAP;
+ req->r_started = started;
+ req->r_timeout = fsc->client->options->mount_timeout;
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+ req->r_num_caps = 2;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err == 0) {
+ struct inode *inode = req->r_target_inode;
+ req->r_target_inode = NULL;
+ dout("open_root_inode success\n");
+ root = d_make_root(inode);
+ if (!root) {
+ root = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ dout("open_root_inode success, root dentry is %p\n", root);
+ } else {
+ root = ERR_PTR(err);
+ }
+out:
+ ceph_mdsc_put_request(req);
+ return root;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
+{
+ int err;
+ unsigned long started = jiffies; /* note the start time */
+ struct dentry *root;
+
+ dout("mount start %p\n", fsc);
+ mutex_lock(&fsc->client->mount_mutex);
+
+ if (!fsc->sb->s_root) {
+ const char *path = fsc->mount_options->server_path ?
+ fsc->mount_options->server_path + 1 : "";
+
+ err = __ceph_open_session(fsc->client, started);
+ if (err < 0)
+ goto out;
+
+ /* setup fscache */
+ if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) {
+ err = ceph_fscache_register_fs(fsc);
+ if (err < 0)
+ goto out;
+ }
+
+ dout("mount opening path '%s'\n", path);
+
+ err = ceph_fs_debugfs_init(fsc);
+ if (err < 0)
+ goto out;
+
+ root = open_root_dentry(fsc, path, started);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out;
+ }
+ fsc->sb->s_root = dget(root);
+ } else {
+ root = dget(fsc->sb->s_root);
+ }
+
+ fsc->mount_state = CEPH_MOUNT_MOUNTED;
+ dout("mount success\n");
+ mutex_unlock(&fsc->client->mount_mutex);
+ return root;
+
+out:
+ mutex_unlock(&fsc->client->mount_mutex);
+ return ERR_PTR(err);
+}
+
+static int ceph_set_super(struct super_block *s, void *data)
+{
+ struct ceph_fs_client *fsc = data;
+ int ret;
+
+ dout("set_super %p data %p\n", s, data);
+
+ s->s_flags = fsc->mount_options->sb_flags;
+ s->s_maxbytes = MAX_LFS_FILESIZE;
+
+ s->s_xattr = ceph_xattr_handlers;
+ s->s_fs_info = fsc;
+ fsc->sb = s;
+ fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */
+
+ s->s_op = &ceph_super_ops;
+ s->s_d_op = &ceph_dentry_ops;
+ s->s_export_op = &ceph_export_ops;
+
+ s->s_time_gran = 1000; /* 1000 ns == 1 us */
+
+ ret = set_anon_super(s, NULL); /* what is that second arg for? */
+ if (ret != 0)
+ goto fail;
+
+ return ret;
+
+fail:
+ s->s_fs_info = NULL;
+ fsc->sb = NULL;
+ return ret;
+}
+
+/*
+ * share superblock if same fs AND options
+ */
+static int ceph_compare_super(struct super_block *sb, void *data)
+{
+ struct ceph_fs_client *new = data;
+ struct ceph_mount_options *fsopt = new->mount_options;
+ struct ceph_options *opt = new->client->options;
+ struct ceph_fs_client *other = ceph_sb_to_client(sb);
+
+ dout("ceph_compare_super %p\n", sb);
+
+ if (compare_mount_options(fsopt, opt, other)) {
+ dout("monitor(s)/mount options don't match\n");
+ return 0;
+ }
+ if ((opt->flags & CEPH_OPT_FSID) &&
+ ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+ dout("fsid doesn't match\n");
+ return 0;
+ }
+ if (fsopt->sb_flags != other->mount_options->sb_flags) {
+ dout("flags differ\n");
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * construct our own bdi so we can control readahead, etc.
+ */
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
+{
+ int err;
+
+ err = super_setup_bdi_name(sb, "ceph-%ld",
+ atomic_long_inc_return(&bdi_seq));
+ if (err)
+ return err;
+
+ /* set ra_pages based on rasize mount option? */
+ sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT;
+
+ /* set io_pages based on max osd read size */
+ sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT;
+
+ return 0;
+}
+
+static struct dentry *ceph_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ struct super_block *sb;
+ struct ceph_fs_client *fsc;
+ struct dentry *res;
+ int err;
+ int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
+ struct ceph_mount_options *fsopt = NULL;
+ struct ceph_options *opt = NULL;
+
+ dout("ceph_mount\n");
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ flags |= SB_POSIXACL;
+#endif
+ err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
+ if (err < 0) {
+ res = ERR_PTR(err);
+ goto out_final;
+ }
+
+ /* create client (which we may/may not use) */
+ fsc = create_fs_client(fsopt, opt);
+ if (IS_ERR(fsc)) {
+ res = ERR_CAST(fsc);
+ goto out_final;
+ }
+
+ err = ceph_mdsc_init(fsc);
+ if (err < 0) {
+ res = ERR_PTR(err);
+ goto out;
+ }
+
+ if (ceph_test_opt(fsc->client, NOSHARE))
+ compare_super = NULL;
+ sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
+ if (IS_ERR(sb)) {
+ res = ERR_CAST(sb);
+ goto out;
+ }
+
+ if (ceph_sb_to_client(sb) != fsc) {
+ ceph_mdsc_destroy(fsc);
+ destroy_fs_client(fsc);
+ fsc = ceph_sb_to_client(sb);
+ dout("get_sb got existing client %p\n", fsc);
+ } else {
+ dout("get_sb using new client %p\n", fsc);
+ err = ceph_setup_bdi(sb, fsc);
+ if (err < 0) {
+ res = ERR_PTR(err);
+ goto out_splat;
+ }
+ }
+
+ res = ceph_real_mount(fsc);
+ if (IS_ERR(res))
+ goto out_splat;
+ dout("root %p inode %p ino %llx.%llx\n", res,
+ d_inode(res), ceph_vinop(d_inode(res)));
+ return res;
+
+out_splat:
+ if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) {
+ pr_info("No mds server is up or the cluster is laggy\n");
+ err = -EHOSTUNREACH;
+ }
+
+ ceph_mdsc_close_sessions(fsc->mdsc);
+ deactivate_locked_super(sb);
+ goto out_final;
+
+out:
+ ceph_mdsc_destroy(fsc);
+ destroy_fs_client(fsc);
+out_final:
+ dout("ceph_mount fail %ld\n", PTR_ERR(res));
+ return res;
+}
+
+static void ceph_kill_sb(struct super_block *s)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+ dev_t dev = s->s_dev;
+
+ dout("kill_sb %p\n", s);
+
+ ceph_mdsc_pre_umount(fsc->mdsc);
+ flush_fs_workqueues(fsc);
+
+ generic_shutdown_super(s);
+
+ fsc->client->extra_mon_dispatch = NULL;
+ ceph_fs_debugfs_cleanup(fsc);
+
+ ceph_fscache_unregister_fs(fsc);
+
+ ceph_mdsc_destroy(fsc);
+
+ destroy_fs_client(fsc);
+ free_anon_bdev(dev);
+}
+
+static struct file_system_type ceph_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ceph",
+ .mount = ceph_mount,
+ .kill_sb = ceph_kill_sb,
+ .fs_flags = FS_RENAME_DOES_D_MOVE,
+};
+MODULE_ALIAS_FS("ceph");
+
+static int __init init_ceph(void)
+{
+ int ret = init_caches();
+ if (ret)
+ goto out;
+
+ ceph_flock_init();
+ ceph_xattr_init();
+ ret = register_filesystem(&ceph_fs_type);
+ if (ret)
+ goto out_xattr;
+
+ pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
+
+ return 0;
+
+out_xattr:
+ ceph_xattr_exit();
+ destroy_caches();
+out:
+ return ret;
+}
+
+static void __exit exit_ceph(void)
+{
+ dout("exit_ceph\n");
+ unregister_filesystem(&ceph_fs_type);
+ ceph_xattr_exit();
+ destroy_caches();
+}
+
+module_init(init_ceph);
+module_exit(exit_ceph);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000..65da12ff5
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,1118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_SUPER_H
+#define _FS_CEPH_SUPER_H
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+#include <linux/posix_acl.h>
+#include <linux/refcount.h>
+
+#include <linux/ceph/libceph.h>
+
+#ifdef CONFIG_CEPH_FSCACHE
+#include <linux/fscache.h>
+#endif
+
+/* f_type in struct statfs */
+#define CEPH_SUPER_MAGIC 0x00c36400
+
+/* large granularity for statfs utilization stats to facilitate
+ * large volume sizes on 32-bit machines. */
+#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
+#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
+
+#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
+#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
+#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
+#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
+#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
+#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
+#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
+
+#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
+
+#define ceph_set_mount_opt(fsc, opt) \
+ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
+
+/* max size of osd read request, limited by libceph */
+#define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN
+/* osd has a configurable limitaion of max write size.
+ * CEPH_MSG_MAX_DATA_LEN should be small enough. */
+#define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN
+#define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */
+#define CEPH_MAX_READDIR_DEFAULT 1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file. Delay a minimum amount of time, even if we send a cap
+ * message for some other reason. Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
+
+struct ceph_mount_options {
+ int flags;
+ int sb_flags;
+
+ int wsize; /* max write size */
+ int rsize; /* max read size */
+ int rasize; /* max readahead */
+ int congestion_kb; /* max writeback in flight */
+ int caps_wanted_delay_min, caps_wanted_delay_max;
+ int max_readdir; /* max readdir result (entires) */
+ int max_readdir_bytes; /* max readdir result (bytes) */
+
+ /*
+ * everything above this point can be memcmp'd; everything below
+ * is handled in compare_mount_options()
+ */
+
+ char *snapdir_name; /* default ".snap" */
+ char *mds_namespace; /* default NULL */
+ char *server_path; /* default NULL (means "/") */
+ char *fscache_uniq; /* default NULL */
+};
+
+struct ceph_fs_client {
+ struct super_block *sb;
+
+ struct ceph_mount_options *mount_options;
+ struct ceph_client *client;
+
+ unsigned long mount_state;
+ int min_caps; /* min caps i added */
+ loff_t max_file_size;
+
+ struct ceph_mds_client *mdsc;
+
+ /* writeback */
+ mempool_t *wb_pagevec_pool;
+ struct workqueue_struct *wb_wq;
+ struct workqueue_struct *pg_inv_wq;
+ struct workqueue_struct *trunc_wq;
+ atomic_long_t writeback_count;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_dentry_lru, *debugfs_caps;
+ struct dentry *debugfs_congestion_kb;
+ struct dentry *debugfs_bdi;
+ struct dentry *debugfs_mdsc, *debugfs_mdsmap;
+ struct dentry *debugfs_mds_sessions;
+#endif
+
+#ifdef CONFIG_CEPH_FSCACHE
+ struct fscache_cookie *fscache;
+#endif
+};
+
+
+/*
+ * File i/o capability. This tracks shared state with the metadata
+ * server that allows us to cache or writeback attributes or to read
+ * and write data. For any given inode, we should have one or more
+ * capabilities, one issued by each metadata server, and our
+ * cumulative access is the OR of all issued capabilities.
+ *
+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
+ * session capability lists.
+ */
+struct ceph_cap {
+ struct ceph_inode_info *ci;
+ struct rb_node ci_node; /* per-ci cap tree */
+ struct ceph_mds_session *session;
+ struct list_head session_caps; /* per-session caplist */
+ u64 cap_id; /* unique cap id (mds provided) */
+ union {
+ /* in-use caps */
+ struct {
+ int issued; /* latest, from the mds */
+ int implemented; /* implemented superset of
+ issued (for revocation) */
+ int mds, mds_wanted;
+ };
+ /* caps to release */
+ struct {
+ u64 cap_ino;
+ int queue_release;
+ };
+ };
+ u32 seq, issue_seq, mseq;
+ u32 cap_gen; /* active/stale cycle */
+ unsigned long last_used;
+ struct list_head caps_item;
+};
+
+#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
+#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
+#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+
+struct ceph_cap_flush {
+ u64 tid;
+ int caps; /* 0 means capsnap */
+ bool wake; /* wake up flush waiters when finish ? */
+ struct list_head g_list; // global
+ struct list_head i_list; // per inode
+};
+
+/*
+ * Snapped cap state that is pending flush to mds. When a snapshot occurs,
+ * we first complete any in-process sync writes and writeback any dirty
+ * data before flushing the snapped state (tracked here) back to the MDS.
+ */
+struct ceph_cap_snap {
+ refcount_t nref;
+ struct list_head ci_item;
+
+ struct ceph_cap_flush cap_flush;
+
+ u64 follows;
+ int issued, dirty;
+ struct ceph_snap_context *context;
+
+ umode_t mode;
+ kuid_t uid;
+ kgid_t gid;
+
+ struct ceph_buffer *xattr_blob;
+ u64 xattr_version;
+
+ u64 size;
+ struct timespec64 mtime, atime, ctime;
+ u64 time_warp_seq;
+ u64 truncate_size;
+ u32 truncate_seq;
+ int writing; /* a sync write is still in progress */
+ int dirty_pages; /* dirty pages awaiting writeback */
+ bool inline_data;
+ bool need_flush;
+};
+
+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
+{
+ if (refcount_dec_and_test(&capsnap->nref)) {
+ if (capsnap->xattr_blob)
+ ceph_buffer_put(capsnap->xattr_blob);
+ kfree(capsnap);
+ }
+}
+
+/*
+ * The frag tree describes how a directory is fragmented, potentially across
+ * multiple metadata servers. It is also used to indicate points where
+ * metadata authority is delegated, and whether/where metadata is replicated.
+ *
+ * A _leaf_ frag will be present in the i_fragtree IFF there is
+ * delegation info. That is, if mds >= 0 || ndist > 0.
+ */
+#define CEPH_MAX_DIRFRAG_REP 4
+
+struct ceph_inode_frag {
+ struct rb_node node;
+
+ /* fragtree state */
+ u32 frag;
+ int split_by; /* i.e. 2^(split_by) children */
+
+ /* delegation and replication info */
+ int mds; /* -1 if same authority as parent */
+ int ndist; /* >0 if replicated */
+ int dist[CEPH_MAX_DIRFRAG_REP];
+};
+
+/*
+ * We cache inode xattrs as an encoded blob until they are first used,
+ * at which point we parse them into an rbtree.
+ */
+struct ceph_inode_xattr {
+ struct rb_node node;
+
+ const char *name;
+ int name_len;
+ const char *val;
+ int val_len;
+ int dirty;
+
+ int should_free_name;
+ int should_free_val;
+};
+
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+ struct ceph_mds_session *lease_session;
+ int lease_shared_gen;
+ u32 lease_gen;
+ u32 lease_seq;
+ unsigned long lease_renew_after, lease_renew_from;
+ struct list_head lru;
+ struct dentry *dentry;
+ unsigned long time;
+ u64 offset;
+};
+
+struct ceph_inode_xattrs_info {
+ /*
+ * (still encoded) xattr blob. we avoid the overhead of parsing
+ * this until someone actually calls getxattr, etc.
+ *
+ * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
+ * NULL means we don't know.
+ */
+ struct ceph_buffer *blob, *prealloc_blob;
+
+ struct rb_root index;
+ bool dirty;
+ int count;
+ int names_size;
+ int vals_size;
+ u64 version, index_version;
+};
+
+/*
+ * Ceph inode.
+ */
+struct ceph_inode_info {
+ struct ceph_vino i_vino; /* ceph ino + snap */
+
+ spinlock_t i_ceph_lock;
+
+ u64 i_version;
+ u64 i_inline_version;
+ u32 i_time_warp_seq;
+
+ unsigned i_ceph_flags;
+ atomic64_t i_release_count;
+ atomic64_t i_ordered_count;
+ atomic64_t i_complete_seq[2];
+
+ struct ceph_dir_layout i_dir_layout;
+ struct ceph_file_layout i_layout;
+ char *i_symlink;
+
+ /* for dirs */
+ struct timespec64 i_rctime;
+ u64 i_rbytes, i_rfiles, i_rsubdirs;
+ u64 i_files, i_subdirs;
+
+ /* quotas */
+ u64 i_max_bytes, i_max_files;
+
+ struct rb_root i_fragtree;
+ int i_fragtree_nsplits;
+ struct mutex i_fragtree_mutex;
+
+ struct ceph_inode_xattrs_info i_xattrs;
+
+ /* capabilities. protected _both_ by i_ceph_lock and cap->session's
+ * s_mutex. */
+ struct rb_root i_caps; /* cap list */
+ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
+ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
+ struct list_head i_dirty_item, i_flushing_item;
+ /* we need to track cap writeback on a per-cap-bit basis, to allow
+ * overlapping, pipelined cap flushes to the mds. we can probably
+ * reduce the tid to 8 bits if we're concerned about inode size. */
+ struct ceph_cap_flush *i_prealloc_cap_flush;
+ struct list_head i_cap_flush_list;
+ wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
+ unsigned long i_hold_caps_min; /* jiffies */
+ unsigned long i_hold_caps_max; /* jiffies */
+ struct list_head i_cap_delay_list; /* for delayed cap release to mds */
+ struct ceph_cap_reservation i_cap_migration_resv;
+ struct list_head i_cap_snaps; /* snapped state pending flush to mds */
+ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
+ dirty|flushing caps */
+ unsigned i_snap_caps; /* cap bits for snapped files */
+
+ int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
+
+ struct mutex i_truncate_mutex;
+ u32 i_truncate_seq; /* last truncate to smaller size */
+ u64 i_truncate_size; /* and the size we last truncated down to */
+ int i_truncate_pending; /* still need to call vmtruncate */
+
+ u64 i_max_size; /* max file size authorized by mds */
+ u64 i_reported_size; /* (max_)size reported to or requested of mds */
+ u64 i_wanted_max_size; /* offset we'd like to write too */
+ u64 i_requested_max_size; /* max_size we've requested */
+
+ /* held references to caps */
+ int i_pin_ref;
+ int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
+ int i_wrbuffer_ref, i_wrbuffer_ref_head;
+ atomic_t i_filelock_ref;
+ atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */
+ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
+ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
+
+ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
+ spinlock_t i_unsafe_lock;
+
+ struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ int i_snap_realm_counter; /* snap realm (if caps) */
+ struct list_head i_snap_realm_item;
+ struct list_head i_snap_flush_item;
+
+ struct work_struct i_wb_work; /* writeback work */
+ struct work_struct i_pg_inv_work; /* page invalidation work */
+
+ struct work_struct i_vmtruncate_work;
+
+#ifdef CONFIG_CEPH_FSCACHE
+ struct fscache_cookie *fscache;
+ u32 i_fscache_gen;
+#endif
+ struct inode vfs_inode; /* at end */
+};
+
+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+{
+ return container_of(inode, struct ceph_inode_info, vfs_inode);
+}
+
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
+{
+ return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
+}
+
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
+{
+ return (struct ceph_fs_client *)sb->s_fs_info;
+}
+
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * i_ino (kernel inode) st_ino (userspace)
+ * i386 32 32
+ * x86_64+ino32 64 32
+ * x86_64 64 64
+ */
+static inline u32 ceph_ino_to_ino32(__u64 vino)
+{
+ u32 ino = vino & 0xffffffff;
+ ino ^= vino >> 32;
+ if (!ino)
+ ino = 2;
+ return ino;
+}
+
+/*
+ * kernel i_ino value
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+#if BITS_PER_LONG == 32
+ return ceph_ino_to_ino32(vino.ino);
+#else
+ return (ino_t)vino.ino;
+#endif
+}
+
+/*
+ * user-visible ino (stat, filldir)
+ */
+#if BITS_PER_LONG == 32
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+ return ino;
+}
+#else
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+ if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
+ ino = ceph_ino_to_ino32(ino);
+ return ino;
+}
+#endif
+
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+ struct ceph_vino *pvino = (struct ceph_vino *)data;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ return ci->i_vino.ino == pvino->ino &&
+ ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+ struct ceph_vino vino)
+{
+ ino_t t = ceph_vino_to_ino(vino);
+ return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
+#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
+#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */
+#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */
+#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
+#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
+#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
+#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
+#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */
+#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */
+
+
+/*
+ * We set the ERROR_WRITE bit when we start seeing write errors on an inode
+ * and then clear it when they start succeeding. Note that we do a lockless
+ * check first, and only take the lock if it looks like it needs to be changed.
+ * The write submission code just takes this as a hint, so we're not too
+ * worried if a few slip through in either direction.
+ */
+static inline void ceph_set_error_write(struct ceph_inode_info *ci)
+{
+ if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags |= CEPH_I_ERROR_WRITE;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+static inline void ceph_clear_error_write(struct ceph_inode_info *ci)
+{
+ if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
+ long long release_count,
+ long long ordered_count)
+{
+ /*
+ * Makes sure operations that setup readdir cache (update page
+ * cache and i_size) are strongly ordered w.r.t. the following
+ * atomic64_set() operations.
+ */
+ smp_mb();
+ atomic64_set(&ci->i_complete_seq[0], release_count);
+ atomic64_set(&ci->i_complete_seq[1], ordered_count);
+}
+
+static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
+{
+ atomic64_inc(&ci->i_release_count);
+}
+
+static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci)
+{
+ atomic64_inc(&ci->i_ordered_count);
+}
+
+static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
+{
+ return atomic64_read(&ci->i_complete_seq[0]) ==
+ atomic64_read(&ci->i_release_count);
+}
+
+static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
+{
+ return atomic64_read(&ci->i_complete_seq[0]) ==
+ atomic64_read(&ci->i_release_count) &&
+ atomic64_read(&ci->i_complete_seq[1]) ==
+ atomic64_read(&ci->i_ordered_count);
+}
+
+static inline void ceph_dir_clear_complete(struct inode *inode)
+{
+ __ceph_dir_clear_complete(ceph_inode(inode));
+}
+
+static inline void ceph_dir_clear_ordered(struct inode *inode)
+{
+ __ceph_dir_clear_ordered(ceph_inode(inode));
+}
+
+static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
+{
+ bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode));
+ smp_rmb();
+ return ret;
+}
+
+/* find a specific frag @f */
+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
+ u32 f);
+
+/*
+ * choose fragment for value @v. copy frag content to pfrag, if leaf
+ * exists
+ */
+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag,
+ int *found);
+
+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+{
+ return (struct ceph_dentry_info *)dentry->d_fsdata;
+}
+
+/*
+ * caps helpers
+ */
+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
+{
+ return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
+ struct ceph_cap *cap);
+
+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
+{
+ int issued;
+ spin_lock(&ci->i_ceph_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ spin_unlock(&ci->i_ceph_lock);
+ return issued;
+}
+
+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
+ int touch)
+{
+ int r;
+ spin_lock(&ci->i_ceph_lock);
+ r = __ceph_caps_issued_mask(ci, mask, touch);
+ spin_unlock(&ci->i_ceph_lock);
+ return r;
+}
+
+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
+{
+ return ci->i_dirty_caps | ci->i_flushing_caps;
+}
+extern struct ceph_cap_flush *ceph_alloc_cap_flush(void);
+extern void ceph_free_cap_flush(struct ceph_cap_flush *cf);
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+ struct ceph_cap_flush **pcf);
+
+extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+ struct ceph_cap *ocap, int mask);
+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_used(struct ceph_inode_info *ci);
+
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+ int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+ if (w & CEPH_CAP_FILE_BUFFER)
+ w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
+ return w;
+}
+
+/* what the mds thinks we want */
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
+
+extern void ceph_caps_init(struct ceph_mds_client *mdsc);
+extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
+extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
+extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx, int need);
+extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_fs_client *client,
+ int *total, int *avail, int *used,
+ int *reserved, int *min);
+
+
+
+/*
+ * we keep buffered readdir results attached to file->private_data
+ */
+#define CEPH_F_SYNC 1
+#define CEPH_F_ATEND 2
+
+struct ceph_file_info {
+ short fmode; /* initialized on open */
+ short flags; /* CEPH_F_* */
+
+ spinlock_t rw_contexts_lock;
+ struct list_head rw_contexts;
+};
+
+struct ceph_dir_file_info {
+ struct ceph_file_info file_info;
+
+ /* readdir: position within the dir */
+ u32 frag;
+ struct ceph_mds_request *last_readdir;
+
+ /* readdir: position within a frag */
+ unsigned next_offset; /* offset of next chunk (last_name's + 1) */
+ char *last_name; /* last entry in previous chunk */
+ long long dir_release_count;
+ long long dir_ordered_count;
+ int readdir_cache_idx;
+
+ /* used for -o dirstat read() on directory thing */
+ char *dir_info;
+ int dir_info_len;
+};
+
+struct ceph_rw_context {
+ struct list_head list;
+ struct task_struct *thread;
+ int caps;
+};
+
+#define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \
+ struct ceph_rw_context _name = { \
+ .thread = current, \
+ .caps = _caps, \
+ }
+
+static inline void ceph_add_rw_context(struct ceph_file_info *cf,
+ struct ceph_rw_context *ctx)
+{
+ spin_lock(&cf->rw_contexts_lock);
+ list_add(&ctx->list, &cf->rw_contexts);
+ spin_unlock(&cf->rw_contexts_lock);
+}
+
+static inline void ceph_del_rw_context(struct ceph_file_info *cf,
+ struct ceph_rw_context *ctx)
+{
+ spin_lock(&cf->rw_contexts_lock);
+ list_del(&ctx->list);
+ spin_unlock(&cf->rw_contexts_lock);
+}
+
+static inline struct ceph_rw_context*
+ceph_find_rw_context(struct ceph_file_info *cf)
+{
+ struct ceph_rw_context *ctx, *found = NULL;
+ spin_lock(&cf->rw_contexts_lock);
+ list_for_each_entry(ctx, &cf->rw_contexts, list) {
+ if (ctx->thread == current) {
+ found = ctx;
+ break;
+ }
+ }
+ spin_unlock(&cf->rw_contexts_lock);
+ return found;
+}
+
+struct ceph_readdir_cache_control {
+ struct page *page;
+ struct dentry **dentries;
+ int index;
+};
+
+/*
+ * A "snap realm" describes a subset of the file hierarchy sharing
+ * the same set of snapshots that apply to it. The realms themselves
+ * are organized into a hierarchy, such that children inherit (some of)
+ * the snapshots of their parents.
+ *
+ * All inodes within the realm that have capabilities are linked into a
+ * per-realm list.
+ */
+struct ceph_snap_realm {
+ u64 ino;
+ struct inode *inode;
+ atomic_t nref;
+ struct rb_node node;
+
+ u64 created, seq;
+ u64 parent_ino;
+ u64 parent_since; /* snapid when our current parent became so */
+
+ u64 *prior_parent_snaps; /* snaps inherited from any parents we */
+ u32 num_prior_parent_snaps; /* had prior to parent_since */
+ u64 *snaps; /* snaps specific to this realm */
+ u32 num_snaps;
+
+ struct ceph_snap_realm *parent;
+ struct list_head children; /* list of child realms */
+ struct list_head child_item;
+
+ struct list_head empty_item; /* if i have ref==0 */
+
+ struct list_head dirty_item; /* if realm needs new context */
+
+ /* the current set of snaps for this realm */
+ struct ceph_snap_context *cached_context;
+
+ struct list_head inodes_with_caps;
+ spinlock_t inodes_with_caps_lock;
+};
+
+static inline int default_congestion_kb(void)
+{
+ int congestion_kb;
+
+ /*
+ * Copied from NFS
+ *
+ * congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ * Limit the default to 256M
+ */
+ congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ if (congestion_kb > 256*1024)
+ congestion_kb = 256*1024;
+
+ return congestion_kb;
+}
+
+
+
+/* snap.c */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino);
+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm);
+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm);
+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
+ void *p, void *e, bool deletion,
+ struct ceph_snap_realm **realm_ret);
+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap);
+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+
+/*
+ * a cap_snap is "pending" if it is still awaiting an in-progress
+ * sync write (that may/may not still update size, mtime, etc.).
+ */
+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
+{
+ return !list_empty(&ci->i_cap_snaps) &&
+ list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap,
+ ci_item)->writing;
+}
+
+/* inode.c */
+extern const struct inode_operations ceph_file_iops;
+
+extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_evict_inode(struct inode *inode);
+extern void ceph_destroy_inode(struct inode *inode);
+extern int ceph_drop_inode(struct inode *inode);
+
+extern struct inode *ceph_get_inode(struct super_block *sb,
+ struct ceph_vino vino);
+extern struct inode *ceph_get_snapdir(struct inode *parent);
+extern int ceph_fill_file_size(struct inode *inode, int issued,
+ u32 truncate_seq, u64 truncate_size, u64 size);
+extern void ceph_fill_file_time(struct inode *inode, int issued,
+ u64 time_warp_seq, struct timespec64 *ctime,
+ struct timespec64 *mtime,
+ struct timespec64 *atime);
+extern int ceph_fill_trace(struct super_block *sb,
+ struct ceph_mds_request *req);
+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+ struct ceph_mds_session *session);
+
+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
+
+extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
+extern void ceph_queue_vmtruncate(struct inode *inode);
+
+extern void ceph_queue_invalidate(struct inode *inode);
+extern void ceph_queue_writeback(struct inode *inode);
+
+extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
+ int mask, bool force);
+static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
+{
+ return __ceph_do_getattr(inode, NULL, mask, force);
+}
+extern int ceph_permission(struct inode *inode, int mask);
+extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags);
+
+/* xattr.c */
+int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
+ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
+extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+extern void __init ceph_xattr_init(void);
+extern void ceph_xattr_exit(void);
+extern const struct xattr_handler *ceph_xattr_handlers[];
+
+#ifdef CONFIG_SECURITY
+extern bool ceph_security_xattr_deadlock(struct inode *in);
+extern bool ceph_security_xattr_wanted(struct inode *in);
+#else
+static inline bool ceph_security_xattr_deadlock(struct inode *in)
+{
+ return false;
+}
+static inline bool ceph_security_xattr_wanted(struct inode *in)
+{
+ return false;
+}
+#endif
+
+/* acl.c */
+struct ceph_acls_info {
+ void *default_acl;
+ void *acl;
+ struct ceph_pagelist *pagelist;
+};
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+
+struct posix_acl *ceph_get_acl(struct inode *, int);
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+ struct ceph_acls_info *info);
+void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info);
+void ceph_release_acls_info(struct ceph_acls_info *info);
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+ forget_all_cached_acls(inode);
+}
+
+#else
+
+#define ceph_get_acl NULL
+#define ceph_set_acl NULL
+
+static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+ struct ceph_acls_info *info)
+{
+ return 0;
+}
+static inline void ceph_init_inode_acls(struct inode *inode,
+ struct ceph_acls_info *info)
+{
+}
+static inline void ceph_release_acls_info(struct ceph_acls_info *info)
+{
+}
+static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
+{
+ return 0;
+}
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+}
+
+#endif
+
+/* caps.c */
+extern const char *ceph_cap_string(int c);
+extern void ceph_handle_caps(struct ceph_mds_session *session,
+ struct ceph_msg *msg);
+extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx);
+extern void ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned cap, unsigned seq, u64 realmino, int flags,
+ struct ceph_cap **new_cap);
+extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void ceph_put_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap *cap);
+extern int ceph_is_any_caps(struct inode *inode);
+
+extern void ceph_queue_caps_release(struct inode *inode);
+extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
+extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync);
+extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
+ int mds);
+extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+ struct ceph_snap_context *snapc);
+extern void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession);
+extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+ struct ceph_mds_session *session);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+extern int ceph_drop_caps_for_unlink(struct inode *inode);
+extern int ceph_encode_inode_release(void **p, struct inode *inode,
+ int mds, int drop, int unless, int force);
+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+ struct inode *dir,
+ int mds, int drop, int unless);
+
+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+ loff_t endoff, int *got, struct page **pinned_page);
+extern int ceph_try_get_caps(struct ceph_inode_info *ci,
+ int need, int want, int *got);
+
+/* for counting open files by mode */
+extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+
+/* addr.c */
+extern const struct address_space_operations ceph_aops;
+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
+extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need);
+extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
+
+/* file.c */
+extern const struct file_operations ceph_file_fops;
+
+extern int ceph_renew_caps(struct inode *inode);
+extern int ceph_open(struct inode *inode, struct file *file);
+extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned flags, umode_t mode);
+extern int ceph_release(struct inode *inode, struct file *filp);
+extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
+ char *data, size_t len);
+
+/* dir.c */
+extern const struct file_operations ceph_dir_fops;
+extern const struct file_operations ceph_snapdir_fops;
+extern const struct inode_operations ceph_dir_iops;
+extern const struct inode_operations ceph_snapdir_iops;
+extern const struct dentry_operations ceph_dentry_ops;
+
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern int ceph_handle_snapdir(struct ceph_mds_request *req,
+ struct dentry *dentry, int err);
+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+ struct dentry *dentry, int err);
+
+extern void ceph_dentry_lru_add(struct dentry *dn);
+extern void ceph_dentry_lru_touch(struct dentry *dn);
+extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
+extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
+
+/* ioctl.c */
+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* export.c */
+extern const struct export_operations ceph_export_ops;
+
+/* locks.c */
+extern __init void ceph_flock_init(void);
+extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
+extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
+extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
+extern int ceph_encode_locks_to_buffer(struct inode *inode,
+ struct ceph_filelock *flocks,
+ int num_fcntl_locks,
+ int num_flock_locks);
+extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+ struct ceph_pagelist *pagelist,
+ int num_fcntl_locks, int num_flock_locks);
+
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+
+/* quota.c */
+static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci)
+{
+ return ci->i_max_files || ci->i_max_bytes;
+}
+
+extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc);
+
+static inline void __ceph_update_quota(struct ceph_inode_info *ci,
+ u64 max_bytes, u64 max_files)
+{
+ bool had_quota, has_quota;
+ had_quota = __ceph_has_any_quota(ci);
+ ci->i_max_bytes = max_bytes;
+ ci->i_max_files = max_files;
+ has_quota = __ceph_has_any_quota(ci);
+
+ if (had_quota != has_quota)
+ ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota);
+}
+
+extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg);
+extern bool ceph_quota_is_max_files_exceeded(struct inode *inode);
+extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new);
+extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode,
+ loff_t newlen);
+extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode,
+ loff_t newlen);
+extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
+ struct kstatfs *buf);
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000..6fa9a7846
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,1209 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/pagelist.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
+
+#define XATTR_CEPH_PREFIX "ceph."
+#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
+
+static int __remove_xattr(struct ceph_inode_info *ci,
+ struct ceph_inode_xattr *xattr);
+
+static const struct xattr_handler ceph_other_xattr_handler;
+
+/*
+ * List of handlers for synthetic system.* attributes. Other
+ * attributes are handled directly.
+ */
+const struct xattr_handler *ceph_xattr_handlers[] = {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &ceph_other_xattr_handler,
+ NULL,
+};
+
+static bool ceph_is_valid_xattr(const char *name)
+{
+ return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
+ !strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) ||
+ !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+ !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+/*
+ * These define virtual xattrs exposing the recursive directory
+ * statistics and layout metadata.
+ */
+struct ceph_vxattr {
+ char *name;
+ size_t name_size; /* strlen(name) + 1 (for '\0') */
+ size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+ size_t size);
+ bool (*exists_cb)(struct ceph_inode_info *ci);
+ unsigned int flags;
+};
+
+#define VXATTR_FLAG_READONLY (1<<0)
+#define VXATTR_FLAG_HIDDEN (1<<1)
+#define VXATTR_FLAG_RSTAT (1<<2)
+
+/* layouts */
+
+static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
+{
+ struct ceph_file_layout *fl = &ci->i_layout;
+ return (fl->stripe_unit > 0 || fl->stripe_count > 0 ||
+ fl->object_size > 0 || fl->pool_id >= 0 ||
+ rcu_dereference_raw(fl->pool_ns) != NULL);
+}
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_string *pool_ns;
+ s64 pool = ci->i_layout.pool_id;
+ const char *pool_name;
+ const char *ns_field = " pool_namespace=";
+ char buf[128];
+ size_t len, total_len = 0;
+ ssize_t ret;
+
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+
+ dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+ down_read(&osdc->lock);
+ pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+ if (pool_name) {
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size);
+ total_len = len + strlen(pool_name);
+ } else {
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size, (unsigned long long)pool);
+ total_len = len;
+ }
+
+ if (pool_ns)
+ total_len += strlen(ns_field) + pool_ns->len;
+
+ ret = total_len;
+ if (size >= total_len) {
+ memcpy(val, buf, len);
+ ret = len;
+ if (pool_name) {
+ len = strlen(pool_name);
+ memcpy(val + ret, pool_name, len);
+ ret += len;
+ }
+ if (pool_ns) {
+ len = strlen(ns_field);
+ memcpy(val + ret, ns_field, len);
+ ret += len;
+ memcpy(val + ret, pool_ns->str, pool_ns->len);
+ ret += pool_ns->len;
+ }
+ }
+ up_read(&osdc->lock);
+ ceph_put_string(pool_ns);
+ return ret;
+}
+
+static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%u", ci->i_layout.stripe_unit);
+}
+
+static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%u", ci->i_layout.stripe_count);
+}
+
+static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%u", ci->i_layout.object_size);
+}
+
+static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ s64 pool = ci->i_layout.pool_id;
+ const char *pool_name;
+
+ down_read(&osdc->lock);
+ pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+ if (pool_name)
+ ret = snprintf(val, size, "%s", pool_name);
+ else
+ ret = snprintf(val, size, "%lld", (unsigned long long)pool);
+ up_read(&osdc->lock);
+ return ret;
+}
+
+static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret = 0;
+ struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ if (ns) {
+ ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str);
+ ceph_put_string(ns);
+ }
+ return ret;
+}
+
+/* directories */
+
+static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_files);
+}
+
+static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_rfiles);
+}
+
+static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_rbytes);
+}
+
+static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld.%09ld", ci->i_rctime.tv_sec,
+ ci->i_rctime.tv_nsec);
+}
+
+/* quotas */
+
+static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
+{
+ bool ret = false;
+ spin_lock(&ci->i_ceph_lock);
+ if ((ci->i_max_files || ci->i_max_bytes) &&
+ ci->i_vino.snap == CEPH_NOSNAP &&
+ ci->i_snap_realm &&
+ ci->i_snap_realm->ino == ci->i_vino.ino)
+ ret = true;
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
+static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "max_bytes=%llu max_files=%llu",
+ ci->i_max_bytes, ci->i_max_files);
+}
+
+static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%llu", ci->i_max_bytes);
+}
+
+static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%llu", ci->i_max_files);
+}
+
+#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
+#define CEPH_XATTR_NAME2(_type, _name, _name2) \
+ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
+
+#define XATTR_NAME_CEPH(_type, _name, _flags) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = NULL, \
+ .flags = (VXATTR_FLAG_READONLY | _flags), \
+ }
+#define XATTR_RSTAT_FIELD(_type, _name) \
+ XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
+#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = NULL, \
+ .flags = VXATTR_FLAG_RSTAT, \
+ }
+#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
+ { \
+ .name = CEPH_XATTR_NAME2(_type, _name, _field), \
+ .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
+ .exists_cb = ceph_vxattrcb_layout_exists, \
+ .flags = VXATTR_FLAG_HIDDEN, \
+ }
+#define XATTR_QUOTA_FIELD(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = ceph_vxattrcb_quota_exists, \
+ .flags = VXATTR_FLAG_HIDDEN, \
+ }
+
+static struct ceph_vxattr ceph_dir_vxattrs[] = {
+ {
+ .name = "ceph.dir.layout",
+ .name_size = sizeof("ceph.dir.layout"),
+ .getxattr_cb = ceph_vxattrcb_layout,
+ .exists_cb = ceph_vxattrcb_layout_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
+ XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
+ XATTR_LAYOUT_FIELD(dir, layout, object_size),
+ XATTR_LAYOUT_FIELD(dir, layout, pool),
+ XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
+ XATTR_NAME_CEPH(dir, entries, 0),
+ XATTR_NAME_CEPH(dir, files, 0),
+ XATTR_NAME_CEPH(dir, subdirs, 0),
+ XATTR_RSTAT_FIELD(dir, rentries),
+ XATTR_RSTAT_FIELD(dir, rfiles),
+ XATTR_RSTAT_FIELD(dir, rsubdirs),
+ XATTR_RSTAT_FIELD(dir, rbytes),
+ XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime),
+ {
+ .name = "ceph.quota",
+ .name_size = sizeof("ceph.quota"),
+ .getxattr_cb = ceph_vxattrcb_quota,
+ .exists_cb = ceph_vxattrcb_quota_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ XATTR_QUOTA_FIELD(quota, max_bytes),
+ XATTR_QUOTA_FIELD(quota, max_files),
+ { .name = NULL, 0 } /* Required table terminator */
+};
+static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
+
+/* files */
+
+static struct ceph_vxattr ceph_file_vxattrs[] = {
+ {
+ .name = "ceph.file.layout",
+ .name_size = sizeof("ceph.file.layout"),
+ .getxattr_cb = ceph_vxattrcb_layout,
+ .exists_cb = ceph_vxattrcb_layout_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
+ XATTR_LAYOUT_FIELD(file, layout, stripe_count),
+ XATTR_LAYOUT_FIELD(file, layout, object_size),
+ XATTR_LAYOUT_FIELD(file, layout, pool),
+ XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
+ { .name = NULL, 0 } /* Required table terminator */
+};
+static size_t ceph_file_vxattrs_name_size; /* total size of all names */
+
+static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode))
+ return ceph_dir_vxattrs;
+ else if (S_ISREG(inode->i_mode))
+ return ceph_file_vxattrs;
+ return NULL;
+}
+
+static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+ if (vxattrs == ceph_dir_vxattrs)
+ return ceph_dir_vxattrs_name_size;
+ if (vxattrs == ceph_file_vxattrs)
+ return ceph_file_vxattrs_name_size;
+ BUG_ON(vxattrs);
+ return 0;
+}
+
+/*
+ * Compute the aggregate size (including terminating '\0') of all
+ * virtual extended attribute names in the given vxattr table.
+ */
+static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+ struct ceph_vxattr *vxattr;
+ size_t size = 0;
+
+ for (vxattr = vxattrs; vxattr->name; vxattr++) {
+ if (!(vxattr->flags & VXATTR_FLAG_HIDDEN))
+ size += vxattr->name_size;
+ }
+
+ return size;
+}
+
+/* Routines called at initialization and exit time */
+
+void __init ceph_xattr_init(void)
+{
+ ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
+ ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
+}
+
+void ceph_xattr_exit(void)
+{
+ ceph_dir_vxattrs_name_size = 0;
+ ceph_file_vxattrs_name_size = 0;
+}
+
+static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
+ const char *name)
+{
+ struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode);
+
+ if (vxattr) {
+ while (vxattr->name) {
+ if (!strcmp(vxattr->name, name))
+ return vxattr;
+ vxattr++;
+ }
+ }
+
+ return NULL;
+}
+
+static int __set_xattr(struct ceph_inode_info *ci,
+ const char *name, int name_len,
+ const char *val, int val_len,
+ int flags, int update_xattr,
+ struct ceph_inode_xattr **newxattr)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct ceph_inode_xattr *xattr = NULL;
+ int c;
+ int new = 0;
+
+ p = &ci->i_xattrs.index.rb_node;
+ while (*p) {
+ parent = *p;
+ xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+ c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else {
+ if (name_len == xattr->name_len)
+ break;
+ else if (name_len < xattr->name_len)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+ xattr = NULL;
+ }
+
+ if (update_xattr) {
+ int err = 0;
+
+ if (xattr && (flags & XATTR_CREATE))
+ err = -EEXIST;
+ else if (!xattr && (flags & XATTR_REPLACE))
+ err = -ENODATA;
+ if (err) {
+ kfree(name);
+ kfree(val);
+ kfree(*newxattr);
+ return err;
+ }
+ if (update_xattr < 0) {
+ if (xattr)
+ __remove_xattr(ci, xattr);
+ kfree(name);
+ kfree(*newxattr);
+ return 0;
+ }
+ }
+
+ if (!xattr) {
+ new = 1;
+ xattr = *newxattr;
+ xattr->name = name;
+ xattr->name_len = name_len;
+ xattr->should_free_name = update_xattr;
+
+ ci->i_xattrs.count++;
+ dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+ } else {
+ kfree(*newxattr);
+ *newxattr = NULL;
+ if (xattr->should_free_val)
+ kfree((void *)xattr->val);
+
+ if (update_xattr) {
+ kfree((void *)name);
+ name = xattr->name;
+ }
+ ci->i_xattrs.names_size -= xattr->name_len;
+ ci->i_xattrs.vals_size -= xattr->val_len;
+ }
+ ci->i_xattrs.names_size += name_len;
+ ci->i_xattrs.vals_size += val_len;
+ if (val)
+ xattr->val = val;
+ else
+ xattr->val = "";
+
+ xattr->val_len = val_len;
+ xattr->dirty = update_xattr;
+ xattr->should_free_val = (val && update_xattr);
+
+ if (new) {
+ rb_link_node(&xattr->node, parent, p);
+ rb_insert_color(&xattr->node, &ci->i_xattrs.index);
+ dout("__set_xattr_val p=%p\n", p);
+ }
+
+ dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
+ ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+
+ return 0;
+}
+
+static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
+ const char *name)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct ceph_inode_xattr *xattr = NULL;
+ int name_len = strlen(name);
+ int c;
+
+ p = &ci->i_xattrs.index.rb_node;
+ while (*p) {
+ parent = *p;
+ xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+ c = strncmp(name, xattr->name, xattr->name_len);
+ if (c == 0 && name_len > xattr->name_len)
+ c = 1;
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else {
+ dout("__get_xattr %s: found %.*s\n", name,
+ xattr->val_len, xattr->val);
+ return xattr;
+ }
+ }
+
+ dout("__get_xattr %s: not found\n", name);
+
+ return NULL;
+}
+
+static void __free_xattr(struct ceph_inode_xattr *xattr)
+{
+ BUG_ON(!xattr);
+
+ if (xattr->should_free_name)
+ kfree((void *)xattr->name);
+ if (xattr->should_free_val)
+ kfree((void *)xattr->val);
+
+ kfree(xattr);
+}
+
+static int __remove_xattr(struct ceph_inode_info *ci,
+ struct ceph_inode_xattr *xattr)
+{
+ if (!xattr)
+ return -ENODATA;
+
+ rb_erase(&xattr->node, &ci->i_xattrs.index);
+
+ if (xattr->should_free_name)
+ kfree((void *)xattr->name);
+ if (xattr->should_free_val)
+ kfree((void *)xattr->val);
+
+ ci->i_xattrs.names_size -= xattr->name_len;
+ ci->i_xattrs.vals_size -= xattr->val_len;
+ ci->i_xattrs.count--;
+ kfree(xattr);
+
+ return 0;
+}
+
+static char *__copy_xattr_names(struct ceph_inode_info *ci,
+ char *dest)
+{
+ struct rb_node *p;
+ struct ceph_inode_xattr *xattr = NULL;
+
+ p = rb_first(&ci->i_xattrs.index);
+ dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+
+ while (p) {
+ xattr = rb_entry(p, struct ceph_inode_xattr, node);
+ memcpy(dest, xattr->name, xattr->name_len);
+ dest[xattr->name_len] = '\0';
+
+ dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+ xattr->name_len, ci->i_xattrs.names_size);
+
+ dest += xattr->name_len + 1;
+ p = rb_next(p);
+ }
+
+ return dest;
+}
+
+void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
+{
+ struct rb_node *p, *tmp;
+ struct ceph_inode_xattr *xattr = NULL;
+
+ p = rb_first(&ci->i_xattrs.index);
+
+ dout("__ceph_destroy_xattrs p=%p\n", p);
+
+ while (p) {
+ xattr = rb_entry(p, struct ceph_inode_xattr, node);
+ tmp = p;
+ p = rb_next(tmp);
+ dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
+ xattr->name_len, xattr->name);
+ rb_erase(tmp, &ci->i_xattrs.index);
+
+ __free_xattr(xattr);
+ }
+
+ ci->i_xattrs.names_size = 0;
+ ci->i_xattrs.vals_size = 0;
+ ci->i_xattrs.index_version = 0;
+ ci->i_xattrs.count = 0;
+ ci->i_xattrs.index = RB_ROOT;
+}
+
+static int __build_xattrs(struct inode *inode)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
+{
+ u32 namelen;
+ u32 numattr = 0;
+ void *p, *end;
+ u32 len;
+ const char *name, *val;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int xattr_version;
+ struct ceph_inode_xattr **xattrs = NULL;
+ int err = 0;
+ int i;
+
+ dout("__build_xattrs() len=%d\n",
+ ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+
+ if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
+ return 0; /* already built */
+
+ __ceph_destroy_xattrs(ci);
+
+start:
+ /* updated internal xattr rb tree */
+ if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
+ p = ci->i_xattrs.blob->vec.iov_base;
+ end = p + ci->i_xattrs.blob->vec.iov_len;
+ ceph_decode_32_safe(&p, end, numattr, bad);
+ xattr_version = ci->i_xattrs.version;
+ spin_unlock(&ci->i_ceph_lock);
+
+ xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *),
+ GFP_NOFS);
+ err = -ENOMEM;
+ if (!xattrs)
+ goto bad_lock;
+
+ for (i = 0; i < numattr; i++) {
+ xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
+ GFP_NOFS);
+ if (!xattrs[i])
+ goto bad_lock;
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_xattrs.version != xattr_version) {
+ /* lost a race, retry */
+ for (i = 0; i < numattr; i++)
+ kfree(xattrs[i]);
+ kfree(xattrs);
+ xattrs = NULL;
+ goto start;
+ }
+ err = -EIO;
+ while (numattr--) {
+ ceph_decode_32_safe(&p, end, len, bad);
+ namelen = len;
+ name = p;
+ p += len;
+ ceph_decode_32_safe(&p, end, len, bad);
+ val = p;
+ p += len;
+
+ err = __set_xattr(ci, name, namelen, val, len,
+ 0, 0, &xattrs[numattr]);
+
+ if (err < 0)
+ goto bad;
+ }
+ kfree(xattrs);
+ }
+ ci->i_xattrs.index_version = ci->i_xattrs.version;
+ ci->i_xattrs.dirty = false;
+
+ return err;
+bad_lock:
+ spin_lock(&ci->i_ceph_lock);
+bad:
+ if (xattrs) {
+ for (i = 0; i < numattr; i++)
+ kfree(xattrs[i]);
+ kfree(xattrs);
+ }
+ ci->i_xattrs.names_size = 0;
+ return err;
+}
+
+static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
+ int val_size)
+{
+ /*
+ * 4 bytes for the length, and additional 4 bytes per each xattr name,
+ * 4 bytes per each value
+ */
+ int size = 4 + ci->i_xattrs.count*(4 + 4) +
+ ci->i_xattrs.names_size +
+ ci->i_xattrs.vals_size;
+ dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
+ ci->i_xattrs.count, ci->i_xattrs.names_size,
+ ci->i_xattrs.vals_size);
+
+ if (name_size)
+ size += 4 + 4 + name_size + val_size;
+
+ return size;
+}
+
+/*
+ * If there are dirty xattrs, reencode xattrs into the prealloc_blob
+ * and swap into place. It returns the old i_xattrs.blob (or NULL) so
+ * that it can be freed by the caller as the i_ceph_lock is likely to be
+ * held.
+ */
+struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+{
+ struct rb_node *p;
+ struct ceph_inode_xattr *xattr = NULL;
+ struct ceph_buffer *old_blob = NULL;
+ void *dest;
+
+ dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+ if (ci->i_xattrs.dirty) {
+ int need = __get_required_blob_size(ci, 0, 0);
+
+ BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
+
+ p = rb_first(&ci->i_xattrs.index);
+ dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+ ceph_encode_32(&dest, ci->i_xattrs.count);
+ while (p) {
+ xattr = rb_entry(p, struct ceph_inode_xattr, node);
+
+ ceph_encode_32(&dest, xattr->name_len);
+ memcpy(dest, xattr->name, xattr->name_len);
+ dest += xattr->name_len;
+ ceph_encode_32(&dest, xattr->val_len);
+ memcpy(dest, xattr->val, xattr->val_len);
+ dest += xattr->val_len;
+
+ p = rb_next(p);
+ }
+
+ /* adjust buffer len; it may be larger than we need */
+ ci->i_xattrs.prealloc_blob->vec.iov_len =
+ dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+ if (ci->i_xattrs.blob)
+ old_blob = ci->i_xattrs.blob;
+ ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
+ ci->i_xattrs.prealloc_blob = NULL;
+ ci->i_xattrs.dirty = false;
+ ci->i_xattrs.version++;
+ }
+
+ return old_blob;
+}
+
+static inline int __get_request_mask(struct inode *in) {
+ struct ceph_mds_request *req = current->journal_info;
+ int mask = 0;
+ if (req && req->r_target_inode == in) {
+ if (req->r_op == CEPH_MDS_OP_LOOKUP ||
+ req->r_op == CEPH_MDS_OP_LOOKUPINO ||
+ req->r_op == CEPH_MDS_OP_LOOKUPPARENT ||
+ req->r_op == CEPH_MDS_OP_GETATTR) {
+ mask = le32_to_cpu(req->r_args.getattr.mask);
+ } else if (req->r_op == CEPH_MDS_OP_OPEN ||
+ req->r_op == CEPH_MDS_OP_CREATE) {
+ mask = le32_to_cpu(req->r_args.open.mask);
+ }
+ }
+ return mask;
+}
+
+ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
+ size_t size)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_xattr *xattr;
+ struct ceph_vxattr *vxattr = NULL;
+ int req_mask;
+ int err;
+
+ /* let's see if a virtual xattr was requested */
+ vxattr = ceph_match_vxattr(inode, name);
+ if (vxattr) {
+ int mask = 0;
+ if (vxattr->flags & VXATTR_FLAG_RSTAT)
+ mask |= CEPH_STAT_RSTAT;
+ err = ceph_do_getattr(inode, mask, true);
+ if (err)
+ return err;
+ err = -ENODATA;
+ if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
+ err = vxattr->getxattr_cb(ci, value, size);
+ if (size && size < err)
+ err = -ERANGE;
+ }
+ return err;
+ }
+
+ req_mask = __get_request_mask(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+ ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+ if (ci->i_xattrs.version == 0 ||
+ !((req_mask & CEPH_CAP_XATTR_SHARED) ||
+ __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) {
+ spin_unlock(&ci->i_ceph_lock);
+
+ /* security module gets xattr while filling trace */
+ if (current->journal_info) {
+ pr_warn_ratelimited("sync getxattr %p "
+ "during filling trace\n", inode);
+ return -EBUSY;
+ }
+
+ /* get xattrs from mds (if we don't already have them) */
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
+ if (err)
+ return err;
+ spin_lock(&ci->i_ceph_lock);
+ }
+
+ err = __build_xattrs(inode);
+ if (err < 0)
+ goto out;
+
+ err = -ENODATA; /* == ENOATTR */
+ xattr = __get_xattr(ci, name);
+ if (!xattr)
+ goto out;
+
+ err = -ERANGE;
+ if (size && size < xattr->val_len)
+ goto out;
+
+ err = xattr->val_len;
+ if (size == 0)
+ goto out;
+
+ memcpy(value, xattr->val, xattr->val_len);
+
+ if (current->journal_info &&
+ !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+ ci->i_ceph_flags |= CEPH_I_SEC_INITED;
+out:
+ spin_unlock(&ci->i_ceph_lock);
+ return err;
+}
+
+ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
+{
+ struct inode *inode = d_inode(dentry);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
+ u32 vir_namelen = 0;
+ u32 namelen;
+ int err;
+ u32 len;
+ int i;
+
+ spin_lock(&ci->i_ceph_lock);
+ dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
+ ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+ if (ci->i_xattrs.version == 0 ||
+ !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
+ spin_unlock(&ci->i_ceph_lock);
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
+ if (err)
+ return err;
+ spin_lock(&ci->i_ceph_lock);
+ }
+
+ err = __build_xattrs(inode);
+ if (err < 0)
+ goto out;
+ /*
+ * Start with virtual dir xattr names (if any) (including
+ * terminating '\0' characters for each).
+ */
+ vir_namelen = ceph_vxattrs_name_size(vxattrs);
+
+ /* adding 1 byte per each variable due to the null termination */
+ namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
+ err = -ERANGE;
+ if (size && vir_namelen + namelen > size)
+ goto out;
+
+ err = namelen + vir_namelen;
+ if (size == 0)
+ goto out;
+
+ names = __copy_xattr_names(ci, names);
+
+ /* virtual xattr names, too */
+ err = namelen;
+ if (vxattrs) {
+ for (i = 0; vxattrs[i].name; i++) {
+ if (!(vxattrs[i].flags & VXATTR_FLAG_HIDDEN) &&
+ !(vxattrs[i].exists_cb &&
+ !vxattrs[i].exists_cb(ci))) {
+ len = sprintf(names, "%s", vxattrs[i].name);
+ names += len + 1;
+ err += len + 1;
+ }
+ }
+ }
+
+out:
+ spin_unlock(&ci->i_ceph_lock);
+ return err;
+}
+
+static int ceph_sync_setxattr(struct inode *inode, const char *name,
+ const char *value, size_t size, int flags)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_pagelist *pagelist = NULL;
+ int op = CEPH_MDS_OP_SETXATTR;
+ int err;
+
+ if (size > 0) {
+ /* copy value into pagelist */
+ pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ if (!pagelist)
+ return -ENOMEM;
+
+ ceph_pagelist_init(pagelist);
+ err = ceph_pagelist_append(pagelist, value, size);
+ if (err)
+ goto out;
+ } else if (!value) {
+ if (flags & CEPH_XATTR_REPLACE)
+ op = CEPH_MDS_OP_RMXATTR;
+ else
+ flags |= CEPH_XATTR_REMOVE;
+ }
+
+ dout("setxattr value=%.*s\n", (int)size, value);
+
+ /* do request */
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2) {
+ ceph_mdsc_put_request(req);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (op == CEPH_MDS_OP_SETXATTR) {
+ req->r_args.setxattr.flags = cpu_to_le32(flags);
+ req->r_pagelist = pagelist;
+ pagelist = NULL;
+ }
+
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_num_caps = 1;
+ req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+
+ dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ ceph_mdsc_put_request(req);
+ dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+
+out:
+ if (pagelist)
+ ceph_pagelist_release(pagelist);
+ return err;
+}
+
+int __ceph_setxattr(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct ceph_vxattr *vxattr;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_cap_flush *prealloc_cf = NULL;
+ struct ceph_buffer *old_blob = NULL;
+ int issued;
+ int err;
+ int dirty = 0;
+ int name_len = strlen(name);
+ int val_len = size;
+ char *newname = NULL;
+ char *newval = NULL;
+ struct ceph_inode_xattr *xattr = NULL;
+ int required_blob_size;
+ bool check_realm = false;
+ bool lock_snap_rwsem = false;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ vxattr = ceph_match_vxattr(inode, name);
+ if (vxattr) {
+ if (vxattr->flags & VXATTR_FLAG_READONLY)
+ return -EOPNOTSUPP;
+ if (value && !strncmp(vxattr->name, "ceph.quota", 10))
+ check_realm = true;
+ }
+
+ /* pass any unhandled ceph.* xattrs through to the MDS */
+ if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto do_sync_unlocked;
+
+ /* preallocate memory for xattr name, value, index node */
+ err = -ENOMEM;
+ newname = kmemdup(name, name_len + 1, GFP_NOFS);
+ if (!newname)
+ goto out;
+
+ if (val_len) {
+ newval = kmemdup(value, val_len, GFP_NOFS);
+ if (!newval)
+ goto out;
+ }
+
+ xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
+ if (!xattr)
+ goto out;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ goto out;
+
+ spin_lock(&ci->i_ceph_lock);
+retry:
+ issued = __ceph_caps_issued(ci, NULL);
+ if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
+ goto do_sync;
+
+ if (!lock_snap_rwsem && !ci->i_head_snapc) {
+ lock_snap_rwsem = true;
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ goto retry;
+ }
+ }
+
+ dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
+ __build_xattrs(inode);
+
+ required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+
+ if (!ci->i_xattrs.prealloc_blob ||
+ required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+ struct ceph_buffer *blob;
+
+ spin_unlock(&ci->i_ceph_lock);
+ ceph_buffer_put(old_blob); /* Shouldn't be required */
+ dout(" pre-allocating new blob size=%d\n", required_blob_size);
+ blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+ if (!blob)
+ goto do_sync_unlocked;
+ spin_lock(&ci->i_ceph_lock);
+ /* prealloc_blob can't be released while holding i_ceph_lock */
+ if (ci->i_xattrs.prealloc_blob)
+ old_blob = ci->i_xattrs.prealloc_blob;
+ ci->i_xattrs.prealloc_blob = blob;
+ goto retry;
+ }
+
+ err = __set_xattr(ci, newname, name_len, newval, val_len,
+ flags, value ? 1 : -1, &xattr);
+
+ if (!err) {
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+ &prealloc_cf);
+ ci->i_xattrs.dirty = true;
+ inode->i_ctime = current_time(inode);
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ ceph_buffer_put(old_blob);
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ ceph_free_cap_flush(prealloc_cf);
+ return err;
+
+do_sync:
+ spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
+
+ /* security module set xattr while filling trace */
+ if (current->journal_info) {
+ pr_warn_ratelimited("sync setxattr %p "
+ "during filling trace\n", inode);
+ err = -EBUSY;
+ } else {
+ err = ceph_sync_setxattr(inode, name, value, size, flags);
+ if (err >= 0 && check_realm) {
+ /* check if snaprealm was created for quota inode */
+ spin_lock(&ci->i_ceph_lock);
+ if ((ci->i_max_files || ci->i_max_bytes) &&
+ !(ci->i_snap_realm &&
+ ci->i_snap_realm->ino == ci->i_vino.ino))
+ err = -EOPNOTSUPP;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ }
+out:
+ ceph_free_cap_flush(prealloc_cf);
+ kfree(newname);
+ kfree(newval);
+ kfree(xattr);
+ return err;
+}
+
+static int ceph_get_xattr_handler(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
+{
+ if (!ceph_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+ return __ceph_getxattr(inode, name, value, size);
+}
+
+static int ceph_set_xattr_handler(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (!ceph_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+ return __ceph_setxattr(inode, name, value, size, flags);
+}
+
+static const struct xattr_handler ceph_other_xattr_handler = {
+ .prefix = "", /* match any name => handlers called with full name */
+ .get = ceph_get_xattr_handler,
+ .set = ceph_set_xattr_handler,
+};
+
+#ifdef CONFIG_SECURITY
+bool ceph_security_xattr_wanted(struct inode *in)
+{
+ return in->i_security != NULL;
+}
+
+bool ceph_security_xattr_deadlock(struct inode *in)
+{
+ struct ceph_inode_info *ci;
+ bool ret;
+ if (!in->i_security)
+ return false;
+ ci = ceph_inode(in);
+ spin_lock(&ci->i_ceph_lock);
+ ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) &&
+ !(ci->i_xattrs.version > 0 &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0));
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+#endif