summaryrefslogtreecommitdiffstats
path: root/fs/netfs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:50:03 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:50:03 +0000
commit01a69402cf9d38ff180345d55c2ee51c7e89fbc7 (patch)
treeb406c5242a088c4f59c6e4b719b783f43aca6ae9 /fs/netfs
parentAdding upstream version 6.7.12. (diff)
downloadlinux-01a69402cf9d38ff180345d55c2ee51c7e89fbc7.tar.xz
linux-01a69402cf9d38ff180345d55c2ee51c7e89fbc7.zip
Adding upstream version 6.8.9.upstream/6.8.9
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--fs/netfs/Kconfig39
-rw-r--r--fs/netfs/Makefile22
-rw-r--r--fs/netfs/buffered_read.c237
-rw-r--r--fs/netfs/buffered_write.c1258
-rw-r--r--fs/netfs/direct_read.c125
-rw-r--r--fs/netfs/direct_write.c174
-rw-r--r--fs/netfs/fscache_cache.c (renamed from fs/fscache/cache.c)0
-rw-r--r--fs/netfs/fscache_cookie.c (renamed from fs/fscache/cookie.c)0
-rw-r--r--fs/netfs/fscache_internal.h14
-rw-r--r--fs/netfs/fscache_io.c (renamed from fs/fscache/io.c)42
-rw-r--r--fs/netfs/fscache_main.c (renamed from fs/fscache/main.c)25
-rw-r--r--fs/netfs/fscache_proc.c (renamed from fs/fscache/proc.c)23
-rw-r--r--fs/netfs/fscache_stats.c (renamed from fs/fscache/stats.c)13
-rw-r--r--fs/netfs/fscache_volume.c (renamed from fs/fscache/volume.c)0
-rw-r--r--fs/netfs/internal.h284
-rw-r--r--fs/netfs/io.c217
-rw-r--r--fs/netfs/iterator.c97
-rw-r--r--fs/netfs/locking.c216
-rw-r--r--fs/netfs/main.c109
-rw-r--r--fs/netfs/misc.c260
-rw-r--r--fs/netfs/objects.c59
-rw-r--r--fs/netfs/output.c478
-rw-r--r--fs/netfs/stats.c42
23 files changed, 3561 insertions, 173 deletions
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index b4db21022c..bec805e0c4 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -21,3 +21,42 @@ config NETFS_STATS
multi-CPU system these may be on cachelines that keep bouncing
between CPUs. On the other hand, the stats are very useful for
debugging purposes. Saying 'Y' here is recommended.
+
+config FSCACHE
+ bool "General filesystem local caching manager"
+ depends on NETFS_SUPPORT
+ help
+ This option enables a generic filesystem caching manager that can be
+ used by various network and other filesystems to cache data locally.
+ Different sorts of caches can be plugged in, depending on the
+ resources available.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
+
+config FSCACHE_STATS
+ bool "Gather statistical information on local caching"
+ depends on FSCACHE && PROC_FS
+ select NETFS_STATS
+ help
+ This option causes statistical information to be gathered on local
+ caching and exported through file:
+
+ /proc/fs/fscache/stats
+
+ The gathering of statistics adds a certain amount of overhead to
+ execution as there are a quite a few stats gathered, and on a
+ multi-CPU system these may be on cachelines that keep bouncing
+ between CPUs. On the other hand, the stats are very useful for
+ debugging purposes. Saying 'Y' here is recommended.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
+
+config FSCACHE_DEBUG
+ bool "Debug FS-Cache"
+ depends on FSCACHE
+ help
+ This permits debugging to be dynamically enabled in the local caching
+ management module. If this is set, the debugging output may be
+ enabled by setting bits in /sys/modules/fscache/parameter/debug.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 386d6fb927..d4d1d79981 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -2,11 +2,29 @@
netfs-y := \
buffered_read.o \
+ buffered_write.o \
+ direct_read.o \
+ direct_write.o \
io.o \
iterator.o \
+ locking.o \
main.o \
- objects.o
+ misc.o \
+ objects.o \
+ output.o
netfs-$(CONFIG_NETFS_STATS) += stats.o
-obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
+netfs-$(CONFIG_FSCACHE) += \
+ fscache_cache.o \
+ fscache_cookie.o \
+ fscache_io.o \
+ fscache_main.o \
+ fscache_volume.o
+
+ifeq ($(CONFIG_PROC_FS),y)
+netfs-$(CONFIG_FSCACHE) += fscache_proc.o
+endif
+netfs-$(CONFIG_FSCACHE_STATS) += fscache_stats.o
+
+obj-$(CONFIG_NETFS_SUPPORT) += netfs.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 2cd3ccf4c4..3298c29b55 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -16,6 +16,7 @@
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
+ struct netfs_folio *finfo;
struct folio *folio;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
@@ -63,6 +64,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
break;
}
if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_start_fscache(folio);
folio_started = true;
}
@@ -86,11 +88,20 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
if (!pg_failed) {
flush_dcache_folio(folio);
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ kfree(finfo);
+ }
folio_mark_uptodate(folio);
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
- if (folio_index(folio) == rreq->no_unlock_folio &&
+ if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
_debug("no unlock");
else
@@ -147,6 +158,15 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq,
}
}
+/*
+ * Begin an operation, and fetch the stored zero point value from the cookie if
+ * available.
+ */
+static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
+{
+ return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
+}
+
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
@@ -180,11 +200,9 @@ void netfs_readahead(struct readahead_control *ractl)
if (IS_ERR(rreq))
return;
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto cleanup_free;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto cleanup_free;
netfs_stat(&netfs_n_rh_readahead);
trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
@@ -192,6 +210,10 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl);
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
+ rreq->start, rreq->len);
+
/* Drop the refs on the folios here rather than in the cache or
* filesystem. The locks will be dropped in netfs_rreq_unlock().
*/
@@ -199,6 +221,7 @@ void netfs_readahead(struct readahead_control *ractl)
;
netfs_begin_read(rreq, false);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return;
cleanup_free:
@@ -223,12 +246,13 @@ EXPORT_SYMBOL(netfs_readahead);
*/
int netfs_read_folio(struct file *file, struct folio *folio)
{
- struct address_space *mapping = folio_file_mapping(folio);
+ struct address_space *mapping = folio->mapping;
struct netfs_io_request *rreq;
struct netfs_inode *ctx = netfs_inode(mapping->host);
+ struct folio *sink = NULL;
int ret;
- _enter("%lx", folio_index(folio));
+ _enter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
folio_file_pos(folio), folio_size(folio),
@@ -238,15 +262,64 @@ int netfs_read_folio(struct file *file, struct folio *folio)
goto alloc_error;
}
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto discard;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto discard;
netfs_stat(&netfs_n_rh_readpage);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
- return netfs_begin_read(rreq, true);
+
+ /* Set up the output buffer */
+ if (folio_test_dirty(folio)) {
+ /* Handle someone trying to read from an unflushed streaming
+ * write. We fiddle the buffer so that a gap at the beginning
+ * and/or a gap at the end get copied to, but the middle is
+ * discarded.
+ */
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct bio_vec *bvec;
+ unsigned int from = finfo->dirty_offset;
+ unsigned int to = from + finfo->dirty_len;
+ unsigned int off = 0, i = 0;
+ size_t flen = folio_size(folio);
+ size_t nr_bvec = flen / PAGE_SIZE + 2;
+ size_t part;
+
+ ret = -ENOMEM;
+ bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
+ if (!bvec)
+ goto discard;
+
+ sink = folio_alloc(GFP_KERNEL, 0);
+ if (!sink)
+ goto discard;
+
+ trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
+
+ rreq->direct_bv = bvec;
+ rreq->direct_bv_count = nr_bvec;
+ if (from > 0) {
+ bvec_set_folio(&bvec[i++], folio, from, 0);
+ off = from;
+ }
+ while (off < to) {
+ part = min_t(size_t, to - off, PAGE_SIZE);
+ bvec_set_folio(&bvec[i++], sink, part, 0);
+ off += part;
+ }
+ if (to < flen)
+ bvec_set_folio(&bvec[i++], folio, flen - to, to);
+ iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
+ } else {
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+ }
+
+ ret = netfs_begin_read(rreq, true);
+ if (sink)
+ folio_put(sink);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret < 0 ? ret : 0;
discard:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
@@ -387,14 +460,12 @@ retry:
ret = PTR_ERR(rreq);
goto error;
}
- rreq->no_unlock_folio = folio_index(folio);
+ rreq->no_unlock_folio = folio->index;
__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto error_put;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
netfs_stat(&netfs_n_rh_write_begin);
trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
@@ -405,6 +476,10 @@ retry:
ractl._nr_pages = folio_nr_pages(folio);
netfs_rreq_expand(rreq, &ractl);
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+
/* We hold the folio locks, so we can drop the references */
folio_get(folio);
while (readahead_folio(&ractl))
@@ -413,6 +488,7 @@ retry:
ret = netfs_begin_read(rreq, true);
if (ret < 0)
goto error;
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
ret = folio_wait_fscache_killable(folio);
@@ -434,3 +510,124 @@ error:
return ret;
}
EXPORT_SYMBOL(netfs_write_begin);
+
+/*
+ * Preload the data into a page we're proposing to write into.
+ */
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+ size_t offset, size_t len)
+{
+ struct netfs_io_request *rreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ unsigned long long start = folio_pos(folio);
+ size_t flen = folio_size(folio);
+ int ret;
+
+ _enter("%zx @%llx", flen, start);
+
+ ret = -ENOMEM;
+
+ rreq = netfs_alloc_request(mapping, file, start, flen,
+ NETFS_READ_FOR_WRITE);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto error;
+ }
+
+ rreq->no_unlock_folio = folio->index;
+ __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
+
+ netfs_stat(&netfs_n_rh_write_begin);
+ trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
+
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+
+ ret = netfs_begin_read(rreq, true);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret;
+
+error_put:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/**
+ * netfs_buffered_read_iter - Filesystem buffered I/O read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead. When no data can be read,
+ * -EAGAIN shall be returned. When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
+ return -EINVAL;
+
+ ret = netfs_start_io_read(inode);
+ if (ret == 0) {
+ ret = filemap_read(iocb, iter, 0);
+ netfs_end_io_read(inode);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netfs_buffered_read_iter);
+
+/**
+ * netfs_file_read_iter - Generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead. When no data can be read,
+ * -EAGAIN shall be returned. When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
+
+ if ((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+ return netfs_unbuffered_read_iter(iocb, iter);
+
+ return netfs_buffered_read_iter(iocb, iter);
+}
+EXPORT_SYMBOL(netfs_file_read_iter);
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
new file mode 100644
index 0000000000..267b622d92
--- /dev/null
+++ b/fs/netfs/buffered_write.c
@@ -0,0 +1,1258 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * Determined write method. Adjust netfs_folio_traces if this is changed.
+ */
+enum netfs_how_to_modify {
+ NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */
+ NETFS_JUST_PREFETCH, /* We have to read the folio anyway */
+ NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */
+ NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */
+ NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */
+ NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */
+ NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
+};
+
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
+
+static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
+{
+ if (netfs_group && !folio_get_private(folio))
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+}
+
+#if IS_ENABLED(CONFIG_FSCACHE)
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+ if (caching)
+ folio_start_fscache(folio);
+}
+#else
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+}
+#endif
+
+/*
+ * Decide how we should modify a folio. We might be attempting to do
+ * write-streaming, in which case we don't want to a local RMW cycle if we can
+ * avoid it. If we're doing local caching or content crypto, we award that
+ * priority over avoiding RMW. If the file is open readably, then we also
+ * assume that we may want to read what we wrote.
+ */
+static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
+ struct file *file,
+ struct folio *folio,
+ void *netfs_group,
+ size_t flen,
+ size_t offset,
+ size_t len,
+ bool maybe_trouble)
+{
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ loff_t pos = folio_file_pos(folio);
+
+ _enter("");
+
+ if (netfs_folio_group(folio) != netfs_group)
+ return NETFS_FLUSH_CONTENT;
+
+ if (folio_test_uptodate(folio))
+ return NETFS_FOLIO_IS_UPTODATE;
+
+ if (pos >= ctx->zero_point)
+ return NETFS_MODIFY_AND_CLEAR;
+
+ if (!maybe_trouble && offset == 0 && len >= flen)
+ return NETFS_WHOLE_FOLIO_MODIFY;
+
+ if (file->f_mode & FMODE_READ)
+ goto no_write_streaming;
+ if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+ goto no_write_streaming;
+
+ if (netfs_is_cache_enabled(ctx)) {
+ /* We don't want to get a streaming write on a file that loses
+ * caching service temporarily because the backing store got
+ * culled.
+ */
+ if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+ set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
+ goto no_write_streaming;
+ }
+
+ if (!finfo)
+ return NETFS_STREAMING_WRITE;
+
+ /* We can continue a streaming write only if it continues on from the
+ * previous. If it overlaps, we must flush lest we suffer a partial
+ * copy and disjoint dirty regions.
+ */
+ if (offset == finfo->dirty_offset + finfo->dirty_len)
+ return NETFS_STREAMING_WRITE_CONT;
+ return NETFS_FLUSH_CONTENT;
+
+no_write_streaming:
+ if (finfo) {
+ netfs_stat(&netfs_n_wh_wstream_conflict);
+ return NETFS_FLUSH_CONTENT;
+ }
+ return NETFS_JUST_PREFETCH;
+}
+
+/*
+ * Grab a folio for writing and lock it. Attempt to allocate as large a folio
+ * as possible to hold as much of the remaining length as possible in one go.
+ */
+static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
+ loff_t pos, size_t part)
+{
+ pgoff_t index = pos / PAGE_SIZE;
+ fgf_t fgp_flags = FGP_WRITEBEGIN;
+
+ if (mapping_large_folio_support(mapping))
+ fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part);
+
+ return __filemap_get_folio(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+}
+
+/**
+ * netfs_perform_write - Copy data into the pagecache.
+ * @iocb: The operation parameters
+ * @iter: The source buffer
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * Copy data into pagecache pages attached to the inode specified by @iocb.
+ * The caller must hold appropriate inode locks.
+ *
+ * Dirty pages are tagged with a netfs_folio struct if they're not up to date
+ * to indicate the range modified. Dirty pages may also be tagged with a
+ * netfs-specific grouping such that data from an old group gets flushed before
+ * a new one is started.
+ */
+ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct netfs_inode *ctx = netfs_inode(inode);
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .for_sync = true,
+ .nr_to_write = LONG_MAX,
+ .range_start = iocb->ki_pos,
+ .range_end = iocb->ki_pos + iter->count,
+ };
+ struct netfs_io_request *wreq = NULL;
+ struct netfs_folio *finfo;
+ struct folio *folio;
+ enum netfs_how_to_modify howto;
+ enum netfs_folio_trace trace;
+ unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
+ ssize_t written = 0, ret, ret2;
+ loff_t i_size, pos = iocb->ki_pos, from, to;
+ size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
+ bool maybe_trouble = false;
+
+ if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
+ iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
+ ) {
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+
+ ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
+ if (ret < 0) {
+ wbc_detach_inode(&wbc);
+ goto out;
+ }
+
+ wreq = netfs_begin_writethrough(iocb, iter->count);
+ if (IS_ERR(wreq)) {
+ wbc_detach_inode(&wbc);
+ ret = PTR_ERR(wreq);
+ wreq = NULL;
+ goto out;
+ }
+ if (!is_sync_kiocb(iocb))
+ wreq->iocb = iocb;
+ wreq->cleanup = netfs_cleanup_buffered_write;
+ }
+
+ do {
+ size_t flen;
+ size_t offset; /* Offset into pagecache folio */
+ size_t part; /* Bytes to write to folio */
+ size_t copied; /* Bytes copied from user */
+
+ ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
+ if (unlikely(ret < 0))
+ break;
+
+ offset = pos & (max_chunk - 1);
+ part = min(max_chunk - offset, iov_iter_count(iter));
+
+ /* Bring in the user pages that we will copy from _first_ lest
+ * we hit a nasty deadlock on copying from the same page as
+ * we're writing to, without it being marked uptodate.
+ *
+ * Not only is this an optimisation, but it is also required to
+ * check that the address is actually valid, when atomic
+ * usercopies are used below.
+ *
+ * We rely on the page being held onto long enough by the LRU
+ * that we can grab it below if this causes it to be read.
+ */
+ ret = -EFAULT;
+ if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
+ break;
+
+ folio = netfs_grab_folio_for_write(mapping, pos, part);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ break;
+ }
+
+ flen = folio_size(folio);
+ offset = pos & (flen - 1);
+ part = min_t(size_t, flen - offset, part);
+
+ if (signal_pending(current)) {
+ ret = written ? -EINTR : -ERESTARTSYS;
+ goto error_folio_unlock;
+ }
+
+ /* See if we need to prefetch the area we're going to modify.
+ * We need to do this before we get a lock on the folio in case
+ * there's more than one writer competing for the same cache
+ * block.
+ */
+ howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
+ flen, offset, part, maybe_trouble);
+ _debug("howto %u", howto);
+ switch (howto) {
+ case NETFS_JUST_PREFETCH:
+ ret = netfs_prefetch_for_write(file, folio, offset, part);
+ if (ret < 0) {
+ _debug("prefetch = %zd", ret);
+ goto error_folio_unlock;
+ }
+ break;
+ case NETFS_FOLIO_IS_UPTODATE:
+ case NETFS_WHOLE_FOLIO_MODIFY:
+ case NETFS_STREAMING_WRITE_CONT:
+ break;
+ case NETFS_MODIFY_AND_CLEAR:
+ zero_user_segment(&folio->page, 0, offset);
+ break;
+ case NETFS_STREAMING_WRITE:
+ ret = -EIO;
+ if (WARN_ON(folio_get_private(folio)))
+ goto error_folio_unlock;
+ break;
+ case NETFS_FLUSH_CONTENT:
+ trace_netfs_folio(folio, netfs_flush_content);
+ from = folio_pos(folio);
+ to = from + folio_size(folio) - 1;
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = filemap_write_and_wait_range(mapping, from, to);
+ if (ret < 0)
+ goto error_folio_unlock;
+ continue;
+ }
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+
+ flush_dcache_folio(folio);
+
+ /* Deal with a (partially) failed copy */
+ if (copied == 0) {
+ ret = -EFAULT;
+ goto error_folio_unlock;
+ }
+
+ trace = (enum netfs_folio_trace)howto;
+ switch (howto) {
+ case NETFS_FOLIO_IS_UPTODATE:
+ case NETFS_JUST_PREFETCH:
+ netfs_set_group(folio, netfs_group);
+ break;
+ case NETFS_MODIFY_AND_CLEAR:
+ zero_user_segment(&folio->page, offset + copied, flen);
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ break;
+ case NETFS_WHOLE_FOLIO_MODIFY:
+ if (unlikely(copied < part)) {
+ maybe_trouble = true;
+ iov_iter_revert(iter, copied);
+ copied = 0;
+ goto retry;
+ }
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ break;
+ case NETFS_STREAMING_WRITE:
+ if (offset == 0 && copied == flen) {
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ trace = netfs_streaming_filled_page;
+ break;
+ }
+ finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
+ if (!finfo) {
+ iov_iter_revert(iter, copied);
+ ret = -ENOMEM;
+ goto error_folio_unlock;
+ }
+ finfo->netfs_group = netfs_get_group(netfs_group);
+ finfo->dirty_offset = offset;
+ finfo->dirty_len = copied;
+ folio_attach_private(folio, (void *)((unsigned long)finfo |
+ NETFS_FOLIO_INFO));
+ break;
+ case NETFS_STREAMING_WRITE_CONT:
+ finfo = netfs_folio_info(folio);
+ finfo->dirty_len += copied;
+ if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ folio_mark_uptodate(folio);
+ kfree(finfo);
+ trace = netfs_streaming_cont_filled_page;
+ }
+ break;
+ default:
+ WARN(true, "Unexpected modify type %u ix=%lx\n",
+ howto, folio->index);
+ ret = -EIO;
+ goto error_folio_unlock;
+ }
+
+ trace_netfs_folio(folio, trace);
+
+ /* Update the inode size if we moved the EOF marker */
+ i_size = i_size_read(inode);
+ pos += copied;
+ if (pos > i_size) {
+ if (ctx->ops->update_i_size) {
+ ctx->ops->update_i_size(inode, pos);
+ } else {
+ i_size_write(inode, pos);
+#if IS_ENABLED(CONFIG_FSCACHE)
+ fscache_update_cookie(ctx->cache, NULL, &pos);
+#endif
+ }
+ }
+ written += copied;
+
+ if (likely(!wreq)) {
+ folio_mark_dirty(folio);
+ } else {
+ if (folio_test_dirty(folio))
+ /* Sigh. mmap. */
+ folio_clear_dirty_for_io(folio);
+ /* We make multiple writes to the folio... */
+ if (!folio_test_writeback(folio)) {
+ folio_wait_fscache(folio);
+ folio_start_writeback(folio);
+ folio_start_fscache(folio);
+ if (wreq->iter.count == 0)
+ trace_netfs_folio(folio, netfs_folio_trace_wthru);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+ }
+ netfs_advance_writethrough(wreq, copied,
+ offset + copied == flen);
+ }
+ retry:
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
+
+ cond_resched();
+ } while (iov_iter_count(iter));
+
+out:
+ if (unlikely(wreq)) {
+ ret2 = netfs_end_writethrough(wreq, iocb);
+ wbc_detach_inode(&wbc);
+ if (ret2 == -EIOCBQUEUED)
+ return ret2;
+ if (ret == 0)
+ ret = ret2;
+ }
+
+ iocb->ki_pos += written;
+ _leave(" = %zd [%zd]", written, ret);
+ return written ? written : ret;
+
+error_folio_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+}
+EXPORT_SYMBOL(netfs_perform_write);
+
+/**
+ * netfs_buffered_write_iter_locked - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * The caller must hold appropriate locks around this function and have called
+ * generic_write_checks() already. The caller is also responsible for doing
+ * any necessary syncing afterwards.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_rwsem.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
+ */
+ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
+ struct netfs_group *netfs_group)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret;
+
+ trace_netfs_write_iter(iocb, from);
+
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
+
+ return netfs_perform_write(iocb, from, netfs_group);
+}
+EXPORT_SYMBOL(netfs_buffered_write_iter_locked);
+
+/**
+ * netfs_file_write_iter - write data to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Perform a write to a file, writing into the pagecache if possible and doing
+ * an unbuffered write instead if not.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+ if (!iov_iter_count(from))
+ return 0;
+
+ if ((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+ return netfs_unbuffered_write_iter(iocb, from);
+
+ ret = netfs_start_io_write(inode);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0)
+ ret = netfs_buffered_write_iter_locked(iocb, from, NULL);
+ netfs_end_io_write(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_file_write_iter);
+
+/*
+ * Notification that a previously read-only page is about to become writable.
+ * Note that the caller indicates a single page of a multipage folio.
+ */
+vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
+{
+ struct folio *folio = page_folio(vmf->page);
+ struct file *file = vmf->vma->vm_file;
+ struct inode *inode = file_inode(file);
+ vm_fault_t ret = VM_FAULT_RETRY;
+ int err;
+
+ _enter("%lx", folio->index);
+
+ sb_start_pagefault(inode->i_sb);
+
+ if (folio_wait_writeback_killable(folio))
+ goto out;
+
+ if (folio_lock_killable(folio) < 0)
+ goto out;
+
+ /* Can we see a streaming write here? */
+ if (WARN_ON(!folio_test_uptodate(folio))) {
+ ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
+ goto out;
+ }
+
+ if (netfs_folio_group(folio) != netfs_group) {
+ folio_unlock(folio);
+ err = filemap_fdatawait_range(inode->i_mapping,
+ folio_pos(folio),
+ folio_pos(folio) + folio_size(folio));
+ switch (err) {
+ case 0:
+ ret = VM_FAULT_RETRY;
+ goto out;
+ case -ENOMEM:
+ ret = VM_FAULT_OOM;
+ goto out;
+ default:
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+
+ if (folio_test_dirty(folio))
+ trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
+ netfs_set_group(folio, netfs_group);
+ file_update_time(file);
+ ret = VM_FAULT_LOCKED;
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_page_mkwrite);
+
+/*
+ * Kill all the pages in the given range
+ */
+static void netfs_kill_pages(struct address_space *mapping,
+ loff_t start, loff_t len)
+{
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+ _enter("%llx-%llx", start, start + len - 1);
+
+ do {
+ _debug("kill %lx (to %lx)", index, last);
+
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ next = index + 1;
+ continue;
+ }
+
+ next = folio_next_index(folio);
+
+ trace_netfs_folio(folio, netfs_folio_trace_kill);
+ folio_clear_uptodate(folio);
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ folio_end_writeback(folio);
+ folio_lock(folio);
+ generic_error_remove_folio(mapping, folio);
+ folio_unlock(folio);
+ folio_put(folio);
+
+ } while (index = next, index <= last);
+
+ _leave("");
+}
+
+/*
+ * Redirty all the pages in a given range.
+ */
+static void netfs_redirty_pages(struct address_space *mapping,
+ loff_t start, loff_t len)
+{
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+ _enter("%llx-%llx", start, start + len - 1);
+
+ do {
+ _debug("redirty %llx @%llx", len, start);
+
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ next = index + 1;
+ continue;
+ }
+
+ next = folio_next_index(folio);
+ trace_netfs_folio(folio, netfs_folio_trace_redirty);
+ filemap_dirty_folio(mapping, folio);
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ folio_end_writeback(folio);
+ folio_put(folio);
+ } while (index = next, index <= last);
+
+ balance_dirty_pages_ratelimited(mapping);
+
+ _leave("");
+}
+
+/*
+ * Completion of write to server
+ */
+static void netfs_pages_written_back(struct netfs_io_request *wreq)
+{
+ struct address_space *mapping = wreq->mapping;
+ struct netfs_folio *finfo;
+ struct netfs_group *group = NULL;
+ struct folio *folio;
+ pgoff_t last;
+ int gcount = 0;
+
+ XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
+
+ _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+
+ rcu_read_lock();
+
+ last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
+ xas_for_each(&xas, folio, last) {
+ WARN(!folio_test_writeback(folio),
+ "bad %zx @%llx page %lx %lx\n",
+ wreq->len, wreq->start, folio->index, last);
+
+ if ((finfo = netfs_folio_info(folio))) {
+ /* Streaming writes cannot be redirtied whilst under
+ * writeback, so discard the streaming record.
+ */
+ folio_detach_private(folio);
+ group = finfo->netfs_group;
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_s);
+ kfree(finfo);
+ } else if ((group = netfs_folio_group(folio))) {
+ /* Need to detach the group pointer if the page didn't
+ * get redirtied. If it has been redirtied, then it
+ * must be within the same group.
+ */
+ if (folio_test_dirty(folio)) {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ goto end_wb;
+ }
+ if (folio_trylock(folio)) {
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ }
+ folio_unlock(folio);
+ goto end_wb;
+ }
+
+ xas_pause(&xas);
+ rcu_read_unlock();
+ folio_lock(folio);
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ }
+ folio_unlock(folio);
+ rcu_read_lock();
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_clear);
+ }
+ end_wb:
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ xas_advance(&xas, folio_next_index(folio) - 1);
+ folio_end_writeback(folio);
+ }
+
+ rcu_read_unlock();
+ netfs_put_group_many(group, gcount);
+ _leave("");
+}
+
+/*
+ * Deal with the disposition of the folios that are under writeback to close
+ * out the operation.
+ */
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
+{
+ struct address_space *mapping = wreq->mapping;
+
+ _enter("");
+
+ switch (wreq->error) {
+ case 0:
+ netfs_pages_written_back(wreq);
+ break;
+
+ default:
+ pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
+ fallthrough;
+ case -EACCES:
+ case -EPERM:
+ case -ENOKEY:
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ case -EKEYREVOKED:
+ case -ENETRESET:
+ case -EDQUOT:
+ case -ENOSPC:
+ netfs_redirty_pages(mapping, wreq->start, wreq->len);
+ break;
+
+ case -EROFS:
+ case -EIO:
+ case -EREMOTEIO:
+ case -EFBIG:
+ case -ENOENT:
+ case -ENOMEDIUM:
+ case -ENXIO:
+ netfs_kill_pages(mapping, wreq->start, wreq->len);
+ break;
+ }
+
+ if (wreq->error)
+ mapping_set_error(mapping, wreq->error);
+ if (wreq->netfs_ops->done)
+ wreq->netfs_ops->done(wreq);
+}
+
+/*
+ * Extend the region to be written back to include subsequent contiguously
+ * dirty pages if possible, but don't sleep while doing so.
+ *
+ * If this page holds new content, then we can include filler zeros in the
+ * writeback.
+ */
+static void netfs_extend_writeback(struct address_space *mapping,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ long *_count,
+ loff_t start,
+ loff_t max_len,
+ bool caching,
+ size_t *_len,
+ size_t *_top)
+{
+ struct netfs_folio *finfo;
+ struct folio_batch fbatch;
+ struct folio *folio;
+ unsigned int i;
+ pgoff_t index = (start + *_len) / PAGE_SIZE;
+ size_t len;
+ void *priv;
+ bool stop = true;
+
+ folio_batch_init(&fbatch);
+
+ do {
+ /* Firstly, we gather up a batch of contiguous dirty pages
+ * under the RCU read lock - but we can't clear the dirty flags
+ * there if any of those pages are mapped.
+ */
+ rcu_read_lock();
+
+ xas_for_each(xas, folio, ULONG_MAX) {
+ stop = true;
+ if (xas_retry(xas, folio))
+ continue;
+ if (xa_is_value(folio))
+ break;
+ if (folio->index != index) {
+ xas_reset(xas);
+ break;
+ }
+
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
+
+ /* Has the folio moved or been split? */
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+ if (!folio_test_dirty(folio) ||
+ folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+
+ stop = false;
+ len = folio_size(folio);
+ priv = folio_get_private(folio);
+ if ((const struct netfs_group *)priv != group) {
+ stop = true;
+ finfo = netfs_folio_info(folio);
+ if (finfo->netfs_group != group ||
+ finfo->dirty_offset > 0) {
+ folio_unlock(folio);
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+ len = finfo->dirty_len;
+ }
+
+ *_top += folio_size(folio);
+ index += folio_nr_pages(folio);
+ *_count -= folio_nr_pages(folio);
+ *_len += len;
+ if (*_len >= max_len || *_count <= 0)
+ stop = true;
+
+ if (!folio_batch_add(&fbatch, folio))
+ break;
+ if (stop)
+ break;
+ }
+
+ xas_pause(xas);
+ rcu_read_unlock();
+
+ /* Now, if we obtained any folios, we can shift them to being
+ * writable and mark them for caching.
+ */
+ if (!folio_batch_count(&fbatch))
+ break;
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
+ trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+
+ if (!folio_clear_dirty_for_io(folio))
+ BUG();
+ folio_start_writeback(folio);
+ netfs_folio_start_fscache(caching, folio);
+ folio_unlock(folio);
+ }
+
+ folio_batch_release(&fbatch);
+ cond_resched();
+ } while (!stop);
+}
+
+/*
+ * Synchronously write back the locked page and any subsequent non-locked dirty
+ * pages.
+ */
+static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ struct folio *folio,
+ unsigned long long start,
+ unsigned long long end)
+{
+ struct netfs_io_request *wreq;
+ struct netfs_folio *finfo;
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ unsigned long long i_size = i_size_read(&ctx->inode);
+ size_t len, max_len;
+ bool caching = netfs_is_cache_enabled(ctx);
+ long count = wbc->nr_to_write;
+ int ret;
+
+ _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
+
+ wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
+ NETFS_WRITEBACK);
+ if (IS_ERR(wreq)) {
+ folio_unlock(folio);
+ return PTR_ERR(wreq);
+ }
+
+ if (!folio_clear_dirty_for_io(folio))
+ BUG();
+ folio_start_writeback(folio);
+ netfs_folio_start_fscache(caching, folio);
+
+ count -= folio_nr_pages(folio);
+
+ /* Find all consecutive lockable dirty pages that have contiguous
+ * written regions, stopping when we find a page that is not
+ * immediately lockable, is not dirty or is missing, or we reach the
+ * end of the range.
+ */
+ trace_netfs_folio(folio, netfs_folio_trace_store);
+
+ len = wreq->len;
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ start += finfo->dirty_offset;
+ if (finfo->dirty_offset + finfo->dirty_len != len) {
+ len = finfo->dirty_len;
+ goto cant_expand;
+ }
+ len = finfo->dirty_len;
+ }
+
+ if (start < i_size) {
+ /* Trim the write to the EOF; the extra data is ignored. Also
+ * put an upper limit on the size of a single storedata op.
+ */
+ max_len = 65536 * 4096;
+ max_len = min_t(unsigned long long, max_len, end - start + 1);
+ max_len = min_t(unsigned long long, max_len, i_size - start);
+
+ if (len < max_len)
+ netfs_extend_writeback(mapping, group, xas, &count, start,
+ max_len, caching, &len, &wreq->upper_len);
+ }
+
+cant_expand:
+ len = min_t(unsigned long long, len, i_size - start);
+
+ /* We now have a contiguous set of dirty pages, each with writeback
+ * set; the first page is still locked at this point, but all the rest
+ * have been unlocked.
+ */
+ folio_unlock(folio);
+ wreq->start = start;
+ wreq->len = len;
+
+ if (start < i_size) {
+ _debug("write back %zx @%llx [%llx]", len, start, i_size);
+
+ /* Speculatively write to the cache. We have to fix this up
+ * later if the store fails.
+ */
+ wreq->cleanup = netfs_cleanup_buffered_write;
+
+ iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
+ wreq->upper_len);
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
+ if (ret == 0 || ret == -EIOCBQUEUED)
+ wbc->nr_to_write -= len / PAGE_SIZE;
+ } else {
+ _debug("write discard %zx @%llx [%llx]", len, start, i_size);
+
+ /* The dirty region was entirely beyond the EOF. */
+ fscache_clear_page_bits(mapping, start, len, caching);
+ netfs_pages_written_back(wreq);
+ ret = 0;
+ }
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = 1");
+ return 1;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static ssize_t netfs_writepages_begin(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ const struct netfs_folio *finfo;
+ struct folio *folio;
+ unsigned long long start = *_start;
+ ssize_t ret;
+ void *priv;
+ int skips = 0;
+
+ _enter("%llx,%llx,", start, end);
+
+search_again:
+ /* Find the first dirty page in the group. */
+ rcu_read_lock();
+
+ for (;;) {
+ folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
+ if (xas_retry(xas, folio) || xa_is_value(folio))
+ continue;
+ if (!folio)
+ break;
+
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ continue;
+ }
+
+ /* Skip any dirty folio that's not in the group of interest. */
+ priv = folio_get_private(folio);
+ if ((const struct netfs_group *)priv != group) {
+ finfo = netfs_folio_info(folio);
+ if (finfo->netfs_group != group) {
+ folio_put(folio);
+ continue;
+ }
+ }
+
+ xas_pause(xas);
+ break;
+ }
+ rcu_read_unlock();
+ if (!folio)
+ return 0;
+
+ start = folio_pos(folio); /* May regress with THPs */
+
+ _debug("wback %lx", folio->index);
+
+ /* At this point we hold neither the i_pages lock nor the page lock:
+ * the page may be truncated or invalidated (changing page->mapping to
+ * NULL), or even swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+lock_again:
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ ret = folio_lock_killable(folio);
+ if (ret < 0)
+ return ret;
+ } else {
+ if (!folio_trylock(folio))
+ goto search_again;
+ }
+
+ if (folio->mapping != mapping ||
+ !folio_test_dirty(folio)) {
+ start += folio_size(folio);
+ folio_unlock(folio);
+ goto search_again;
+ }
+
+ if (folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ folio_wait_writeback(folio);
+#ifdef CONFIG_FSCACHE
+ folio_wait_fscache(folio);
+#endif
+ goto lock_again;
+ }
+
+ start += folio_size(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (skips >= 5 || need_resched()) {
+ ret = 0;
+ goto out;
+ }
+ skips++;
+ }
+ goto search_again;
+ }
+
+ ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
+ folio, start, end);
+out:
+ if (ret > 0)
+ *_start = start + ret;
+ _leave(" = %zd [%llx]", ret, *_start);
+ return ret;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static int netfs_writepages_region(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ ssize_t ret;
+
+ XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
+
+ do {
+ ret = netfs_writepages_begin(mapping, wbc, group, &xas,
+ _start, end);
+ if (ret > 0 && wbc->nr_to_write > 0)
+ cond_resched();
+ } while (ret > 0 && wbc->nr_to_write > 0);
+
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * write some of the pending data back to the server
+ */
+int netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct netfs_group *group = NULL;
+ loff_t start, end;
+ int ret;
+
+ _enter("");
+
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
+
+ if (wbc->range_cyclic && mapping->writeback_index) {
+ start = mapping->writeback_index * PAGE_SIZE;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, LLONG_MAX);
+ if (ret < 0)
+ goto out;
+
+ if (wbc->nr_to_write <= 0) {
+ mapping->writeback_index = start / PAGE_SIZE;
+ goto out;
+ }
+
+ start = 0;
+ end = mapping->writeback_index * PAGE_SIZE;
+ mapping->writeback_index = 0;
+ ret = netfs_writepages_region(mapping, wbc, group, &start, end);
+ if (ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
+ } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+ start = 0;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, LLONG_MAX);
+ if (wbc->nr_to_write > 0 && ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
+ } else {
+ start = wbc->range_start;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, wbc->range_end);
+ }
+
+out:
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_writepages);
+
+/*
+ * Deal with the disposition of a laundered folio.
+ */
+static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
+{
+ if (wreq->error) {
+ pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
+ mapping_set_error(wreq->mapping, wreq->error);
+ }
+}
+
+/**
+ * netfs_launder_folio - Clean up a dirty folio that's being invalidated
+ * @folio: The folio to clean
+ *
+ * This is called to write back a folio that's being invalidated when an inode
+ * is getting torn down. Ideally, writepages would be used instead.
+ */
+int netfs_launder_folio(struct folio *folio)
+{
+ struct netfs_io_request *wreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct netfs_group *group = netfs_folio_group(folio);
+ struct bio_vec bvec;
+ unsigned long long i_size = i_size_read(mapping->host);
+ unsigned long long start = folio_pos(folio);
+ size_t offset = 0, len;
+ int ret = 0;
+
+ if (finfo) {
+ offset = finfo->dirty_offset;
+ start += offset;
+ len = finfo->dirty_len;
+ } else {
+ len = folio_size(folio);
+ }
+ len = min_t(unsigned long long, len, i_size - start);
+
+ wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
+ if (IS_ERR(wreq)) {
+ ret = PTR_ERR(wreq);
+ goto out;
+ }
+
+ if (!folio_clear_dirty_for_io(folio))
+ goto out_put;
+
+ trace_netfs_folio(folio, netfs_folio_trace_launder);
+
+ _debug("launder %llx-%llx", start, start + len - 1);
+
+ /* Speculatively write to the cache. We have to fix this up later if
+ * the store fails.
+ */
+ wreq->cleanup = netfs_cleanup_launder_folio;
+
+ bvec_set_folio(&bvec, folio, len, offset);
+ iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
+
+out_put:
+ folio_detach_private(folio);
+ netfs_put_group(group);
+ kfree(finfo);
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+out:
+ folio_wait_fscache(folio);
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_launder_folio);
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
new file mode 100644
index 0000000000..ad4370b393
--- /dev/null
+++ b/fs/netfs/direct_read.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Direct I/O support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer. No use is made of the pagecache.
+ *
+ * The caller must hold any appropriate locks.
+ */
+static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct netfs_io_request *rreq;
+ ssize_t ret;
+ size_t orig_count = iov_iter_count(iter);
+ bool async = !is_sync_kiocb(iocb);
+
+ _enter("");
+
+ if (!orig_count)
+ return 0; /* Don't update atime */
+
+ ret = kiocb_write_and_wait(iocb, orig_count);
+ if (ret < 0)
+ return ret;
+ file_accessed(iocb->ki_filp);
+
+ rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ iocb->ki_pos, orig_count,
+ NETFS_DIO_READ);
+ if (IS_ERR(rreq))
+ return PTR_ERR(rreq);
+
+ netfs_stat(&netfs_n_rh_dio_read);
+ trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read);
+
+ /* If this is an async op, we have to keep track of the destination
+ * buffer for ourselves as the caller's iterator will be trashed when
+ * we return.
+ *
+ * In such a case, extract an iterator to represent as much of the the
+ * output buffer as we can manage. Note that the extraction might not
+ * be able to allocate a sufficiently large bvec array and may shorten
+ * the request.
+ */
+ if (user_backed_iter(iter)) {
+ ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0);
+ if (ret < 0)
+ goto out;
+ rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec;
+ rreq->direct_bv_count = ret;
+ rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+ rreq->len = iov_iter_count(&rreq->iter);
+ } else {
+ rreq->iter = *iter;
+ rreq->len = orig_count;
+ rreq->direct_bv_unpin = false;
+ iov_iter_advance(iter, orig_count);
+ }
+
+ // TODO: Set up bounce buffer if needed
+
+ if (async)
+ rreq->iocb = iocb;
+
+ ret = netfs_begin_read(rreq, is_sync_kiocb(iocb));
+ if (ret < 0)
+ goto out; /* May be -EIOCBQUEUED */
+ if (!async) {
+ // TODO: Copy from bounce buffer
+ iocb->ki_pos += rreq->transferred;
+ ret = rreq->transferred;
+ }
+
+out:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ if (ret > 0)
+ orig_count -= ret;
+ if (ret != -EIOCBQUEUED)
+ iov_iter_revert(iter, orig_count - iov_iter_count(iter));
+ return ret;
+}
+
+/**
+ * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer. No use is made of the pagecache.
+ */
+ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (!iter->count)
+ return 0; /* Don't update atime */
+
+ ret = netfs_start_io_direct(inode);
+ if (ret == 0) {
+ ret = netfs_unbuffered_read_iter_locked(iocb, iter);
+ netfs_end_io_direct(inode);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_read_iter);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
new file mode 100644
index 0000000000..bee047e20f
--- /dev/null
+++ b/fs/netfs/direct_write.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Unbuffered and direct write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
+{
+ struct inode *inode = wreq->inode;
+ unsigned long long end = wreq->start + wreq->len;
+
+ if (!wreq->error &&
+ i_size_read(inode) < end) {
+ if (wreq->netfs_ops->update_i_size)
+ wreq->netfs_ops->update_i_size(inode, end);
+ else
+ i_size_write(inode, end);
+ }
+}
+
+/*
+ * Perform an unbuffered write where we may have to do an RMW operation on an
+ * encrypted file. This can also be used for direct I/O writes.
+ */
+static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group)
+{
+ struct netfs_io_request *wreq;
+ unsigned long long start = iocb->ki_pos;
+ unsigned long long end = start + iov_iter_count(iter);
+ ssize_t ret, n;
+ bool async = !is_sync_kiocb(iocb);
+
+ _enter("");
+
+ /* We're going to need a bounce buffer if what we transmit is going to
+ * be different in some way to the source buffer, e.g. because it gets
+ * encrypted/compressed or because it needs expanding to a block size.
+ */
+ // TODO
+
+ _debug("uw %llx-%llx", start, end);
+
+ wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ start, end - start,
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+ if (IS_ERR(wreq))
+ return PTR_ERR(wreq);
+
+ {
+ /* If this is an async op and we're not using a bounce buffer,
+ * we have to save the source buffer as the iterator is only
+ * good until we return. In such a case, extract an iterator
+ * to represent as much of the the output buffer as we can
+ * manage. Note that the extraction might not be able to
+ * allocate a sufficiently large bvec array and may shorten the
+ * request.
+ */
+ if (async || user_backed_iter(iter)) {
+ n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+ if (n < 0) {
+ ret = n;
+ goto out;
+ }
+ wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
+ wreq->direct_bv_count = n;
+ wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+ wreq->len = iov_iter_count(&wreq->iter);
+ } else {
+ wreq->iter = *iter;
+ }
+
+ wreq->io_iter = wreq->iter;
+ }
+
+ /* Copy the data into the bounce buffer and encrypt it. */
+ // TODO
+
+ /* Dispatch the write. */
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ if (async)
+ wreq->iocb = iocb;
+ wreq->cleanup = netfs_cleanup_dio_write;
+ ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
+ iocb->ki_flags & IOCB_DIRECT ?
+ netfs_write_trace_dio_write :
+ netfs_write_trace_unbuffered_write);
+ if (ret < 0) {
+ _debug("begin = %zd", ret);
+ goto out;
+ }
+
+ if (!async) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+
+ ret = wreq->error;
+ _debug("waited = %zd", ret);
+ if (ret == 0) {
+ ret = wreq->transferred;
+ iocb->ki_pos += ret;
+ }
+ } else {
+ ret = -EIOCBQUEUED;
+ }
+
+out:
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
+
+/**
+ * netfs_unbuffered_write_iter - Unbuffered write to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Do an unbuffered write to a file, writing the data directly to the server
+ * and not lodging the data in the pagecache.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ unsigned long long end;
+ ssize_t ret;
+
+ _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+ if (!iov_iter_count(from))
+ return 0;
+
+ trace_netfs_write_iter(iocb, from);
+ netfs_stat(&netfs_n_rh_dio_write);
+
+ ret = netfs_start_io_direct(inode);
+ if (ret < 0)
+ return ret;
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+ ret = file_remove_privs(file);
+ if (ret < 0)
+ goto out;
+ ret = file_update_time(file);
+ if (ret < 0)
+ goto out;
+ ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+ if (ret < 0)
+ goto out;
+ end = iocb->ki_pos + iov_iter_count(from);
+ if (end > ictx->zero_point)
+ ictx->zero_point = end;
+
+ fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
+ FSCACHE_INVAL_DIO_WRITE);
+ ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
+out:
+ netfs_end_io_direct(inode);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_write_iter);
diff --git a/fs/fscache/cache.c b/fs/netfs/fscache_cache.c
index 9397ed39b0..9397ed39b0 100644
--- a/fs/fscache/cache.c
+++ b/fs/netfs/fscache_cache.c
diff --git a/fs/fscache/cookie.c b/fs/netfs/fscache_cookie.c
index bce2492186..bce2492186 100644
--- a/fs/fscache/cookie.c
+++ b/fs/netfs/fscache_cookie.c
diff --git a/fs/netfs/fscache_internal.h b/fs/netfs/fscache_internal.h
new file mode 100644
index 0000000000..a09b948fce
--- /dev/null
+++ b/fs/netfs/fscache_internal.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include "internal.h"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "FS-Cache: " fmt
diff --git a/fs/fscache/io.c b/fs/netfs/fscache_io.c
index 0d2b8dec8f..ad572f7ee8 100644
--- a/fs/fscache/io.c
+++ b/fs/netfs/fscache_io.c
@@ -158,46 +158,6 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres,
}
EXPORT_SYMBOL(__fscache_begin_write_operation);
-/**
- * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback
- * @mapping: The mapping the folio belongs to.
- * @folio: The folio being dirtied.
- * @cookie: The cookie referring to the cache object
- *
- * Set the dirty flag on a folio and pin an in-use cache object in memory
- * so that writeback can later write to it. This is intended
- * to be called from the filesystem's ->dirty_folio() method.
- *
- * Return: true if the dirty flag was set on the folio, false otherwise.
- */
-bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
- struct fscache_cookie *cookie)
-{
- struct inode *inode = mapping->host;
- bool need_use = false;
-
- _enter("");
-
- if (!filemap_dirty_folio(mapping, folio))
- return false;
- if (!fscache_cookie_valid(cookie))
- return true;
-
- if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
- spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
- inode->i_state |= I_PINNING_FSCACHE_WB;
- need_use = true;
- }
- spin_unlock(&inode->i_lock);
-
- if (need_use)
- fscache_use_cookie(cookie, true);
- }
- return true;
-}
-EXPORT_SYMBOL(fscache_dirty_folio);
-
struct fscache_write_request {
struct netfs_cache_resources cache_resources;
struct address_space *mapping;
@@ -277,7 +237,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
fscache_access_io_write) < 0)
goto abandon_free;
- ret = cres->ops->prepare_write(cres, &start, &len, i_size, false);
+ ret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false);
if (ret < 0)
goto abandon_end;
diff --git a/fs/fscache/main.c b/fs/netfs/fscache_main.c
index dad85fd84f..42e98bb523 100644
--- a/fs/fscache/main.c
+++ b/fs/netfs/fscache_main.c
@@ -8,18 +8,9 @@
#define FSCACHE_DEBUG_LEVEL CACHE
#include <linux/module.h>
#include <linux/init.h>
-#define CREATE_TRACE_POINTS
#include "internal.h"
-
-MODULE_DESCRIPTION("FS Cache Manager");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
-unsigned fscache_debug;
-module_param_named(debug, fscache_debug, uint,
- S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(fscache_debug,
- "FS-Cache debugging mask");
+#define CREATE_TRACE_POINTS
+#include <trace/events/fscache.h>
EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache);
EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume);
@@ -71,7 +62,7 @@ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len)
/*
* initialise the fs caching module
*/
-static int __init fscache_init(void)
+int __init fscache_init(void)
{
int ret = -ENOMEM;
@@ -92,7 +83,7 @@ static int __init fscache_init(void)
goto error_cookie_jar;
}
- pr_notice("Loaded\n");
+ pr_notice("FS-Cache loaded\n");
return 0;
error_cookie_jar:
@@ -103,19 +94,15 @@ error_wq:
return ret;
}
-fs_initcall(fscache_init);
-
/*
* clean up on module removal
*/
-static void __exit fscache_exit(void)
+void __exit fscache_exit(void)
{
_enter("");
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
destroy_workqueue(fscache_wq);
- pr_notice("Unloaded\n");
+ pr_notice("FS-Cache unloaded\n");
}
-
-module_exit(fscache_exit);
diff --git a/fs/fscache/proc.c b/fs/netfs/fscache_proc.c
index dc3b0e9c8c..874d951bc3 100644
--- a/fs/fscache/proc.c
+++ b/fs/netfs/fscache_proc.c
@@ -12,41 +12,34 @@
#include "internal.h"
/*
- * initialise the /proc/fs/fscache/ directory
+ * Add files to /proc/fs/netfs/.
*/
int __init fscache_proc_init(void)
{
- if (!proc_mkdir("fs/fscache", NULL))
- goto error_dir;
+ if (!proc_symlink("fs/fscache", NULL, "netfs"))
+ goto error_sym;
- if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL,
&fscache_caches_seq_ops))
goto error;
- if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/volumes", S_IFREG | 0444, NULL,
&fscache_volumes_seq_ops))
goto error;
- if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/cookies", S_IFREG | 0444, NULL,
&fscache_cookies_seq_ops))
goto error;
-
-#ifdef CONFIG_FSCACHE_STATS
- if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
- fscache_stats_show))
- goto error;
-#endif
-
return 0;
error:
remove_proc_entry("fs/fscache", NULL);
-error_dir:
+error_sym:
return -ENOMEM;
}
/*
- * clean up the /proc/fs/fscache/ directory
+ * Clean up the /proc/fs/fscache symlink.
*/
void fscache_proc_cleanup(void)
{
diff --git a/fs/fscache/stats.c b/fs/netfs/fscache_stats.c
index fc94e5e79f..add21abdf7 100644
--- a/fs/fscache/stats.c
+++ b/fs/netfs/fscache_stats.c
@@ -48,13 +48,15 @@ atomic_t fscache_n_no_create_space;
EXPORT_SYMBOL(fscache_n_no_create_space);
atomic_t fscache_n_culled;
EXPORT_SYMBOL(fscache_n_culled);
+atomic_t fscache_n_dio_misfit;
+EXPORT_SYMBOL(fscache_n_dio_misfit);
/*
* display the general statistics
*/
-int fscache_stats_show(struct seq_file *m, void *v)
+int fscache_stats_show(struct seq_file *m)
{
- seq_puts(m, "FS-Cache statistics\n");
+ seq_puts(m, "-- FS-Cache statistics --\n");
seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n",
atomic_read(&fscache_n_cookies),
atomic_read(&fscache_n_volumes),
@@ -93,10 +95,9 @@ int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_no_create_space),
atomic_read(&fscache_n_culled));
- seq_printf(m, "IO : rd=%u wr=%u\n",
+ seq_printf(m, "IO : rd=%u wr=%u mis=%u\n",
atomic_read(&fscache_n_read),
- atomic_read(&fscache_n_write));
-
- netfs_stats_show(m);
+ atomic_read(&fscache_n_write),
+ atomic_read(&fscache_n_dio_misfit));
return 0;
}
diff --git a/fs/fscache/volume.c b/fs/netfs/fscache_volume.c
index cdf991bdd9..cdf991bdd9 100644
--- a/fs/fscache/volume.c
+++ b/fs/netfs/fscache_volume.c
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 43fac1b14e..ec7045d244 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -5,9 +5,13 @@
* Written by David Howells (dhowells@redhat.com)
*/
+#include <linux/slab.h>
+#include <linux/seq_file.h>
#include <linux/netfs.h>
#include <linux/fscache.h>
+#include <linux/fscache-cache.h>
#include <trace/events/netfs.h>
+#include <trace/events/fscache.h>
#ifdef pr_fmt
#undef pr_fmt
@@ -19,6 +23,8 @@
* buffered_read.c
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+ size_t offset, size_t len);
/*
* io.c
@@ -29,6 +35,41 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
* main.c
*/
extern unsigned int netfs_debug;
+extern struct list_head netfs_io_requests;
+extern spinlock_t netfs_proc_lock;
+
+#ifdef CONFIG_PROC_FS
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
+{
+ spin_lock(&netfs_proc_lock);
+ list_add_tail_rcu(&rreq->proc_link, &netfs_io_requests);
+ spin_unlock(&netfs_proc_lock);
+}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq)
+{
+ if (!list_empty(&rreq->proc_link)) {
+ spin_lock(&netfs_proc_lock);
+ list_del_rcu(&rreq->proc_link);
+ spin_unlock(&netfs_proc_lock);
+ }
+}
+#else
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
+#endif
+
+/*
+ * misc.c
+ */
+#define NETFS_FLAG_PUT_MARK BIT(0)
+#define NETFS_FLAG_PAGECACHE_MARK BIT(1)
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+ struct folio *folio, unsigned int flags,
+ gfp_t gfp_mask);
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+ struct address_space *mapping,
+ pgoff_t index, pgoff_t to, gfp_t gfp_mask);
+void netfs_clear_buffer(struct xarray *buffer);
/*
* objects.c
@@ -50,9 +91,20 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
}
/*
+ * output.c
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+ enum netfs_write_trace what);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
+
+/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_dio_read;
+extern atomic_t netfs_n_rh_dio_write;
extern atomic_t netfs_n_rh_readahead;
extern atomic_t netfs_n_rh_readpage;
extern atomic_t netfs_n_rh_rreq;
@@ -71,7 +123,15 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_wh_wstream_conflict;
+extern atomic_t netfs_n_wh_upload;
+extern atomic_t netfs_n_wh_upload_done;
+extern atomic_t netfs_n_wh_upload_failed;
+extern atomic_t netfs_n_wh_write;
+extern atomic_t netfs_n_wh_write_done;
+extern atomic_t netfs_n_wh_write_failed;
+int netfs_stats_show(struct seq_file *m, void *v);
static inline void netfs_stat(atomic_t *stat)
{
@@ -103,6 +163,176 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
#endif
}
+/*
+ * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
+{
+ if (netfs_group)
+ refcount_inc(&netfs_group->ref);
+ return netfs_group;
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group(struct netfs_group *netfs_group)
+{
+ if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+ netfs_group->free(netfs_group);
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
+{
+ if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+ netfs_group->free(netfs_group);
+}
+
+/*
+ * fscache-cache.c
+ */
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_caches_seq_ops;
+#endif
+bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache);
+void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where);
+
+static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache)
+{
+ return smp_load_acquire(&cache->state);
+}
+
+static inline bool fscache_cache_is_live(const struct fscache_cache *cache)
+{
+ return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE;
+}
+
+static inline void fscache_set_cache_state(struct fscache_cache *cache,
+ enum fscache_cache_state new_state)
+{
+ smp_store_release(&cache->state, new_state);
+
+}
+
+static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
+ enum fscache_cache_state old_state,
+ enum fscache_cache_state new_state)
+{
+ return try_cmpxchg_release(&cache->state, &old_state, new_state);
+}
+
+/*
+ * fscache-cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_cookies_seq_ops;
+#endif
+extern struct timer_list fscache_cookie_lru_timer;
+
+extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
+extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+
+static inline void fscache_see_cookie(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
+{
+ trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
+ where);
+}
+
+/*
+ * fscache-main.c
+ */
+extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
+#ifdef CONFIG_FSCACHE
+int __init fscache_init(void);
+void __exit fscache_exit(void);
+#else
+static inline int fscache_init(void) { return 0; }
+static inline void fscache_exit(void) {}
+#endif
+
+/*
+ * fscache-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init() (0)
+#define fscache_proc_cleanup() do {} while (0)
+#endif
+
+/*
+ * fscache-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_volumes;
+extern atomic_t fscache_n_volumes_collision;
+extern atomic_t fscache_n_volumes_nomem;
+extern atomic_t fscache_n_cookies;
+extern atomic_t fscache_n_cookies_lru;
+extern atomic_t fscache_n_cookies_lru_expired;
+extern atomic_t fscache_n_cookies_lru_removed;
+extern atomic_t fscache_n_cookies_lru_dropped;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_invalidates;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_retire;
+extern atomic_t fscache_n_relinquishes_dropped;
+
+extern atomic_t fscache_n_resizes;
+extern atomic_t fscache_n_resizes_null;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+ atomic_inc(stat);
+}
+
+static inline void fscache_stat_d(atomic_t *stat)
+{
+ atomic_dec(stat);
+}
+
+#define __fscache_stat(stat) (stat)
+
+int fscache_stats_show(struct seq_file *m);
+#else
+
+#define __fscache_stat(stat) (NULL)
+#define fscache_stat(stat) do {} while (0)
+#define fscache_stat_d(stat) do {} while (0)
+
+static inline int fscache_stats_show(struct seq_file *m) { return 0; }
+#endif
+
+/*
+ * fscache-volume.c
+ */
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_volumes_seq_ops;
+#endif
+
+struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+void fscache_put_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+bool fscache_begin_volume_access(struct fscache_volume *volume,
+ struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+void fscache_create_volume(struct fscache_volume *volume, bool wait);
+
/*****************************************************************************/
/*
* debug tracing
@@ -143,3 +373,57 @@ do { \
#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
#endif
+
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X) \
+do { \
+ if (unlikely(!(X))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+ if (unlikely(!((X) OP (Y)))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIF(C, X) \
+do { \
+ if (unlikely((C) && !(X))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+ if (unlikely((C) && !((X) OP (Y)))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#else
+
+#define ASSERT(X) do {} while (0)
+#define ASSERTCMP(X, OP, Y) do {} while (0)
+#define ASSERTIF(C, X) do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
+
+#endif /* assert or not */
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 7f753380e0..4261ad6c55 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -21,12 +21,7 @@
*/
static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
- struct iov_iter iter;
-
- iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
- iov_iter_zero(iov_iter_count(&iter), &iter);
+ iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
}
static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
@@ -46,14 +41,9 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq,
enum netfs_read_from_hole read_hole)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct iov_iter iter;
netfs_stat(&netfs_n_rh_read);
- iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
-
- cres->ops->read(cres, subreq->start, &iter, read_hole,
+ cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole,
netfs_cache_read_terminated, subreq);
}
@@ -88,6 +78,13 @@ static void netfs_read_from_server(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
netfs_stat(&netfs_n_rh_download);
+
+ if (rreq->origin != NETFS_DIO_READ &&
+ iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+ pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
+ rreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->len,
+ subreq->transferred, subreq->flags);
rreq->netfs_ops->issue_read(subreq);
}
@@ -127,9 +124,10 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
/* We might have multiple writes from the same huge
* folio, but we mustn't unlock a folio more than once.
*/
- if (have_unlocked && folio_index(folio) <= unlocked)
+ if (have_unlocked && folio->index <= unlocked)
continue;
- unlocked = folio_index(folio);
+ unlocked = folio_next_index(folio) - 1;
+ trace_netfs_folio(folio, netfs_folio_trace_end_copy);
folio_end_fscache(folio);
have_unlocked = true;
}
@@ -201,7 +199,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
}
ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- rreq->i_size, true);
+ subreq->len, rreq->i_size, true);
if (ret < 0) {
trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
@@ -260,6 +258,30 @@ static void netfs_rreq_short_read(struct netfs_io_request *rreq,
}
/*
+ * Reset the subrequest iterator prior to resubmission.
+ */
+static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ size_t remaining = subreq->len - subreq->transferred;
+ size_t count = iov_iter_count(&subreq->io_iter);
+
+ if (count == remaining)
+ return;
+
+ _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
+ rreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->transferred,
+ subreq->len, rreq->i_size,
+ subreq->io_iter.iter_type);
+
+ if (count < remaining)
+ iov_iter_revert(&subreq->io_iter, remaining - count);
+ else
+ iov_iter_advance(&subreq->io_iter, count - remaining);
+}
+
+/*
* Resubmit any short or failed operations. Returns true if we got the rreq
* ref back.
*/
@@ -287,6 +309,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
atomic_inc(&rreq->nr_outstanding);
+ netfs_reset_subreq_iter(rreq, subreq);
netfs_read_from_server(rreq, subreq);
} else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
netfs_rreq_short_read(rreq, subreq);
@@ -321,6 +344,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
}
/*
+ * Determine how much we can admit to having read from a DIO read.
+ */
+static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ unsigned int i;
+ size_t transferred = 0;
+
+ for (i = 0; i < rreq->direct_bv_count; i++)
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->error || subreq->transferred == 0)
+ break;
+ transferred += subreq->transferred;
+ if (subreq->transferred < subreq->len)
+ break;
+ }
+
+ for (i = 0; i < rreq->direct_bv_count; i++)
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+ rreq->transferred = transferred;
+ task_io_account_read(transferred);
+
+ if (rreq->iocb) {
+ rreq->iocb->ki_pos += transferred;
+ if (rreq->iocb->ki_complete)
+ rreq->iocb->ki_complete(
+ rreq->iocb, rreq->error ? rreq->error : transferred);
+ }
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+ inode_dio_end(rreq->inode);
+}
+
+/*
* Assess the state of a read request and decide what to do next.
*
* Note that we could be in an ordinary kernel thread, on a workqueue or in
@@ -340,8 +400,12 @@ again:
return;
}
- netfs_rreq_unlock_folios(rreq);
+ if (rreq->origin != NETFS_DIO_READ)
+ netfs_rreq_unlock_folios(rreq);
+ else
+ netfs_rreq_assess_dio(rreq);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
@@ -399,9 +463,9 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
int u;
- _enter("[%u]{%llx,%lx},%zd",
- subreq->debug_index, subreq->start, subreq->flags,
- transferred_or_error);
+ _enter("R=%x[%x]{%llx,%lx},%zd",
+ rreq->debug_id, subreq->debug_index,
+ subreq->start, subreq->flags, transferred_or_error);
switch (subreq->source) {
case NETFS_READ_FROM_CACHE:
@@ -501,15 +565,20 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest
*/
static enum netfs_io_source
netfs_rreq_prepare_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
+ struct netfs_io_subrequest *subreq,
+ struct iov_iter *io_iter)
{
- enum netfs_io_source source;
+ enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
+ size_t lsize;
_enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
- source = netfs_cache_prepare_read(subreq, rreq->i_size);
- if (source == NETFS_INVALID_READ)
- goto out;
+ if (rreq->origin != NETFS_DIO_READ) {
+ source = netfs_cache_prepare_read(subreq, rreq->i_size);
+ if (source == NETFS_INVALID_READ)
+ goto out;
+ }
if (source == NETFS_DOWNLOAD_FROM_SERVER) {
/* Call out to the netfs to let it shrink the request to fit
@@ -518,19 +587,52 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
* to make serial calls, it can indicate a short read and then
* we will call it again.
*/
+ if (rreq->origin != NETFS_DIO_READ) {
+ if (subreq->start >= ictx->zero_point) {
+ source = NETFS_FILL_WITH_ZEROES;
+ goto set;
+ }
+ if (subreq->len > ictx->zero_point - subreq->start)
+ subreq->len = ictx->zero_point - subreq->start;
+ }
if (subreq->len > rreq->i_size - subreq->start)
subreq->len = rreq->i_size - subreq->start;
+ if (rreq->rsize && subreq->len > rreq->rsize)
+ subreq->len = rreq->rsize;
if (rreq->netfs_ops->clamp_length &&
!rreq->netfs_ops->clamp_length(subreq)) {
source = NETFS_INVALID_READ;
goto out;
}
+
+ if (subreq->max_nr_segs) {
+ lsize = netfs_limit_iter(io_iter, 0, subreq->len,
+ subreq->max_nr_segs);
+ if (subreq->len > lsize) {
+ subreq->len = lsize;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
+ }
+ }
}
- if (WARN_ON(subreq->len == 0))
+set:
+ if (subreq->len > rreq->len)
+ pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
+ rreq->debug_id, subreq->debug_index,
+ subreq->len, rreq->len);
+
+ if (WARN_ON(subreq->len == 0)) {
source = NETFS_INVALID_READ;
+ goto out;
+ }
+ subreq->source = source;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ subreq->io_iter = *io_iter;
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+ iov_iter_advance(io_iter, subreq->len);
out:
subreq->source = source;
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
@@ -541,6 +643,7 @@ out:
* Slice off a piece of a read request and submit an I/O request for it.
*/
static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
+ struct iov_iter *io_iter,
unsigned int *_debug_index)
{
struct netfs_io_subrequest *subreq;
@@ -552,7 +655,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
subreq->debug_index = (*_debug_index)++;
subreq->start = rreq->start + rreq->submitted;
- subreq->len = rreq->len - rreq->submitted;
+ subreq->len = io_iter->count;
_debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
@@ -565,7 +668,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
* (the starts must coincide), in which case, we go around the loop
* again and ask it to download the next piece.
*/
- source = netfs_rreq_prepare_read(rreq, subreq);
+ source = netfs_rreq_prepare_read(rreq, subreq, io_iter);
if (source == NETFS_INVALID_READ)
goto subreq_failed;
@@ -603,6 +706,7 @@ subreq_failed:
*/
int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
{
+ struct iov_iter io_iter;
unsigned int debug_index = 0;
int ret;
@@ -611,50 +715,73 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
if (rreq->len == 0) {
pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len);
return -EIO;
}
- INIT_WORK(&rreq->work, netfs_rreq_work);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_begin(rreq->inode);
- if (sync)
- netfs_get_request(rreq, netfs_rreq_trace_get_hold);
+ // TODO: Use bounce buffer if requested
+ rreq->io_iter = rreq->iter;
+
+ INIT_WORK(&rreq->work, netfs_rreq_work);
/* Chop the read into slices according to what the cache and the netfs
* want and submit each one.
*/
+ netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding);
atomic_set(&rreq->nr_outstanding, 1);
+ io_iter = rreq->io_iter;
do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
+ _debug("submit %llx + %zx >= %llx",
+ rreq->start, rreq->submitted, rreq->i_size);
+ if (rreq->origin == NETFS_DIO_READ &&
+ rreq->start + rreq->submitted >= rreq->i_size)
+ break;
+ if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
+ break;
+ if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
+ test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
break;
} while (rreq->submitted < rreq->len);
+ if (!rreq->submitted) {
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_end(rreq->inode);
+ ret = 0;
+ goto out;
+ }
+
if (sync) {
- /* Keep nr_outstanding incremented so that the ref always belongs to
- * us, and the service code isn't punted off to a random thread pool to
- * process.
+ /* Keep nr_outstanding incremented so that the ref always
+ * belongs to us, and the service code isn't punted off to a
+ * random thread pool to process. Note that this might start
+ * further work, such as writing to the cache.
*/
- for (;;) {
- wait_var_event(&rreq->nr_outstanding,
- atomic_read(&rreq->nr_outstanding) == 1);
+ wait_var_event(&rreq->nr_outstanding,
+ atomic_read(&rreq->nr_outstanding) == 1);
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- break;
- cond_resched();
- }
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len) {
+ if (ret == 0 && rreq->submitted < rreq->len &&
+ rreq->origin != NETFS_DIO_READ) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
- netfs_put_request(rreq, false, netfs_rreq_trace_put_hold);
} else {
/* If we decrement nr_outstanding to 0, the ref belongs to us. */
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
- ret = 0;
+ ret = -EIOCBQUEUED;
}
+
+out:
return ret;
}
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index 2ff07ba655..b781bbbf1d 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -101,3 +101,100 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
return npages;
}
EXPORT_SYMBOL_GPL(netfs_extract_user_iter);
+
+/*
+ * Select the span of a bvec iterator we're going to use. Limit it by both maximum
+ * size and maximum number of segments. Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ const struct bio_vec *bvecs = iter->bvec;
+ unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0;
+ size_t len, span = 0, n = iter->count;
+ size_t skip = iter->iov_offset + start_offset;
+
+ if (WARN_ON(!iov_iter_is_bvec(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+
+ while (n && ix < nbv && skip) {
+ len = bvecs[ix].bv_len;
+ if (skip < len)
+ break;
+ skip -= len;
+ n -= len;
+ ix++;
+ }
+
+ while (n && ix < nbv) {
+ len = min3(n, bvecs[ix].bv_len - skip, max_size);
+ span += len;
+ nsegs++;
+ ix++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ skip = 0;
+ n -= len;
+ }
+
+ return min(span, max_size);
+}
+
+/*
+ * Select the span of an xarray iterator we're going to use. Limit it by both
+ * maximum size and maximum number of segments. It is assumed that segments
+ * can be larger than a page in size, provided they're physically contiguous.
+ * Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ struct folio *folio;
+ unsigned int nsegs = 0;
+ loff_t pos = iter->xarray_start + iter->iov_offset;
+ pgoff_t index = pos / PAGE_SIZE;
+ size_t span = 0, n = iter->count;
+
+ XA_STATE(xas, iter->xarray, index);
+
+ if (WARN_ON(!iov_iter_is_xarray(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+ max_size = min(max_size, n - start_offset);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ size_t offset, flen, len;
+ if (xas_retry(&xas, folio))
+ continue;
+ if (WARN_ON(xa_is_value(folio)))
+ break;
+ if (WARN_ON(folio_test_hugetlb(folio)))
+ break;
+
+ flen = folio_size(folio);
+ offset = offset_in_folio(folio, pos);
+ len = min(max_size, flen - offset);
+ span += len;
+ nsegs++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ }
+
+ rcu_read_unlock();
+ return min(span, max_size);
+}
+
+size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ if (iov_iter_is_bvec(iter))
+ return netfs_limit_bvec(iter, start_offset, max_size, max_segs);
+ if (iov_iter_is_xarray(iter))
+ return netfs_limit_xarray(iter, start_offset, max_size, max_segs);
+ BUG();
+}
+EXPORT_SYMBOL(netfs_limit_iter);
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
new file mode 100644
index 0000000000..75dc52a49b
--- /dev/null
+++ b/fs/netfs/locking.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O and data path helper functionality.
+ *
+ * Borrowed from NFS Copyright (c) 2016 Trond Myklebust
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/*
+ * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish
+ * @inode: inode to wait for
+ *
+ * Waits for all pending direct I/O requests to finish so that we can
+ * proceed with a truncate or equivalent operation.
+ *
+ * Must be called under a lock that serializes taking new references
+ * to i_dio_count, usually by inode->i_mutex.
+ */
+static int inode_dio_wait_interruptible(struct inode *inode)
+{
+ if (!atomic_read(&inode->i_dio_count))
+ return 0;
+
+ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
+ DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
+
+ for (;;) {
+ prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE);
+ if (!atomic_read(&inode->i_dio_count))
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ }
+ finish_wait(wq, &q.wq_entry);
+
+ return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0;
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_o_direct(struct netfs_inode *ictx)
+{
+ if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags))
+ return 0;
+ clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ return inode_dio_wait_interruptible(&ictx->inode);
+}
+
+/**
+ * netfs_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+int netfs_start_io_read(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ /* Be an optimist! */
+ if (down_read_interruptible(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) == 0)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (netfs_block_o_direct(ictx) < 0) {
+ up_write(&inode->i_rwsem);
+ return -ERESTARTSYS;
+ }
+ downgrade_write(&inode->i_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_read);
+
+/**
+ * netfs_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_read(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_read);
+
+/**
+ * netfs_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+int netfs_start_io_write(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (netfs_block_o_direct(ictx) < 0) {
+ up_write(&inode->i_rwsem);
+ return -ERESTARTSYS;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_write);
+
+/**
+ * netfs_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_write(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_write(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_write);
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_buffered(struct inode *inode)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ int ret;
+
+ if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) {
+ set_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ if (inode->i_mapping->nrpages != 0) {
+ unmap_mapping_range(inode->i_mapping, 0, 0, 0);
+ ret = filemap_fdatawait(inode->i_mapping);
+ if (ret < 0) {
+ clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * netfs_start_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+int netfs_start_io_direct(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ int ret;
+
+ /* Be an optimist! */
+ if (down_read_interruptible(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) != 0)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ ret = netfs_block_buffered(inode);
+ if (ret < 0) {
+ up_write(&inode->i_rwsem);
+ return ret;
+ }
+ downgrade_write(&inode->i_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_direct);
+
+/**
+ * netfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_direct(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_direct);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 0685687029..5e77618a79 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,8 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/netfs.h>
@@ -15,6 +17,113 @@ MODULE_DESCRIPTION("Network fs support");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");
+EXPORT_TRACEPOINT_SYMBOL(netfs_sreq);
+
unsigned netfs_debug;
module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+
+#ifdef CONFIG_PROC_FS
+LIST_HEAD(netfs_io_requests);
+DEFINE_SPINLOCK(netfs_proc_lock);
+
+static const char *netfs_origins[nr__netfs_io_origin] = {
+ [NETFS_READAHEAD] = "RA",
+ [NETFS_READPAGE] = "RP",
+ [NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_WRITEBACK] = "WB",
+ [NETFS_WRITETHROUGH] = "WT",
+ [NETFS_LAUNDER_WRITE] = "LW",
+ [NETFS_UNBUFFERED_WRITE] = "UW",
+ [NETFS_DIO_READ] = "DR",
+ [NETFS_DIO_WRITE] = "DW",
+};
+
+/*
+ * Generate a list of I/O requests in /proc/fs/netfs/requests
+ */
+static int netfs_requests_seq_show(struct seq_file *m, void *v)
+{
+ struct netfs_io_request *rreq;
+
+ if (v == &netfs_io_requests) {
+ seq_puts(m,
+ "REQUEST OR REF FL ERR OPS COVERAGE\n"
+ "======== == === == ==== === =========\n"
+ );
+ return 0;
+ }
+
+ rreq = list_entry(v, struct netfs_io_request, proc_link);
+ seq_printf(m,
+ "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+ rreq->debug_id,
+ netfs_origins[rreq->origin],
+ refcount_read(&rreq->ref),
+ rreq->flags,
+ rreq->error,
+ atomic_read(&rreq->nr_outstanding),
+ rreq->start, rreq->submitted, rreq->len);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void *netfs_requests_seq_start(struct seq_file *m, loff_t *_pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return seq_list_start_head(&netfs_io_requests, *_pos);
+}
+
+static void *netfs_requests_seq_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+ return seq_list_next(v, &netfs_io_requests, _pos);
+}
+
+static void netfs_requests_seq_stop(struct seq_file *m, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static const struct seq_operations netfs_requests_seq_ops = {
+ .start = netfs_requests_seq_start,
+ .next = netfs_requests_seq_next,
+ .stop = netfs_requests_seq_stop,
+ .show = netfs_requests_seq_show,
+};
+#endif /* CONFIG_PROC_FS */
+
+static int __init netfs_init(void)
+{
+ int ret = -ENOMEM;
+
+ if (!proc_mkdir("fs/netfs", NULL))
+ goto error;
+ if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
+ &netfs_requests_seq_ops))
+ goto error_proc;
+#ifdef CONFIG_FSCACHE_STATS
+ if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
+ netfs_stats_show))
+ goto error_proc;
+#endif
+
+ ret = fscache_init();
+ if (ret < 0)
+ goto error_proc;
+ return 0;
+
+error_proc:
+ remove_proc_entry("fs/netfs", NULL);
+error:
+ return ret;
+}
+fs_initcall(netfs_init);
+
+static void __exit netfs_exit(void)
+{
+ fscache_exit();
+ remove_proc_entry("fs/netfs", NULL);
+}
+module_exit(netfs_exit);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
new file mode 100644
index 0000000000..90051ced8e
--- /dev/null
+++ b/fs/netfs/misc.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Miscellaneous routines.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/swap.h>
+#include "internal.h"
+
+/*
+ * Attach a folio to the buffer and maybe set marks on it to say that we need
+ * to put the folio later and twiddle the pagecache flags.
+ */
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+ struct folio *folio, unsigned int flags,
+ gfp_t gfp_mask)
+{
+ XA_STATE_ORDER(xas, xa, index, folio_order(folio));
+
+retry:
+ xas_lock(&xas);
+ for (;;) {
+ xas_store(&xas, folio);
+ if (!xas_error(&xas))
+ break;
+ xas_unlock(&xas);
+ if (!xas_nomem(&xas, gfp_mask))
+ return xas_error(&xas);
+ goto retry;
+ }
+
+ if (flags & NETFS_FLAG_PUT_MARK)
+ xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
+ if (flags & NETFS_FLAG_PAGECACHE_MARK)
+ xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
+ xas_unlock(&xas);
+ return xas_error(&xas);
+}
+
+/*
+ * Create the specified range of folios in the buffer attached to the read
+ * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that
+ * these need freeing later.
+ */
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+ struct address_space *mapping,
+ pgoff_t index, pgoff_t to, gfp_t gfp_mask)
+{
+ struct folio *folio;
+ int ret;
+
+ if (to + 1 == index) /* Page range is inclusive */
+ return 0;
+
+ do {
+ /* TODO: Figure out what order folio can be allocated here */
+ folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
+ if (!folio)
+ return -ENOMEM;
+ folio->index = index;
+ ret = netfs_xa_store_and_mark(buffer, index, folio,
+ NETFS_FLAG_PUT_MARK, gfp_mask);
+ if (ret < 0) {
+ folio_put(folio);
+ return ret;
+ }
+
+ index += folio_nr_pages(folio);
+ } while (index <= to && index != 0);
+
+ return 0;
+}
+
+/*
+ * Clear an xarray buffer, putting a ref on the folios that have
+ * NETFS_BUF_PUT_MARK set.
+ */
+void netfs_clear_buffer(struct xarray *buffer)
+{
+ struct folio *folio;
+ XA_STATE(xas, buffer, 0);
+
+ rcu_read_lock();
+ xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
+ folio_put(folio);
+ }
+ rcu_read_unlock();
+ xa_destroy(buffer);
+}
+
+/**
+ * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
+ * @mapping: The mapping the folio belongs to.
+ * @folio: The folio being dirtied.
+ *
+ * Set the dirty flag on a folio and pin an in-use cache object in memory so
+ * that writeback can later write to it. This is intended to be called from
+ * the filesystem's ->dirty_folio() method.
+ *
+ * Return: true if the dirty flag was set on the folio, false otherwise.
+ */
+bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(ictx);
+ bool need_use = false;
+
+ _enter("");
+
+ if (!filemap_dirty_folio(mapping, folio))
+ return false;
+ if (!fscache_cookie_valid(cookie))
+ return true;
+
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ inode->i_state |= I_PINNING_NETFS_WB;
+ need_use = true;
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (need_use)
+ fscache_use_cookie(cookie, true);
+ }
+ return true;
+}
+EXPORT_SYMBOL(netfs_dirty_folio);
+
+/**
+ * netfs_unpin_writeback - Unpin writeback resources
+ * @inode: The inode on which the cookie resides
+ * @wbc: The writeback control
+ *
+ * Unpin the writeback resources pinned by netfs_dirty_folio(). This is
+ * intended to be called as/by the netfs's ->write_inode() method.
+ */
+int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc)
+{
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+ if (wbc->unpinned_netfs_wb)
+ fscache_unuse_cookie(cookie, NULL, NULL);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_unpin_writeback);
+
+/**
+ * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode
+ * @inode: The inode to clean up
+ * @aux: Auxiliary data to apply to the inode
+ *
+ * Clear any writeback resources held by an inode when the inode is evicted.
+ * This must be called before clear_inode() is called.
+ */
+void netfs_clear_inode_writeback(struct inode *inode, const void *aux)
+{
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+ if (inode->i_state & I_PINNING_NETFS_WB) {
+ loff_t i_size = i_size_read(inode);
+ fscache_unuse_cookie(cookie, aux, &i_size);
+ }
+}
+EXPORT_SYMBOL(netfs_clear_inode_writeback);
+
+/**
+ * netfs_invalidate_folio - Invalidate or partially invalidate a folio
+ * @folio: Folio proposed for release
+ * @offset: Offset of the invalidated region
+ * @length: Length of the invalidated region
+ *
+ * Invalidate part or all of a folio for a network filesystem. The folio will
+ * be removed afterwards if the invalidated region covers the entire folio.
+ */
+void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+ struct netfs_folio *finfo = NULL;
+ size_t flen = folio_size(folio);
+
+ _enter("{%lx},%zx,%zx", folio->index, offset, length);
+
+ folio_wait_fscache(folio);
+
+ if (!folio_test_private(folio))
+ return;
+
+ finfo = netfs_folio_info(folio);
+
+ if (offset == 0 && length >= flen)
+ goto erase_completely;
+
+ if (finfo) {
+ /* We have a partially uptodate page from a streaming write. */
+ unsigned int fstart = finfo->dirty_offset;
+ unsigned int fend = fstart + finfo->dirty_len;
+ unsigned int end = offset + length;
+
+ if (offset >= fend)
+ return;
+ if (end <= fstart)
+ return;
+ if (offset <= fstart && end >= fend)
+ goto erase_completely;
+ if (offset <= fstart && end > fstart)
+ goto reduce_len;
+ if (offset > fstart && end >= fend)
+ goto move_start;
+ /* A partial write was split. The caller has already zeroed
+ * it, so just absorb the hole.
+ */
+ }
+ return;
+
+erase_completely:
+ netfs_put_group(netfs_folio_group(folio));
+ folio_detach_private(folio);
+ folio_clear_uptodate(folio);
+ kfree(finfo);
+ return;
+reduce_len:
+ finfo->dirty_len = offset + length - finfo->dirty_offset;
+ return;
+move_start:
+ finfo->dirty_len -= offset - finfo->dirty_offset;
+ finfo->dirty_offset = offset;
+}
+EXPORT_SYMBOL(netfs_invalidate_folio);
+
+/**
+ * netfs_release_folio - Try to release a folio
+ * @folio: Folio proposed for release
+ * @gfp: Flags qualifying the release
+ *
+ * Request release of a folio and clean up its private state if it's not busy.
+ * Returns true if the folio can now be released, false if not
+ */
+bool netfs_release_folio(struct folio *folio, gfp_t gfp)
+{
+ struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
+ unsigned long long end;
+
+ end = folio_pos(folio) + folio_size(folio);
+ if (end > ctx->zero_point)
+ ctx->zero_point = end;
+
+ if (folio_test_private(folio))
+ return false;
+ if (folio_test_fscache(folio)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ folio_wait_fscache(folio);
+ }
+
+ fscache_note_page_release(netfs_i_cookie(ctx));
+ return true;
+}
+EXPORT_SYMBOL(netfs_release_folio);
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e17cdf53f6..610ceb5bd8 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -20,14 +20,20 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct inode *inode = file ? file_inode(file) : mapping->host;
struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
+ bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
+ origin == NETFS_DIO_READ ||
+ origin == NETFS_DIO_WRITE);
+ bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
int ret;
- rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
+ GFP_KERNEL);
if (!rreq)
return ERR_PTR(-ENOMEM);
rreq->start = start;
rreq->len = len;
+ rreq->upper_len = len;
rreq->origin = origin;
rreq->netfs_ops = ctx->ops;
rreq->mapping = mapping;
@@ -35,8 +41,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
INIT_LIST_HEAD(&rreq->subrequests);
+ INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
+
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ if (cached)
+ __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+ if (file && file->f_flags & O_NONBLOCK)
+ __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
@@ -45,6 +57,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
}
+ trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
+ netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
return rreq;
}
@@ -74,33 +88,47 @@ static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
+ unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ netfs_proc_del_rreq(rreq);
netfs_clear_subrequests(rreq, false);
if (rreq->netfs_ops->free_request)
rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
- kfree(rreq);
+ if (rreq->direct_bv) {
+ for (i = 0; i < rreq->direct_bv_count; i++) {
+ if (rreq->direct_bv[i].bv_page) {
+ if (rreq->direct_bv_unpin)
+ unpin_user_page(rreq->direct_bv[i].bv_page);
+ }
+ }
+ kvfree(rreq->direct_bv);
+ }
+ kfree_rcu(rreq, rcu);
netfs_stat_d(&netfs_n_rh_rreq);
}
void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
enum netfs_rreq_ref_trace what)
{
- unsigned int debug_id = rreq->debug_id;
+ unsigned int debug_id;
bool dead;
int r;
- dead = __refcount_dec_and_test(&rreq->ref, &r);
- trace_netfs_rreq_ref(debug_id, r - 1, what);
- if (dead) {
- if (was_async) {
- rreq->work.func = netfs_free_request;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_free_request(&rreq->work);
+ if (rreq) {
+ debug_id = rreq->debug_id;
+ dead = __refcount_dec_and_test(&rreq->ref, &r);
+ trace_netfs_rreq_ref(debug_id, r - 1, what);
+ if (dead) {
+ if (was_async) {
+ rreq->work.func = netfs_free_request;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_free_request(&rreq->work);
+ }
}
}
}
@@ -112,8 +140,11 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq
{
struct netfs_io_subrequest *subreq;
- subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL);
+ subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
+ sizeof(struct netfs_io_subrequest),
+ GFP_KERNEL);
if (subreq) {
+ INIT_WORK(&subreq->work, NULL);
INIT_LIST_HEAD(&subreq->rreq_link);
refcount_set(&subreq->ref, 2);
subreq->rreq = rreq;
@@ -140,6 +171,8 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
trace_netfs_sreq(subreq, netfs_sreq_trace_free);
+ if (rreq->netfs_ops->free_subrequest)
+ rreq->netfs_ops->free_subrequest(subreq);
kfree(subreq);
netfs_stat_d(&netfs_n_rh_sreq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
new file mode 100644
index 0000000000..625eb68f3e
--- /dev/null
+++ b/fs/netfs/output.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/**
+ * netfs_create_write_request - Create a write operation.
+ * @wreq: The write request this is storing from.
+ * @dest: The destination type
+ * @start: Start of the region this write will modify
+ * @len: Length of the modification
+ * @worker: The worker function to handle the write(s)
+ *
+ * Allocate a write operation, set it up and add it to the list on a write
+ * request.
+ */
+struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq,
+ enum netfs_io_source dest,
+ loff_t start, size_t len,
+ work_func_t worker)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_alloc_subrequest(wreq);
+ if (subreq) {
+ INIT_WORK(&subreq->work, worker);
+ subreq->source = dest;
+ subreq->start = start;
+ subreq->len = len;
+ subreq->debug_index = wreq->subreq_counter++;
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ BUG();
+ }
+
+ subreq->io_iter = wreq->io_iter;
+ iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start);
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+ atomic_inc(&wreq->nr_outstanding);
+ list_add_tail(&subreq->rreq_link, &wreq->subrequests);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ }
+
+ return subreq;
+}
+EXPORT_SYMBOL(netfs_create_write_request);
+
+/*
+ * Process a completed write request once all the component operations have
+ * been completed.
+ */
+static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+ size_t transferred = 0;
+
+ _enter("R=%x[]", wreq->debug_id);
+
+ trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+ list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+ if (subreq->error || subreq->transferred == 0)
+ break;
+ transferred += subreq->transferred;
+ if (subreq->transferred < subreq->len)
+ break;
+ }
+ wreq->transferred = transferred;
+
+ list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+ if (!subreq->error)
+ continue;
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ /* Depending on the type of failure, this may prevent
+ * writeback completion unless we're in disconnected
+ * mode.
+ */
+ if (!wreq->error)
+ wreq->error = subreq->error;
+ break;
+
+ case NETFS_WRITE_TO_CACHE:
+ /* Failure doesn't prevent writeback completion unless
+ * we're in disconnected mode.
+ */
+ if (subreq->error != -ENOBUFS)
+ ctx->ops->invalidate_cache(wreq);
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ if (!wreq->error)
+ wreq->error = -EIO;
+ return;
+ }
+ }
+
+ wreq->cleanup(wreq);
+
+ if (wreq->origin == NETFS_DIO_WRITE &&
+ wreq->mapping->nrpages) {
+ pgoff_t first = wreq->start >> PAGE_SHIFT;
+ pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+ invalidate_inode_pages2_range(wreq->mapping, first, last);
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_end(wreq->inode);
+
+ _debug("finished");
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (wreq->iocb) {
+ wreq->iocb->ki_pos += transferred;
+ if (wreq->iocb->ki_complete)
+ wreq->iocb->ki_complete(
+ wreq->iocb, wreq->error ? wreq->error : transferred);
+ }
+
+ netfs_clear_subrequests(wreq, was_async);
+ netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
+}
+
+/*
+ * Deal with the completion of writing the data to the cache.
+ */
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = _op;
+ struct netfs_io_request *wreq = subreq->rreq;
+ unsigned int u;
+
+ _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_done);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_done);
+ break;
+ case NETFS_INVALID_WRITE:
+ break;
+ default:
+ BUG();
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ trace_netfs_failure(wreq, subreq, transferred_or_error,
+ netfs_fail_write);
+ goto failed;
+ }
+
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq excess write: R%x[%x] %zd > %zu - %zu",
+ wreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+
+ if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+ pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n",
+ wreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->len,
+ subreq->transferred, subreq->io_iter.iter_type);
+
+ if (subreq->transferred < subreq->len)
+ goto incomplete;
+
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+out:
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ /* If we decrement nr_outstanding to 0, the ref belongs to us. */
+ u = atomic_dec_return(&wreq->nr_outstanding);
+ if (u == 0)
+ netfs_write_terminated(wreq, was_async);
+ else if (u == 1)
+ wake_up_var(&wreq->nr_outstanding);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+ return;
+
+incomplete:
+ if (transferred_or_error == 0) {
+ if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+ subreq->error = -ENODATA;
+ goto failed;
+ }
+ } else {
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+
+ __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+ goto out;
+
+failed:
+ switch (subreq->source) {
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_failed);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+ break;
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_failed);
+ set_bit(NETFS_RREQ_FAILED, &wreq->flags);
+ wreq->error = subreq->error;
+ break;
+ default:
+ break;
+ }
+ goto out;
+}
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
+
+static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+
+ cres->ops->write(cres, subreq->start, &subreq->io_iter,
+ netfs_write_subrequest_terminated, subreq);
+}
+
+static void netfs_write_to_cache_op_worker(struct work_struct *work)
+{
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
+
+ netfs_write_to_cache_op(subreq);
+}
+
+/**
+ * netfs_queue_write_request - Queue a write request for attention
+ * @subreq: The write request to be queued
+ *
+ * Queue the specified write request for processing by a worker thread. We
+ * pass the caller's ref on the request to the worker thread.
+ */
+void netfs_queue_write_request(struct netfs_io_subrequest *subreq)
+{
+ if (!queue_work(system_unbound_wq, &subreq->work))
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip);
+}
+EXPORT_SYMBOL(netfs_queue_write_request);
+
+/*
+ * Set up a op for writing to the cache.
+ */
+static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
+{
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+ struct netfs_io_subrequest *subreq;
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(ctx);
+ loff_t start = wreq->start;
+ size_t len = wreq->len;
+ int ret;
+
+ if (!fscache_cookie_enabled(cookie)) {
+ clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags);
+ return;
+ }
+
+ _debug("write to cache");
+ ret = fscache_begin_write_operation(cres, cookie);
+ if (ret < 0)
+ return;
+
+ ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len,
+ i_size_read(wreq->inode), true);
+ if (ret < 0)
+ return;
+
+ subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
+ netfs_write_to_cache_op_worker);
+ if (!subreq)
+ return;
+
+ netfs_write_to_cache_op(subreq);
+}
+
+/*
+ * Begin the process of writing out a chunk of data.
+ *
+ * We are given a write request that holds a series of dirty regions and
+ * (partially) covers a sequence of folios, all of which are present. The
+ * pages must have been marked as writeback as appropriate.
+ *
+ * We need to perform the following steps:
+ *
+ * (1) If encrypting, create an output buffer and encrypt each block of the
+ * data into it, otherwise the output buffer will point to the original
+ * folios.
+ *
+ * (2) If the data is to be cached, set up a write op for the entire output
+ * buffer to the cache, if the cache wants to accept it.
+ *
+ * (3) If the data is to be uploaded (ie. not merely cached):
+ *
+ * (a) If the data is to be compressed, create a compression buffer and
+ * compress the data into it.
+ *
+ * (b) For each destination we want to upload to, set up write ops to write
+ * to that destination. We may need multiple writes if the data is not
+ * contiguous or the span exceeds wsize for a server.
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+ enum netfs_write_trace what)
+{
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+
+ _enter("R=%x %llx-%llx f=%lx",
+ wreq->debug_id, wreq->start, wreq->start + wreq->len - 1,
+ wreq->flags);
+
+ trace_netfs_write(wreq, what);
+ if (wreq->len == 0 || wreq->iter.count == 0) {
+ pr_err("Zero-sized write [R=%x]\n", wreq->debug_id);
+ return -EIO;
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_begin(wreq->inode);
+
+ wreq->io_iter = wreq->iter;
+
+ /* ->outstanding > 0 carries a ref */
+ netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+ atomic_set(&wreq->nr_outstanding, 1);
+
+ /* Start the encryption/compression going. We can do that in the
+ * background whilst we generate a list of write ops that we want to
+ * perform.
+ */
+ // TODO: Encrypt or compress the region as appropriate
+
+ /* We need to write all of the region to the cache */
+ if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+ netfs_set_up_write_to_cache(wreq);
+
+ /* However, we don't necessarily write all of the region to the server.
+ * Caching of reads is being managed this way also.
+ */
+ if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ ctx->ops->create_write_requests(wreq, wreq->start, wreq->len);
+
+ if (atomic_dec_and_test(&wreq->nr_outstanding))
+ netfs_write_terminated(wreq, false);
+
+ if (!may_wait)
+ return -EIOCBQUEUED;
+
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ return wreq->error;
+}
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+ struct netfs_io_request *wreq;
+ struct file *file = iocb->ki_filp;
+
+ wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len,
+ NETFS_WRITETHROUGH);
+ if (IS_ERR(wreq))
+ return wreq;
+
+ trace_netfs_write(wreq, netfs_write_trace_writethrough);
+
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0);
+ wreq->io_iter = wreq->iter;
+
+ /* ->outstanding > 0 carries a ref */
+ netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+ atomic_set(&wreq->nr_outstanding, 1);
+ return wreq;
+}
+
+static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
+{
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ unsigned long long start;
+ size_t len;
+
+ if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ return;
+
+ start = wreq->start + wreq->submitted;
+ len = wreq->iter.count - wreq->submitted;
+ if (!final) {
+ len /= wreq->wsize; /* Round to number of maximum packets */
+ len *= wreq->wsize;
+ }
+
+ ictx->ops->create_write_requests(wreq, start, len);
+ wreq->submitted += len;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache. Data has been copied into the pagecache that we need to append
+ * to the request. If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
+{
+ _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
+ wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
+
+ wreq->iter.count += copied;
+ wreq->io_iter.count += copied;
+ if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize)
+ netfs_submit_writethrough(wreq, false);
+
+ return wreq->error;
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
+{
+ int ret = -EIOCBQUEUED;
+
+ _enter("ic=%zu sb=%zu ws=%u",
+ wreq->iter.count, wreq->submitted, wreq->wsize);
+
+ if (wreq->submitted < wreq->io_iter.count)
+ netfs_submit_writethrough(wreq, true);
+
+ if (atomic_dec_and_test(&wreq->nr_outstanding))
+ netfs_write_terminated(wreq, false);
+
+ if (is_sync_kiocb(iocb)) {
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ ret = wreq->error;
+ }
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 5510a7a14a..deeba9f9dc 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -9,6 +9,8 @@
#include <linux/seq_file.h>
#include "internal.h"
+atomic_t netfs_n_rh_dio_read;
+atomic_t netfs_n_rh_dio_write;
atomic_t netfs_n_rh_readahead;
atomic_t netfs_n_rh_readpage;
atomic_t netfs_n_rh_rreq;
@@ -27,32 +29,48 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_wh_wstream_conflict;
+atomic_t netfs_n_wh_upload;
+atomic_t netfs_n_wh_upload_done;
+atomic_t netfs_n_wh_upload_failed;
+atomic_t netfs_n_wh_write;
+atomic_t netfs_n_wh_write_done;
+atomic_t netfs_n_wh_write_failed;
-void netfs_stats_show(struct seq_file *m)
+int netfs_stats_show(struct seq_file *m, void *v)
{
- seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+ seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n",
+ atomic_read(&netfs_n_rh_dio_read),
+ atomic_read(&netfs_n_rh_dio_write),
atomic_read(&netfs_n_rh_readahead),
atomic_read(&netfs_n_rh_readpage),
atomic_read(&netfs_n_rh_write_begin),
- atomic_read(&netfs_n_rh_write_zskip),
- atomic_read(&netfs_n_rh_rreq),
- atomic_read(&netfs_n_rh_sreq));
- seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n",
+ atomic_read(&netfs_n_rh_write_zskip));
+ seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
atomic_read(&netfs_n_rh_write_zskip));
- seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n",
+ seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n",
atomic_read(&netfs_n_rh_download),
atomic_read(&netfs_n_rh_download_done),
atomic_read(&netfs_n_rh_download_failed),
atomic_read(&netfs_n_rh_download_instead));
- seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n",
+ seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n",
atomic_read(&netfs_n_rh_read),
atomic_read(&netfs_n_rh_read_done),
atomic_read(&netfs_n_rh_read_failed));
- seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n",
- atomic_read(&netfs_n_rh_write),
- atomic_read(&netfs_n_rh_write_done),
- atomic_read(&netfs_n_rh_write_failed));
+ seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n",
+ atomic_read(&netfs_n_wh_upload),
+ atomic_read(&netfs_n_wh_upload_done),
+ atomic_read(&netfs_n_wh_upload_failed));
+ seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n",
+ atomic_read(&netfs_n_wh_write),
+ atomic_read(&netfs_n_wh_write_done),
+ atomic_read(&netfs_n_wh_write_failed));
+ seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n",
+ atomic_read(&netfs_n_rh_rreq),
+ atomic_read(&netfs_n_rh_sreq),
+ atomic_read(&netfs_n_wh_wstream_conflict));
+ return fscache_stats_show(m);
}
EXPORT_SYMBOL(netfs_stats_show);