diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:17:52 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:17:52 +0000 |
commit | 3afb00d3f86d3d924f88b56fa8285d4e9db85852 (patch) | |
tree | 95a985d3019522cea546b7d8df621369bc44fc6c /fs/netfs | |
parent | Adding debian version 6.9.12-1. (diff) | |
download | linux-3afb00d3f86d3d924f88b56fa8285d4e9db85852.tar.xz linux-3afb00d3f86d3d924f88b56fa8285d4e9db85852.zip |
Merging upstream version 6.10.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/netfs')
-rw-r--r-- | fs/netfs/Makefile | 3 | ||||
-rw-r--r-- | fs/netfs/buffered_read.c | 54 | ||||
-rw-r--r-- | fs/netfs/buffered_write.c | 843 | ||||
-rw-r--r-- | fs/netfs/direct_read.c | 5 | ||||
-rw-r--r-- | fs/netfs/direct_write.c | 68 | ||||
-rw-r--r-- | fs/netfs/fscache_cache.c | 4 | ||||
-rw-r--r-- | fs/netfs/fscache_cookie.c | 28 | ||||
-rw-r--r-- | fs/netfs/fscache_io.c | 26 | ||||
-rw-r--r-- | fs/netfs/fscache_main.c | 2 | ||||
-rw-r--r-- | fs/netfs/fscache_volume.c | 4 | ||||
-rw-r--r-- | fs/netfs/internal.h | 97 | ||||
-rw-r--r-- | fs/netfs/io.c | 170 | ||||
-rw-r--r-- | fs/netfs/main.c | 57 | ||||
-rw-r--r-- | fs/netfs/misc.c | 95 | ||||
-rw-r--r-- | fs/netfs/objects.c | 86 | ||||
-rw-r--r-- | fs/netfs/output.c | 478 | ||||
-rw-r--r-- | fs/netfs/stats.c | 17 | ||||
-rw-r--r-- | fs/netfs/write_collect.c | 809 | ||||
-rw-r--r-- | fs/netfs/write_issue.c | 690 |
19 files changed, 1884 insertions, 1652 deletions
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index d4d1d79981..8e6781e0b1 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -11,7 +11,8 @@ netfs-y := \ main.o \ misc.o \ objects.o \ - output.o + write_collect.o \ + write_issue.o netfs-$(CONFIG_NETFS_STATS) += stats.o diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 3298c29b55..4c0401dbbf 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -10,8 +10,11 @@ #include "internal.h" /* - * Unlock the folios in a read operation. We need to set PG_fscache on any + * Unlock the folios in a read operation. We need to set PG_writeback on any * folios we're going to write back before we unlock them. + * + * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use + * PG_private_2 and do a direct write to the cache from here instead. */ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) { @@ -48,14 +51,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) xas_for_each(&xas, folio, last_page) { loff_t pg_end; bool pg_failed = false; - bool folio_started; + bool wback_to_cache = false; + bool folio_started = false; if (xas_retry(&xas, folio)) continue; pg_end = folio_pos(folio) + folio_size(folio) - 1; - folio_started = false; for (;;) { loff_t sreq_end; @@ -63,10 +66,16 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_start_fscache(folio); - folio_started = true; + if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { + if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, + &subreq->flags)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_start_private_2(folio); + folio_started = true; + } + } else { + wback_to_cache |= + test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); } pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; @@ -98,12 +107,17 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) kfree(finfo); } folio_mark_uptodate(folio); + if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); + filemap_dirty_folio(folio->mapping, folio); + } } if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) - _debug("no unlock"); + kdebug("no unlock"); else folio_unlock(folio); } @@ -116,7 +130,9 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, - loff_t *_start, size_t *_len, loff_t i_size) + unsigned long long *_start, + unsigned long long *_len, + unsigned long long i_size) { struct netfs_cache_resources *cres = &rreq->cache_resources; @@ -188,7 +204,7 @@ void netfs_readahead(struct readahead_control *ractl) struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); int ret; - _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); + kenter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); if (readahead_count(ractl) == 0) return; @@ -252,7 +268,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) struct folio *sink = NULL; int ret; - _enter("%lx", folio->index); + kenter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, folio_file_pos(folio), folio_size(folio), @@ -266,7 +282,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) goto discard; - netfs_stat(&netfs_n_rh_readpage); + netfs_stat(&netfs_n_rh_read_folio); trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); /* Set up the output buffer */ @@ -450,7 +466,7 @@ retry: if (!netfs_is_cache_enabled(ctx) && netfs_skip_folio_read(folio, pos, len, false)) { netfs_stat(&netfs_n_rh_write_zskip); - goto have_folio_no_wait; + goto have_folio; } rreq = netfs_alloc_request(mapping, file, @@ -491,12 +507,8 @@ retry: netfs_put_request(rreq, false, netfs_rreq_trace_put_return); have_folio: - ret = folio_wait_fscache_killable(folio); - if (ret < 0) - goto error; -have_folio_no_wait: *_folio = folio; - _leave(" = 0"); + kleave(" = 0"); return 0; error_put: @@ -506,7 +518,7 @@ error: folio_unlock(folio); folio_put(folio); } - _leave(" = %d", ret); + kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(netfs_write_begin); @@ -524,7 +536,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t flen = folio_size(folio); int ret; - _enter("%zx @%llx", flen, start); + kenter("%zx @%llx", flen, start); ret = -ENOMEM; @@ -555,7 +567,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, error_put: netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); error: - _leave(" = %d", ret); + kleave(" = %d", ret); return ret; } diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index e6066ddbb3..ecbc99ec7d 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Network filesystem high-level write support. +/* Network filesystem high-level buffered write support. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -26,25 +26,15 @@ enum netfs_how_to_modify { NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ }; -static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq); - static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) { - if (netfs_group && !folio_get_private(folio)) - folio_attach_private(folio, netfs_get_group(netfs_group)); -} + void *priv = folio_get_private(folio); -#if IS_ENABLED(CONFIG_FSCACHE) -static void netfs_folio_start_fscache(bool caching, struct folio *folio) -{ - if (caching) - folio_start_fscache(folio); -} -#else -static void netfs_folio_start_fscache(bool caching, struct folio *folio) -{ + if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) + folio_attach_private(folio, netfs_get_group(netfs_group)); + else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) + folio_detach_private(folio); } -#endif /* * Decide how we should modify a folio. We might be attempting to do @@ -63,11 +53,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, bool maybe_trouble) { struct netfs_folio *finfo = netfs_folio_info(folio); + struct netfs_group *group = netfs_folio_group(folio); loff_t pos = folio_file_pos(folio); - _enter(""); + kenter(""); - if (netfs_folio_group(folio) != netfs_group) + if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) return NETFS_FLUSH_CONTENT; if (folio_test_uptodate(folio)) @@ -81,16 +72,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, if (file->f_mode & FMODE_READ) goto no_write_streaming; - if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) - goto no_write_streaming; if (netfs_is_cache_enabled(ctx)) { /* We don't want to get a streaming write on a file that loses * caching service temporarily because the backing store got * culled. */ - if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) - set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags); goto no_write_streaming; } @@ -130,6 +117,37 @@ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, mapping_gfp_mask(mapping)); } +/* + * Update i_size and estimate the update to i_blocks to reflect the additional + * data written into the pagecache until we can find out from the server what + * the values actually are. + */ +static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, + loff_t i_size, loff_t pos, size_t copied) +{ + blkcnt_t add; + size_t gap; + + if (ctx->ops->update_i_size) { + ctx->ops->update_i_size(inode, pos); + return; + } + + i_size_write(inode, pos); +#if IS_ENABLED(CONFIG_FSCACHE) + fscache_update_cookie(ctx->cache, NULL, &pos); +#endif + + gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1)); + if (copied > gap) { + add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE); + + inode->i_blocks = min_t(blkcnt_t, + DIV_ROUND_UP(pos, SECTOR_SIZE), + inode->i_blocks + add); + } +} + /** * netfs_perform_write - Copy data into the pagecache. * @iocb: The operation parameters @@ -160,7 +178,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, }; struct netfs_io_request *wreq = NULL; struct netfs_folio *finfo; - struct folio *folio; + struct folio *folio, *writethrough = NULL; enum netfs_how_to_modify howto; enum netfs_folio_trace trace; unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; @@ -189,7 +207,9 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } if (!is_sync_kiocb(iocb)) wreq->iocb = iocb; - wreq->cleanup = netfs_cleanup_buffered_write; + netfs_stat(&netfs_n_wh_writethrough); + } else { + netfs_stat(&netfs_n_wh_buffered_write); } do { @@ -230,6 +250,16 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, offset = pos & (flen - 1); part = min_t(size_t, flen - offset, part); + /* Wait for writeback to complete. The writeback engine owns + * the info in folio->private and may change it until it + * removes the WB mark. + */ + if (folio_get_private(folio) && + folio_wait_writeback_killable(folio)) { + ret = written ? -EINTR : -ERESTARTSYS; + goto error_folio_unlock; + } + if (signal_pending(current)) { ret = written ? -EINTR : -ERESTARTSYS; goto error_folio_unlock; @@ -242,12 +272,12 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, */ howto = netfs_how_to_modify(ctx, file, folio, netfs_group, flen, offset, part, maybe_trouble); - _debug("howto %u", howto); + kdebug("howto %u", howto); switch (howto) { case NETFS_JUST_PREFETCH: ret = netfs_prefetch_for_write(file, folio, offset, part); if (ret < 0) { - _debug("prefetch = %zd", ret); + kdebug("prefetch = %zd", ret); goto error_folio_unlock; } break; @@ -304,6 +334,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, maybe_trouble = true; iov_iter_revert(iter, copied); copied = 0; + folio_unlock(folio); goto retry; } netfs_set_group(folio, netfs_group); @@ -351,41 +382,22 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, trace_netfs_folio(folio, trace); /* Update the inode size if we moved the EOF marker */ - i_size = i_size_read(inode); pos += copied; - if (pos > i_size) { - if (ctx->ops->update_i_size) { - ctx->ops->update_i_size(inode, pos); - } else { - i_size_write(inode, pos); -#if IS_ENABLED(CONFIG_FSCACHE) - fscache_update_cookie(ctx->cache, NULL, &pos); -#endif - } - } + i_size = i_size_read(inode); + if (pos > i_size) + netfs_update_i_size(ctx, inode, i_size, pos, copied); written += copied; if (likely(!wreq)) { folio_mark_dirty(folio); + folio_unlock(folio); } else { - if (folio_test_dirty(folio)) - /* Sigh. mmap. */ - folio_clear_dirty_for_io(folio); - /* We make multiple writes to the folio... */ - if (!folio_test_writeback(folio)) { - folio_wait_fscache(folio); - folio_start_writeback(folio); - folio_start_fscache(folio); - if (wreq->iter.count == 0) - trace_netfs_folio(folio, netfs_folio_trace_wthru); - else - trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); - } - netfs_advance_writethrough(wreq, copied, - offset + copied == flen); + netfs_advance_writethrough(wreq, &wbc, folio, copied, + offset + copied == flen, + &writethrough); + /* Folio unlocked */ } retry: - folio_unlock(folio); folio_put(folio); folio = NULL; @@ -393,8 +405,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } while (iov_iter_count(iter)); out: + if (likely(written) && ctx->ops->post_modify) + ctx->ops->post_modify(inode); + if (unlikely(wreq)) { - ret2 = netfs_end_writethrough(wreq, iocb); + ret2 = netfs_end_writethrough(wreq, &wbc, writethrough); wbc_detach_inode(&wbc); if (ret2 == -EIOCBQUEUED) return ret2; @@ -403,7 +418,7 @@ out: } iocb->ki_pos += written; - _leave(" = %zd [%zd]", written, ret); + kleave(" = %zd [%zd]", written, ret); return written ? written : ret; error_folio_unlock: @@ -476,7 +491,7 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct netfs_inode *ictx = netfs_inode(inode); ssize_t ret; - _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + kenter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); if (!iov_iter_count(from)) return 0; @@ -505,20 +520,19 @@ EXPORT_SYMBOL(netfs_file_write_iter); */ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) { + struct netfs_group *group; struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file); + struct netfs_inode *ictx = netfs_inode(inode); vm_fault_t ret = VM_FAULT_RETRY; int err; - _enter("%lx", folio->index); + kenter("%lx", folio->index); sb_start_pagefault(inode->i_sb); - if (folio_wait_writeback_killable(folio)) - goto out; - if (folio_lock_killable(folio) < 0) goto out; if (folio->mapping != mapping) { @@ -527,13 +541,19 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr goto out; } + if (folio_wait_writeback_killable(folio)) { + ret = VM_FAULT_LOCKED; + goto out; + } + /* Can we see a streaming write here? */ if (WARN_ON(!folio_test_uptodate(folio))) { ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; goto out; } - if (netfs_folio_group(folio) != netfs_group) { + group = netfs_folio_group(folio); + if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) { folio_unlock(folio); err = filemap_fdatawrite_range(mapping, folio_pos(folio), @@ -557,708 +577,11 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr trace_netfs_folio(folio, netfs_folio_trace_mkwrite); netfs_set_group(folio, netfs_group); file_update_time(file); + if (ictx->ops->post_modify) + ictx->ops->post_modify(inode); ret = VM_FAULT_LOCKED; out: sb_end_pagefault(inode->i_sb); return ret; } EXPORT_SYMBOL(netfs_page_mkwrite); - -/* - * Kill all the pages in the given range - */ -static void netfs_kill_pages(struct address_space *mapping, - loff_t start, loff_t len) -{ - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("%llx-%llx", start, start + len - 1); - - do { - _debug("kill %lx (to %lx)", index, last); - - folio = filemap_get_folio(mapping, index); - if (IS_ERR(folio)) { - next = index + 1; - continue; - } - - next = folio_next_index(folio); - - trace_netfs_folio(folio, netfs_folio_trace_kill); - folio_clear_uptodate(folio); - if (folio_test_fscache(folio)) - folio_end_fscache(folio); - folio_end_writeback(folio); - folio_lock(folio); - generic_error_remove_folio(mapping, folio); - folio_unlock(folio); - folio_put(folio); - - } while (index = next, index <= last); - - _leave(""); -} - -/* - * Redirty all the pages in a given range. - */ -static void netfs_redirty_pages(struct address_space *mapping, - loff_t start, loff_t len) -{ - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("%llx-%llx", start, start + len - 1); - - do { - _debug("redirty %llx @%llx", len, start); - - folio = filemap_get_folio(mapping, index); - if (IS_ERR(folio)) { - next = index + 1; - continue; - } - - next = folio_next_index(folio); - trace_netfs_folio(folio, netfs_folio_trace_redirty); - filemap_dirty_folio(mapping, folio); - if (folio_test_fscache(folio)) - folio_end_fscache(folio); - folio_end_writeback(folio); - folio_put(folio); - } while (index = next, index <= last); - - balance_dirty_pages_ratelimited(mapping); - - _leave(""); -} - -/* - * Completion of write to server - */ -static void netfs_pages_written_back(struct netfs_io_request *wreq) -{ - struct address_space *mapping = wreq->mapping; - struct netfs_folio *finfo; - struct netfs_group *group = NULL; - struct folio *folio; - pgoff_t last; - int gcount = 0; - - XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE); - - _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); - - rcu_read_lock(); - - last = (wreq->start + wreq->len - 1) / PAGE_SIZE; - xas_for_each(&xas, folio, last) { - WARN(!folio_test_writeback(folio), - "bad %zx @%llx page %lx %lx\n", - wreq->len, wreq->start, folio->index, last); - - if ((finfo = netfs_folio_info(folio))) { - /* Streaming writes cannot be redirtied whilst under - * writeback, so discard the streaming record. - */ - folio_detach_private(folio); - group = finfo->netfs_group; - gcount++; - trace_netfs_folio(folio, netfs_folio_trace_clear_s); - kfree(finfo); - } else if ((group = netfs_folio_group(folio))) { - /* Need to detach the group pointer if the page didn't - * get redirtied. If it has been redirtied, then it - * must be within the same group. - */ - if (folio_test_dirty(folio)) { - trace_netfs_folio(folio, netfs_folio_trace_redirtied); - goto end_wb; - } - if (folio_trylock(folio)) { - if (!folio_test_dirty(folio)) { - folio_detach_private(folio); - gcount++; - trace_netfs_folio(folio, netfs_folio_trace_clear_g); - } else { - trace_netfs_folio(folio, netfs_folio_trace_redirtied); - } - folio_unlock(folio); - goto end_wb; - } - - xas_pause(&xas); - rcu_read_unlock(); - folio_lock(folio); - if (!folio_test_dirty(folio)) { - folio_detach_private(folio); - gcount++; - trace_netfs_folio(folio, netfs_folio_trace_clear_g); - } else { - trace_netfs_folio(folio, netfs_folio_trace_redirtied); - } - folio_unlock(folio); - rcu_read_lock(); - } else { - trace_netfs_folio(folio, netfs_folio_trace_clear); - } - end_wb: - if (folio_test_fscache(folio)) - folio_end_fscache(folio); - xas_advance(&xas, folio_next_index(folio) - 1); - folio_end_writeback(folio); - } - - rcu_read_unlock(); - netfs_put_group_many(group, gcount); - _leave(""); -} - -/* - * Deal with the disposition of the folios that are under writeback to close - * out the operation. - */ -static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq) -{ - struct address_space *mapping = wreq->mapping; - - _enter(""); - - switch (wreq->error) { - case 0: - netfs_pages_written_back(wreq); - break; - - default: - pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error); - fallthrough; - case -EACCES: - case -EPERM: - case -ENOKEY: - case -EKEYEXPIRED: - case -EKEYREJECTED: - case -EKEYREVOKED: - case -ENETRESET: - case -EDQUOT: - case -ENOSPC: - netfs_redirty_pages(mapping, wreq->start, wreq->len); - break; - - case -EROFS: - case -EIO: - case -EREMOTEIO: - case -EFBIG: - case -ENOENT: - case -ENOMEDIUM: - case -ENXIO: - netfs_kill_pages(mapping, wreq->start, wreq->len); - break; - } - - if (wreq->error) - mapping_set_error(mapping, wreq->error); - if (wreq->netfs_ops->done) - wreq->netfs_ops->done(wreq); -} - -/* - * Extend the region to be written back to include subsequent contiguously - * dirty pages if possible, but don't sleep while doing so. - * - * If this page holds new content, then we can include filler zeros in the - * writeback. - */ -static void netfs_extend_writeback(struct address_space *mapping, - struct netfs_group *group, - struct xa_state *xas, - long *_count, - loff_t start, - loff_t max_len, - bool caching, - size_t *_len, - size_t *_top) -{ - struct netfs_folio *finfo; - struct folio_batch fbatch; - struct folio *folio; - unsigned int i; - pgoff_t index = (start + *_len) / PAGE_SIZE; - size_t len; - void *priv; - bool stop = true; - - folio_batch_init(&fbatch); - - do { - /* Firstly, we gather up a batch of contiguous dirty pages - * under the RCU read lock - but we can't clear the dirty flags - * there if any of those pages are mapped. - */ - rcu_read_lock(); - - xas_for_each(xas, folio, ULONG_MAX) { - stop = true; - if (xas_retry(xas, folio)) - continue; - if (xa_is_value(folio)) - break; - if (folio->index != index) { - xas_reset(xas); - break; - } - - if (!folio_try_get(folio)) { - xas_reset(xas); - continue; - } - - /* Has the folio moved or been split? */ - if (unlikely(folio != xas_reload(xas))) { - folio_put(folio); - xas_reset(xas); - break; - } - - if (!folio_trylock(folio)) { - folio_put(folio); - xas_reset(xas); - break; - } - if (!folio_test_dirty(folio) || - folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - folio_put(folio); - xas_reset(xas); - break; - } - - stop = false; - len = folio_size(folio); - priv = folio_get_private(folio); - if ((const struct netfs_group *)priv != group) { - stop = true; - finfo = netfs_folio_info(folio); - if (finfo->netfs_group != group || - finfo->dirty_offset > 0) { - folio_unlock(folio); - folio_put(folio); - xas_reset(xas); - break; - } - len = finfo->dirty_len; - } - - *_top += folio_size(folio); - index += folio_nr_pages(folio); - *_count -= folio_nr_pages(folio); - *_len += len; - if (*_len >= max_len || *_count <= 0) - stop = true; - - if (!folio_batch_add(&fbatch, folio)) - break; - if (stop) - break; - } - - xas_pause(xas); - rcu_read_unlock(); - - /* Now, if we obtained any folios, we can shift them to being - * writable and mark them for caching. - */ - if (!folio_batch_count(&fbatch)) - break; - - for (i = 0; i < folio_batch_count(&fbatch); i++) { - folio = fbatch.folios[i]; - trace_netfs_folio(folio, netfs_folio_trace_store_plus); - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - folio_start_writeback(folio); - netfs_folio_start_fscache(caching, folio); - folio_unlock(folio); - } - - folio_batch_release(&fbatch); - cond_resched(); - } while (!stop); -} - -/* - * Synchronously write back the locked page and any subsequent non-locked dirty - * pages. - */ -static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping, - struct writeback_control *wbc, - struct netfs_group *group, - struct xa_state *xas, - struct folio *folio, - unsigned long long start, - unsigned long long end) -{ - struct netfs_io_request *wreq; - struct netfs_folio *finfo; - struct netfs_inode *ctx = netfs_inode(mapping->host); - unsigned long long i_size = i_size_read(&ctx->inode); - size_t len, max_len; - bool caching = netfs_is_cache_enabled(ctx); - long count = wbc->nr_to_write; - int ret; - - _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching); - - wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio), - NETFS_WRITEBACK); - if (IS_ERR(wreq)) { - folio_unlock(folio); - return PTR_ERR(wreq); - } - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - folio_start_writeback(folio); - netfs_folio_start_fscache(caching, folio); - - count -= folio_nr_pages(folio); - - /* Find all consecutive lockable dirty pages that have contiguous - * written regions, stopping when we find a page that is not - * immediately lockable, is not dirty or is missing, or we reach the - * end of the range. - */ - trace_netfs_folio(folio, netfs_folio_trace_store); - - len = wreq->len; - finfo = netfs_folio_info(folio); - if (finfo) { - start += finfo->dirty_offset; - if (finfo->dirty_offset + finfo->dirty_len != len) { - len = finfo->dirty_len; - goto cant_expand; - } - len = finfo->dirty_len; - } - - if (start < i_size) { - /* Trim the write to the EOF; the extra data is ignored. Also - * put an upper limit on the size of a single storedata op. - */ - max_len = 65536 * 4096; - max_len = min_t(unsigned long long, max_len, end - start + 1); - max_len = min_t(unsigned long long, max_len, i_size - start); - - if (len < max_len) - netfs_extend_writeback(mapping, group, xas, &count, start, - max_len, caching, &len, &wreq->upper_len); - } - -cant_expand: - len = min_t(unsigned long long, len, i_size - start); - - /* We now have a contiguous set of dirty pages, each with writeback - * set; the first page is still locked at this point, but all the rest - * have been unlocked. - */ - folio_unlock(folio); - wreq->start = start; - wreq->len = len; - - if (start < i_size) { - _debug("write back %zx @%llx [%llx]", len, start, i_size); - - /* Speculatively write to the cache. We have to fix this up - * later if the store fails. - */ - wreq->cleanup = netfs_cleanup_buffered_write; - - iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start, - wreq->upper_len); - __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); - ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback); - if (ret == 0 || ret == -EIOCBQUEUED) - wbc->nr_to_write -= len / PAGE_SIZE; - } else { - _debug("write discard %zx @%llx [%llx]", len, start, i_size); - - /* The dirty region was entirely beyond the EOF. */ - fscache_clear_page_bits(mapping, start, len, caching); - netfs_pages_written_back(wreq); - ret = 0; - } - - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); - _leave(" = 1"); - return 1; -} - -/* - * Write a region of pages back to the server - */ -static ssize_t netfs_writepages_begin(struct address_space *mapping, - struct writeback_control *wbc, - struct netfs_group *group, - struct xa_state *xas, - unsigned long long *_start, - unsigned long long end) -{ - const struct netfs_folio *finfo; - struct folio *folio; - unsigned long long start = *_start; - ssize_t ret; - void *priv; - int skips = 0; - - _enter("%llx,%llx,", start, end); - -search_again: - /* Find the first dirty page in the group. */ - rcu_read_lock(); - - for (;;) { - folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY); - if (xas_retry(xas, folio) || xa_is_value(folio)) - continue; - if (!folio) - break; - - if (!folio_try_get(folio)) { - xas_reset(xas); - continue; - } - - if (unlikely(folio != xas_reload(xas))) { - folio_put(folio); - xas_reset(xas); - continue; - } - - /* Skip any dirty folio that's not in the group of interest. */ - priv = folio_get_private(folio); - if ((const struct netfs_group *)priv != group) { - finfo = netfs_folio_info(folio); - if (finfo->netfs_group != group) { - folio_put(folio); - continue; - } - } - - xas_pause(xas); - break; - } - rcu_read_unlock(); - if (!folio) - return 0; - - start = folio_pos(folio); /* May regress with THPs */ - - _debug("wback %lx", folio->index); - - /* At this point we hold neither the i_pages lock nor the page lock: - * the page may be truncated or invalidated (changing page->mapping to - * NULL), or even swizzled back from swapper_space to tmpfs file - * mapping - */ -lock_again: - if (wbc->sync_mode != WB_SYNC_NONE) { - ret = folio_lock_killable(folio); - if (ret < 0) - return ret; - } else { - if (!folio_trylock(folio)) - goto search_again; - } - - if (folio->mapping != mapping || - !folio_test_dirty(folio)) { - start += folio_size(folio); - folio_unlock(folio); - goto search_again; - } - - if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) { - folio_wait_writeback(folio); -#ifdef CONFIG_FSCACHE - folio_wait_fscache(folio); -#endif - goto lock_again; - } - - start += folio_size(folio); - if (wbc->sync_mode == WB_SYNC_NONE) { - if (skips >= 5 || need_resched()) { - ret = 0; - goto out; - } - skips++; - } - goto search_again; - } - - ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas, - folio, start, end); -out: - if (ret > 0) - *_start = start + ret; - _leave(" = %zd [%llx]", ret, *_start); - return ret; -} - -/* - * Write a region of pages back to the server - */ -static int netfs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - struct netfs_group *group, - unsigned long long *_start, - unsigned long long end) -{ - ssize_t ret; - - XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE); - - do { - ret = netfs_writepages_begin(mapping, wbc, group, &xas, - _start, end); - if (ret > 0 && wbc->nr_to_write > 0) - cond_resched(); - } while (ret > 0 && wbc->nr_to_write > 0); - - return ret > 0 ? 0 : ret; -} - -/* - * write some of the pending data back to the server - */ -int netfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct netfs_group *group = NULL; - loff_t start, end; - int ret; - - _enter(""); - - /* We have to be careful as we can end up racing with setattr() - * truncating the pagecache since the caller doesn't take a lock here - * to prevent it. - */ - - if (wbc->range_cyclic && mapping->writeback_index) { - start = mapping->writeback_index * PAGE_SIZE; - ret = netfs_writepages_region(mapping, wbc, group, - &start, LLONG_MAX); - if (ret < 0) - goto out; - - if (wbc->nr_to_write <= 0) { - mapping->writeback_index = start / PAGE_SIZE; - goto out; - } - - start = 0; - end = mapping->writeback_index * PAGE_SIZE; - mapping->writeback_index = 0; - ret = netfs_writepages_region(mapping, wbc, group, &start, end); - if (ret == 0) - mapping->writeback_index = start / PAGE_SIZE; - } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - start = 0; - ret = netfs_writepages_region(mapping, wbc, group, - &start, LLONG_MAX); - if (wbc->nr_to_write > 0 && ret == 0) - mapping->writeback_index = start / PAGE_SIZE; - } else { - start = wbc->range_start; - ret = netfs_writepages_region(mapping, wbc, group, - &start, wbc->range_end); - } - -out: - _leave(" = %d", ret); - return ret; -} -EXPORT_SYMBOL(netfs_writepages); - -/* - * Deal with the disposition of a laundered folio. - */ -static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq) -{ - if (wreq->error) { - pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error); - mapping_set_error(wreq->mapping, wreq->error); - } -} - -/** - * netfs_launder_folio - Clean up a dirty folio that's being invalidated - * @folio: The folio to clean - * - * This is called to write back a folio that's being invalidated when an inode - * is getting torn down. Ideally, writepages would be used instead. - */ -int netfs_launder_folio(struct folio *folio) -{ - struct netfs_io_request *wreq; - struct address_space *mapping = folio->mapping; - struct netfs_folio *finfo = netfs_folio_info(folio); - struct netfs_group *group = netfs_folio_group(folio); - struct bio_vec bvec; - unsigned long long i_size = i_size_read(mapping->host); - unsigned long long start = folio_pos(folio); - size_t offset = 0, len; - int ret = 0; - - if (finfo) { - offset = finfo->dirty_offset; - start += offset; - len = finfo->dirty_len; - } else { - len = folio_size(folio); - } - len = min_t(unsigned long long, len, i_size - start); - - wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE); - if (IS_ERR(wreq)) { - ret = PTR_ERR(wreq); - goto out; - } - - if (!folio_clear_dirty_for_io(folio)) - goto out_put; - - trace_netfs_folio(folio, netfs_folio_trace_launder); - - _debug("launder %llx-%llx", start, start + len - 1); - - /* Speculatively write to the cache. We have to fix this up later if - * the store fails. - */ - wreq->cleanup = netfs_cleanup_launder_folio; - - bvec_set_folio(&bvec, folio, len, offset); - iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len); - __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); - ret = netfs_begin_write(wreq, true, netfs_write_trace_launder); - -out_put: - folio_detach_private(folio); - netfs_put_group(group); - kfree(finfo); - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); -out: - folio_wait_fscache(folio); - _leave(" = %d", ret); - return ret; -} -EXPORT_SYMBOL(netfs_launder_folio); diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index ad4370b393..b6debac620 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -26,14 +26,14 @@ * * The caller must hold any appropriate locks. */ -static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) +ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) { struct netfs_io_request *rreq; ssize_t ret; size_t orig_count = iov_iter_count(iter); bool async = !is_sync_kiocb(iocb); - _enter(""); + kenter(""); if (!orig_count) return 0; /* Don't update atime */ @@ -98,6 +98,7 @@ out: iov_iter_revert(iter, orig_count - iov_iter_count(iter)); return ret; } +EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); /** * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index bee047e20f..792ef17bae 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -12,7 +12,7 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) { struct inode *inode = wreq->inode; - unsigned long long end = wreq->start + wreq->len; + unsigned long long end = wreq->start + wreq->transferred; if (!wreq->error && i_size_read(inode) < end) { @@ -27,16 +27,17 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) * Perform an unbuffered write where we may have to do an RMW operation on an * encrypted file. This can also be used for direct I/O writes. */ -static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, +ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group) { struct netfs_io_request *wreq; unsigned long long start = iocb->ki_pos; unsigned long long end = start + iov_iter_count(iter); ssize_t ret, n; + size_t len = iov_iter_count(iter); bool async = !is_sync_kiocb(iocb); - _enter(""); + kenter(""); /* We're going to need a bounce buffer if what we transmit is going to * be different in some way to the source buffer, e.g. because it gets @@ -44,15 +45,19 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov */ // TODO - _debug("uw %llx-%llx", start, end); + kdebug("uw %llx-%llx", start, end); - wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, - start, end - start, - iocb->ki_flags & IOCB_DIRECT ? - NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE); + wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start, + iocb->ki_flags & IOCB_DIRECT ? + NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE); if (IS_ERR(wreq)) return PTR_ERR(wreq); + wreq->io_streams[0].avail = true; + trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ? + netfs_write_trace_dio_write : + netfs_write_trace_unbuffered_write)); + { /* If this is an async op and we're not using a bounce buffer, * we have to save the source buffer as the iterator is only @@ -63,7 +68,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov * request. */ if (async || user_backed_iter(iter)) { - n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0); + n = netfs_extract_user_iter(iter, len, &wreq->iter, 0); if (n < 0) { ret = n; goto out; @@ -71,7 +76,6 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec; wreq->direct_bv_count = n; wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); - wreq->len = iov_iter_count(&wreq->iter); } else { wreq->iter = *iter; } @@ -79,6 +83,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov wreq->io_iter = wreq->iter; } + __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); + /* Copy the data into the bounce buffer and encrypt it. */ // TODO @@ -86,13 +92,11 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); if (async) wreq->iocb = iocb; + wreq->len = iov_iter_count(&wreq->io_iter); wreq->cleanup = netfs_cleanup_dio_write; - ret = netfs_begin_write(wreq, is_sync_kiocb(iocb), - iocb->ki_flags & IOCB_DIRECT ? - netfs_write_trace_dio_write : - netfs_write_trace_unbuffered_write); + ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); if (ret < 0) { - _debug("begin = %zd", ret); + kdebug("begin = %zd", ret); goto out; } @@ -100,9 +104,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); - + smp_rmb(); /* Read error/transferred after RIP flag */ ret = wreq->error; - _debug("waited = %zd", ret); if (ret == 0) { ret = wreq->transferred; iocb->ki_pos += ret; @@ -115,6 +118,7 @@ out: netfs_put_request(wreq, false, netfs_rreq_trace_put_return); return ret; } +EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); /** * netfs_unbuffered_write_iter - Unbuffered write to a file @@ -132,18 +136,20 @@ out: ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; struct netfs_inode *ictx = netfs_inode(inode); - unsigned long long end; ssize_t ret; + loff_t pos = iocb->ki_pos; + unsigned long long end = pos + iov_iter_count(from) - 1; - _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + kenter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode)); if (!iov_iter_count(from)) return 0; trace_netfs_write_iter(iocb, from); - netfs_stat(&netfs_n_rh_dio_write); + netfs_stat(&netfs_n_wh_dio_write); ret = netfs_start_io_direct(inode); if (ret < 0) @@ -157,7 +163,25 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = file_update_time(file); if (ret < 0) goto out; - ret = kiocb_invalidate_pages(iocb, iov_iter_count(from)); + if (iocb->ki_flags & IOCB_NOWAIT) { + /* We could block if there are any pages in the range. */ + ret = -EAGAIN; + if (filemap_range_has_page(mapping, pos, end)) + if (filemap_invalidate_inode(inode, true, pos, end)) + goto out; + } else { + ret = filemap_write_and_wait_range(mapping, pos, end); + if (ret < 0) + goto out; + } + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + ret = filemap_invalidate_inode(inode, true, pos, end); if (ret < 0) goto out; end = iocb->ki_pos + iov_iter_count(from); diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c index 9397ed39b0..288a73c307 100644 --- a/fs/netfs/fscache_cache.c +++ b/fs/netfs/fscache_cache.c @@ -237,7 +237,7 @@ int fscache_add_cache(struct fscache_cache *cache, { int n_accesses; - _enter("{%s,%s}", ops->name, cache->name); + kenter("{%s,%s}", ops->name, cache->name); BUG_ON(fscache_cache_state(cache) != FSCACHE_CACHE_IS_PREPARING); @@ -257,7 +257,7 @@ int fscache_add_cache(struct fscache_cache *cache, up_write(&fscache_addremove_sem); pr_notice("Cache \"%s\" added (type %s)\n", cache->name, ops->name); - _leave(" = 0 [%s]", cache->name); + kleave(" = 0 [%s]", cache->name); return 0; } EXPORT_SYMBOL(fscache_add_cache); diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index bce2492186..4d1e8bf4c6 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -456,7 +456,7 @@ struct fscache_cookie *__fscache_acquire_cookie( { struct fscache_cookie *cookie; - _enter("V=%x", volume->debug_id); + kenter("V=%x", volume->debug_id); if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255) return NULL; @@ -484,7 +484,7 @@ struct fscache_cookie *__fscache_acquire_cookie( trace_fscache_acquire(cookie); fscache_stat(&fscache_n_acquires_ok); - _leave(" = c=%08x", cookie->debug_id); + kleave(" = c=%08x", cookie->debug_id); return cookie; } EXPORT_SYMBOL(__fscache_acquire_cookie); @@ -505,7 +505,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) enum fscache_access_trace trace = fscache_access_lookup_cookie_end_failed; bool need_withdraw = false; - _enter(""); + kenter(""); if (!cookie->volume->cache_priv) { fscache_create_volume(cookie->volume, true); @@ -519,7 +519,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) if (cookie->state != FSCACHE_COOKIE_STATE_FAILED) fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); need_withdraw = true; - _leave(" [fail]"); + kleave(" [fail]"); goto out; } @@ -572,7 +572,7 @@ void __fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify) bool queue = false; int n_active; - _enter("c=%08x", cookie->debug_id); + kenter("c=%08x", cookie->debug_id); if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), "Trying to use relinquished cookie\n")) @@ -636,7 +636,7 @@ again: spin_unlock(&cookie->lock); if (queue) fscache_queue_cookie(cookie, fscache_cookie_get_use_work); - _leave(""); + kleave(""); } EXPORT_SYMBOL(__fscache_use_cookie); @@ -702,7 +702,7 @@ static void fscache_cookie_state_machine(struct fscache_cookie *cookie) enum fscache_cookie_state state; bool wake = false; - _enter("c=%x", cookie->debug_id); + kenter("c=%x", cookie->debug_id); again: spin_lock(&cookie->lock); @@ -820,7 +820,7 @@ out: spin_unlock(&cookie->lock); if (wake) wake_up_cookie_state(cookie); - _leave(""); + kleave(""); } static void fscache_cookie_worker(struct work_struct *work) @@ -867,7 +867,7 @@ static void fscache_cookie_lru_do_one(struct fscache_cookie *cookie) set_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags); spin_unlock(&cookie->lock); fscache_stat(&fscache_n_cookies_lru_expired); - _debug("lru c=%x", cookie->debug_id); + kdebug("lru c=%x", cookie->debug_id); __fscache_withdraw_cookie(cookie); } @@ -971,7 +971,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) if (retire) fscache_stat(&fscache_n_relinquishes_retire); - _enter("c=%08x{%d},%d", + kenter("c=%08x{%d},%d", cookie->debug_id, atomic_read(&cookie->n_active), retire); if (WARN(test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), @@ -1050,7 +1050,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, { bool is_caching; - _enter("c=%x", cookie->debug_id); + kenter("c=%x", cookie->debug_id); fscache_stat(&fscache_n_invalidates); @@ -1072,7 +1072,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, case FSCACHE_COOKIE_STATE_INVALIDATING: /* is_still_valid will catch it */ default: spin_unlock(&cookie->lock); - _leave(" [no %u]", cookie->state); + kleave(" [no %u]", cookie->state); return; case FSCACHE_COOKIE_STATE_LOOKING_UP: @@ -1081,7 +1081,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, fallthrough; case FSCACHE_COOKIE_STATE_CREATING: spin_unlock(&cookie->lock); - _leave(" [look %x]", cookie->inval_counter); + kleave(" [look %x]", cookie->inval_counter); return; case FSCACHE_COOKIE_STATE_ACTIVE: @@ -1094,7 +1094,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, if (is_caching) fscache_queue_cookie(cookie, fscache_cookie_get_inval_work); - _leave(" [inv]"); + kleave(" [inv]"); return; } } diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index 43a651ed82..bf4eaeec44 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -28,12 +28,12 @@ bool fscache_wait_for_operation(struct netfs_cache_resources *cres, again: if (!fscache_cache_is_live(cookie->volume->cache)) { - _leave(" [broken]"); + kleave(" [broken]"); return false; } state = fscache_cookie_state(cookie); - _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); + kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state); switch (state) { case FSCACHE_COOKIE_STATE_CREATING: @@ -52,7 +52,7 @@ again: case FSCACHE_COOKIE_STATE_DROPPED: case FSCACHE_COOKIE_STATE_RELINQUISHING: default: - _leave(" [not live]"); + kleave(" [not live]"); return false; } @@ -92,7 +92,7 @@ again: spin_lock(&cookie->lock); state = fscache_cookie_state(cookie); - _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); + kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state); switch (state) { case FSCACHE_COOKIE_STATE_LOOKING_UP: @@ -140,7 +140,7 @@ failed: cres->cache_priv = NULL; cres->ops = NULL; fscache_end_cookie_access(cookie, fscache_access_io_not_live); - _leave(" = -ENOBUFS"); + kleave(" = -ENOBUFS"); return -ENOBUFS; } @@ -166,6 +166,7 @@ struct fscache_write_request { loff_t start; size_t len; bool set_bits; + bool using_pgpriv2; netfs_io_terminated_t term_func; void *term_func_priv; }; @@ -182,7 +183,7 @@ void __fscache_clear_page_bits(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, page, last) { - end_page_fscache(page); + folio_end_private_2(page_folio(page)); } rcu_read_unlock(); } @@ -197,8 +198,9 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, { struct fscache_write_request *wreq = priv; - fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len, - wreq->set_bits); + if (wreq->using_pgpriv2) + fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len, + wreq->set_bits); if (wreq->term_func) wreq->term_func(wreq->term_func_priv, transferred_or_error, @@ -212,7 +214,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, loff_t start, size_t len, loff_t i_size, netfs_io_terminated_t term_func, void *term_func_priv, - bool cond) + bool using_pgpriv2, bool cond) { struct fscache_write_request *wreq; struct netfs_cache_resources *cres; @@ -222,7 +224,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, if (len == 0) goto abandon; - _enter("%llx,%zx", start, len); + kenter("%llx,%zx", start, len); wreq = kzalloc(sizeof(struct fscache_write_request), GFP_NOFS); if (!wreq) @@ -230,6 +232,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, wreq->mapping = mapping; wreq->start = start; wreq->len = len; + wreq->using_pgpriv2 = using_pgpriv2; wreq->set_bits = cond; wreq->term_func = term_func; wreq->term_func_priv = term_func_priv; @@ -257,7 +260,8 @@ abandon_end: abandon_free: kfree(wreq); abandon: - fscache_clear_page_bits(mapping, start, len, cond); + if (using_pgpriv2) + fscache_clear_page_bits(mapping, start, len, cond); if (term_func) term_func(term_func_priv, ret, false); } diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c index 42e98bb523..bf9b33d26e 100644 --- a/fs/netfs/fscache_main.c +++ b/fs/netfs/fscache_main.c @@ -99,7 +99,7 @@ error_wq: */ void __exit fscache_exit(void) { - _enter(""); + kenter(""); kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c index cb75c07b52..2e2a405ca9 100644 --- a/fs/netfs/fscache_volume.c +++ b/fs/netfs/fscache_volume.c @@ -264,7 +264,7 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key, fscache_see_volume(volume, fscache_volume_new_acquire); fscache_stat(&fscache_n_volumes); up_write(&fscache_addremove_sem); - _leave(" = v=%x", volume->debug_id); + kleave(" = v=%x", volume->debug_id); return volume; err_vol: @@ -466,7 +466,7 @@ void fscache_withdraw_volume(struct fscache_volume *volume) { int n_accesses; - _debug("withdraw V=%x", volume->debug_id); + kdebug("withdraw V=%x", volume->debug_id); /* Allow wakeups on dec-to-0 */ n_accesses = atomic_dec_return(&volume->n_accesses); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index edf9b2a180..21e46bc9aa 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -34,9 +34,10 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync); /* * main.c */ -extern unsigned int netfs_debug; extern struct list_head netfs_io_requests; extern spinlock_t netfs_proc_lock; +extern mempool_t netfs_request_pool; +extern mempool_t netfs_subrequest_pool; #ifdef CONFIG_PROC_FS static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) @@ -61,15 +62,6 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} /* * misc.c */ -#define NETFS_FLAG_PUT_MARK BIT(0) -#define NETFS_FLAG_PAGECACHE_MARK BIT(1) -int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, - struct folio *folio, unsigned int flags, - gfp_t gfp_mask); -int netfs_add_folios_to_buffer(struct xarray *buffer, - struct address_space *mapping, - pgoff_t index, pgoff_t to, gfp_t gfp_mask); -void netfs_clear_buffer(struct xarray *buffer); /* * objects.c @@ -91,22 +83,12 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, } /* - * output.c - */ -int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, - enum netfs_write_trace what); -struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len); -int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end); -int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb); - -/* * stats.c */ #ifdef CONFIG_NETFS_STATS extern atomic_t netfs_n_rh_dio_read; -extern atomic_t netfs_n_rh_dio_write; extern atomic_t netfs_n_rh_readahead; -extern atomic_t netfs_n_rh_readpage; +extern atomic_t netfs_n_rh_read_folio; extern atomic_t netfs_n_rh_rreq; extern atomic_t netfs_n_rh_sreq; extern atomic_t netfs_n_rh_download; @@ -123,6 +105,10 @@ extern atomic_t netfs_n_rh_write_begin; extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +extern atomic_t netfs_n_wh_buffered_write; +extern atomic_t netfs_n_wh_writethrough; +extern atomic_t netfs_n_wh_dio_write; +extern atomic_t netfs_n_wh_writepages; extern atomic_t netfs_n_wh_wstream_conflict; extern atomic_t netfs_n_wh_upload; extern atomic_t netfs_n_wh_upload_done; @@ -149,6 +135,33 @@ static inline void netfs_stat_d(atomic_t *stat) #endif /* + * write_collect.c + */ +int netfs_folio_written_back(struct folio *folio); +void netfs_write_collection_worker(struct work_struct *work); +void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async); + +/* + * write_issue.c + */ +struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, + struct file *file, + loff_t start, + enum netfs_io_origin origin); +void netfs_reissue_write(struct netfs_io_stream *stream, + struct netfs_io_subrequest *subreq); +int netfs_advance_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start, size_t len, bool to_eof); +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len); +int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *folio, size_t copied, bool to_page_end, + struct folio **writethrough_cache); +int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache); +int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len); + +/* * Miscellaneous functions. */ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) @@ -168,7 +181,7 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) */ static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group) { - if (netfs_group) + if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE) refcount_inc(&netfs_group->ref); return netfs_group; } @@ -178,7 +191,9 @@ static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_grou */ static inline void netfs_put_group(struct netfs_group *netfs_group) { - if (netfs_group && refcount_dec_and_test(&netfs_group->ref)) + if (netfs_group && + netfs_group != NETFS_FOLIO_COPY_TO_CACHE && + refcount_dec_and_test(&netfs_group->ref)) netfs_group->free(netfs_group); } @@ -187,7 +202,9 @@ static inline void netfs_put_group(struct netfs_group *netfs_group) */ static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) { - if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref)) + if (netfs_group && + netfs_group != NETFS_FOLIO_COPY_TO_CACHE && + refcount_sub_and_test(nr, &netfs_group->ref)) netfs_group->free(netfs_group); } @@ -336,42 +353,12 @@ void fscache_create_volume(struct fscache_volume *volume, bool wait); * debug tracing */ #define dbgprintk(FMT, ...) \ - printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + pr_debug("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) -#ifdef __KDEBUG -#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) -#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) -#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) - -#elif defined(CONFIG_NETFS_DEBUG) -#define _enter(FMT, ...) \ -do { \ - if (netfs_debug) \ - kenter(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _leave(FMT, ...) \ -do { \ - if (netfs_debug) \ - kleave(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _debug(FMT, ...) \ -do { \ - if (netfs_debug) \ - kdebug(FMT, ##__VA_ARGS__); \ -} while (0) - -#else -#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) -#endif - /* * assertions */ diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 4261ad6c55..c7576481c3 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -99,145 +99,6 @@ static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async) } /* - * Deal with the completion of writing the data to the cache. We have to clear - * the PG_fscache bits on the folios involved and release the caller's ref. - * - * May be called in softirq mode and we inherit a ref from the caller. - */ -static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, - bool was_async) -{ - struct netfs_io_subrequest *subreq; - struct folio *folio; - pgoff_t unlocked = 0; - bool have_unlocked = false; - - rcu_read_lock(); - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); - - xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { - if (xas_retry(&xas, folio)) - continue; - - /* We might have multiple writes from the same huge - * folio, but we mustn't unlock a folio more than once. - */ - if (have_unlocked && folio->index <= unlocked) - continue; - unlocked = folio_next_index(folio) - 1; - trace_netfs_folio(folio, netfs_folio_trace_end_copy); - folio_end_fscache(folio); - have_unlocked = true; - } - } - - rcu_read_unlock(); - netfs_rreq_completed(rreq, was_async); -} - -static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_io_subrequest *subreq = priv; - struct netfs_io_request *rreq = subreq->rreq; - - if (IS_ERR_VALUE(transferred_or_error)) { - netfs_stat(&netfs_n_rh_write_failed); - trace_netfs_failure(rreq, subreq, transferred_or_error, - netfs_fail_copy_to_cache); - } else { - netfs_stat(&netfs_n_rh_write_done); - } - - trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); - - /* If we decrement nr_copy_ops to 0, the ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_copy_ops)) - netfs_rreq_unmark_after_write(rreq, was_async); - - netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); -} - -/* - * Perform any outstanding writes to the cache. We inherit a ref from the - * caller. - */ -static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) -{ - struct netfs_cache_resources *cres = &rreq->cache_resources; - struct netfs_io_subrequest *subreq, *next, *p; - struct iov_iter iter; - int ret; - - trace_netfs_rreq(rreq, netfs_rreq_trace_copy); - - /* We don't want terminating writes trying to wake us up whilst we're - * still going through the list. - */ - atomic_inc(&rreq->nr_copy_ops); - - list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { - if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { - list_del_init(&subreq->rreq_link); - netfs_put_subrequest(subreq, false, - netfs_sreq_trace_put_no_copy); - } - } - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - /* Amalgamate adjacent writes */ - while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { - next = list_next_entry(subreq, rreq_link); - if (next->start != subreq->start + subreq->len) - break; - subreq->len += next->len; - list_del_init(&next->rreq_link); - netfs_put_subrequest(next, false, - netfs_sreq_trace_put_merged); - } - - ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, - subreq->len, rreq->i_size, true); - if (ret < 0) { - trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); - trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); - continue; - } - - iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages, - subreq->start, subreq->len); - - atomic_inc(&rreq->nr_copy_ops); - netfs_stat(&netfs_n_rh_write); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache); - trace_netfs_sreq(subreq, netfs_sreq_trace_write); - cres->ops->write(cres, subreq->start, &iter, - netfs_rreq_copy_terminated, subreq); - } - - /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_copy_ops)) - netfs_rreq_unmark_after_write(rreq, false); -} - -static void netfs_rreq_write_to_cache_work(struct work_struct *work) -{ - struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, work); - - netfs_rreq_do_write_to_cache(rreq); -} - -static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) -{ - rreq->work.func = netfs_rreq_write_to_cache_work; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); -} - -/* * Handle a short read. */ static void netfs_rreq_short_read(struct netfs_io_request *rreq, @@ -269,7 +130,7 @@ static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, if (count == remaining) return; - _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", + kdebug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", rreq->debug_id, subreq->debug_index, iov_iter_count(&subreq->io_iter), subreq->transferred, subreq->len, rreq->i_size, @@ -352,8 +213,13 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) unsigned int i; size_t transferred = 0; - for (i = 0; i < rreq->direct_bv_count; i++) + for (i = 0; i < rreq->direct_bv_count; i++) { flush_dcache_page(rreq->direct_bv[i].bv_page); + // TODO: cifs marks pages in the destination buffer + // dirty under some circumstances after a read. Do we + // need to do that too? + set_page_dirty(rreq->direct_bv[i].bv_page); + } list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { if (subreq->error || subreq->transferred == 0) @@ -409,9 +275,6 @@ again: clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); - if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) - return netfs_rreq_write_to_cache(rreq); - netfs_rreq_completed(rreq, was_async); } @@ -463,7 +326,7 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; int u; - _enter("R=%x[%x]{%llx,%lx},%zd", + kenter("R=%x[%x]{%llx,%lx},%zd", rreq->debug_id, subreq->debug_index, subreq->start, subreq->flags, transferred_or_error); @@ -572,7 +435,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, struct netfs_inode *ictx = netfs_inode(rreq->inode); size_t lsize; - _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); + kenter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); if (rreq->origin != NETFS_DIO_READ) { source = netfs_cache_prepare_read(subreq, rreq->i_size); @@ -618,7 +481,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, set: if (subreq->len > rreq->len) - pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n", + pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n", rreq->debug_id, subreq->debug_index, subreq->len, rreq->len); @@ -643,8 +506,7 @@ out: * Slice off a piece of a read request and submit an I/O request for it. */ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, - struct iov_iter *io_iter, - unsigned int *_debug_index) + struct iov_iter *io_iter) { struct netfs_io_subrequest *subreq; enum netfs_io_source source; @@ -653,11 +515,10 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, if (!subreq) return false; - subreq->debug_index = (*_debug_index)++; subreq->start = rreq->start + rreq->submitted; subreq->len = io_iter->count; - _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); + kdebug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); list_add_tail(&subreq->rreq_link, &rreq->subrequests); /* Call out to the cache to find out what it can do with the remaining @@ -707,10 +568,9 @@ subreq_failed: int netfs_begin_read(struct netfs_io_request *rreq, bool sync) { struct iov_iter io_iter; - unsigned int debug_index = 0; int ret; - _enter("R=%x %llx-%llx", + kenter("R=%x %llx-%llx", rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); if (rreq->len == 0) { @@ -733,12 +593,12 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) atomic_set(&rreq->nr_outstanding, 1); io_iter = rreq->io_iter; do { - _debug("submit %llx + %zx >= %llx", + kdebug("submit %llx + %llx >= %llx", rreq->start, rreq->submitted, rreq->i_size); if (rreq->origin == NETFS_DIO_READ && rreq->start + rreq->submitted >= rreq->i_size) break; - if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index)) + if (!netfs_rreq_submit_slice(rreq, &io_iter)) break; if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 5e77618a79..db824c3728 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/export.h> +#include <linux/mempool.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "internal.h" @@ -19,9 +20,10 @@ MODULE_LICENSE("GPL"); EXPORT_TRACEPOINT_SYMBOL(netfs_sreq); -unsigned netfs_debug; -module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); +static struct kmem_cache *netfs_request_slab; +static struct kmem_cache *netfs_subrequest_slab; +mempool_t netfs_request_pool; +mempool_t netfs_subrequest_pool; #ifdef CONFIG_PROC_FS LIST_HEAD(netfs_io_requests); @@ -31,9 +33,9 @@ static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READAHEAD] = "RA", [NETFS_READPAGE] = "RP", [NETFS_READ_FOR_WRITE] = "RW", + [NETFS_COPY_TO_CACHE] = "CC", [NETFS_WRITEBACK] = "WB", [NETFS_WRITETHROUGH] = "WT", - [NETFS_LAUNDER_WRITE] = "LW", [NETFS_UNBUFFERED_WRITE] = "UW", [NETFS_DIO_READ] = "DR", [NETFS_DIO_WRITE] = "DW", @@ -56,7 +58,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v) rreq = list_entry(v, struct netfs_io_request, proc_link); seq_printf(m, - "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx", + "%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx", rreq->debug_id, netfs_origins[rreq->origin], refcount_read(&rreq->ref), @@ -98,25 +100,54 @@ static int __init netfs_init(void) { int ret = -ENOMEM; + netfs_request_slab = kmem_cache_create("netfs_request", + sizeof(struct netfs_io_request), 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, + NULL); + if (!netfs_request_slab) + goto error_req; + + if (mempool_init_slab_pool(&netfs_request_pool, 100, netfs_request_slab) < 0) + goto error_reqpool; + + netfs_subrequest_slab = kmem_cache_create("netfs_subrequest", + sizeof(struct netfs_io_subrequest), 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, + NULL); + if (!netfs_subrequest_slab) + goto error_subreq; + + if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0) + goto error_subreqpool; + if (!proc_mkdir("fs/netfs", NULL)) - goto error; + goto error_proc; if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL, &netfs_requests_seq_ops)) - goto error_proc; + goto error_procfile; #ifdef CONFIG_FSCACHE_STATS if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL, netfs_stats_show)) - goto error_proc; + goto error_procfile; #endif ret = fscache_init(); if (ret < 0) - goto error_proc; + goto error_fscache; return 0; -error_proc: +error_fscache: +error_procfile: remove_proc_entry("fs/netfs", NULL); -error: +error_proc: + mempool_exit(&netfs_subrequest_pool); +error_subreqpool: + kmem_cache_destroy(netfs_subrequest_slab); +error_subreq: + mempool_exit(&netfs_request_pool); +error_reqpool: + kmem_cache_destroy(netfs_request_slab); +error_req: return ret; } fs_initcall(netfs_init); @@ -125,5 +156,9 @@ static void __exit netfs_exit(void) { fscache_exit(); remove_proc_entry("fs/netfs", NULL); + mempool_exit(&netfs_subrequest_pool); + kmem_cache_destroy(netfs_subrequest_slab); + mempool_exit(&netfs_request_pool); + kmem_cache_destroy(netfs_request_slab); } module_exit(netfs_exit); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 90051ced8e..172808e83c 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -8,87 +8,6 @@ #include <linux/swap.h> #include "internal.h" -/* - * Attach a folio to the buffer and maybe set marks on it to say that we need - * to put the folio later and twiddle the pagecache flags. - */ -int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, - struct folio *folio, unsigned int flags, - gfp_t gfp_mask) -{ - XA_STATE_ORDER(xas, xa, index, folio_order(folio)); - -retry: - xas_lock(&xas); - for (;;) { - xas_store(&xas, folio); - if (!xas_error(&xas)) - break; - xas_unlock(&xas); - if (!xas_nomem(&xas, gfp_mask)) - return xas_error(&xas); - goto retry; - } - - if (flags & NETFS_FLAG_PUT_MARK) - xas_set_mark(&xas, NETFS_BUF_PUT_MARK); - if (flags & NETFS_FLAG_PAGECACHE_MARK) - xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK); - xas_unlock(&xas); - return xas_error(&xas); -} - -/* - * Create the specified range of folios in the buffer attached to the read - * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that - * these need freeing later. - */ -int netfs_add_folios_to_buffer(struct xarray *buffer, - struct address_space *mapping, - pgoff_t index, pgoff_t to, gfp_t gfp_mask) -{ - struct folio *folio; - int ret; - - if (to + 1 == index) /* Page range is inclusive */ - return 0; - - do { - /* TODO: Figure out what order folio can be allocated here */ - folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0); - if (!folio) - return -ENOMEM; - folio->index = index; - ret = netfs_xa_store_and_mark(buffer, index, folio, - NETFS_FLAG_PUT_MARK, gfp_mask); - if (ret < 0) { - folio_put(folio); - return ret; - } - - index += folio_nr_pages(folio); - } while (index <= to && index != 0); - - return 0; -} - -/* - * Clear an xarray buffer, putting a ref on the folios that have - * NETFS_BUF_PUT_MARK set. - */ -void netfs_clear_buffer(struct xarray *buffer) -{ - struct folio *folio; - XA_STATE(xas, buffer, 0); - - rcu_read_lock(); - xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) { - folio_put(folio); - } - rcu_read_unlock(); - xa_destroy(buffer); -} - /** * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback * @mapping: The mapping the folio belongs to. @@ -107,7 +26,7 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) struct fscache_cookie *cookie = netfs_i_cookie(ictx); bool need_use = false; - _enter(""); + kenter(""); if (!filemap_dirty_folio(mapping, folio)) return false; @@ -177,12 +96,10 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback); */ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - struct netfs_folio *finfo = NULL; + struct netfs_folio *finfo; size_t flen = folio_size(folio); - _enter("{%lx},%zx,%zx", folio->index, offset, length); - - folio_wait_fscache(folio); + kenter("{%lx},%zx,%zx", folio->index, offset, length); if (!folio_test_private(folio)) return; @@ -248,12 +165,6 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) if (folio_test_private(folio)) return false; - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - fscache_note_page_release(netfs_i_cookie(ctx)); return true; } diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 610ceb5bd8..f4a6427274 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -6,6 +6,8 @@ */ #include <linux/slab.h> +#include <linux/mempool.h> +#include <linux/delay.h> #include "internal.h" /* @@ -20,17 +22,22 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct inode *inode = file ? file_inode(file) : mapping->host; struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; + mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool; + struct kmem_cache *cache = mempool->pool_data; bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE || origin == NETFS_DIO_READ || origin == NETFS_DIO_WRITE); bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx); int ret; - rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request), - GFP_KERNEL); - if (!rreq) - return ERR_PTR(-ENOMEM); + for (;;) { + rreq = mempool_alloc(mempool, GFP_KERNEL); + if (rreq) + break; + msleep(10); + } + memset(rreq, 0, kmem_cache_size(cache)); rreq->start = start; rreq->len = len; rreq->upper_len = len; @@ -40,23 +47,32 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, rreq->inode = inode; rreq->i_size = i_size_read(inode); rreq->debug_id = atomic_inc_return(&debug_ids); + rreq->wsize = INT_MAX; + spin_lock_init(&rreq->lock); + INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); + INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); INIT_LIST_HEAD(&rreq->subrequests); INIT_WORK(&rreq->work, NULL); refcount_set(&rreq->ref, 1); __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - if (cached) + if (cached) { __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); + if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) + /* Filesystem uses deprecated PG_private_2 marking. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + } if (file && file->f_flags & O_NONBLOCK) __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { - kfree(rreq); + mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool); return ERR_PTR(ret); } } + atomic_inc(&ctx->io_count); trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); @@ -74,6 +90,8 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) { struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream; + int s; while (!list_empty(&rreq->subrequests)) { subreq = list_first_entry(&rreq->subrequests, @@ -82,12 +100,32 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_clear); } + + for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) { + stream = &rreq->io_streams[s]; + while (!list_empty(&stream->subrequests)) { + subreq = list_first_entry(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, was_async, + netfs_sreq_trace_put_clear); + } + } +} + +static void netfs_free_request_rcu(struct rcu_head *rcu) +{ + struct netfs_io_request *rreq = container_of(rcu, struct netfs_io_request, rcu); + + mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool); + netfs_stat_d(&netfs_n_rh_rreq); } static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); @@ -106,8 +144,10 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } - kfree_rcu(rreq, rcu); - netfs_stat_d(&netfs_n_rh_rreq); + + if (atomic_dec_and_test(&ictx->io_count)) + wake_up_var(&ictx->io_count); + call_rcu(&rreq->rcu, netfs_free_request_rcu); } void netfs_put_request(struct netfs_io_request *rreq, bool was_async, @@ -139,19 +179,25 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async, struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; - - subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?: - sizeof(struct netfs_io_subrequest), - GFP_KERNEL); - if (subreq) { - INIT_WORK(&subreq->work, NULL); - INIT_LIST_HEAD(&subreq->rreq_link); - refcount_set(&subreq->ref, 2); - subreq->rreq = rreq; - netfs_get_request(rreq, netfs_rreq_trace_get_subreq); - netfs_stat(&netfs_n_rh_sreq); + mempool_t *mempool = rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool; + struct kmem_cache *cache = mempool->pool_data; + + for (;;) { + subreq = mempool_alloc(rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool, + GFP_KERNEL); + if (subreq) + break; + msleep(10); } + memset(subreq, 0, kmem_cache_size(cache)); + INIT_WORK(&subreq->work, NULL); + INIT_LIST_HEAD(&subreq->rreq_link); + refcount_set(&subreq->ref, 2); + subreq->rreq = rreq; + subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); + netfs_get_request(rreq, netfs_rreq_trace_get_subreq); + netfs_stat(&netfs_n_rh_sreq); return subreq; } @@ -173,7 +219,7 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, trace_netfs_sreq(subreq, netfs_sreq_trace_free); if (rreq->netfs_ops->free_subrequest) rreq->netfs_ops->free_subrequest(subreq); - kfree(subreq); + mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool); netfs_stat_d(&netfs_n_rh_sreq); netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); } diff --git a/fs/netfs/output.c b/fs/netfs/output.c deleted file mode 100644 index 625eb68f3e..0000000000 --- a/fs/netfs/output.c +++ /dev/null @@ -1,478 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Network filesystem high-level write support. - * - * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/writeback.h> -#include <linux/pagevec.h> -#include "internal.h" - -/** - * netfs_create_write_request - Create a write operation. - * @wreq: The write request this is storing from. - * @dest: The destination type - * @start: Start of the region this write will modify - * @len: Length of the modification - * @worker: The worker function to handle the write(s) - * - * Allocate a write operation, set it up and add it to the list on a write - * request. - */ -struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq, - enum netfs_io_source dest, - loff_t start, size_t len, - work_func_t worker) -{ - struct netfs_io_subrequest *subreq; - - subreq = netfs_alloc_subrequest(wreq); - if (subreq) { - INIT_WORK(&subreq->work, worker); - subreq->source = dest; - subreq->start = start; - subreq->len = len; - subreq->debug_index = wreq->subreq_counter++; - - switch (subreq->source) { - case NETFS_UPLOAD_TO_SERVER: - netfs_stat(&netfs_n_wh_upload); - break; - case NETFS_WRITE_TO_CACHE: - netfs_stat(&netfs_n_wh_write); - break; - default: - BUG(); - } - - subreq->io_iter = wreq->io_iter; - iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start); - iov_iter_truncate(&subreq->io_iter, subreq->len); - - trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, - refcount_read(&subreq->ref), - netfs_sreq_trace_new); - atomic_inc(&wreq->nr_outstanding); - list_add_tail(&subreq->rreq_link, &wreq->subrequests); - trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); - } - - return subreq; -} -EXPORT_SYMBOL(netfs_create_write_request); - -/* - * Process a completed write request once all the component operations have - * been completed. - */ -static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async) -{ - struct netfs_io_subrequest *subreq; - struct netfs_inode *ctx = netfs_inode(wreq->inode); - size_t transferred = 0; - - _enter("R=%x[]", wreq->debug_id); - - trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); - - list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { - if (subreq->error || subreq->transferred == 0) - break; - transferred += subreq->transferred; - if (subreq->transferred < subreq->len) - break; - } - wreq->transferred = transferred; - - list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { - if (!subreq->error) - continue; - switch (subreq->source) { - case NETFS_UPLOAD_TO_SERVER: - /* Depending on the type of failure, this may prevent - * writeback completion unless we're in disconnected - * mode. - */ - if (!wreq->error) - wreq->error = subreq->error; - break; - - case NETFS_WRITE_TO_CACHE: - /* Failure doesn't prevent writeback completion unless - * we're in disconnected mode. - */ - if (subreq->error != -ENOBUFS) - ctx->ops->invalidate_cache(wreq); - break; - - default: - WARN_ON_ONCE(1); - if (!wreq->error) - wreq->error = -EIO; - return; - } - } - - wreq->cleanup(wreq); - - if (wreq->origin == NETFS_DIO_WRITE && - wreq->mapping->nrpages) { - pgoff_t first = wreq->start >> PAGE_SHIFT; - pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; - invalidate_inode_pages2_range(wreq->mapping, first, last); - } - - if (wreq->origin == NETFS_DIO_WRITE) - inode_dio_end(wreq->inode); - - _debug("finished"); - trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); - wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); - - if (wreq->iocb) { - wreq->iocb->ki_pos += transferred; - if (wreq->iocb->ki_complete) - wreq->iocb->ki_complete( - wreq->iocb, wreq->error ? wreq->error : transferred); - } - - netfs_clear_subrequests(wreq, was_async); - netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete); -} - -/* - * Deal with the completion of writing the data to the cache. - */ -void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_io_subrequest *subreq = _op; - struct netfs_io_request *wreq = subreq->rreq; - unsigned int u; - - _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); - - switch (subreq->source) { - case NETFS_UPLOAD_TO_SERVER: - netfs_stat(&netfs_n_wh_upload_done); - break; - case NETFS_WRITE_TO_CACHE: - netfs_stat(&netfs_n_wh_write_done); - break; - case NETFS_INVALID_WRITE: - break; - default: - BUG(); - } - - if (IS_ERR_VALUE(transferred_or_error)) { - subreq->error = transferred_or_error; - trace_netfs_failure(wreq, subreq, transferred_or_error, - netfs_fail_write); - goto failed; - } - - if (WARN(transferred_or_error > subreq->len - subreq->transferred, - "Subreq excess write: R%x[%x] %zd > %zu - %zu", - wreq->debug_id, subreq->debug_index, - transferred_or_error, subreq->len, subreq->transferred)) - transferred_or_error = subreq->len - subreq->transferred; - - subreq->error = 0; - subreq->transferred += transferred_or_error; - - if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) - pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n", - wreq->debug_id, subreq->debug_index, - iov_iter_count(&subreq->io_iter), subreq->len, - subreq->transferred, subreq->io_iter.iter_type); - - if (subreq->transferred < subreq->len) - goto incomplete; - - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); -out: - trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - /* If we decrement nr_outstanding to 0, the ref belongs to us. */ - u = atomic_dec_return(&wreq->nr_outstanding); - if (u == 0) - netfs_write_terminated(wreq, was_async); - else if (u == 1) - wake_up_var(&wreq->nr_outstanding); - - netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); - return; - -incomplete: - if (transferred_or_error == 0) { - if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { - subreq->error = -ENODATA; - goto failed; - } - } else { - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - } - - __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); - set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags); - goto out; - -failed: - switch (subreq->source) { - case NETFS_WRITE_TO_CACHE: - netfs_stat(&netfs_n_wh_write_failed); - set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags); - break; - case NETFS_UPLOAD_TO_SERVER: - netfs_stat(&netfs_n_wh_upload_failed); - set_bit(NETFS_RREQ_FAILED, &wreq->flags); - wreq->error = subreq->error; - break; - default: - break; - } - goto out; -} -EXPORT_SYMBOL(netfs_write_subrequest_terminated); - -static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq) -{ - struct netfs_io_request *wreq = subreq->rreq; - struct netfs_cache_resources *cres = &wreq->cache_resources; - - trace_netfs_sreq(subreq, netfs_sreq_trace_submit); - - cres->ops->write(cres, subreq->start, &subreq->io_iter, - netfs_write_subrequest_terminated, subreq); -} - -static void netfs_write_to_cache_op_worker(struct work_struct *work) -{ - struct netfs_io_subrequest *subreq = - container_of(work, struct netfs_io_subrequest, work); - - netfs_write_to_cache_op(subreq); -} - -/** - * netfs_queue_write_request - Queue a write request for attention - * @subreq: The write request to be queued - * - * Queue the specified write request for processing by a worker thread. We - * pass the caller's ref on the request to the worker thread. - */ -void netfs_queue_write_request(struct netfs_io_subrequest *subreq) -{ - if (!queue_work(system_unbound_wq, &subreq->work)) - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip); -} -EXPORT_SYMBOL(netfs_queue_write_request); - -/* - * Set up a op for writing to the cache. - */ -static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq) -{ - struct netfs_cache_resources *cres = &wreq->cache_resources; - struct netfs_io_subrequest *subreq; - struct netfs_inode *ctx = netfs_inode(wreq->inode); - struct fscache_cookie *cookie = netfs_i_cookie(ctx); - loff_t start = wreq->start; - size_t len = wreq->len; - int ret; - - if (!fscache_cookie_enabled(cookie)) { - clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags); - return; - } - - _debug("write to cache"); - ret = fscache_begin_write_operation(cres, cookie); - if (ret < 0) - return; - - ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len, - i_size_read(wreq->inode), true); - if (ret < 0) - return; - - subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len, - netfs_write_to_cache_op_worker); - if (!subreq) - return; - - netfs_write_to_cache_op(subreq); -} - -/* - * Begin the process of writing out a chunk of data. - * - * We are given a write request that holds a series of dirty regions and - * (partially) covers a sequence of folios, all of which are present. The - * pages must have been marked as writeback as appropriate. - * - * We need to perform the following steps: - * - * (1) If encrypting, create an output buffer and encrypt each block of the - * data into it, otherwise the output buffer will point to the original - * folios. - * - * (2) If the data is to be cached, set up a write op for the entire output - * buffer to the cache, if the cache wants to accept it. - * - * (3) If the data is to be uploaded (ie. not merely cached): - * - * (a) If the data is to be compressed, create a compression buffer and - * compress the data into it. - * - * (b) For each destination we want to upload to, set up write ops to write - * to that destination. We may need multiple writes if the data is not - * contiguous or the span exceeds wsize for a server. - */ -int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, - enum netfs_write_trace what) -{ - struct netfs_inode *ctx = netfs_inode(wreq->inode); - - _enter("R=%x %llx-%llx f=%lx", - wreq->debug_id, wreq->start, wreq->start + wreq->len - 1, - wreq->flags); - - trace_netfs_write(wreq, what); - if (wreq->len == 0 || wreq->iter.count == 0) { - pr_err("Zero-sized write [R=%x]\n", wreq->debug_id); - return -EIO; - } - - if (wreq->origin == NETFS_DIO_WRITE) - inode_dio_begin(wreq->inode); - - wreq->io_iter = wreq->iter; - - /* ->outstanding > 0 carries a ref */ - netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding); - atomic_set(&wreq->nr_outstanding, 1); - - /* Start the encryption/compression going. We can do that in the - * background whilst we generate a list of write ops that we want to - * perform. - */ - // TODO: Encrypt or compress the region as appropriate - - /* We need to write all of the region to the cache */ - if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) - netfs_set_up_write_to_cache(wreq); - - /* However, we don't necessarily write all of the region to the server. - * Caching of reads is being managed this way also. - */ - if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) - ctx->ops->create_write_requests(wreq, wreq->start, wreq->len); - - if (atomic_dec_and_test(&wreq->nr_outstanding)) - netfs_write_terminated(wreq, false); - - if (!may_wait) - return -EIOCBQUEUED; - - wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - return wreq->error; -} - -/* - * Begin a write operation for writing through the pagecache. - */ -struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len) -{ - struct netfs_io_request *wreq; - struct file *file = iocb->ki_filp; - - wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len, - NETFS_WRITETHROUGH); - if (IS_ERR(wreq)) - return wreq; - - trace_netfs_write(wreq, netfs_write_trace_writethrough); - - __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); - iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0); - wreq->io_iter = wreq->iter; - - /* ->outstanding > 0 carries a ref */ - netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding); - atomic_set(&wreq->nr_outstanding, 1); - return wreq; -} - -static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final) -{ - struct netfs_inode *ictx = netfs_inode(wreq->inode); - unsigned long long start; - size_t len; - - if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) - return; - - start = wreq->start + wreq->submitted; - len = wreq->iter.count - wreq->submitted; - if (!final) { - len /= wreq->wsize; /* Round to number of maximum packets */ - len *= wreq->wsize; - } - - ictx->ops->create_write_requests(wreq, start, len); - wreq->submitted += len; -} - -/* - * Advance the state of the write operation used when writing through the - * pagecache. Data has been copied into the pagecache that we need to append - * to the request. If we've added more than wsize then we need to create a new - * subrequest. - */ -int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end) -{ - _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u", - wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end); - - wreq->iter.count += copied; - wreq->io_iter.count += copied; - if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize) - netfs_submit_writethrough(wreq, false); - - return wreq->error; -} - -/* - * End a write operation used when writing through the pagecache. - */ -int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb) -{ - int ret = -EIOCBQUEUED; - - _enter("ic=%zu sb=%zu ws=%u", - wreq->iter.count, wreq->submitted, wreq->wsize); - - if (wreq->submitted < wreq->io_iter.count) - netfs_submit_writethrough(wreq, true); - - if (atomic_dec_and_test(&wreq->nr_outstanding)) - netfs_write_terminated(wreq, false); - - if (is_sync_kiocb(iocb)) { - wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - ret = wreq->error; - } - - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); - return ret; -} diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index deeba9f9dc..0892768eea 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -10,9 +10,8 @@ #include "internal.h" atomic_t netfs_n_rh_dio_read; -atomic_t netfs_n_rh_dio_write; atomic_t netfs_n_rh_readahead; -atomic_t netfs_n_rh_readpage; +atomic_t netfs_n_rh_read_folio; atomic_t netfs_n_rh_rreq; atomic_t netfs_n_rh_sreq; atomic_t netfs_n_rh_download; @@ -29,6 +28,10 @@ atomic_t netfs_n_rh_write_begin; atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; +atomic_t netfs_n_wh_buffered_write; +atomic_t netfs_n_wh_writethrough; +atomic_t netfs_n_wh_dio_write; +atomic_t netfs_n_wh_writepages; atomic_t netfs_n_wh_wstream_conflict; atomic_t netfs_n_wh_upload; atomic_t netfs_n_wh_upload_done; @@ -39,13 +42,17 @@ atomic_t netfs_n_wh_write_failed; int netfs_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n", + seq_printf(m, "Netfs : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n", atomic_read(&netfs_n_rh_dio_read), - atomic_read(&netfs_n_rh_dio_write), atomic_read(&netfs_n_rh_readahead), - atomic_read(&netfs_n_rh_readpage), + atomic_read(&netfs_n_rh_read_folio), atomic_read(&netfs_n_rh_write_begin), atomic_read(&netfs_n_rh_write_zskip)); + seq_printf(m, "Netfs : BW=%u WT=%u DW=%u WP=%u\n", + atomic_read(&netfs_n_wh_buffered_write), + atomic_read(&netfs_n_wh_writethrough), + atomic_read(&netfs_n_wh_dio_write), + atomic_read(&netfs_n_wh_writepages)); seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n", atomic_read(&netfs_n_rh_zero), atomic_read(&netfs_n_rh_short_read), diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c new file mode 100644 index 0000000000..488147439f --- /dev/null +++ b/fs/netfs/write_collect.c @@ -0,0 +1,809 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem write subrequest result collection, assessment + * and retrying. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include "internal.h" + +/* Notes made in the collector */ +#define HIT_PENDING 0x01 /* A front op was still pending */ +#define SOME_EMPTY 0x02 /* One of more streams are empty */ +#define ALL_EMPTY 0x04 /* All streams are empty */ +#define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */ +#define NEED_REASSESS 0x10 /* Need to loop round and reassess */ +#define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */ +#define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */ +#define BUFFERED 0x80 /* The pagecache needs cleaning up */ +#define NEED_RETRY 0x100 /* A front op requests retrying */ +#define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */ + +/* + * Successful completion of write of a folio to the server and/or cache. Note + * that we are not allowed to lock the folio here on pain of deadlocking with + * truncate. + */ +int netfs_folio_written_back(struct folio *folio) +{ + enum netfs_folio_trace why = netfs_folio_trace_clear; + struct netfs_folio *finfo; + struct netfs_group *group = NULL; + int gcount = 0; + + if ((finfo = netfs_folio_info(folio))) { + /* Streaming writes cannot be redirtied whilst under writeback, + * so discard the streaming record. + */ + folio_detach_private(folio); + group = finfo->netfs_group; + gcount++; + kfree(finfo); + why = netfs_folio_trace_clear_s; + goto end_wb; + } + + if ((group = netfs_folio_group(folio))) { + if (group == NETFS_FOLIO_COPY_TO_CACHE) { + why = netfs_folio_trace_clear_cc; + folio_detach_private(folio); + goto end_wb; + } + + /* Need to detach the group pointer if the page didn't get + * redirtied. If it has been redirtied, then it must be within + * the same group. + */ + why = netfs_folio_trace_redirtied; + if (!folio_test_dirty(folio)) { + folio_detach_private(folio); + gcount++; + why = netfs_folio_trace_clear_g; + } + } + +end_wb: + trace_netfs_folio(folio, why); + folio_end_writeback(folio); + return gcount; +} + +/* + * Get hold of a folio we have under writeback. We don't want to get the + * refcount on it. + */ +static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos) +{ + XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE); + struct folio *folio; + + rcu_read_lock(); + + for (;;) { + xas_reset(&xas); + folio = xas_load(&xas); + if (xas_retry(&xas, folio)) + continue; + + if (!folio || xa_is_value(folio)) + kdebug("R=%08x: folio %lx (%llx) not present", + wreq->debug_id, xas.xa_index, pos / PAGE_SIZE); + BUG_ON(!folio || xa_is_value(folio)); + + if (folio == xas_reload(&xas)) + break; + } + + rcu_read_unlock(); + + if (WARN_ONCE(!folio_test_writeback(folio), + "R=%08x: folio %lx is not under writeback\n", + wreq->debug_id, folio->index)) { + trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); + } + return folio; +} + +/* + * Unlock any folios we've finished with. + */ +static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, + unsigned long long collected_to, + unsigned int *notes) +{ + for (;;) { + struct folio *folio; + struct netfs_folio *finfo; + unsigned long long fpos, fend; + size_t fsize, flen; + + folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to); + + fpos = folio_pos(folio); + fsize = folio_size(folio); + finfo = netfs_folio_info(folio); + flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize; + + fend = min_t(unsigned long long, fpos + flen, wreq->i_size); + + trace_netfs_collect_folio(wreq, folio, fend, collected_to); + + if (fpos + fsize > wreq->contiguity) { + trace_netfs_collect_contig(wreq, fpos + fsize, + netfs_contig_trace_unlock); + wreq->contiguity = fpos + fsize; + } + + /* Unlock any folio we've transferred all of. */ + if (collected_to < fend) + break; + + wreq->nr_group_rel += netfs_folio_written_back(folio); + wreq->cleaned_to = fpos + fsize; + *notes |= MADE_PROGRESS; + + if (fpos + fsize >= collected_to) + break; + } +} + +/* + * Perform retries on the streams that need it. + */ +static void netfs_retry_write_stream(struct netfs_io_request *wreq, + struct netfs_io_stream *stream) +{ + struct list_head *next; + + kenter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); + + if (list_empty(&stream->subrequests)) + return; + + if (stream->source == NETFS_UPLOAD_TO_SERVER && + wreq->netfs_ops->retry_request) + wreq->netfs_ops->retry_request(wreq, stream); + + if (unlikely(stream->failed)) + return; + + /* If there's no renegotiation to do, just resend each failed subreq. */ + if (!stream->prepare_write) { + struct netfs_io_subrequest *subreq; + + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) + break; + if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + netfs_reissue_write(stream, subreq); + } + } + return; + } + + next = stream->subrequests.next; + + do { + struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp; + unsigned long long start, len; + size_t part; + bool boundary = false; + + /* Go through the stream and find the next span of contiguous + * data that we then rejig (cifs, for example, needs the wsize + * renegotiating) and reissue. + */ + from = list_entry(next, struct netfs_io_subrequest, rreq_link); + to = from; + start = from->start + from->transferred; + len = from->len - from->transferred; + + if (test_bit(NETFS_SREQ_FAILED, &from->flags) || + !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) + return; + + list_for_each_continue(next, &stream->subrequests) { + subreq = list_entry(next, struct netfs_io_subrequest, rreq_link); + if (subreq->start + subreq->transferred != start + len || + test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) || + !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) + break; + to = subreq; + len += to->len; + } + + /* Work through the sublist. */ + subreq = from; + list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { + if (!len) + break; + /* Renegotiate max_len (wsize) */ + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + stream->prepare_write(subreq); + + part = min(len, subreq->max_len); + subreq->len = part; + subreq->start = start; + subreq->transferred = 0; + len -= part; + start += part; + if (len && subreq == to && + __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags)) + boundary = true; + + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + netfs_reissue_write(stream, subreq); + if (subreq == to) + break; + } + + /* If we managed to use fewer subreqs, we can discard the + * excess; if we used the same number, then we're done. + */ + if (!len) { + if (subreq == to) + continue; + list_for_each_entry_safe_from(subreq, tmp, + &stream->subrequests, rreq_link) { + trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + if (subreq == to) + break; + } + continue; + } + + /* We ran out of subrequests, so we need to allocate some more + * and insert them after. + */ + do { + subreq = netfs_alloc_subrequest(wreq); + subreq->source = to->source; + subreq->start = start; + subreq->max_len = len; + subreq->max_nr_segs = INT_MAX; + subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); + subreq->stream_nr = to->stream_nr; + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + + trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), + netfs_sreq_trace_new); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + + list_add(&subreq->rreq_link, &to->rreq_link); + to = list_next_entry(to, rreq_link); + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + + switch (stream->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); + subreq->max_len = min(len, wreq->wsize); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); + break; + default: + WARN_ON_ONCE(1); + } + + stream->prepare_write(subreq); + + part = min(len, subreq->max_len); + subreq->len = subreq->transferred + part; + len -= part; + start += part; + if (!len && boundary) { + __set_bit(NETFS_SREQ_BOUNDARY, &to->flags); + boundary = false; + } + + netfs_reissue_write(stream, subreq); + if (!len) + break; + + } while (len); + + } while (!list_is_head(next, &stream->subrequests)); +} + +/* + * Perform retries on the streams that need it. If we're doing content + * encryption and the server copy changed due to a third-party write, we may + * need to do an RMW cycle and also rewrite the data to the cache. + */ +static void netfs_retry_writes(struct netfs_io_request *wreq) +{ + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream; + int s; + + /* Wait for all outstanding I/O to quiesce before performing retries as + * we may need to renegotiate the I/O sizes. + */ + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (!stream->active) + continue; + + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + } + } + + // TODO: Enc: Fetch changed partial pages + // TODO: Enc: Reencrypt content if needed. + // TODO: Enc: Wind back transferred point. + // TODO: Enc: Mark cache pages for retry. + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->need_retry) { + stream->need_retry = false; + netfs_retry_write_stream(wreq, stream); + } + } +} + +/* + * Collect and assess the results of various write subrequests. We may need to + * retry some of the results - or even do an RMW cycle for content crypto. + * + * Note that we have a number of parallel, overlapping lists of subrequests, + * one to the server and one to the local cache for example, which may not be + * the same size or starting position and may not even correspond in boundary + * alignment. + */ +static void netfs_collect_write_results(struct netfs_io_request *wreq) +{ + struct netfs_io_subrequest *front, *remove; + struct netfs_io_stream *stream; + unsigned long long collected_to; + unsigned int notes; + int s; + + kenter("%llx-%llx", wreq->start, wreq->start + wreq->len); + trace_netfs_collect(wreq); + trace_netfs_rreq(wreq, netfs_rreq_trace_collect); + +reassess_streams: + smp_rmb(); + collected_to = ULLONG_MAX; + if (wreq->origin == NETFS_WRITEBACK) + notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG; + else if (wreq->origin == NETFS_WRITETHROUGH) + notes = ALL_EMPTY | BUFFERED; + else + notes = ALL_EMPTY; + + /* Remove completed subrequests from the front of the streams and + * advance the completion point on each stream. We stop when we hit + * something that's in progress. The issuer thread may be adding stuff + * to the tail whilst we're doing this. + * + * We must not, however, merge in discontiguities that span whole + * folios that aren't under writeback. This is made more complicated + * by the folios in the gap being of unpredictable sizes - if they even + * exist - but we don't want to look them up. + */ + for (s = 0; s < NR_IO_STREAMS; s++) { + loff_t rstart, rend; + + stream = &wreq->io_streams[s]; + /* Read active flag before list pointers */ + if (!smp_load_acquire(&stream->active)) + continue; + + front = stream->front; + while (front) { + trace_netfs_collect_sreq(wreq, front); + //kdebug("sreq [%x] %llx %zx/%zx", + // front->debug_index, front->start, front->transferred, front->len); + + /* Stall if there may be a discontinuity. */ + rstart = round_down(front->start, PAGE_SIZE); + if (rstart > wreq->contiguity) { + if (wreq->contiguity > stream->collected_to) { + trace_netfs_collect_gap(wreq, stream, + wreq->contiguity, 'D'); + stream->collected_to = wreq->contiguity; + } + notes |= REASSESS_DISCONTIG; + break; + } + rend = round_up(front->start + front->len, PAGE_SIZE); + if (rend > wreq->contiguity) { + trace_netfs_collect_contig(wreq, rend, + netfs_contig_trace_collect); + wreq->contiguity = rend; + if (notes & REASSESS_DISCONTIG) + notes |= NEED_REASSESS; + } + notes &= ~MAYBE_DISCONTIG; + + /* Stall if the front is still undergoing I/O. */ + if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) { + notes |= HIT_PENDING; + break; + } + smp_rmb(); /* Read counters after I-P flag. */ + + if (stream->failed) { + stream->collected_to = front->start + front->len; + notes |= MADE_PROGRESS | SAW_FAILURE; + goto cancel; + } + if (front->start + front->transferred > stream->collected_to) { + stream->collected_to = front->start + front->transferred; + stream->transferred = stream->collected_to - wreq->start; + notes |= MADE_PROGRESS; + } + if (test_bit(NETFS_SREQ_FAILED, &front->flags)) { + stream->failed = true; + stream->error = front->error; + if (stream->source == NETFS_UPLOAD_TO_SERVER) + mapping_set_error(wreq->mapping, front->error); + notes |= NEED_REASSESS | SAW_FAILURE; + break; + } + if (front->transferred < front->len) { + stream->need_retry = true; + notes |= NEED_RETRY | MADE_PROGRESS; + break; + } + + cancel: + /* Remove if completely consumed. */ + spin_lock(&wreq->lock); + + remove = front; + list_del_init(&front->rreq_link); + front = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + stream->front = front; + if (!front) { + unsigned long long jump_to = atomic64_read(&wreq->issued_to); + + if (stream->collected_to < jump_to) { + trace_netfs_collect_gap(wreq, stream, jump_to, 'A'); + stream->collected_to = jump_to; + } + } + + spin_unlock(&wreq->lock); + netfs_put_subrequest(remove, false, + notes & SAW_FAILURE ? + netfs_sreq_trace_put_cancel : + netfs_sreq_trace_put_done); + } + + if (front) + notes &= ~ALL_EMPTY; + else + notes |= SOME_EMPTY; + + if (stream->collected_to < collected_to) + collected_to = stream->collected_to; + } + + if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to) + wreq->collected_to = collected_to; + + /* If we have an empty stream, we need to jump it forward over any gap + * otherwise the collection point will never advance. + * + * Note that the issuer always adds to the stream with the lowest + * so-far submitted start, so if we see two consecutive subreqs in one + * stream with nothing between then in another stream, then the second + * stream has a gap that can be jumped. + */ + if (notes & SOME_EMPTY) { + unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted); + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->active && + stream->front && + stream->front->start < jump_to) + jump_to = stream->front->start; + } + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->active && + !stream->front && + stream->collected_to < jump_to) { + trace_netfs_collect_gap(wreq, stream, jump_to, 'B'); + stream->collected_to = jump_to; + } + } + } + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->active) + trace_netfs_collect_stream(wreq, stream); + } + + trace_netfs_collect_state(wreq, wreq->collected_to, notes); + + /* Unlock any folios that we have now finished with. */ + if (notes & BUFFERED) { + unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity); + + if (wreq->cleaned_to < clean_to) + netfs_writeback_unlock_folios(wreq, clean_to, ¬es); + } else { + wreq->cleaned_to = wreq->collected_to; + } + + // TODO: Discard encryption buffers + + /* If all streams are discontiguous with the last folio we cleared, we + * may need to skip a set of folios. + */ + if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) { + unsigned long long jump_to = ULLONG_MAX; + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->active && stream->front && + stream->front->start < jump_to) + jump_to = stream->front->start; + } + + trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump); + wreq->contiguity = jump_to; + wreq->cleaned_to = jump_to; + wreq->collected_to = jump_to; + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->collected_to < jump_to) + stream->collected_to = jump_to; + } + //cond_resched(); + notes |= MADE_PROGRESS; + goto reassess_streams; + } + + if (notes & NEED_RETRY) + goto need_retry; + if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { + trace_netfs_rreq(wreq, netfs_rreq_trace_unpause); + clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags); + wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE); + } + + if (notes & NEED_REASSESS) { + //cond_resched(); + goto reassess_streams; + } + if (notes & MADE_PROGRESS) { + //cond_resched(); + goto reassess_streams; + } + +out: + netfs_put_group_many(wreq->group, wreq->nr_group_rel); + wreq->nr_group_rel = 0; + kleave(" = %x", notes); + return; + +need_retry: + /* Okay... We're going to have to retry one or both streams. Note + * that any partially completed op will have had any wholly transferred + * folios removed from it. + */ + kdebug("retry"); + netfs_retry_writes(wreq); + goto out; +} + +/* + * Perform the collection of subrequests, folios and encryption buffers. + */ +void netfs_write_collection_worker(struct work_struct *work) +{ + struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work); + struct netfs_inode *ictx = netfs_inode(wreq->inode); + size_t transferred; + int s; + + kenter("R=%x", wreq->debug_id); + + netfs_see_request(wreq, netfs_rreq_trace_see_work); + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) { + netfs_put_request(wreq, false, netfs_rreq_trace_put_work); + return; + } + + netfs_collect_write_results(wreq); + + /* We're done when the app thread has finished posting subreqs and all + * the queues in all the streams are empty. + */ + if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) { + netfs_put_request(wreq, false, netfs_rreq_trace_put_work); + return; + } + smp_rmb(); /* Read ALL_QUEUED before lists. */ + + transferred = LONG_MAX; + for (s = 0; s < NR_IO_STREAMS; s++) { + struct netfs_io_stream *stream = &wreq->io_streams[s]; + if (!stream->active) + continue; + if (!list_empty(&stream->subrequests)) { + netfs_put_request(wreq, false, netfs_rreq_trace_put_work); + return; + } + if (stream->transferred < transferred) + transferred = stream->transferred; + } + + /* Okay, declare that all I/O is complete. */ + wreq->transferred = transferred; + trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); + + if (wreq->io_streams[1].active && + wreq->io_streams[1].failed) { + /* Cache write failure doesn't prevent writeback completion + * unless we're in disconnected mode. + */ + ictx->ops->invalidate_cache(wreq); + } + + if (wreq->cleanup) + wreq->cleanup(wreq); + + if (wreq->origin == NETFS_DIO_WRITE && + wreq->mapping->nrpages) { + /* mmap may have got underfoot and we may now have folios + * locally covering the region we just wrote. Attempt to + * discard the folios, but leave in place any modified locally. + * ->write_iter() is prevented from interfering by the DIO + * counter. + */ + pgoff_t first = wreq->start >> PAGE_SHIFT; + pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; + invalidate_inode_pages2_range(wreq->mapping, first, last); + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_end(wreq->inode); + + kdebug("finished"); + trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); + wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + + if (wreq->iocb) { + size_t written = min(wreq->transferred, wreq->len); + wreq->iocb->ki_pos += written; + if (wreq->iocb->ki_complete) + wreq->iocb->ki_complete( + wreq->iocb, wreq->error ? wreq->error : written); + wreq->iocb = VFS_PTR_POISON; + } + + netfs_clear_subrequests(wreq, false); + netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete); +} + +/* + * Wake the collection work item. + */ +void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) +{ + if (!work_pending(&wreq->work)) { + netfs_get_request(wreq, netfs_rreq_trace_get_work); + if (!queue_work(system_unbound_wq, &wreq->work)) + netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq); + } +} + +/** + * netfs_write_subrequest_terminated - Note the termination of a write operation. + * @_op: The I/O request that has terminated. + * @transferred_or_error: The amount of data transferred or an error code. + * @was_async: The termination was asynchronous + * + * This tells the library that a contributory write I/O operation has + * terminated, one way or another, and that it should collect the results. + * + * The caller indicates in @transferred_or_error the outcome of the operation, + * supplying a positive value to indicate the number of bytes transferred or a + * negative error code. The library will look after reissuing I/O operations + * as appropriate and writing downloaded data to the cache. + * + * If @was_async is true, the caller might be running in softirq or interrupt + * context and we can't sleep. + * + * When this is called, ownership of the subrequest is transferred back to the + * library, along with a ref. + * + * Note that %_op is a void* so that the function can be passed to + * kiocb::term_func without the need for a casting wrapper. + */ +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = _op; + struct netfs_io_request *wreq = subreq->rreq; + struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; + + kenter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); + + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_done); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_done); + break; + case NETFS_INVALID_WRITE: + break; + default: + BUG(); + } + + if (IS_ERR_VALUE(transferred_or_error)) { + subreq->error = transferred_or_error; + if (subreq->error == -EAGAIN) + set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + else + set_bit(NETFS_SREQ_FAILED, &subreq->flags); + trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write); + + switch (subreq->source) { + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_failed); + break; + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_failed); + break; + default: + break; + } + trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause); + set_bit(NETFS_RREQ_PAUSE, &wreq->flags); + } else { + if (WARN(transferred_or_error > subreq->len - subreq->transferred, + "Subreq excess write: R=%x[%x] %zd > %zu - %zu", + wreq->debug_id, subreq->debug_index, + transferred_or_error, subreq->len, subreq->transferred)) + transferred_or_error = subreq->len - subreq->transferred; + + subreq->error = 0; + subreq->transferred += transferred_or_error; + + if (subreq->transferred < subreq->len) + set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS); + + /* If we are at the head of the queue, wake up the collector, + * transferring a ref to it if we were the ones to do so. + */ + if (list_is_first(&subreq->rreq_link, &stream->subrequests)) + netfs_wake_write_collector(wreq, was_async); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); +} +EXPORT_SYMBOL(netfs_write_subrequest_terminated); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c new file mode 100644 index 0000000000..32bc88bee5 --- /dev/null +++ b/fs/netfs/write_issue.c @@ -0,0 +1,690 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem high-level (buffered) writeback. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * + * To support network filesystems with local caching, we manage a situation + * that can be envisioned like the following: + * + * +---+---+-----+-----+---+----------+ + * Folios: | | | | | | | + * +---+---+-----+-----+---+----------+ + * + * +------+------+ +----+----+ + * Upload: | | |.....| | | + * (Stream 0) +------+------+ +----+----+ + * + * +------+------+------+------+------+ + * Cache: | | | | | | + * (Stream 1) +------+------+------+------+------+ + * + * Where we have a sequence of folios of varying sizes that we need to overlay + * with multiple parallel streams of I/O requests, where the I/O requests in a + * stream may also be of various sizes (in cifs, for example, the sizes are + * negotiated with the server; in something like ceph, they may represent the + * sizes of storage objects). + * + * The sequence in each stream may contain gaps and noncontiguous subrequests + * may be glued together into single vectored write RPCs. + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include "internal.h" + +/* + * Kill all dirty folios in the event of an unrecoverable error, starting with + * a locked folio we've already obtained from writeback_iter(). + */ +static void netfs_kill_dirty_pages(struct address_space *mapping, + struct writeback_control *wbc, + struct folio *folio) +{ + int error = 0; + + do { + enum netfs_folio_trace why = netfs_folio_trace_kill; + struct netfs_group *group = NULL; + struct netfs_folio *finfo = NULL; + void *priv; + + priv = folio_detach_private(folio); + if (priv) { + finfo = __netfs_folio_info(priv); + if (finfo) { + /* Kill folio from streaming write. */ + group = finfo->netfs_group; + why = netfs_folio_trace_kill_s; + } else { + group = priv; + if (group == NETFS_FOLIO_COPY_TO_CACHE) { + /* Kill copy-to-cache folio */ + why = netfs_folio_trace_kill_cc; + group = NULL; + } else { + /* Kill folio with group */ + why = netfs_folio_trace_kill_g; + } + } + } + + trace_netfs_folio(folio, why); + + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); + + netfs_put_group(group); + kfree(finfo); + + } while ((folio = writeback_iter(mapping, wbc, folio, &error))); +} + +/* + * Create a write request and set it up appropriately for the origin type. + */ +struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, + struct file *file, + loff_t start, + enum netfs_io_origin origin) +{ + struct netfs_io_request *wreq; + struct netfs_inode *ictx; + + wreq = netfs_alloc_request(mapping, file, start, 0, origin); + if (IS_ERR(wreq)) + return wreq; + + kenter("R=%x", wreq->debug_id); + + ictx = netfs_inode(wreq->inode); + if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); + + wreq->contiguity = wreq->start; + wreq->cleaned_to = wreq->start; + INIT_WORK(&wreq->work, netfs_write_collection_worker); + + wreq->io_streams[0].stream_nr = 0; + wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER; + wreq->io_streams[0].prepare_write = ictx->ops->prepare_write; + wreq->io_streams[0].issue_write = ictx->ops->issue_write; + wreq->io_streams[0].collected_to = start; + wreq->io_streams[0].transferred = LONG_MAX; + + wreq->io_streams[1].stream_nr = 1; + wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE; + wreq->io_streams[1].collected_to = start; + wreq->io_streams[1].transferred = LONG_MAX; + if (fscache_resources_valid(&wreq->cache_resources)) { + wreq->io_streams[1].avail = true; + wreq->io_streams[1].active = true; + wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq; + wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write; + } + + return wreq; +} + +/** + * netfs_prepare_write_failed - Note write preparation failed + * @subreq: The subrequest to mark + * + * Mark a subrequest to note that preparation for write failed. + */ +void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq) +{ + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); + trace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed); +} +EXPORT_SYMBOL(netfs_prepare_write_failed); + +/* + * Prepare a write subrequest. We need to allocate a new subrequest + * if we don't have one. + */ +static void netfs_prepare_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start) +{ + struct netfs_io_subrequest *subreq; + + subreq = netfs_alloc_subrequest(wreq); + subreq->source = stream->source; + subreq->start = start; + subreq->max_len = ULONG_MAX; + subreq->max_nr_segs = INT_MAX; + subreq->stream_nr = stream->stream_nr; + + kenter("R=%x[%x]", wreq->debug_id, subreq->debug_index); + + trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), + netfs_sreq_trace_new); + + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + + switch (stream->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); + subreq->max_len = wreq->wsize; + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); + break; + default: + WARN_ON_ONCE(1); + break; + } + + if (stream->prepare_write) + stream->prepare_write(subreq); + + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + + /* We add to the end of the list whilst the collector may be walking + * the list. The collector only goes nextwards and uses the lock to + * remove entries off of the front. + */ + spin_lock(&wreq->lock); + list_add_tail(&subreq->rreq_link, &stream->subrequests); + if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { + stream->front = subreq; + if (!stream->active) { + stream->collected_to = stream->front->start; + /* Write list pointers before active flag */ + smp_store_release(&stream->active, true); + } + } + + spin_unlock(&wreq->lock); + + stream->construct = subreq; +} + +/* + * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O + * operation. The operation may be asynchronous and should call + * netfs_write_subrequest_terminated() when complete. + */ +static void netfs_do_issue_write(struct netfs_io_stream *stream, + struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *wreq = subreq->rreq; + + kenter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); + + if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) + return netfs_write_subrequest_terminated(subreq, subreq->error, false); + + // TODO: Use encrypted buffer + if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) { + subreq->io_iter = wreq->io_iter; + iov_iter_advance(&subreq->io_iter, + subreq->start + subreq->transferred - wreq->start); + iov_iter_truncate(&subreq->io_iter, + subreq->len - subreq->transferred); + } else { + iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages, + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + stream->issue_write(subreq); +} + +void netfs_reissue_write(struct netfs_io_stream *stream, + struct netfs_io_subrequest *subreq) +{ + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_do_issue_write(stream, subreq); +} + +static void netfs_issue_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream) +{ + struct netfs_io_subrequest *subreq = stream->construct; + + if (!subreq) + return; + stream->construct = NULL; + + if (subreq->start + subreq->len > wreq->start + wreq->submitted) + WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start); + netfs_do_issue_write(stream, subreq); +} + +/* + * Add data to the write subrequest, dispatching each as we fill it up or if it + * is discontiguous with the previous. We only fill one part at a time so that + * we can avoid overrunning the credits obtained (cifs) and try to parallelise + * content-crypto preparation with network writes. + */ +int netfs_advance_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start, size_t len, bool to_eof) +{ + struct netfs_io_subrequest *subreq = stream->construct; + size_t part; + + if (!stream->avail) { + kleave("no write"); + return len; + } + + kenter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0); + + if (subreq && start != subreq->start + subreq->len) { + netfs_issue_write(wreq, stream); + subreq = NULL; + } + + if (!stream->construct) + netfs_prepare_write(wreq, stream, start); + subreq = stream->construct; + + part = min(subreq->max_len - subreq->len, len); + kdebug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); + subreq->len += part; + subreq->nr_segs++; + + if (subreq->len >= subreq->max_len || + subreq->nr_segs >= subreq->max_nr_segs || + to_eof) { + netfs_issue_write(wreq, stream); + subreq = NULL; + } + + return part; +} + +/* + * Write some of a pending folio data back to the server. + */ +static int netfs_write_folio(struct netfs_io_request *wreq, + struct writeback_control *wbc, + struct folio *folio) +{ + struct netfs_io_stream *upload = &wreq->io_streams[0]; + struct netfs_io_stream *cache = &wreq->io_streams[1]; + struct netfs_io_stream *stream; + struct netfs_group *fgroup; /* TODO: Use this with ceph */ + struct netfs_folio *finfo; + size_t fsize = folio_size(folio), flen = fsize, foff = 0; + loff_t fpos = folio_pos(folio), i_size; + bool to_eof = false, streamw = false; + bool debug = false; + + kenter(""); + + /* netfs_perform_write() may shift i_size around the page or from out + * of the page to beyond it, but cannot move i_size into or through the + * page since we have it locked. + */ + i_size = i_size_read(wreq->inode); + + if (fpos >= i_size) { + /* mmap beyond eof. */ + kdebug("beyond eof"); + folio_start_writeback(folio); + folio_unlock(folio); + wreq->nr_group_rel += netfs_folio_written_back(folio); + netfs_put_group_many(wreq->group, wreq->nr_group_rel); + wreq->nr_group_rel = 0; + return 0; + } + + if (fpos + fsize > wreq->i_size) + wreq->i_size = i_size; + + fgroup = netfs_folio_group(folio); + finfo = netfs_folio_info(folio); + if (finfo) { + foff = finfo->dirty_offset; + flen = foff + finfo->dirty_len; + streamw = true; + } + + if (wreq->origin == NETFS_WRITETHROUGH) { + to_eof = false; + if (flen > i_size - fpos) + flen = i_size - fpos; + } else if (flen > i_size - fpos) { + flen = i_size - fpos; + if (!streamw) + folio_zero_segment(folio, flen, fsize); + to_eof = true; + } else if (flen == i_size - fpos) { + to_eof = true; + } + flen -= foff; + + kdebug("folio %zx %zx %zx", foff, flen, fsize); + + /* Deal with discontinuities in the stream of dirty pages. These can + * arise from a number of sources: + * + * (1) Intervening non-dirty pages from random-access writes, multiple + * flushers writing back different parts simultaneously and manual + * syncing. + * + * (2) Partially-written pages from write-streaming. + * + * (3) Pages that belong to a different write-back group (eg. Ceph + * snapshots). + * + * (4) Actually-clean pages that were marked for write to the cache + * when they were read. Note that these appear as a special + * write-back group. + */ + if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) { + netfs_issue_write(wreq, upload); + } else if (fgroup != wreq->group) { + /* We can't write this page to the server yet. */ + kdebug("wrong group"); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + netfs_issue_write(wreq, upload); + netfs_issue_write(wreq, cache); + return 0; + } + + if (foff > 0) + netfs_issue_write(wreq, upload); + if (streamw) + netfs_issue_write(wreq, cache); + + /* Flip the page to the writeback state and unlock. If we're called + * from write-through, then the page has already been put into the wb + * state. + */ + if (wreq->origin == NETFS_WRITEBACK) + folio_start_writeback(folio); + folio_unlock(folio); + + if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) { + if (!fscache_resources_valid(&wreq->cache_resources)) { + trace_netfs_folio(folio, netfs_folio_trace_cancel_copy); + netfs_issue_write(wreq, upload); + netfs_folio_written_back(folio); + return 0; + } + trace_netfs_folio(folio, netfs_folio_trace_store_copy); + } else if (!upload->construct) { + trace_netfs_folio(folio, netfs_folio_trace_store); + } else { + trace_netfs_folio(folio, netfs_folio_trace_store_plus); + } + + /* Move the submission point forward to allow for write-streaming data + * not starting at the front of the page. We don't do write-streaming + * with the cache as the cache requires DIO alignment. + * + * Also skip uploading for data that's been read and just needs copying + * to the cache. + */ + for (int s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + stream->submit_max_len = fsize; + stream->submit_off = foff; + stream->submit_len = flen; + if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) || + (stream->source == NETFS_UPLOAD_TO_SERVER && + fgroup == NETFS_FOLIO_COPY_TO_CACHE)) { + stream->submit_off = UINT_MAX; + stream->submit_len = 0; + stream->submit_max_len = 0; + } + } + + /* Attach the folio to one or more subrequests. For a big folio, we + * could end up with thousands of subrequests if the wsize is small - + * but we might need to wait during the creation of subrequests for + * network resources (eg. SMB credits). + */ + for (;;) { + ssize_t part; + size_t lowest_off = ULONG_MAX; + int choose_s = -1; + + /* Always add to the lowest-submitted stream first. */ + for (int s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->submit_len > 0 && + stream->submit_off < lowest_off) { + lowest_off = stream->submit_off; + choose_s = s; + } + } + + if (choose_s < 0) + break; + stream = &wreq->io_streams[choose_s]; + + part = netfs_advance_write(wreq, stream, fpos + stream->submit_off, + stream->submit_len, to_eof); + atomic64_set(&wreq->issued_to, fpos + stream->submit_off); + stream->submit_off += part; + stream->submit_max_len -= part; + if (part > stream->submit_len) + stream->submit_len = 0; + else + stream->submit_len -= part; + if (part > 0) + debug = true; + } + + atomic64_set(&wreq->issued_to, fpos + fsize); + + if (!debug) + kdebug("R=%x: No submit", wreq->debug_id); + + if (foff + flen < fsize) + for (int s = 0; s < NR_IO_STREAMS; s++) + netfs_issue_write(wreq, &wreq->io_streams[s]); + + kleave(" = 0"); + return 0; +} + +/* + * Write some of the pending data back to the server + */ +int netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct netfs_inode *ictx = netfs_inode(mapping->host); + struct netfs_io_request *wreq = NULL; + struct folio *folio; + int error = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + mutex_lock(&ictx->wb_lock); + else if (!mutex_trylock(&ictx->wb_lock)) + return 0; + + /* Need the first folio to be able to set up the op. */ + folio = writeback_iter(mapping, wbc, NULL, &error); + if (!folio) + goto out; + + wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK); + if (IS_ERR(wreq)) { + error = PTR_ERR(wreq); + goto couldnt_start; + } + + trace_netfs_write(wreq, netfs_write_trace_writeback); + netfs_stat(&netfs_n_wh_writepages); + + do { + kdebug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); + + /* It appears we don't have to handle cyclic writeback wrapping. */ + WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted); + + if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE && + unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) { + set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + wreq->netfs_ops->begin_writeback(wreq); + } + + error = netfs_write_folio(wreq, wbc, folio); + if (error < 0) + break; + } while ((folio = writeback_iter(mapping, wbc, folio, &error))); + + for (int s = 0; s < NR_IO_STREAMS; s++) + netfs_issue_write(wreq, &wreq->io_streams[s]); + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); + + mutex_unlock(&ictx->wb_lock); + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + kleave(" = %d", error); + return error; + +couldnt_start: + netfs_kill_dirty_pages(mapping, wbc, folio); +out: + mutex_unlock(&ictx->wb_lock); + kleave(" = %d", error); + return error; +} +EXPORT_SYMBOL(netfs_writepages); + +/* + * Begin a write operation for writing through the pagecache. + */ +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len) +{ + struct netfs_io_request *wreq = NULL; + struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp)); + + mutex_lock(&ictx->wb_lock); + + wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, + iocb->ki_pos, NETFS_WRITETHROUGH); + if (IS_ERR(wreq)) { + mutex_unlock(&ictx->wb_lock); + return wreq; + } + + wreq->io_streams[0].avail = true; + trace_netfs_write(wreq, netfs_write_trace_writethrough); + return wreq; +} + +/* + * Advance the state of the write operation used when writing through the + * pagecache. Data has been copied into the pagecache that we need to append + * to the request. If we've added more than wsize then we need to create a new + * subrequest. + */ +int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *folio, size_t copied, bool to_page_end, + struct folio **writethrough_cache) +{ + kenter("R=%x ic=%zu ws=%u cp=%zu tp=%u", + wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end); + + if (!*writethrough_cache) { + if (folio_test_dirty(folio)) + /* Sigh. mmap. */ + folio_clear_dirty_for_io(folio); + + /* We can make multiple writes to the folio... */ + folio_start_writeback(folio); + if (wreq->len == 0) + trace_netfs_folio(folio, netfs_folio_trace_wthru); + else + trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); + *writethrough_cache = folio; + } + + wreq->len += copied; + if (!to_page_end) + return 0; + + *writethrough_cache = NULL; + return netfs_write_folio(wreq, wbc, folio); +} + +/* + * End a write operation used when writing through the pagecache. + */ +int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache) +{ + struct netfs_inode *ictx = netfs_inode(wreq->inode); + int ret; + + kenter("R=%x", wreq->debug_id); + + if (writethrough_cache) + netfs_write_folio(wreq, wbc, writethrough_cache); + + netfs_issue_write(wreq, &wreq->io_streams[0]); + netfs_issue_write(wreq, &wreq->io_streams[1]); + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); + + mutex_unlock(&ictx->wb_lock); + + if (wreq->iocb) { + ret = -EIOCBQUEUED; + } else { + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); + ret = wreq->error; + } + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + return ret; +} + +/* + * Write data to the server without going through the pagecache and without + * writing it to the local cache. + */ +int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len) +{ + struct netfs_io_stream *upload = &wreq->io_streams[0]; + ssize_t part; + loff_t start = wreq->start; + int error = 0; + + kenter("%zx", len); + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_begin(wreq->inode); + + while (len) { + // TODO: Prepare content encryption + + kdebug("unbuffered %zx", len); + part = netfs_advance_write(wreq, upload, start, len, false); + start += part; + len -= part; + if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { + trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); + wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE); + } + if (test_bit(NETFS_RREQ_FAILED, &wreq->flags)) + break; + } + + netfs_issue_write(wreq, upload); + + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); + if (list_empty(&upload->subrequests)) + netfs_wake_write_collector(wreq, false); + + kleave(" = %d", error); + return error; +} |