From 3afb00d3f86d3d924f88b56fa8285d4e9db85852 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 7 Aug 2024 15:17:52 +0200 Subject: Merging upstream version 6.10.3. Signed-off-by: Daniel Baumann --- fs/erofs/Kconfig | 15 ++ fs/erofs/Makefile | 5 +- fs/erofs/compress.h | 4 + fs/erofs/data.c | 25 ++- fs/erofs/decompressor.c | 15 +- fs/erofs/decompressor_zstd.c | 279 ++++++++++++++++++++++++++ fs/erofs/dir.c | 4 +- fs/erofs/erofs_fs.h | 15 +- fs/erofs/fscache.c | 12 +- fs/erofs/inode.c | 4 +- fs/erofs/internal.h | 37 ++-- fs/erofs/namei.c | 6 +- fs/erofs/pcpubuf.c | 148 -------------- fs/erofs/super.c | 74 ++++--- fs/erofs/utils.c | 287 --------------------------- fs/erofs/xattr.c | 37 ++-- fs/erofs/zdata.c | 8 +- fs/erofs/zmap.c | 48 ++--- fs/erofs/zutil.c | 456 +++++++++++++++++++++++++++++++++++++++++++ 19 files changed, 894 insertions(+), 585 deletions(-) create mode 100644 fs/erofs/decompressor_zstd.c delete mode 100644 fs/erofs/pcpubuf.c delete mode 100644 fs/erofs/utils.c create mode 100644 fs/erofs/zutil.c (limited to 'fs/erofs') diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index fffd391934..7dcdce660c 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -112,6 +112,21 @@ config EROFS_FS_ZIP_DEFLATE If unsure, say N. +config EROFS_FS_ZIP_ZSTD + bool "EROFS Zstandard compressed data support" + depends on EROFS_FS_ZIP + select ZSTD_DECOMPRESS + help + Saying Y here includes support for reading EROFS file systems + containing Zstandard compressed data. It gives better compression + ratios than the default LZ4 format, while it costs more CPU + overhead. + + Zstandard support is an experimental feature for now and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config EROFS_FS_ONDEMAND bool "EROFS fscache-based on-demand read support" depends on EROFS_FS diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 994d0b9ded..097d672e6b 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -1,9 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_EROFS_FS) += erofs.o -erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o +erofs-objs := super.o inode.o data.o namei.o dir.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o -erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o +erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o +erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 333587ba61..19d53c30c8 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -90,8 +90,12 @@ int z_erofs_load_lzma_config(struct super_block *sb, struct erofs_super_block *dsb, void *data, int size); int z_erofs_load_deflate_config(struct super_block *sb, struct erofs_super_block *dsb, void *data, int size); +int z_erofs_load_zstd_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size); int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool); int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool); +int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl); #endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 52524bd969..8be60797ea 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -29,11 +29,9 @@ void erofs_put_metabuf(struct erofs_buf *buf) * Derive the block size from inode->i_blkbits to make compatible with * anonymous inode in fscache mode. */ -void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, enum erofs_kmap_type type) { - struct inode *inode = buf->inode; - erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits; pgoff_t index = offset >> PAGE_SHIFT; struct page *page = buf->page; struct folio *folio; @@ -43,7 +41,7 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, erofs_put_metabuf(buf); nofs_flag = memalloc_nofs_save(); - folio = read_cache_folio(inode->i_mapping, index, NULL, NULL); + folio = read_cache_folio(buf->mapping, index, NULL, NULL); memalloc_nofs_restore(nofs_flag); if (IS_ERR(folio)) return folio; @@ -68,16 +66,16 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { if (erofs_is_fscache_mode(sb)) - buf->inode = EROFS_SB(sb)->s_fscache->inode; + buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping; else - buf->inode = sb->s_bdev->bd_inode; + buf->mapping = sb->s_bdev->bd_mapping; } void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_blk_t blkaddr, enum erofs_kmap_type type) + erofs_off_t offset, enum erofs_kmap_type type) { erofs_init_metabuf(buf, sb); - return erofs_bread(buf, blkaddr, type); + return erofs_bread(buf, offset, type); } static int erofs_map_blocks_flatmode(struct inode *inode, @@ -154,7 +152,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP); + kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out; @@ -165,7 +163,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) /* handle block map */ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr = kaddr + erofs_blkoff(sb, pos); + __le32 *blkaddr = kaddr; if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { map->m_flags = 0; @@ -176,7 +174,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) goto out_unlock; } /* parse chunk indexes */ - idx = kaddr + erofs_blkoff(sb, pos); + idx = kaddr; switch (le32_to_cpu(idx->blkaddr)) { case EROFS_NULL_ADDR: map->m_flags = 0; @@ -296,11 +294,10 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ptr = erofs_read_metabuf(&buf, sb, - erofs_blknr(sb, mdev.m_pa), EROFS_KMAP); + ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP); if (IS_ERR(ptr)) return PTR_ERR(ptr); - iomap->inline_data = ptr + erofs_blkoff(sb, mdev.m_pa); + iomap->inline_data = ptr; iomap->private = buf.base; } else { iomap->type = IOMAP_MAPPED; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2ec9b2bb62..9d85b6c11c 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -54,7 +54,7 @@ static int z_erofs_load_lz4_config(struct super_block *sb, sbi->lz4.max_distance_pages = distance ? DIV_ROUND_UP(distance, PAGE_SIZE) + 1 : LZ4_MAX_DISTANCE_PAGES; - return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); + return z_erofs_gbuf_growsize(sbi->lz4.max_pclusterblks); } /* @@ -111,7 +111,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, victim = availables[--top]; get_page(victim); } else { - victim = erofs_allocpage(pagepool, rq->gfp); + victim = __erofs_allocpage(pagepool, rq->gfp, true); if (!victim) return -ENOMEM; set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); @@ -159,7 +159,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, docopy: /* Or copy compressed data which can be overlapped to per-CPU buffer */ in = rq->in; - src = erofs_get_pcpubuf(ctx->inpages); + src = z_erofs_get_gbuf(ctx->inpages); if (!src) { DBG_BUGON(1); kunmap_local(inpage); @@ -260,7 +260,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } else if (maptype == 1) { vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { - erofs_put_pcpubuf(src); + z_erofs_put_gbuf(src); } else if (maptype != 3) { DBG_BUGON(1); return -EFAULT; @@ -399,6 +399,13 @@ const struct z_erofs_decompressor erofs_decompressors[] = { .name = "deflate" }, #endif +#ifdef CONFIG_EROFS_FS_ZIP_ZSTD + [Z_EROFS_COMPRESSION_ZSTD] = { + .config = z_erofs_load_zstd_config, + .decompress = z_erofs_zstd_decompress, + .name = "zstd" + }, +#endif }; int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c new file mode 100644 index 0000000000..63a23cac3a --- /dev/null +++ b/fs/erofs/decompressor_zstd.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include "compress.h" + +struct z_erofs_zstd { + struct z_erofs_zstd *next; + u8 bounce[PAGE_SIZE]; + void *wksp; + unsigned int wkspsz; +}; + +static DEFINE_SPINLOCK(z_erofs_zstd_lock); +static unsigned int z_erofs_zstd_max_dictsize; +static unsigned int z_erofs_zstd_nstrms, z_erofs_zstd_avail_strms; +static struct z_erofs_zstd *z_erofs_zstd_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_zstd_wq); + +module_param_named(zstd_streams, z_erofs_zstd_nstrms, uint, 0444); + +static struct z_erofs_zstd *z_erofs_isolate_strms(bool all) +{ + struct z_erofs_zstd *strm; + +again: + spin_lock(&z_erofs_zstd_lock); + strm = z_erofs_zstd_head; + if (!strm) { + spin_unlock(&z_erofs_zstd_lock); + wait_event(z_erofs_zstd_wq, READ_ONCE(z_erofs_zstd_head)); + goto again; + } + z_erofs_zstd_head = all ? NULL : strm->next; + spin_unlock(&z_erofs_zstd_lock); + return strm; +} + +void z_erofs_zstd_exit(void) +{ + while (z_erofs_zstd_avail_strms) { + struct z_erofs_zstd *strm, *n; + + for (strm = z_erofs_isolate_strms(true); strm; strm = n) { + n = strm->next; + + kvfree(strm->wksp); + kfree(strm); + --z_erofs_zstd_avail_strms; + } + } +} + +int __init z_erofs_zstd_init(void) +{ + /* by default, use # of possible CPUs instead */ + if (!z_erofs_zstd_nstrms) + z_erofs_zstd_nstrms = num_possible_cpus(); + + for (; z_erofs_zstd_avail_strms < z_erofs_zstd_nstrms; + ++z_erofs_zstd_avail_strms) { + struct z_erofs_zstd *strm; + + strm = kzalloc(sizeof(*strm), GFP_KERNEL); + if (!strm) { + z_erofs_zstd_exit(); + return -ENOMEM; + } + spin_lock(&z_erofs_zstd_lock); + strm->next = z_erofs_zstd_head; + z_erofs_zstd_head = strm; + spin_unlock(&z_erofs_zstd_lock); + } + return 0; +} + +int z_erofs_load_zstd_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) +{ + static DEFINE_MUTEX(zstd_resize_mutex); + struct z_erofs_zstd_cfgs *zstd = data; + unsigned int dict_size, wkspsz; + struct z_erofs_zstd *strm, *head = NULL; + void *wksp; + + if (!zstd || size < sizeof(struct z_erofs_zstd_cfgs) || zstd->format) { + erofs_err(sb, "unsupported zstd format, size=%u", size); + return -EINVAL; + } + + if (zstd->windowlog > ilog2(Z_EROFS_ZSTD_MAX_DICT_SIZE) - 10) { + erofs_err(sb, "unsupported zstd window log %u", zstd->windowlog); + return -EINVAL; + } + dict_size = 1U << (zstd->windowlog + 10); + + /* in case 2 z_erofs_load_zstd_config() race to avoid deadlock */ + mutex_lock(&zstd_resize_mutex); + if (z_erofs_zstd_max_dictsize >= dict_size) { + mutex_unlock(&zstd_resize_mutex); + return 0; + } + + /* 1. collect/isolate all streams for the following check */ + while (z_erofs_zstd_avail_strms) { + struct z_erofs_zstd *n; + + for (strm = z_erofs_isolate_strms(true); strm; strm = n) { + n = strm->next; + strm->next = head; + head = strm; + --z_erofs_zstd_avail_strms; + } + } + + /* 2. walk each isolated stream and grow max dict_size if needed */ + wkspsz = zstd_dstream_workspace_bound(dict_size); + for (strm = head; strm; strm = strm->next) { + wksp = kvmalloc(wkspsz, GFP_KERNEL); + if (!wksp) + break; + kvfree(strm->wksp); + strm->wksp = wksp; + strm->wkspsz = wkspsz; + } + + /* 3. push back all to the global list and update max dict_size */ + spin_lock(&z_erofs_zstd_lock); + DBG_BUGON(z_erofs_zstd_head); + z_erofs_zstd_head = head; + spin_unlock(&z_erofs_zstd_lock); + z_erofs_zstd_avail_strms = z_erofs_zstd_nstrms; + wake_up_all(&z_erofs_zstd_wq); + if (!strm) + z_erofs_zstd_max_dictsize = dict_size; + mutex_unlock(&zstd_resize_mutex); + return strm ? -ENOMEM : 0; +} + +int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + zstd_dstream *stream; + struct super_block *sb = rq->sb; + unsigned int insz, outsz, pofs; + struct z_erofs_zstd *strm; + zstd_in_buffer in_buf = { NULL, 0, 0 }; + zstd_out_buffer out_buf = { NULL, 0, 0 }; + u8 *kin, *kout = NULL; + bool bounced = false; + int no = -1, ni = 0, j = 0, zerr, err; + + /* 1. get the exact compressed size */ + kin = kmap_local_page(*rq->in); + err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + sb->s_blocksize - rq->pageofs_in)); + if (err) { + kunmap_local(kin); + return err; + } + + /* 2. get an available ZSTD context */ + strm = z_erofs_isolate_strms(false); + + /* 3. multi-call decompress */ + insz = rq->inputsize; + outsz = rq->outputsize; + stream = zstd_init_dstream(z_erofs_zstd_max_dictsize, strm->wksp, strm->wkspsz); + if (!stream) { + err = -EIO; + goto failed_zinit; + } + + pofs = rq->pageofs_out; + in_buf.size = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in); + insz -= in_buf.size; + in_buf.src = kin + rq->pageofs_in; + do { + if (out_buf.size == out_buf.pos) { + if (++no >= nrpages_out || !outsz) { + erofs_err(sb, "insufficient space for decompressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) + kunmap_local(kout); + out_buf.size = min_t(u32, outsz, PAGE_SIZE - pofs); + outsz -= out_buf.size; + if (!rq->out[no]) { + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + kout = NULL; + err = -ENOMEM; + break; + } + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } + kout = kmap_local_page(rq->out[no]); + out_buf.dst = kout + pofs; + out_buf.pos = 0; + pofs = 0; + } + + if (in_buf.size == in_buf.pos && insz) { + if (++ni >= nrpages_in) { + erofs_err(sb, "invalid compressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) /* unlike kmap(), take care of the orders */ + kunmap_local(kout); + kunmap_local(kin); + in_buf.size = min_t(u32, insz, PAGE_SIZE); + insz -= in_buf.size; + kin = kmap_local_page(rq->in[ni]); + in_buf.src = kin; + in_buf.pos = 0; + bounced = false; + if (kout) { + j = (u8 *)out_buf.dst - kout; + kout = kmap_local_page(rq->out[no]); + out_buf.dst = kout + j; + } + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Or use short-lived pages from the + * on-stack pagepool where pages share among the same request + * and not _all_ inplace I/O pages are needed to be doubled. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, in_buf.src, in_buf.size); + in_buf.src = strm->bounce; + bounced = true; + } + + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + zerr = zstd_decompress_stream(stream, &out_buf, &in_buf); + if (zstd_is_error(zerr) || (!zerr && outsz)) { + erofs_err(sb, "failed to decompress in[%u] out[%u]: %s", + rq->inputsize, rq->outputsize, + zerr ? zstd_get_error_name(zerr) : "unexpected end of stream"); + err = -EFSCORRUPTED; + break; + } + } while (outsz || out_buf.pos < out_buf.size); +failed: + if (kout) + kunmap_local(kout); +failed_zinit: + kunmap_local(kin); + /* 4. push back ZSTD stream context to the global list */ + spin_lock(&z_erofs_zstd_lock); + strm->next = z_erofs_zstd_head; + z_erofs_zstd_head = strm; + spin_unlock(&z_erofs_zstd_lock); + wake_up(&z_erofs_zstd_wq); + return err; +} diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index b80abec053..2193a6710c 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -58,12 +58,12 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) int err = 0; bool initial = true; - buf.inode = dir; + buf.mapping = dir->i_mapping; while (ctx->pos < dirsize) { struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, i, EROFS_KMAP); + de = erofs_bread(&buf, erofs_pos(sb, i), EROFS_KMAP); if (IS_ERR(de)) { erofs_err(sb, "fail to readdir of logical block %u of nid %llu", i, EROFS_I(dir)->nid); diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index a03ec70ba6..6c0c270c42 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -296,6 +296,7 @@ enum { Z_EROFS_COMPRESSION_LZ4 = 0, Z_EROFS_COMPRESSION_LZMA = 1, Z_EROFS_COMPRESSION_DEFLATE = 2, + Z_EROFS_COMPRESSION_ZSTD = 3, Z_EROFS_COMPRESSION_MAX }; #define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) @@ -322,6 +323,15 @@ struct z_erofs_deflate_cfgs { u8 reserved[5]; } __packed; +/* 6 bytes (+ length field = 8 bytes) */ +struct z_erofs_zstd_cfgs { + u8 format; + u8 windowlog; /* windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN(10) */ + u8 reserved[4]; +} __packed; + +#define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; @@ -396,8 +406,7 @@ enum { Z_EROFS_LCLUSTER_TYPE_MAX }; -#define Z_EROFS_LI_LCLUSTER_TYPE_BITS 2 -#define Z_EROFS_LI_LCLUSTER_TYPE_BIT 0 +#define Z_EROFS_LI_LCLUSTER_TYPE_MASK (Z_EROFS_LCLUSTER_TYPE_MAX - 1) /* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ #define Z_EROFS_LI_PARTIAL_REF (1 << 15) @@ -451,8 +460,6 @@ static inline void erofs_check_ondisk_layout_definitions(void) sizeof(struct z_erofs_lcluster_index)); BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); - BUILD_BUG_ON(BIT(Z_EROFS_LI_LCLUSTER_TYPE_BITS) < - Z_EROFS_LCLUSTER_TYPE_MAX - 1); /* exclude old compiler versions like gcc 7.5.0 */ BUILD_BUG_ON(__builtin_constant_p(fmh) ? fmh != cpu_to_le64(1ULL << 63) : 0); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 62da538d91..fda16eedaf 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -273,21 +273,15 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) if (map.m_flags & EROFS_MAP_META) { struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct iov_iter iter; - erofs_blk_t blknr; - size_t offset, size; + size_t size = map.m_llen; void *src; - /* For tail packing layout, the offset may be non-zero. */ - offset = erofs_blkoff(sb, map.m_pa); - blknr = erofs_blknr(sb, map.m_pa); - size = map.m_llen; - - src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); + src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP); if (IS_ERR(src)) return PTR_ERR(src); iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE); - if (copy_to_iter(src + offset, size, &iter) != size) { + if (copy_to_iter(src, size, &iter) != size) { erofs_put_metabuf(&buf); return -EFAULT; } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 0eb0e6f933..5f6439a63a 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -26,7 +26,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, blkaddr = erofs_blknr(sb, inode_loc); *ofs = erofs_blkoff(sb, inode_loc); - kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP); + kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", vi->nid, PTR_ERR(kaddr)); @@ -66,7 +66,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, goto err_out; } memcpy(copied, dic, gotten); - kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1, + kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr + 1), EROFS_KMAP); if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index d28ccfc035..0c1b44ac95 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -64,15 +64,12 @@ enum { }; struct erofs_mount_opts { -#ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */ unsigned int sync_decompress; - /* threshold for decompression synchronously */ unsigned int max_sync_decompress_pages; -#endif unsigned int mount_opt; }; @@ -216,7 +213,7 @@ enum erofs_kmap_type { }; struct erofs_buf { - struct inode *inode; + struct address_space *mapping; struct page *page; void *base; enum erofs_kmap_type kmap_type; @@ -402,11 +399,11 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, enum erofs_kmap_type type); void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_blk_t blkaddr, enum erofs_kmap_type type); + erofs_off_t offset, enum erofs_kmap_type type); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -438,7 +435,11 @@ void erofs_unregister_sysfs(struct super_block *sb); int __init erofs_init_sysfs(void); void erofs_exit_sysfs(void); -struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); +struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv); +static inline struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) +{ + return __erofs_allocpage(pagepool, gfp, false); +} static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) { set_page_private(page, (unsigned long)*pagepool); @@ -463,11 +464,11 @@ int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags); -void *erofs_get_pcpubuf(unsigned int requiredpages); -void erofs_put_pcpubuf(void *ptr); -int erofs_pcpubuf_growsize(unsigned int nrpages); -void __init erofs_pcpubuf_init(void); -void erofs_pcpubuf_exit(void); +void *z_erofs_get_gbuf(unsigned int requiredpages); +void z_erofs_put_gbuf(void *ptr); +int z_erofs_gbuf_growsize(unsigned int nrpages); +int __init z_erofs_gbuf_init(void); +void z_erofs_gbuf_exit(void); int erofs_init_managed_cache(struct super_block *sb); int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); #else @@ -477,8 +478,8 @@ static inline int erofs_init_shrinker(void) { return 0; } static inline void erofs_exit_shrinker(void) {} static inline int z_erofs_init_zip_subsystem(void) { return 0; } static inline void z_erofs_exit_zip_subsystem(void) {} -static inline void erofs_pcpubuf_init(void) {} -static inline void erofs_pcpubuf_exit(void) {} +static inline int z_erofs_gbuf_init(void) { return 0; } +static inline void z_erofs_gbuf_exit(void) {} static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ @@ -498,6 +499,14 @@ static inline int z_erofs_deflate_init(void) { return 0; } static inline int z_erofs_deflate_exit(void) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */ +#ifdef CONFIG_EROFS_FS_ZIP_ZSTD +int __init z_erofs_zstd_init(void); +void z_erofs_zstd_exit(void); +#else +static inline int z_erofs_zstd_init(void) { return 0; } +static inline int z_erofs_zstd_exit(void) { return 0; } +#endif /* !CONFIG_EROFS_FS_ZIP_ZSTD */ + #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index f0110a78ac..c94d0c1608 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -99,8 +99,8 @@ static void *erofs_find_target_block(struct erofs_buf *target, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; - buf.inode = dir; - de = erofs_bread(&buf, mid, EROFS_KMAP); + buf.mapping = dir->i_mapping; + de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP); if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); @@ -171,7 +171,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, qn.name = name->name; qn.end = name->name + name->len; - buf.inode = dir; + buf.mapping = dir->i_mapping; ndirents = 0; de = erofs_find_target_block(&buf, dir, &qn, &ndirents); diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c deleted file mode 100644 index c7a4b1d770..0000000000 --- a/fs/erofs/pcpubuf.c +++ /dev/null @@ -1,148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) Gao Xiang - * - * For low-latency decompression algorithms (e.g. lz4), reserve consecutive - * per-CPU virtual memory (in pages) in advance to store such inplace I/O - * data if inplace decompression is failed (due to unmet inplace margin for - * example). - */ -#include "internal.h" - -struct erofs_pcpubuf { - raw_spinlock_t lock; - void *ptr; - struct page **pages; - unsigned int nrpages; -}; - -static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb); - -void *erofs_get_pcpubuf(unsigned int requiredpages) - __acquires(pcb->lock) -{ - struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb); - - raw_spin_lock(&pcb->lock); - /* check if the per-CPU buffer is too small */ - if (requiredpages > pcb->nrpages) { - raw_spin_unlock(&pcb->lock); - put_cpu_var(erofs_pcb); - /* (for sparse checker) pretend pcb->lock is still taken */ - __acquire(pcb->lock); - return NULL; - } - return pcb->ptr; -} - -void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock) -{ - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id()); - - DBG_BUGON(pcb->ptr != ptr); - raw_spin_unlock(&pcb->lock); - put_cpu_var(erofs_pcb); -} - -/* the next step: support per-CPU page buffers hotplug */ -int erofs_pcpubuf_growsize(unsigned int nrpages) -{ - static DEFINE_MUTEX(pcb_resize_mutex); - static unsigned int pcb_nrpages; - struct page *pagepool = NULL; - int delta, cpu, ret, i; - - mutex_lock(&pcb_resize_mutex); - delta = nrpages - pcb_nrpages; - ret = 0; - /* avoid shrinking pcpubuf, since no idea how many fses rely on */ - if (delta <= 0) - goto out; - - for_each_possible_cpu(cpu) { - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); - struct page **pages, **oldpages; - void *ptr, *old_ptr; - - pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - break; - } - - for (i = 0; i < nrpages; ++i) { - pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL); - if (!pages[i]) { - ret = -ENOMEM; - oldpages = pages; - goto free_pagearray; - } - } - ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL); - if (!ptr) { - ret = -ENOMEM; - oldpages = pages; - goto free_pagearray; - } - raw_spin_lock(&pcb->lock); - old_ptr = pcb->ptr; - pcb->ptr = ptr; - oldpages = pcb->pages; - pcb->pages = pages; - i = pcb->nrpages; - pcb->nrpages = nrpages; - raw_spin_unlock(&pcb->lock); - - if (!oldpages) { - DBG_BUGON(old_ptr); - continue; - } - - if (old_ptr) - vunmap(old_ptr); -free_pagearray: - while (i) - erofs_pagepool_add(&pagepool, oldpages[--i]); - kfree(oldpages); - if (ret) - break; - } - pcb_nrpages = nrpages; - erofs_release_pages(&pagepool); -out: - mutex_unlock(&pcb_resize_mutex); - return ret; -} - -void __init erofs_pcpubuf_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); - - raw_spin_lock_init(&pcb->lock); - } -} - -void erofs_pcpubuf_exit(void) -{ - int cpu, i; - - for_each_possible_cpu(cpu) { - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); - - if (pcb->ptr) { - vunmap(pcb->ptr); - pcb->ptr = NULL; - } - if (!pcb->pages) - continue; - - for (i = 0; i < pcb->nrpages; ++i) - if (pcb->pages[i]) - put_page(pcb->pages[i]); - kfree(pcb->pages); - pcb->pages = NULL; - } -} diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 30b49b2eee..1b91d95130 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -132,11 +132,11 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, int len, i, cnt; *offset = round_up(*offset, 4); - ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); + ptr = erofs_bread(buf, *offset, EROFS_KMAP); if (IS_ERR(ptr)) return ptr; - len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]); + len = le16_to_cpu(*(__le16 *)ptr); if (!len) len = U16_MAX + 1; buffer = kmalloc(len, GFP_KERNEL); @@ -148,12 +148,12 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, for (i = 0; i < len; i += cnt) { cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), len - i); - ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); + ptr = erofs_bread(buf, *offset, EROFS_KMAP); if (IS_ERR(ptr)) { kfree(buffer); return ptr; } - memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt); + memcpy(buffer + i, ptr, cnt); *offset += cnt; } return buffer; @@ -178,12 +178,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_fscache *fscache; struct erofs_deviceslot *dis; struct file *bdev_file; - void *ptr; - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - dis = ptr + erofs_blkoff(sb, *pos); + dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); + if (IS_ERR(dis)) + return PTR_ERR(dis); if (!sbi->devs->flatdev && !dif->path) { if (!dis->tag[0]) { @@ -345,7 +343,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->build_time = le64_to_cpu(dsb->build_time); sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); - memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid)); + super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); ret = strscpy(sbi->volume_name, dsb->volume_name, sizeof(dsb->volume_name)); @@ -859,7 +857,14 @@ static int __init erofs_module_init(void) if (err) goto deflate_err; - erofs_pcpubuf_init(); + err = z_erofs_zstd_init(); + if (err) + goto zstd_err; + + err = z_erofs_gbuf_init(); + if (err) + goto gbuf_err; + err = z_erofs_init_zip_subsystem(); if (err) goto zip_err; @@ -879,6 +884,10 @@ fs_err: sysfs_err: z_erofs_exit_zip_subsystem(); zip_err: + z_erofs_gbuf_exit(); +gbuf_err: + z_erofs_zstd_exit(); +zstd_err: z_erofs_deflate_exit(); deflate_err: z_erofs_lzma_exit(); @@ -898,33 +907,32 @@ static void __exit erofs_module_exit(void) erofs_exit_sysfs(); z_erofs_exit_zip_subsystem(); + z_erofs_zstd_exit(); z_erofs_deflate_exit(); z_erofs_lzma_exit(); erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); - erofs_pcpubuf_exit(); + z_erofs_gbuf_exit(); } static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - u64 id = 0; - - if (!erofs_is_fscache_mode(sb)) - id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; - buf->f_files = ULLONG_MAX; buf->f_ffree = ULLONG_MAX - sbi->inos; - buf->f_namelen = EROFS_NAME_LEN; - buf->f_fsid = u64_to_fsid(id); + if (uuid_is_null(&sb->s_uuid)) + buf->f_fsid = u64_to_fsid(erofs_is_fscache_mode(sb) ? 0 : + huge_encode_dev(sb->s_bdev->bd_dev)); + else + buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); return 0; } @@ -933,26 +941,14 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); struct erofs_mount_opts *opt = &sbi->opt; -#ifdef CONFIG_EROFS_FS_XATTR - if (test_opt(opt, XATTR_USER)) - seq_puts(seq, ",user_xattr"); - else - seq_puts(seq, ",nouser_xattr"); -#endif -#ifdef CONFIG_EROFS_FS_POSIX_ACL - if (test_opt(opt, POSIX_ACL)) - seq_puts(seq, ",acl"); - else - seq_puts(seq, ",noacl"); -#endif -#ifdef CONFIG_EROFS_FS_ZIP - if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED) - seq_puts(seq, ",cache_strategy=disabled"); - else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) - seq_puts(seq, ",cache_strategy=readahead"); - else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND) - seq_puts(seq, ",cache_strategy=readaround"); -#endif + if (IS_ENABLED(CONFIG_EROFS_FS_XATTR)) + seq_puts(seq, test_opt(opt, XATTR_USER) ? + ",user_xattr" : ",nouser_xattr"); + if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL)) + seq_puts(seq, test_opt(opt, POSIX_ACL) ? ",acl" : ",noacl"); + if (IS_ENABLED(CONFIG_EROFS_FS_ZIP)) + seq_printf(seq, ",cache_strategy=%s", + erofs_param_cache_strategy[opt->cache_strategy].name); if (test_opt(opt, DAX_ALWAYS)) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c deleted file mode 100644 index 518bdd69c8..0000000000 --- a/fs/erofs/utils.c +++ /dev/null @@ -1,287 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#include "internal.h" - -struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) -{ - struct page *page = *pagepool; - - if (page) { - DBG_BUGON(page_ref_count(page) != 1); - *pagepool = (struct page *)page_private(page); - } else { - page = alloc_page(gfp); - } - return page; -} - -void erofs_release_pages(struct page **pagepool) -{ - while (*pagepool) { - struct page *page = *pagepool; - - *pagepool = (struct page *)page_private(page); - put_page(page); - } -} - -#ifdef CONFIG_EROFS_FS_ZIP -/* global shrink count (for all mounted EROFS instances) */ -static atomic_long_t erofs_global_shrink_cnt; - -static bool erofs_workgroup_get(struct erofs_workgroup *grp) -{ - if (lockref_get_not_zero(&grp->lockref)) - return true; - - spin_lock(&grp->lockref.lock); - if (__lockref_is_dead(&grp->lockref)) { - spin_unlock(&grp->lockref.lock); - return false; - } - - if (!grp->lockref.count++) - atomic_long_dec(&erofs_global_shrink_cnt); - spin_unlock(&grp->lockref.lock); - return true; -} - -struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index) -{ - struct erofs_sb_info *sbi = EROFS_SB(sb); - struct erofs_workgroup *grp; - -repeat: - rcu_read_lock(); - grp = xa_load(&sbi->managed_pslots, index); - if (grp) { - if (!erofs_workgroup_get(grp)) { - /* prefer to relax rcu read side */ - rcu_read_unlock(); - goto repeat; - } - - DBG_BUGON(index != grp->index); - } - rcu_read_unlock(); - return grp; -} - -struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, - struct erofs_workgroup *grp) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct erofs_workgroup *pre; - - DBG_BUGON(grp->lockref.count < 1); -repeat: - xa_lock(&sbi->managed_pslots); - pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, - NULL, grp, GFP_KERNEL); - if (pre) { - if (xa_is_err(pre)) { - pre = ERR_PTR(xa_err(pre)); - } else if (!erofs_workgroup_get(pre)) { - /* try to legitimize the current in-tree one */ - xa_unlock(&sbi->managed_pslots); - cond_resched(); - goto repeat; - } - grp = pre; - } - xa_unlock(&sbi->managed_pslots); - return grp; -} - -static void __erofs_workgroup_free(struct erofs_workgroup *grp) -{ - atomic_long_dec(&erofs_global_shrink_cnt); - erofs_workgroup_free_rcu(grp); -} - -void erofs_workgroup_put(struct erofs_workgroup *grp) -{ - if (lockref_put_or_lock(&grp->lockref)) - return; - - DBG_BUGON(__lockref_is_dead(&grp->lockref)); - if (grp->lockref.count == 1) - atomic_long_inc(&erofs_global_shrink_cnt); - --grp->lockref.count; - spin_unlock(&grp->lockref.lock); -} - -static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) -{ - int free = false; - - spin_lock(&grp->lockref.lock); - if (grp->lockref.count) - goto out; - - /* - * Note that all cached pages should be detached before deleted from - * the XArray. Otherwise some cached pages could be still attached to - * the orphan old workgroup when the new one is available in the tree. - */ - if (erofs_try_to_free_all_cached_folios(sbi, grp)) - goto out; - - /* - * It's impossible to fail after the workgroup is freezed, - * however in order to avoid some race conditions, add a - * DBG_BUGON to observe this in advance. - */ - DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); - - lockref_mark_dead(&grp->lockref); - free = true; -out: - spin_unlock(&grp->lockref.lock); - if (free) - __erofs_workgroup_free(grp); - return free; -} - -static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, - unsigned long nr_shrink) -{ - struct erofs_workgroup *grp; - unsigned int freed = 0; - unsigned long index; - - xa_lock(&sbi->managed_pslots); - xa_for_each(&sbi->managed_pslots, index, grp) { - /* try to shrink each valid workgroup */ - if (!erofs_try_to_release_workgroup(sbi, grp)) - continue; - xa_unlock(&sbi->managed_pslots); - - ++freed; - if (!--nr_shrink) - return freed; - xa_lock(&sbi->managed_pslots); - } - xa_unlock(&sbi->managed_pslots); - return freed; -} - -/* protected by 'erofs_sb_list_lock' */ -static unsigned int shrinker_run_no; - -/* protects the mounted 'erofs_sb_list' */ -static DEFINE_SPINLOCK(erofs_sb_list_lock); -static LIST_HEAD(erofs_sb_list); - -void erofs_shrinker_register(struct super_block *sb) -{ - struct erofs_sb_info *sbi = EROFS_SB(sb); - - mutex_init(&sbi->umount_mutex); - - spin_lock(&erofs_sb_list_lock); - list_add(&sbi->list, &erofs_sb_list); - spin_unlock(&erofs_sb_list_lock); -} - -void erofs_shrinker_unregister(struct super_block *sb) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - - mutex_lock(&sbi->umount_mutex); - /* clean up all remaining workgroups in memory */ - erofs_shrink_workstation(sbi, ~0UL); - - spin_lock(&erofs_sb_list_lock); - list_del(&sbi->list); - spin_unlock(&erofs_sb_list_lock); - mutex_unlock(&sbi->umount_mutex); -} - -static unsigned long erofs_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - return atomic_long_read(&erofs_global_shrink_cnt); -} - -static unsigned long erofs_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct erofs_sb_info *sbi; - struct list_head *p; - - unsigned long nr = sc->nr_to_scan; - unsigned int run_no; - unsigned long freed = 0; - - spin_lock(&erofs_sb_list_lock); - do { - run_no = ++shrinker_run_no; - } while (run_no == 0); - - /* Iterate over all mounted superblocks and try to shrink them */ - p = erofs_sb_list.next; - while (p != &erofs_sb_list) { - sbi = list_entry(p, struct erofs_sb_info, list); - - /* - * We move the ones we do to the end of the list, so we stop - * when we see one we have already done. - */ - if (sbi->shrinker_run_no == run_no) - break; - - if (!mutex_trylock(&sbi->umount_mutex)) { - p = p->next; - continue; - } - - spin_unlock(&erofs_sb_list_lock); - sbi->shrinker_run_no = run_no; - - freed += erofs_shrink_workstation(sbi, nr - freed); - - spin_lock(&erofs_sb_list_lock); - /* Get the next list element before we move this one */ - p = p->next; - - /* - * Move this one to the end of the list to provide some - * fairness. - */ - list_move_tail(&sbi->list, &erofs_sb_list); - mutex_unlock(&sbi->umount_mutex); - - if (freed >= nr) - break; - } - spin_unlock(&erofs_sb_list_lock); - return freed; -} - -static struct shrinker *erofs_shrinker_info; - -int __init erofs_init_shrinker(void) -{ - erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker"); - if (!erofs_shrinker_info) - return -ENOMEM; - - erofs_shrinker_info->count_objects = erofs_shrink_count; - erofs_shrinker_info->scan_objects = erofs_shrink_scan; - - shrinker_register(erofs_shrinker_info); - - return 0; -} - -void erofs_exit_shrinker(void) -{ - shrinker_free(erofs_shrinker_info); -} -#endif /* !CONFIG_EROFS_FS_ZIP */ diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index b58316b49a..a90d7d6497 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -81,13 +81,13 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; } - ih = it.kaddr + erofs_blkoff(sb, it.pos); + ih = it.kaddr; vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, @@ -102,16 +102,14 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), - EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); if (IS_ERR(it.kaddr)) { kfree(vi->xattr_shared_xattrs); vi->xattr_shared_xattrs = NULL; ret = PTR_ERR(it.kaddr); goto out_unlock; } - vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *) - (it.kaddr + erofs_blkoff(sb, it.pos))); + vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr); it.pos += sizeof(__le32); } erofs_put_metabuf(&it.buf); @@ -185,12 +183,11 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, void *src; for (processed = 0; processed < len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); - src = it->kaddr + erofs_blkoff(sb, it->pos); + src = it->kaddr; slice = min_t(unsigned int, sb->s_blocksize - erofs_blkoff(sb, it->pos), len - processed); memcpy(it->buffer + it->buffer_ofs, src, slice); @@ -208,8 +205,7 @@ static int erofs_listxattr_foreach(struct erofs_xattr_iter *it) int err; /* 1. handle xattr entry */ - entry = *(struct erofs_xattr_entry *) - (it->kaddr + erofs_blkoff(it->sb, it->pos)); + entry = *(struct erofs_xattr_entry *)it->kaddr; it->pos += sizeof(struct erofs_xattr_entry); base_index = entry.e_name_index; @@ -259,8 +255,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) unsigned int slice, processed, value_sz; /* 1. handle xattr entry */ - entry = *(struct erofs_xattr_entry *) - (it->kaddr + erofs_blkoff(sb, it->pos)); + entry = *(struct erofs_xattr_entry *)it->kaddr; it->pos += sizeof(struct erofs_xattr_entry); value_sz = le16_to_cpu(entry.e_value_size); @@ -291,8 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) /* 2. handle xattr name */ for (processed = 0; processed < entry.e_name_len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -300,7 +294,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) sb->s_blocksize - erofs_blkoff(sb, it->pos), entry.e_name_len - processed); if (memcmp(it->name.name + it->infix_len + processed, - it->kaddr + erofs_blkoff(sb, it->pos), slice)) + it->kaddr, slice)) return -ENOATTR; it->pos += slice; } @@ -336,13 +330,11 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; while (remaining) { - it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); - entry_sz = erofs_xattr_entry_size(it->kaddr + - erofs_blkoff(it->sb, it->pos)); + entry_sz = erofs_xattr_entry_size(it->kaddr); /* xattr on-disk corruption: xattr entry beyond xattr_isize */ if (remaining < entry_sz) { DBG_BUGON(1); @@ -375,8 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, for (i = 0; i < vi->xattr_shared_count; ++i) { it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + vi->xattr_shared_xattrs[i] * sizeof(__le32); - it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -492,7 +483,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb) return -ENOMEM; if (sbi->packed_inode) - buf.inode = sbi->packed_inode; + buf.mapping = sbi->packed_inode->i_mapping; else erofs_init_metabuf(&buf, sb); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 3216b920d3..d6fe002a4a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -868,7 +868,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) } else { void *mptr; - mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); + mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP); if (IS_ERR(mptr)) { ret = PTR_ERR(mptr); erofs_err(sb, "failed to get inline data %d", ret); @@ -936,16 +936,16 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page, if (!packed_inode) return -EFSCORRUPTED; - buf.inode = packed_inode; + buf.mapping = packed_inode->i_mapping; for (; cur < end; cur += cnt, pos += cnt) { cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); - src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); + src = erofs_bread(&buf, pos, EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } - memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); + memcpy_to_page(page, cur, src, cnt); } erofs_put_metabuf(&buf); return 0; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 6bd435a565..74d3d7bffc 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -31,22 +31,20 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_lcluster_index); struct z_erofs_lcluster_index *di; - unsigned int advise, type; + unsigned int advise; m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(inode->i_sb, pos), EROFS_KMAP); + pos, EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); m->lcn = lcn; - di = m->kaddr + erofs_blkoff(inode->i_sb, pos); + di = m->kaddr; advise = le16_to_cpu(di->di_advise); - type = (advise >> Z_EROFS_LI_LCLUSTER_TYPE_BIT) & - ((1 << Z_EROFS_LI_LCLUSTER_TYPE_BITS) - 1); - switch (type) { - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: + m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { @@ -60,24 +58,15 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); - break; - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: - if (advise & Z_EROFS_LI_PARTIAL_REF) - m->partialref = true; + } else { + m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF); m->clusterofs = le16_to_cpu(di->di_clusterofs); if (m->clusterofs >= 1 << vi->z_logical_clusterbits) { DBG_BUGON(1); return -EFSCORRUPTED; } m->pblk = le32_to_cpu(di->di_u.blkaddr); - break; - default: - DBG_BUGON(1); - return -EOPNOTSUPP; } - m->type = type; return 0; } @@ -120,7 +109,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; - unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs; + unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; int i; u8 *in, type; bool big_pcluster; @@ -138,11 +127,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; - eofs = erofs_blkoff(m->inode->i_sb, pos); - base = round_down(eofs, vcnt << amortizedshift); - in = m->kaddr + base; + bytes = pos & ((vcnt << amortizedshift) - 1); + + in = m->kaddr - bytes; - i = (eofs - base) >> amortizedshift; + i = bytes >> amortizedshift; lo = decode_compactedbits(lobits, in, encodebits * i, &type); m->type = type; @@ -267,7 +256,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, out: pos += lcn * (1 << amortizedshift); m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(inode->i_sb, pos), EROFS_KMAP); + pos, EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); @@ -561,7 +550,8 @@ static int z_erofs_do_map_blocks(struct inode *inode, if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA || - map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) && + map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE || + map->m_algorithmformat == Z_EROFS_COMPRESSION_ZSTD) && map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) @@ -580,7 +570,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) int err, headnr; erofs_off_t pos; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; struct z_erofs_map_header *h; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { @@ -600,13 +589,12 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_unlock; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); + h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); + if (IS_ERR(h)) { + err = PTR_ERR(h); goto out_unlock; } - h = kaddr + erofs_blkoff(sb, pos); /* * if the highest bit of the 8-byte map header is set, the whole file * is stored in the packed inode. The rest bits keeps z_fragmentoff. diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c new file mode 100644 index 0000000000..9b53883e5c --- /dev/null +++ b/fs/erofs/zutil.c @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * https://www.huawei.com/ + */ +#include "internal.h" + +struct z_erofs_gbuf { + spinlock_t lock; + void *ptr; + struct page **pages; + unsigned int nrpages; +}; + +static struct z_erofs_gbuf *z_erofs_gbufpool, *z_erofs_rsvbuf; +static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages, + z_erofs_rsv_nrpages; + +module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444); +module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444); + +static atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */ +/* protected by 'erofs_sb_list_lock' */ +static unsigned int shrinker_run_no; + +/* protects the mounted 'erofs_sb_list' */ +static DEFINE_SPINLOCK(erofs_sb_list_lock); +static LIST_HEAD(erofs_sb_list); +static struct shrinker *erofs_shrinker_info; + +static unsigned int z_erofs_gbuf_id(void) +{ + return raw_smp_processor_id() % z_erofs_gbuf_count; +} + +void *z_erofs_get_gbuf(unsigned int requiredpages) + __acquires(gbuf->lock) +{ + struct z_erofs_gbuf *gbuf; + + migrate_disable(); + gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()]; + spin_lock(&gbuf->lock); + /* check if the buffer is too small */ + if (requiredpages > gbuf->nrpages) { + spin_unlock(&gbuf->lock); + migrate_enable(); + /* (for sparse checker) pretend gbuf->lock is still taken */ + __acquire(gbuf->lock); + return NULL; + } + return gbuf->ptr; +} + +void z_erofs_put_gbuf(void *ptr) __releases(gbuf->lock) +{ + struct z_erofs_gbuf *gbuf; + + gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()]; + DBG_BUGON(gbuf->ptr != ptr); + spin_unlock(&gbuf->lock); + migrate_enable(); +} + +int z_erofs_gbuf_growsize(unsigned int nrpages) +{ + static DEFINE_MUTEX(gbuf_resize_mutex); + struct page **tmp_pages = NULL; + struct z_erofs_gbuf *gbuf; + void *ptr, *old_ptr; + int last, i, j; + + mutex_lock(&gbuf_resize_mutex); + /* avoid shrinking gbufs, since no idea how many fses rely on */ + if (nrpages <= z_erofs_gbuf_nrpages) { + mutex_unlock(&gbuf_resize_mutex); + return 0; + } + + for (i = 0; i < z_erofs_gbuf_count; ++i) { + gbuf = &z_erofs_gbufpool[i]; + tmp_pages = kcalloc(nrpages, sizeof(*tmp_pages), GFP_KERNEL); + if (!tmp_pages) + goto out; + + for (j = 0; j < gbuf->nrpages; ++j) + tmp_pages[j] = gbuf->pages[j]; + do { + last = j; + j = alloc_pages_bulk_array(GFP_KERNEL, nrpages, + tmp_pages); + if (last == j) + goto out; + } while (j != nrpages); + + ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL); + if (!ptr) + goto out; + + spin_lock(&gbuf->lock); + kfree(gbuf->pages); + gbuf->pages = tmp_pages; + old_ptr = gbuf->ptr; + gbuf->ptr = ptr; + gbuf->nrpages = nrpages; + spin_unlock(&gbuf->lock); + if (old_ptr) + vunmap(old_ptr); + } + z_erofs_gbuf_nrpages = nrpages; +out: + if (i < z_erofs_gbuf_count && tmp_pages) { + for (j = 0; j < nrpages; ++j) + if (tmp_pages[j] && tmp_pages[j] != gbuf->pages[j]) + __free_page(tmp_pages[j]); + kfree(tmp_pages); + } + mutex_unlock(&gbuf_resize_mutex); + return i < z_erofs_gbuf_count ? -ENOMEM : 0; +} + +int __init z_erofs_gbuf_init(void) +{ + unsigned int i, total = num_possible_cpus(); + + if (z_erofs_gbuf_count) + total = min(z_erofs_gbuf_count, total); + z_erofs_gbuf_count = total; + + /* The last (special) global buffer is the reserved buffer */ + total += !!z_erofs_rsv_nrpages; + + z_erofs_gbufpool = kcalloc(total, sizeof(*z_erofs_gbufpool), + GFP_KERNEL); + if (!z_erofs_gbufpool) + return -ENOMEM; + + if (z_erofs_rsv_nrpages) { + z_erofs_rsvbuf = &z_erofs_gbufpool[total - 1]; + z_erofs_rsvbuf->pages = kcalloc(z_erofs_rsv_nrpages, + sizeof(*z_erofs_rsvbuf->pages), GFP_KERNEL); + if (!z_erofs_rsvbuf->pages) { + z_erofs_rsvbuf = NULL; + z_erofs_rsv_nrpages = 0; + } + } + for (i = 0; i < total; ++i) + spin_lock_init(&z_erofs_gbufpool[i].lock); + return 0; +} + +void z_erofs_gbuf_exit(void) +{ + int i, j; + + for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) { + struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i]; + + if (gbuf->ptr) { + vunmap(gbuf->ptr); + gbuf->ptr = NULL; + } + + if (!gbuf->pages) + continue; + + for (j = 0; j < gbuf->nrpages; ++j) + if (gbuf->pages[j]) + put_page(gbuf->pages[j]); + kfree(gbuf->pages); + gbuf->pages = NULL; + } + kfree(z_erofs_gbufpool); +} + +struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv) +{ + struct page *page = *pagepool; + + if (page) { + *pagepool = (struct page *)page_private(page); + } else if (tryrsv && z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages) { + spin_lock(&z_erofs_rsvbuf->lock); + if (z_erofs_rsvbuf->nrpages) + page = z_erofs_rsvbuf->pages[--z_erofs_rsvbuf->nrpages]; + spin_unlock(&z_erofs_rsvbuf->lock); + } + if (!page) + page = alloc_page(gfp); + DBG_BUGON(page && page_ref_count(page) != 1); + return page; +} + +void erofs_release_pages(struct page **pagepool) +{ + while (*pagepool) { + struct page *page = *pagepool; + + *pagepool = (struct page *)page_private(page); + /* try to fill reserved global pool first */ + if (z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages < + z_erofs_rsv_nrpages) { + spin_lock(&z_erofs_rsvbuf->lock); + if (z_erofs_rsvbuf->nrpages < z_erofs_rsv_nrpages) { + z_erofs_rsvbuf->pages[z_erofs_rsvbuf->nrpages++] + = page; + spin_unlock(&z_erofs_rsvbuf->lock); + continue; + } + spin_unlock(&z_erofs_rsvbuf->lock); + } + put_page(page); + } +} + +static bool erofs_workgroup_get(struct erofs_workgroup *grp) +{ + if (lockref_get_not_zero(&grp->lockref)) + return true; + + spin_lock(&grp->lockref.lock); + if (__lockref_is_dead(&grp->lockref)) { + spin_unlock(&grp->lockref.lock); + return false; + } + + if (!grp->lockref.count++) + atomic_long_dec(&erofs_global_shrink_cnt); + spin_unlock(&grp->lockref.lock); + return true; +} + +struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, + pgoff_t index) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_workgroup *grp; + +repeat: + rcu_read_lock(); + grp = xa_load(&sbi->managed_pslots, index); + if (grp) { + if (!erofs_workgroup_get(grp)) { + /* prefer to relax rcu read side */ + rcu_read_unlock(); + goto repeat; + } + + DBG_BUGON(index != grp->index); + } + rcu_read_unlock(); + return grp; +} + +struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, + struct erofs_workgroup *grp) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct erofs_workgroup *pre; + + DBG_BUGON(grp->lockref.count < 1); +repeat: + xa_lock(&sbi->managed_pslots); + pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, + NULL, grp, GFP_KERNEL); + if (pre) { + if (xa_is_err(pre)) { + pre = ERR_PTR(xa_err(pre)); + } else if (!erofs_workgroup_get(pre)) { + /* try to legitimize the current in-tree one */ + xa_unlock(&sbi->managed_pslots); + cond_resched(); + goto repeat; + } + grp = pre; + } + xa_unlock(&sbi->managed_pslots); + return grp; +} + +static void __erofs_workgroup_free(struct erofs_workgroup *grp) +{ + atomic_long_dec(&erofs_global_shrink_cnt); + erofs_workgroup_free_rcu(grp); +} + +void erofs_workgroup_put(struct erofs_workgroup *grp) +{ + if (lockref_put_or_lock(&grp->lockref)) + return; + + DBG_BUGON(__lockref_is_dead(&grp->lockref)); + if (grp->lockref.count == 1) + atomic_long_inc(&erofs_global_shrink_cnt); + --grp->lockref.count; + spin_unlock(&grp->lockref.lock); +} + +static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp) +{ + int free = false; + + spin_lock(&grp->lockref.lock); + if (grp->lockref.count) + goto out; + + /* + * Note that all cached pages should be detached before deleted from + * the XArray. Otherwise some cached pages could be still attached to + * the orphan old workgroup when the new one is available in the tree. + */ + if (erofs_try_to_free_all_cached_folios(sbi, grp)) + goto out; + + /* + * It's impossible to fail after the workgroup is freezed, + * however in order to avoid some race conditions, add a + * DBG_BUGON to observe this in advance. + */ + DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); + + lockref_mark_dead(&grp->lockref); + free = true; +out: + spin_unlock(&grp->lockref.lock); + if (free) + __erofs_workgroup_free(grp); + return free; +} + +static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, + unsigned long nr_shrink) +{ + struct erofs_workgroup *grp; + unsigned int freed = 0; + unsigned long index; + + xa_lock(&sbi->managed_pslots); + xa_for_each(&sbi->managed_pslots, index, grp) { + /* try to shrink each valid workgroup */ + if (!erofs_try_to_release_workgroup(sbi, grp)) + continue; + xa_unlock(&sbi->managed_pslots); + + ++freed; + if (!--nr_shrink) + return freed; + xa_lock(&sbi->managed_pslots); + } + xa_unlock(&sbi->managed_pslots); + return freed; +} + +void erofs_shrinker_register(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_init(&sbi->umount_mutex); + + spin_lock(&erofs_sb_list_lock); + list_add(&sbi->list, &erofs_sb_list); + spin_unlock(&erofs_sb_list_lock); +} + +void erofs_shrinker_unregister(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + mutex_lock(&sbi->umount_mutex); + /* clean up all remaining workgroups in memory */ + erofs_shrink_workstation(sbi, ~0UL); + + spin_lock(&erofs_sb_list_lock); + list_del(&sbi->list); + spin_unlock(&erofs_sb_list_lock); + mutex_unlock(&sbi->umount_mutex); +} + +static unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return atomic_long_read(&erofs_global_shrink_cnt); +} + +static unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct erofs_sb_info *sbi; + struct list_head *p; + + unsigned long nr = sc->nr_to_scan; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&erofs_sb_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + + /* Iterate over all mounted superblocks and try to shrink them */ + p = erofs_sb_list.next; + while (p != &erofs_sb_list) { + sbi = list_entry(p, struct erofs_sb_info, list); + + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (sbi->shrinker_run_no == run_no) + break; + + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + + spin_unlock(&erofs_sb_list_lock); + sbi->shrinker_run_no = run_no; + + freed += erofs_shrink_workstation(sbi, nr - freed); + + spin_lock(&erofs_sb_list_lock); + /* Get the next list element before we move this one */ + p = p->next; + + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&sbi->list, &erofs_sb_list); + mutex_unlock(&sbi->umount_mutex); + + if (freed >= nr) + break; + } + spin_unlock(&erofs_sb_list_lock); + return freed; +} + +int __init erofs_init_shrinker(void) +{ + erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker"); + if (!erofs_shrinker_info) + return -ENOMEM; + + erofs_shrinker_info->count_objects = erofs_shrink_count; + erofs_shrinker_info->scan_objects = erofs_shrink_scan; + shrinker_register(erofs_shrinker_info); + return 0; +} + +void erofs_exit_shrinker(void) +{ + shrinker_free(erofs_shrinker_info); +} -- cgit v1.2.3