summaryrefslogtreecommitdiffstats
path: root/fs/erofs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
commitace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
treeb2d64bc10158fdd5497876388cd68142ca374ed3 /fs/erofs
parentInitial commit. (diff)
downloadlinux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/erofs')
-rw-r--r--fs/erofs/Kconfig145
-rw-r--r--fs/erofs/Makefile9
-rw-r--r--fs/erofs/compress.h105
-rw-r--r--fs/erofs/data.c452
-rw-r--r--fs/erofs/decompressor.c445
-rw-r--r--fs/erofs/decompressor_deflate.c248
-rw-r--r--fs/erofs/decompressor_lzma.c294
-rw-r--r--fs/erofs/dir.c111
-rw-r--r--fs/erofs/erofs_fs.h461
-rw-r--r--fs/erofs/fscache.c612
-rw-r--r--fs/erofs/inode.c395
-rw-r--r--fs/erofs/internal.h537
-rw-r--r--fs/erofs/namei.c224
-rw-r--r--fs/erofs/pcpubuf.c148
-rw-r--r--fs/erofs/super.c1021
-rw-r--r--fs/erofs/sysfs.c277
-rw-r--r--fs/erofs/utils.c282
-rw-r--r--fs/erofs/xattr.c560
-rw-r--r--fs/erofs/xattr.h73
-rw-r--r--fs/erofs/zdata.c1894
-rw-r--r--fs/erofs/zmap.c774
21 files changed, 9067 insertions, 0 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
new file mode 100644
index 000000000..f6dc961e6
--- /dev/null
+++ b/fs/erofs/Kconfig
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config EROFS_FS
+ tristate "EROFS filesystem support"
+ depends on BLOCK
+ select FS_IOMAP
+ select LIBCRC32C
+ help
+ EROFS (Enhanced Read-Only File System) is a lightweight read-only
+ file system with modern designs (e.g. no buffer heads, inline
+ xattrs/data, chunk-based deduplication, multiple devices, etc.) for
+ scenarios which need high-performance read-only solutions, e.g.
+ smartphones with Android OS, LiveCDs and high-density hosts with
+ numerous containers;
+
+ It also provides fixed-sized output compression support in order to
+ improve storage density as well as keep relatively higher compression
+ ratios and implements in-place decompression to reuse the file page
+ for compressed data temporarily with proper strategies, which is
+ quite useful to ensure guaranteed end-to-end runtime decompression
+ performance under extremely memory pressure without extra cost.
+
+ See the documentation at <file:Documentation/filesystems/erofs.rst>
+ for more details.
+
+ If unsure, say N.
+
+config EROFS_FS_DEBUG
+ bool "EROFS debugging feature"
+ depends on EROFS_FS
+ help
+ Print debugging messages and enable more BUG_ONs which check
+ filesystem consistency and find potential issues aggressively,
+ which can be used for Android eng build, for example.
+
+ For daily use, say N.
+
+config EROFS_FS_XATTR
+ bool "EROFS extended attributes"
+ depends on EROFS_FS
+ select XXHASH
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+config EROFS_FS_POSIX_ACL
+ bool "EROFS Access Control Lists"
+ depends on EROFS_FS_XATTR
+ select FS_POSIX_ACL
+ default y
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N.
+
+config EROFS_FS_SECURITY
+ bool "EROFS Security Labels"
+ depends on EROFS_FS_XATTR
+ default y
+ help
+ Security labels provide an access control facility to support Linux
+ Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+ Linux. This option enables an extended attribute handler for file
+ security labels in the erofs filesystem, so that it requires enabling
+ the extended attribute support in advance.
+
+ If you are not using a security module, say N.
+
+config EROFS_FS_ZIP
+ bool "EROFS Data Compression Support"
+ depends on EROFS_FS
+ select LZ4_DECOMPRESS
+ default y
+ help
+ Enable fixed-sized output compression for EROFS.
+
+ If you don't want to enable compression feature, say N.
+
+config EROFS_FS_ZIP_LZMA
+ bool "EROFS LZMA compressed data support"
+ depends on EROFS_FS_ZIP
+ select XZ_DEC
+ select XZ_DEC_MICROLZMA
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing LZMA compressed data, specifically called microLZMA. it
+ gives better compression ratios than the LZ4 algorithm, at the
+ expense of more CPU overhead.
+
+ LZMA support is an experimental feature for now and so most file
+ systems will be readable without selecting this option.
+
+ If unsure, say N.
+
+config EROFS_FS_ZIP_DEFLATE
+ bool "EROFS DEFLATE compressed data support"
+ depends on EROFS_FS_ZIP
+ select ZLIB_INFLATE
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing DEFLATE compressed data. It gives better compression
+ ratios than the default LZ4 format, while it costs more CPU
+ overhead.
+
+ DEFLATE support is an experimental feature for now and so most
+ file systems will be readable without selecting this option.
+
+ If unsure, say N.
+
+config EROFS_FS_ONDEMAND
+ bool "EROFS fscache-based on-demand read support"
+ depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
+ default n
+ help
+ This permits EROFS to use fscache-backed data blobs with on-demand
+ read support.
+
+ If unsure, say N.
+
+config EROFS_FS_PCPU_KTHREAD
+ bool "EROFS per-cpu decompression kthread workers"
+ depends on EROFS_FS_ZIP
+ help
+ Saying Y here enables per-CPU kthread workers pool to carry out
+ async decompression for low latencies on some architectures.
+
+ If unsure, say N.
+
+config EROFS_FS_PCPU_KTHREAD_HIPRI
+ bool "EROFS high priority per-CPU kthread workers"
+ depends on EROFS_FS_ZIP && EROFS_FS_PCPU_KTHREAD
+ default y
+ help
+ This permits EROFS to configure per-CPU kthread workers to run
+ at higher priority.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
new file mode 100644
index 000000000..994d0b9de
--- /dev/null
+++ b/fs/erofs/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_EROFS_FS) += erofs.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o
+erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o
+erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
+erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
+erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
new file mode 100644
index 000000000..279933e00
--- /dev/null
+++ b/fs/erofs/compress.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ * https://www.huawei.com/
+ */
+#ifndef __EROFS_FS_COMPRESS_H
+#define __EROFS_FS_COMPRESS_H
+
+#include "internal.h"
+
+struct z_erofs_decompress_req {
+ struct super_block *sb;
+ struct page **in, **out;
+
+ unsigned short pageofs_in, pageofs_out;
+ unsigned int inputsize, outputsize;
+
+ /* indicate the algorithm will be used for decompression */
+ unsigned int alg;
+ bool inplace_io, partial_decoding, fillgaps;
+};
+
+struct z_erofs_decompressor {
+ int (*config)(struct super_block *sb, struct erofs_super_block *dsb,
+ void *data, int size);
+ int (*decompress)(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
+ char *name;
+};
+
+/* some special page->private (unsigned long, see below) */
+#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
+#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
+
+/*
+ * For all pages in a pcluster, page->private should be one of
+ * Type Last 2bits page->private
+ * short-lived page 00 Z_EROFS_SHORTLIVED_PAGE
+ * preallocated page (tryalloc) 00 Z_EROFS_PREALLOCATED_PAGE
+ * cached/managed page 00 pointer to z_erofs_pcluster
+ * online page (file-backed, 01/10/11 sub-index << 2 | count
+ * some pages can be used for inplace I/O)
+ *
+ * page->mapping should be one of
+ * Type page->mapping
+ * short-lived page NULL
+ * preallocated page NULL
+ * cached/managed page non-NULL or NULL (invalidated/truncated page)
+ * online page non-NULL
+ *
+ * For all managed pages, PG_private should be set with 1 extra refcount,
+ * which is used for page reclaim / migration.
+ */
+
+/*
+ * short-lived pages are pages directly from buddy system with specific
+ * page->private (no need to set PagePrivate since these are non-LRU /
+ * non-movable pages and bypass reclaim / migration code).
+ */
+static inline bool z_erofs_is_shortlived_page(struct page *page)
+{
+ if (page->private != Z_EROFS_SHORTLIVED_PAGE)
+ return false;
+
+ DBG_BUGON(page->mapping);
+ return true;
+}
+
+static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
+ struct page *page)
+{
+ if (!z_erofs_is_shortlived_page(page))
+ return false;
+
+ /* short-lived pages should not be used by others at the same time */
+ if (page_ref_count(page) > 1) {
+ put_page(page);
+ } else {
+ /* follow the pcluster rule above. */
+ erofs_pagepool_add(pagepool, page);
+ }
+ return true;
+}
+
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+ struct page *page)
+{
+ return page->mapping == MNGD_MAPPING(sbi);
+}
+
+int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
+ unsigned int padbufsize);
+extern const struct z_erofs_decompressor erofs_decompressors[];
+
+/* prototypes for specific algorithms */
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size);
+int z_erofs_load_deflate_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size);
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
+int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
+#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
new file mode 100644
index 000000000..0c2c99c58
--- /dev/null
+++ b/fs/erofs/data.c
@@ -0,0 +1,452 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
+ */
+#include "internal.h"
+#include <linux/prefetch.h>
+#include <linux/sched/mm.h>
+#include <linux/dax.h>
+#include <trace/events/erofs.h>
+
+void erofs_unmap_metabuf(struct erofs_buf *buf)
+{
+ if (buf->kmap_type == EROFS_KMAP)
+ kunmap_local(buf->base);
+ buf->base = NULL;
+ buf->kmap_type = EROFS_NO_KMAP;
+}
+
+void erofs_put_metabuf(struct erofs_buf *buf)
+{
+ if (!buf->page)
+ return;
+ erofs_unmap_metabuf(buf);
+ put_page(buf->page);
+ buf->page = NULL;
+}
+
+/*
+ * Derive the block size from inode->i_blkbits to make compatible with
+ * anonymous inode in fscache mode.
+ */
+void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
+ enum erofs_kmap_type type)
+{
+ struct inode *inode = buf->inode;
+ erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits;
+ pgoff_t index = offset >> PAGE_SHIFT;
+ struct page *page = buf->page;
+ struct folio *folio;
+ unsigned int nofs_flag;
+
+ if (!page || page->index != index) {
+ erofs_put_metabuf(buf);
+
+ nofs_flag = memalloc_nofs_save();
+ folio = read_cache_folio(inode->i_mapping, index, NULL, NULL);
+ memalloc_nofs_restore(nofs_flag);
+ if (IS_ERR(folio))
+ return folio;
+
+ /* should already be PageUptodate, no need to lock page */
+ page = folio_file_page(folio, index);
+ buf->page = page;
+ }
+ if (buf->kmap_type == EROFS_NO_KMAP) {
+ if (type == EROFS_KMAP)
+ buf->base = kmap_local_page(page);
+ buf->kmap_type = type;
+ } else if (buf->kmap_type != type) {
+ DBG_BUGON(1);
+ return ERR_PTR(-EFAULT);
+ }
+ if (type == EROFS_NO_KMAP)
+ return NULL;
+ return buf->base + (offset & ~PAGE_MASK);
+}
+
+void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
+{
+ if (erofs_is_fscache_mode(sb))
+ buf->inode = EROFS_SB(sb)->s_fscache->inode;
+ else
+ buf->inode = sb->s_bdev->bd_inode;
+}
+
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
+{
+ erofs_init_metabuf(buf, sb);
+ return erofs_bread(buf, blkaddr, type);
+}
+
+static int erofs_map_blocks_flatmode(struct inode *inode,
+ struct erofs_map_blocks *map)
+{
+ erofs_blk_t nblocks, lastblk;
+ u64 offset = map->m_la;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
+
+ nblocks = erofs_iblks(inode);
+ lastblk = nblocks - tailendpacking;
+
+ /* there is no hole in flatmode */
+ map->m_flags = EROFS_MAP_MAPPED;
+ if (offset < erofs_pos(sb, lastblk)) {
+ map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
+ map->m_plen = erofs_pos(sb, lastblk) - offset;
+ } else if (tailendpacking) {
+ map->m_pa = erofs_iloc(inode) + vi->inode_isize +
+ vi->xattr_isize + erofs_blkoff(sb, offset);
+ map->m_plen = inode->i_size - offset;
+
+ /* inline data should be located in the same meta block */
+ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
+ erofs_err(sb, "inline data cross block boundary @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ map->m_flags |= EROFS_MAP_META;
+ } else {
+ erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx",
+ vi->nid, inode->i_size, map->m_la);
+ DBG_BUGON(1);
+ return -EIO;
+ }
+ return 0;
+}
+
+int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
+{
+ struct super_block *sb = inode->i_sb;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_inode_chunk_index *idx;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ u64 chunknr;
+ unsigned int unit;
+ erofs_off_t pos;
+ void *kaddr;
+ int err = 0;
+
+ trace_erofs_map_blocks_enter(inode, map, 0);
+ map->m_deviceid = 0;
+ if (map->m_la >= inode->i_size) {
+ /* leave out-of-bound access unmapped */
+ map->m_flags = 0;
+ map->m_plen = 0;
+ goto out;
+ }
+
+ if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
+ err = erofs_map_blocks_flatmode(inode, map);
+ goto out;
+ }
+
+ if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)
+ unit = sizeof(*idx); /* chunk index */
+ else
+ unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */
+
+ chunknr = map->m_la >> vi->chunkbits;
+ pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
+ vi->xattr_isize, unit) + unit * chunknr;
+
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ err = PTR_ERR(kaddr);
+ goto out;
+ }
+ map->m_la = chunknr << vi->chunkbits;
+ map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
+ round_up(inode->i_size - map->m_la, sb->s_blocksize));
+
+ /* handle block map */
+ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
+ __le32 *blkaddr = kaddr + erofs_blkoff(sb, pos);
+
+ if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
+ map->m_flags = 0;
+ } else {
+ map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr));
+ map->m_flags = EROFS_MAP_MAPPED;
+ }
+ goto out_unlock;
+ }
+ /* parse chunk indexes */
+ idx = kaddr + erofs_blkoff(sb, pos);
+ switch (le32_to_cpu(idx->blkaddr)) {
+ case EROFS_NULL_ADDR:
+ map->m_flags = 0;
+ break;
+ default:
+ map->m_deviceid = le16_to_cpu(idx->device_id) &
+ EROFS_SB(sb)->device_id_mask;
+ map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr));
+ map->m_flags = EROFS_MAP_MAPPED;
+ break;
+ }
+out_unlock:
+ erofs_put_metabuf(&buf);
+out:
+ if (!err)
+ map->m_llen = map->m_plen;
+ trace_erofs_map_blocks_exit(inode, map, 0, err);
+ return err;
+}
+
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
+{
+ struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
+ struct erofs_device_info *dif;
+ int id;
+
+ map->m_bdev = sb->s_bdev;
+ map->m_daxdev = EROFS_SB(sb)->dax_dev;
+ map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
+ map->m_fscache = EROFS_SB(sb)->s_fscache;
+
+ if (map->m_deviceid) {
+ down_read(&devs->rwsem);
+ dif = idr_find(&devs->tree, map->m_deviceid - 1);
+ if (!dif) {
+ up_read(&devs->rwsem);
+ return -ENODEV;
+ }
+ if (devs->flatdev) {
+ map->m_pa += erofs_pos(sb, dif->mapped_blkaddr);
+ up_read(&devs->rwsem);
+ return 0;
+ }
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
+ up_read(&devs->rwsem);
+ } else if (devs->extra_devices && !devs->flatdev) {
+ down_read(&devs->rwsem);
+ idr_for_each_entry(&devs->tree, dif, id) {
+ erofs_off_t startoff, length;
+
+ if (!dif->mapped_blkaddr)
+ continue;
+ startoff = erofs_pos(sb, dif->mapped_blkaddr);
+ length = erofs_pos(sb, dif->blocks);
+
+ if (map->m_pa >= startoff &&
+ map->m_pa < startoff + length) {
+ map->m_pa -= startoff;
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
+ break;
+ }
+ }
+ up_read(&devs->rwsem);
+ }
+ return 0;
+}
+
+static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ int ret;
+ struct super_block *sb = inode->i_sb;
+ struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
+
+ map.m_la = offset;
+ map.m_llen = length;
+
+ ret = erofs_map_blocks(inode, &map);
+ if (ret < 0)
+ return ret;
+
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ return ret;
+
+ iomap->offset = map.m_la;
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = mdev.m_daxdev;
+ else
+ iomap->bdev = mdev.m_bdev;
+ iomap->length = map.m_llen;
+ iomap->flags = 0;
+ iomap->private = NULL;
+
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ if (!iomap->length)
+ iomap->length = length;
+ return 0;
+ }
+
+ if (map.m_flags & EROFS_MAP_META) {
+ void *ptr;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+
+ iomap->type = IOMAP_INLINE;
+ ptr = erofs_read_metabuf(&buf, sb,
+ erofs_blknr(sb, mdev.m_pa), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ iomap->inline_data = ptr + erofs_blkoff(sb, mdev.m_pa);
+ iomap->private = buf.base;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = mdev.m_pa;
+ if (flags & IOMAP_DAX)
+ iomap->addr += mdev.m_dax_part_off;
+ }
+ return 0;
+}
+
+static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ void *ptr = iomap->private;
+
+ if (ptr) {
+ struct erofs_buf buf = {
+ .page = kmap_to_page(ptr),
+ .base = ptr,
+ .kmap_type = EROFS_KMAP,
+ };
+
+ DBG_BUGON(iomap->type != IOMAP_INLINE);
+ erofs_put_metabuf(&buf);
+ } else {
+ DBG_BUGON(iomap->type == IOMAP_INLINE);
+ }
+ return written;
+}
+
+static const struct iomap_ops erofs_iomap_ops = {
+ .iomap_begin = erofs_iomap_begin,
+ .iomap_end = erofs_iomap_end,
+};
+
+int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
+#ifdef CONFIG_EROFS_FS_ZIP
+ return iomap_fiemap(inode, fieinfo, start, len,
+ &z_erofs_iomap_report_ops);
+#else
+ return -EOPNOTSUPP;
+#endif
+ }
+ return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops);
+}
+
+/*
+ * since we dont have write or truncate flows, so no inode
+ * locking needs to be held at the moment.
+ */
+static int erofs_read_folio(struct file *file, struct folio *folio)
+{
+ return iomap_read_folio(folio, &erofs_iomap_ops);
+}
+
+static void erofs_readahead(struct readahead_control *rac)
+{
+ return iomap_readahead(rac, &erofs_iomap_ops);
+}
+
+static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
+{
+ return iomap_bmap(mapping, block, &erofs_iomap_ops);
+}
+
+static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ /* no need taking (shared) inode lock since it's a ro filesystem */
+ if (!iov_iter_count(to))
+ return 0;
+
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
+#endif
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ unsigned int blksize_mask;
+
+ if (bdev)
+ blksize_mask = bdev_logical_block_size(bdev) - 1;
+ else
+ blksize_mask = i_blocksize(inode) - 1;
+
+ if ((iocb->ki_pos | iov_iter_count(to) |
+ iov_iter_alignment(to)) & blksize_mask)
+ return -EINVAL;
+
+ return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
+ NULL, 0, NULL, 0);
+ }
+ return filemap_read(iocb, to, 0);
+}
+
+/* for uncompressed (aligned) files and raw access for other files */
+const struct address_space_operations erofs_raw_access_aops = {
+ .read_folio = erofs_read_folio,
+ .readahead = erofs_readahead,
+ .bmap = erofs_bmap,
+ .direct_IO = noop_direct_IO,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
+};
+
+#ifdef CONFIG_FS_DAX
+static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
+ unsigned int order)
+{
+ return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops);
+}
+
+static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
+{
+ return erofs_dax_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct erofs_dax_vm_ops = {
+ .fault = erofs_dax_fault,
+ .huge_fault = erofs_dax_huge_fault,
+};
+
+static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if (!IS_DAX(file_inode(file)))
+ return generic_file_readonly_mmap(file, vma);
+
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ return -EINVAL;
+
+ vma->vm_ops = &erofs_dax_vm_ops;
+ vm_flags_set(vma, VM_HUGEPAGE);
+ return 0;
+}
+#else
+#define erofs_file_mmap generic_file_readonly_mmap
+#endif
+
+const struct file_operations erofs_file_fops = {
+ .llseek = generic_file_llseek,
+ .read_iter = erofs_file_read_iter,
+ .mmap = erofs_file_mmap,
+ .splice_read = filemap_splice_read,
+};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
new file mode 100644
index 000000000..d36b3963c
--- /dev/null
+++ b/fs/erofs/decompressor.c
@@ -0,0 +1,445 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ * https://www.huawei.com/
+ */
+#include "compress.h"
+#include <linux/module.h>
+#include <linux/lz4.h>
+
+#ifndef LZ4_DISTANCE_MAX /* history window size */
+#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */
+#endif
+
+#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
+#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
+#endif
+
+struct z_erofs_lz4_decompress_ctx {
+ struct z_erofs_decompress_req *rq;
+ /* # of encoded, decoded pages */
+ unsigned int inpages, outpages;
+ /* decoded block total length (used for in-place decompression) */
+ unsigned int oend;
+};
+
+static int z_erofs_load_lz4_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct z_erofs_lz4_cfgs *lz4 = data;
+ u16 distance;
+
+ if (lz4) {
+ if (size < sizeof(struct z_erofs_lz4_cfgs)) {
+ erofs_err(sb, "invalid lz4 cfgs, size=%u", size);
+ return -EINVAL;
+ }
+ distance = le16_to_cpu(lz4->max_distance);
+
+ sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks);
+ if (!sbi->lz4.max_pclusterblks) {
+ sbi->lz4.max_pclusterblks = 1; /* reserved case */
+ } else if (sbi->lz4.max_pclusterblks >
+ erofs_blknr(sb, Z_EROFS_PCLUSTER_MAX_SIZE)) {
+ erofs_err(sb, "too large lz4 pclusterblks %u",
+ sbi->lz4.max_pclusterblks);
+ return -EINVAL;
+ }
+ } else {
+ distance = le16_to_cpu(dsb->u1.lz4_max_distance);
+ sbi->lz4.max_pclusterblks = 1;
+ }
+
+ sbi->lz4.max_distance_pages = distance ?
+ DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
+ LZ4_MAX_DISTANCE_PAGES;
+ return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
+}
+
+/*
+ * Fill all gaps with bounce pages if it's a sparse page list. Also check if
+ * all physical pages are consecutive, which can be seen for moderate CR.
+ */
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
+ struct page **pagepool)
+{
+ struct z_erofs_decompress_req *rq = ctx->rq;
+ struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
+ unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
+ BITS_PER_LONG)] = { 0 };
+ unsigned int lz4_max_distance_pages =
+ EROFS_SB(rq->sb)->lz4.max_distance_pages;
+ void *kaddr = NULL;
+ unsigned int i, j, top;
+
+ top = 0;
+ for (i = j = 0; i < ctx->outpages; ++i, ++j) {
+ struct page *const page = rq->out[i];
+ struct page *victim;
+
+ if (j >= lz4_max_distance_pages)
+ j = 0;
+
+ /* 'valid' bounced can only be tested after a complete round */
+ if (!rq->fillgaps && test_bit(j, bounced)) {
+ DBG_BUGON(i < lz4_max_distance_pages);
+ DBG_BUGON(top >= lz4_max_distance_pages);
+ availables[top++] = rq->out[i - lz4_max_distance_pages];
+ }
+
+ if (page) {
+ __clear_bit(j, bounced);
+ if (!PageHighMem(page)) {
+ if (!i) {
+ kaddr = page_address(page);
+ continue;
+ }
+ if (kaddr &&
+ kaddr + PAGE_SIZE == page_address(page)) {
+ kaddr += PAGE_SIZE;
+ continue;
+ }
+ }
+ kaddr = NULL;
+ continue;
+ }
+ kaddr = NULL;
+ __set_bit(j, bounced);
+
+ if (top) {
+ victim = availables[--top];
+ get_page(victim);
+ } else {
+ victim = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
+ }
+ rq->out[i] = victim;
+ }
+ return kaddr ? 1 : 0;
+}
+
+static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
+ void *inpage, void *out, unsigned int *inputmargin,
+ int *maptype, bool may_inplace)
+{
+ struct z_erofs_decompress_req *rq = ctx->rq;
+ unsigned int omargin, total, i;
+ struct page **in;
+ void *src, *tmp;
+
+ if (rq->inplace_io) {
+ omargin = PAGE_ALIGN(ctx->oend) - ctx->oend;
+ if (rq->partial_decoding || !may_inplace ||
+ omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
+ goto docopy;
+
+ for (i = 0; i < ctx->inpages; ++i)
+ if (rq->out[ctx->outpages - ctx->inpages + i] !=
+ rq->in[i])
+ goto docopy;
+ kunmap_local(inpage);
+ *maptype = 3;
+ return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT);
+ }
+
+ if (ctx->inpages <= 1) {
+ *maptype = 0;
+ return inpage;
+ }
+ kunmap_local(inpage);
+ src = erofs_vm_map_ram(rq->in, ctx->inpages);
+ if (!src)
+ return ERR_PTR(-ENOMEM);
+ *maptype = 1;
+ return src;
+
+docopy:
+ /* Or copy compressed data which can be overlapped to per-CPU buffer */
+ in = rq->in;
+ src = erofs_get_pcpubuf(ctx->inpages);
+ if (!src) {
+ DBG_BUGON(1);
+ kunmap_local(inpage);
+ return ERR_PTR(-EFAULT);
+ }
+
+ tmp = src;
+ total = rq->inputsize;
+ while (total) {
+ unsigned int page_copycnt =
+ min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
+
+ if (!inpage)
+ inpage = kmap_local_page(*in);
+ memcpy(tmp, inpage + *inputmargin, page_copycnt);
+ kunmap_local(inpage);
+ inpage = NULL;
+ tmp += page_copycnt;
+ total -= page_copycnt;
+ ++in;
+ *inputmargin = 0;
+ }
+ *maptype = 2;
+ return src;
+}
+
+/*
+ * Get the exact inputsize with zero_padding feature.
+ * - For LZ4, it should work if zero_padding feature is on (5.3+);
+ * - For MicroLZMA, it'd be enabled all the time.
+ */
+int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
+ unsigned int padbufsize)
+{
+ const char *padend;
+
+ padend = memchr_inv(padbuf, 0, padbufsize);
+ if (!padend)
+ return -EFSCORRUPTED;
+ rq->inputsize -= padend - padbuf;
+ rq->pageofs_in += padend - padbuf;
+ return 0;
+}
+
+static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
+ u8 *dst)
+{
+ struct z_erofs_decompress_req *rq = ctx->rq;
+ bool support_0padding = false, may_inplace = false;
+ unsigned int inputmargin;
+ u8 *out, *headpage, *src;
+ int ret, maptype;
+
+ DBG_BUGON(*rq->in == NULL);
+ headpage = kmap_local_page(*rq->in);
+
+ /* LZ4 decompression inplace is only safe if zero_padding is enabled */
+ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) {
+ support_0padding = true;
+ ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ rq->sb->s_blocksize - rq->pageofs_in));
+ if (ret) {
+ kunmap_local(headpage);
+ return ret;
+ }
+ may_inplace = !((rq->pageofs_in + rq->inputsize) &
+ (rq->sb->s_blocksize - 1));
+ }
+
+ inputmargin = rq->pageofs_in;
+ src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin,
+ &maptype, may_inplace);
+ if (IS_ERR(src))
+ return PTR_ERR(src);
+
+ out = dst + rq->pageofs_out;
+ /* legacy format could compress extra data in a pcluster. */
+ if (rq->partial_decoding || !support_0padding)
+ ret = LZ4_decompress_safe_partial(src + inputmargin, out,
+ rq->inputsize, rq->outputsize, rq->outputsize);
+ else
+ ret = LZ4_decompress_safe(src + inputmargin, out,
+ rq->inputsize, rq->outputsize);
+
+ if (ret != rq->outputsize) {
+ erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
+ ret, rq->inputsize, inputmargin, rq->outputsize);
+
+ print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
+ 16, 1, src + inputmargin, rq->inputsize, true);
+ print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
+ 16, 1, out, rq->outputsize, true);
+
+ if (ret >= 0)
+ memset(out + ret, 0, rq->outputsize - ret);
+ ret = -EIO;
+ } else {
+ ret = 0;
+ }
+
+ if (maptype == 0) {
+ kunmap_local(headpage);
+ } else if (maptype == 1) {
+ vm_unmap_ram(src, ctx->inpages);
+ } else if (maptype == 2) {
+ erofs_put_pcpubuf(src);
+ } else if (maptype != 3) {
+ DBG_BUGON(1);
+ return -EFAULT;
+ }
+ return ret;
+}
+
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ struct z_erofs_lz4_decompress_ctx ctx;
+ unsigned int dst_maptype;
+ void *dst;
+ int ret;
+
+ ctx.rq = rq;
+ ctx.oend = rq->pageofs_out + rq->outputsize;
+ ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT;
+ ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+
+ /* one optimized fast path only for non bigpcluster cases yet */
+ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
+ DBG_BUGON(!*rq->out);
+ dst = kmap_local_page(*rq->out);
+ dst_maptype = 0;
+ goto dstmap_out;
+ }
+
+ /* general decoding path which can be used for all cases */
+ ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ dst = page_address(*rq->out);
+ dst_maptype = 1;
+ } else {
+ dst = erofs_vm_map_ram(rq->out, ctx.outpages);
+ if (!dst)
+ return -ENOMEM;
+ dst_maptype = 2;
+ }
+
+dstmap_out:
+ ret = z_erofs_lz4_decompress_mem(&ctx, dst);
+ if (!dst_maptype)
+ kunmap_local(dst);
+ else if (dst_maptype == 2)
+ vm_unmap_ram(dst, ctx.outpages);
+ return ret;
+}
+
+static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ const unsigned int outpages =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
+ PAGE_SIZE - rq->pageofs_out);
+ const unsigned int lefthalf = rq->outputsize - righthalf;
+ const unsigned int interlaced_offset =
+ rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
+ u8 *src;
+
+ if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
+ if (rq->out[0] == *rq->in) {
+ DBG_BUGON(rq->pageofs_out);
+ return 0;
+ }
+
+ src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
+ if (rq->out[0])
+ memcpy_to_page(rq->out[0], rq->pageofs_out,
+ src + interlaced_offset, righthalf);
+
+ if (outpages > inpages) {
+ DBG_BUGON(!rq->out[outpages - 1]);
+ if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
+ memcpy_to_page(rq->out[outpages - 1], 0, src +
+ (interlaced_offset ? 0 : righthalf),
+ lefthalf);
+ } else if (!interlaced_offset) {
+ memmove(src, src + righthalf, lefthalf);
+ flush_dcache_page(rq->in[inpages - 1]);
+ }
+ }
+ kunmap_local(src);
+ return 0;
+}
+
+const struct z_erofs_decompressor erofs_decompressors[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = {
+ .decompress = z_erofs_transform_plain,
+ .name = "shifted"
+ },
+ [Z_EROFS_COMPRESSION_INTERLACED] = {
+ .decompress = z_erofs_transform_plain,
+ .name = "interlaced"
+ },
+ [Z_EROFS_COMPRESSION_LZ4] = {
+ .config = z_erofs_load_lz4_config,
+ .decompress = z_erofs_lz4_decompress,
+ .name = "lz4"
+ },
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+ [Z_EROFS_COMPRESSION_LZMA] = {
+ .config = z_erofs_load_lzma_config,
+ .decompress = z_erofs_lzma_decompress,
+ .name = "lzma"
+ },
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
+ [Z_EROFS_COMPRESSION_DEFLATE] = {
+ .config = z_erofs_load_deflate_config,
+ .decompress = z_erofs_deflate_decompress,
+ .name = "deflate"
+ },
+#endif
+};
+
+int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ unsigned int algs, alg;
+ erofs_off_t offset;
+ int size, ret = 0;
+
+ if (!erofs_sb_has_compr_cfgs(sbi)) {
+ sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4;
+ return z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+ }
+
+ sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
+ if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
+ erofs_err(sb, "unidentified algorithms %x, please upgrade kernel",
+ sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
+ return -EOPNOTSUPP;
+ }
+
+ erofs_init_metabuf(&buf, sb);
+ offset = EROFS_SUPER_OFFSET + sbi->sb_size;
+ alg = 0;
+ for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+ void *data;
+
+ if (!(algs & 1))
+ continue;
+
+ data = erofs_read_metadata(sb, &buf, &offset, &size);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ break;
+ }
+
+ if (alg >= ARRAY_SIZE(erofs_decompressors) ||
+ !erofs_decompressors[alg].config) {
+ erofs_err(sb, "algorithm %d isn't enabled on this kernel",
+ alg);
+ ret = -EOPNOTSUPP;
+ } else {
+ ret = erofs_decompressors[alg].config(sb,
+ dsb, data, size);
+ }
+
+ kfree(data);
+ if (ret)
+ break;
+ }
+ erofs_put_metabuf(&buf);
+ return ret;
+}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
new file mode 100644
index 000000000..0e1946a6b
--- /dev/null
+++ b/fs/erofs/decompressor_deflate.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/module.h>
+#include <linux/zlib.h>
+#include "compress.h"
+
+struct z_erofs_deflate {
+ struct z_erofs_deflate *next;
+ struct z_stream_s z;
+ u8 bounce[PAGE_SIZE];
+};
+
+static DEFINE_SPINLOCK(z_erofs_deflate_lock);
+static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms;
+static struct z_erofs_deflate *z_erofs_deflate_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq);
+
+module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444);
+
+void z_erofs_deflate_exit(void)
+{
+ /* there should be no running fs instance */
+ while (z_erofs_deflate_avail_strms) {
+ struct z_erofs_deflate *strm;
+
+ spin_lock(&z_erofs_deflate_lock);
+ strm = z_erofs_deflate_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_deflate_lock);
+ continue;
+ }
+ z_erofs_deflate_head = NULL;
+ spin_unlock(&z_erofs_deflate_lock);
+
+ while (strm) {
+ struct z_erofs_deflate *n = strm->next;
+
+ vfree(strm->z.workspace);
+ kfree(strm);
+ --z_erofs_deflate_avail_strms;
+ strm = n;
+ }
+ }
+}
+
+int __init z_erofs_deflate_init(void)
+{
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_deflate_nstrms)
+ z_erofs_deflate_nstrms = num_possible_cpus();
+
+ for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms;
+ ++z_erofs_deflate_avail_strms) {
+ struct z_erofs_deflate *strm;
+
+ strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+ if (!strm)
+ goto out_failed;
+
+ /* XXX: in-kernel zlib cannot shrink windowbits currently */
+ strm->z.workspace = vmalloc(zlib_inflate_workspacesize());
+ if (!strm->z.workspace) {
+ kfree(strm);
+ goto out_failed;
+ }
+
+ spin_lock(&z_erofs_deflate_lock);
+ strm->next = z_erofs_deflate_head;
+ z_erofs_deflate_head = strm;
+ spin_unlock(&z_erofs_deflate_lock);
+ }
+ return 0;
+
+out_failed:
+ pr_err("failed to allocate zlib workspace\n");
+ z_erofs_deflate_exit();
+ return -ENOMEM;
+}
+
+int z_erofs_load_deflate_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size)
+{
+ struct z_erofs_deflate_cfgs *dfl = data;
+
+ if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) {
+ erofs_err(sb, "invalid deflate cfgs, size=%u", size);
+ return -EINVAL;
+ }
+
+ if (dfl->windowbits > MAX_WBITS) {
+ erofs_err(sb, "unsupported windowbits %u", dfl->windowbits);
+ return -EOPNOTSUPP;
+ }
+
+ erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!");
+ return 0;
+}
+
+int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ struct super_block *sb = rq->sb;
+ unsigned int insz, outsz, pofs;
+ struct z_erofs_deflate *strm;
+ u8 *kin, *kout = NULL;
+ bool bounced = false;
+ int no = -1, ni = 0, j = 0, zerr, err;
+
+ /* 1. get the exact DEFLATE compressed size */
+ kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ sb->s_blocksize - rq->pageofs_in));
+ if (err) {
+ kunmap_local(kin);
+ return err;
+ }
+
+ /* 2. get an available DEFLATE context */
+again:
+ spin_lock(&z_erofs_deflate_lock);
+ strm = z_erofs_deflate_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_deflate_lock);
+ wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head));
+ goto again;
+ }
+ z_erofs_deflate_head = strm->next;
+ spin_unlock(&z_erofs_deflate_lock);
+
+ /* 3. multi-call decompress */
+ insz = rq->inputsize;
+ outsz = rq->outputsize;
+ zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS);
+ if (zerr != Z_OK) {
+ err = -EIO;
+ goto failed_zinit;
+ }
+
+ pofs = rq->pageofs_out;
+ strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in);
+ insz -= strm->z.avail_in;
+ strm->z.next_in = kin + rq->pageofs_in;
+ strm->z.avail_out = 0;
+
+ while (1) {
+ if (!strm->z.avail_out) {
+ if (++no >= nrpages_out || !outsz) {
+ erofs_err(sb, "insufficient space for decompressed data");
+ err = -EFSCORRUPTED;
+ break;
+ }
+
+ if (kout)
+ kunmap_local(kout);
+ strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
+ outsz -= strm->z.avail_out;
+ if (!rq->out[no]) {
+ rq->out[no] = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(rq->out[no],
+ Z_EROFS_SHORTLIVED_PAGE);
+ }
+ kout = kmap_local_page(rq->out[no]);
+ strm->z.next_out = kout + pofs;
+ pofs = 0;
+ }
+
+ if (!strm->z.avail_in && insz) {
+ if (++ni >= nrpages_in) {
+ erofs_err(sb, "invalid compressed data");
+ err = -EFSCORRUPTED;
+ break;
+ }
+
+ if (kout) { /* unlike kmap(), take care of the orders */
+ j = strm->z.next_out - kout;
+ kunmap_local(kout);
+ }
+ kunmap_local(kin);
+ strm->z.avail_in = min_t(u32, insz, PAGE_SIZE);
+ insz -= strm->z.avail_in;
+ kin = kmap_local_page(rq->in[ni]);
+ strm->z.next_in = kin;
+ bounced = false;
+ if (kout) {
+ kout = kmap_local_page(rq->out[no]);
+ strm->z.next_out = kout + j;
+ }
+ }
+
+ /*
+ * Handle overlapping: Use bounced buffer if the compressed
+ * data is under processing; Or use short-lived pages from the
+ * on-stack pagepool where pages share among the same request
+ * and not _all_ inplace I/O pages are needed to be doubled.
+ */
+ if (!bounced && rq->out[no] == rq->in[ni]) {
+ memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in);
+ strm->z.next_in = strm->bounce;
+ bounced = true;
+ }
+
+ for (j = ni + 1; j < nrpages_in; ++j) {
+ struct page *tmppage;
+
+ if (rq->out[no] != rq->in[j])
+ continue;
+
+ DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
+ rq->in[j]));
+ tmppage = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+
+ zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH);
+ if (zerr != Z_OK || !(outsz + strm->z.avail_out)) {
+ if (zerr == Z_OK && rq->partial_decoding)
+ break;
+ if (zerr == Z_STREAM_END && !outsz)
+ break;
+ erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
+ zerr, rq->inputsize, rq->outputsize);
+ err = -EFSCORRUPTED;
+ break;
+ }
+ }
+
+ if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
+ err = -EIO;
+ if (kout)
+ kunmap_local(kout);
+failed_zinit:
+ kunmap_local(kin);
+ /* 4. push back DEFLATE stream context to the global list */
+ spin_lock(&z_erofs_deflate_lock);
+ strm->next = z_erofs_deflate_head;
+ z_erofs_deflate_head = strm;
+ spin_unlock(&z_erofs_deflate_lock);
+ wake_up(&z_erofs_deflate_wq);
+ return err;
+}
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
new file mode 100644
index 000000000..ba4ec73f4
--- /dev/null
+++ b/fs/erofs/decompressor_lzma.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/xz.h>
+#include <linux/module.h>
+#include "compress.h"
+
+struct z_erofs_lzma {
+ struct z_erofs_lzma *next;
+ struct xz_dec_microlzma *state;
+ struct xz_buf buf;
+ u8 bounce[PAGE_SIZE];
+};
+
+/* considering the LZMA performance, no need to use a lockless list for now */
+static DEFINE_SPINLOCK(z_erofs_lzma_lock);
+static unsigned int z_erofs_lzma_max_dictsize;
+static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms;
+static struct z_erofs_lzma *z_erofs_lzma_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
+
+module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
+
+void z_erofs_lzma_exit(void)
+{
+ /* there should be no running fs instance */
+ while (z_erofs_lzma_avail_strms) {
+ struct z_erofs_lzma *strm;
+
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ DBG_BUGON(1);
+ return;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ while (strm) {
+ struct z_erofs_lzma *n = strm->next;
+
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ kfree(strm);
+ --z_erofs_lzma_avail_strms;
+ strm = n;
+ }
+ }
+}
+
+int __init z_erofs_lzma_init(void)
+{
+ unsigned int i;
+
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_lzma_nstrms)
+ z_erofs_lzma_nstrms = num_possible_cpus();
+
+ for (i = 0; i < z_erofs_lzma_nstrms; ++i) {
+ struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+
+ if (!strm) {
+ z_erofs_lzma_exit();
+ return -ENOMEM;
+ }
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ ++z_erofs_lzma_avail_strms;
+ }
+ return 0;
+}
+
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size)
+{
+ static DEFINE_MUTEX(lzma_resize_mutex);
+ struct z_erofs_lzma_cfgs *lzma = data;
+ unsigned int dict_size, i;
+ struct z_erofs_lzma *strm, *head = NULL;
+ int err;
+
+ if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) {
+ erofs_err(sb, "invalid lzma cfgs, size=%u", size);
+ return -EINVAL;
+ }
+ if (lzma->format) {
+ erofs_err(sb, "unidentified lzma format %x, please check kernel version",
+ le16_to_cpu(lzma->format));
+ return -EINVAL;
+ }
+ dict_size = le32_to_cpu(lzma->dict_size);
+ if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) {
+ erofs_err(sb, "unsupported lzma dictionary size %u",
+ dict_size);
+ return -EINVAL;
+ }
+
+ erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
+
+ /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
+ mutex_lock(&lzma_resize_mutex);
+
+ if (z_erofs_lzma_max_dictsize >= dict_size) {
+ mutex_unlock(&lzma_resize_mutex);
+ return 0;
+ }
+
+ /* 1. collect/isolate all streams for the following check */
+ for (i = 0; i < z_erofs_lzma_avail_strms; ++i) {
+ struct z_erofs_lzma *last;
+
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq,
+ READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ for (last = strm; last->next; last = last->next)
+ ++i;
+ last->next = head;
+ head = strm;
+ }
+
+ err = 0;
+ /* 2. walk each isolated stream and grow max dict_size if needed */
+ for (strm = head; strm; strm = strm->next) {
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size);
+ if (!strm->state)
+ err = -ENOMEM;
+ }
+
+ /* 3. push back all to the global list and update max dict_size */
+ spin_lock(&z_erofs_lzma_lock);
+ DBG_BUGON(z_erofs_lzma_head);
+ z_erofs_lzma_head = head;
+ spin_unlock(&z_erofs_lzma_lock);
+ wake_up_all(&z_erofs_lzma_wq);
+
+ z_erofs_lzma_max_dictsize = dict_size;
+ mutex_unlock(&lzma_resize_mutex);
+ return err;
+}
+
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ unsigned int inlen, outlen, pageofs;
+ struct z_erofs_lzma *strm;
+ u8 *kin;
+ bool bounced = false;
+ int no, ni, j, err = 0;
+
+ /* 1. get the exact LZMA compressed size */
+ kin = kmap(*rq->in);
+ err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ rq->sb->s_blocksize - rq->pageofs_in));
+ if (err) {
+ kunmap(*rq->in);
+ return err;
+ }
+
+ /* 2. get an available lzma context */
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = strm->next;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ /* 3. multi-call decompress */
+ inlen = rq->inputsize;
+ outlen = rq->outputsize;
+ xz_dec_microlzma_reset(strm->state, inlen, outlen,
+ !rq->partial_decoding);
+ pageofs = rq->pageofs_out;
+ strm->buf.in = kin + rq->pageofs_in;
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in);
+ inlen -= strm->buf.in_size;
+ strm->buf.out = NULL;
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = 0;
+
+ for (ni = 0, no = -1;;) {
+ enum xz_ret xz_err;
+
+ if (strm->buf.out_pos == strm->buf.out_size) {
+ if (strm->buf.out) {
+ kunmap(rq->out[no]);
+ strm->buf.out = NULL;
+ }
+
+ if (++no >= nrpages_out || !outlen) {
+ erofs_err(rq->sb, "decompressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = min_t(u32, outlen,
+ PAGE_SIZE - pageofs);
+ outlen -= strm->buf.out_size;
+ if (!rq->out[no] && rq->fillgaps) { /* deduped */
+ rq->out[no] = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(rq->out[no],
+ Z_EROFS_SHORTLIVED_PAGE);
+ }
+ if (rq->out[no])
+ strm->buf.out = kmap(rq->out[no]) + pageofs;
+ pageofs = 0;
+ } else if (strm->buf.in_pos == strm->buf.in_size) {
+ kunmap(rq->in[ni]);
+
+ if (++ni >= nrpages_in || !inlen) {
+ erofs_err(rq->sb, "compressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
+ inlen -= strm->buf.in_size;
+ kin = kmap(rq->in[ni]);
+ strm->buf.in = kin;
+ bounced = false;
+ }
+
+ /*
+ * Handle overlapping: Use bounced buffer if the compressed
+ * data is under processing; Otherwise, Use short-lived pages
+ * from the on-stack pagepool where pages share with the same
+ * request.
+ */
+ if (!bounced && rq->out[no] == rq->in[ni]) {
+ memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
+ strm->buf.in = strm->bounce;
+ bounced = true;
+ }
+ for (j = ni + 1; j < nrpages_in; ++j) {
+ struct page *tmppage;
+
+ if (rq->out[no] != rq->in[j])
+ continue;
+
+ DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
+ rq->in[j]));
+ tmppage = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+ xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
+ DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
+ DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+
+ if (xz_err != XZ_OK) {
+ if (xz_err == XZ_STREAM_END && !outlen)
+ break;
+ erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+ xz_err, rq->inputsize, rq->outputsize);
+ err = -EFSCORRUPTED;
+ break;
+ }
+ }
+ if (no < nrpages_out && strm->buf.out)
+ kunmap(rq->out[no]);
+ if (ni < nrpages_in)
+ kunmap(rq->in[ni]);
+ /* 4. push back LZMA stream context to the global list */
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ wake_up(&z_erofs_lzma_wq);
+ return err;
+}
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
new file mode 100644
index 000000000..b80abec05
--- /dev/null
+++ b/fs/erofs/dir.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
+ */
+#include "internal.h"
+
+static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
+ void *dentry_blk, struct erofs_dirent *de,
+ unsigned int nameoff, unsigned int maxsize)
+{
+ const struct erofs_dirent *end = dentry_blk + nameoff;
+
+ while (de < end) {
+ const char *de_name;
+ unsigned int de_namelen;
+ unsigned char d_type;
+
+ d_type = fs_ftype_to_dtype(de->file_type);
+
+ nameoff = le16_to_cpu(de->nameoff);
+ de_name = (char *)dentry_blk + nameoff;
+
+ /* the last dirent in the block? */
+ if (de + 1 >= end)
+ de_namelen = strnlen(de_name, maxsize - nameoff);
+ else
+ de_namelen = le16_to_cpu(de[1].nameoff) - nameoff;
+
+ /* a corrupted entry is found */
+ if (nameoff + de_namelen > maxsize ||
+ de_namelen > EROFS_NAME_LEN) {
+ erofs_err(dir->i_sb, "bogus dirent @ nid %llu",
+ EROFS_I(dir)->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
+ if (!dir_emit(ctx, de_name, de_namelen,
+ le64_to_cpu(de->nid), d_type))
+ return 1;
+ ++de;
+ ctx->pos += sizeof(struct erofs_dirent);
+ }
+ return 0;
+}
+
+static int erofs_readdir(struct file *f, struct dir_context *ctx)
+{
+ struct inode *dir = file_inode(f);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct super_block *sb = dir->i_sb;
+ unsigned long bsz = sb->s_blocksize;
+ const size_t dirsize = i_size_read(dir);
+ unsigned int i = erofs_blknr(sb, ctx->pos);
+ unsigned int ofs = erofs_blkoff(sb, ctx->pos);
+ int err = 0;
+ bool initial = true;
+
+ buf.inode = dir;
+ while (ctx->pos < dirsize) {
+ struct erofs_dirent *de;
+ unsigned int nameoff, maxsize;
+
+ de = erofs_bread(&buf, i, EROFS_KMAP);
+ if (IS_ERR(de)) {
+ erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
+ i, EROFS_I(dir)->nid);
+ err = PTR_ERR(de);
+ break;
+ }
+
+ nameoff = le16_to_cpu(de->nameoff);
+ if (nameoff < sizeof(struct erofs_dirent) || nameoff >= bsz) {
+ erofs_err(sb, "invalid de[0].nameoff %u @ nid %llu",
+ nameoff, EROFS_I(dir)->nid);
+ err = -EFSCORRUPTED;
+ break;
+ }
+
+ maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz);
+
+ /* search dirents at the arbitrary position */
+ if (initial) {
+ initial = false;
+
+ ofs = roundup(ofs, sizeof(struct erofs_dirent));
+ ctx->pos = erofs_pos(sb, i) + ofs;
+ if (ofs >= nameoff)
+ goto skip_this;
+ }
+
+ err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs,
+ nameoff, maxsize);
+ if (err)
+ break;
+skip_this:
+ ctx->pos = erofs_pos(sb, i) + maxsize;
+ ++i;
+ ofs = 0;
+ }
+ erofs_put_metabuf(&buf);
+ return err < 0 ? err : 0;
+}
+
+const struct file_operations erofs_dir_fops = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = erofs_readdir,
+};
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
new file mode 100644
index 000000000..a03ec70ba
--- /dev/null
+++ b/fs/erofs/erofs_fs.h
@@ -0,0 +1,461 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */
+/*
+ * EROFS (Enhanced ROM File System) on-disk format definition
+ *
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
+ */
+#ifndef __EROFS_FS_H
+#define __EROFS_FS_H
+
+#define EROFS_SUPER_OFFSET 1024
+
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
+#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
+
+/*
+ * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
+ * be incompatible with this kernel version.
+ */
+#define EROFS_FEATURE_INCOMPAT_ZERO_PADDING 0x00000001
+#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
+#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
+#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
+#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
+#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008
+#define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010
+#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020
+#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
+#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040
+#define EROFS_ALL_FEATURE_INCOMPAT \
+ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
+ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
+ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
+ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
+ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
+ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
+ EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
+ EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
+ EROFS_FEATURE_INCOMPAT_DEDUPE | \
+ EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES)
+
+#define EROFS_SB_EXTSLOT_SIZE 16
+
+struct erofs_deviceslot {
+ u8 tag[64]; /* digest(sha256), etc. */
+ __le32 blocks; /* total fs blocks of this device */
+ __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
+ u8 reserved[56];
+};
+#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
+
+/* erofs on-disk super block (currently 128 bytes) */
+struct erofs_super_block {
+ __le32 magic; /* file system magic number */
+ __le32 checksum; /* crc32c(super_block) */
+ __le32 feature_compat;
+ __u8 blkszbits; /* filesystem block size in bit shift */
+ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */
+
+ __le16 root_nid; /* nid of root directory */
+ __le64 inos; /* total valid ino # (== f_files - f_favail) */
+
+ __le64 build_time; /* compact inode time derivation */
+ __le32 build_time_nsec; /* compact inode time derivation in ns scale */
+ __le32 blocks; /* used for statfs */
+ __le32 meta_blkaddr; /* start block address of metadata area */
+ __le32 xattr_blkaddr; /* start block address of shared xattr area */
+ __u8 uuid[16]; /* 128-bit uuid for volume */
+ __u8 volume_name[16]; /* volume name */
+ __le32 feature_incompat;
+ union {
+ /* bitmap for available compression algorithms */
+ __le16 available_compr_algs;
+ /* customized sliding window size instead of 64k by default */
+ __le16 lz4_max_distance;
+ } __packed u1;
+ __le16 extra_devices; /* # of devices besides the primary device */
+ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */
+ __u8 dirblkbits; /* directory block size in bit shift */
+ __u8 xattr_prefix_count; /* # of long xattr name prefixes */
+ __le32 xattr_prefix_start; /* start of long xattr prefixes */
+ __le64 packed_nid; /* nid of the special packed inode */
+ __u8 xattr_filter_reserved; /* reserved for xattr name filter */
+ __u8 reserved2[23];
+};
+
+/*
+ * EROFS inode datalayout (i_format in on-disk inode):
+ * 0 - uncompressed flat inode without tail-packing inline data:
+ * 1 - compressed inode with non-compact indexes:
+ * 2 - uncompressed flat inode with tail-packing inline data:
+ * 3 - compressed inode with compact indexes:
+ * 4 - chunk-based inode with (optional) multi-device support:
+ * 5~7 - reserved
+ */
+enum {
+ EROFS_INODE_FLAT_PLAIN = 0,
+ EROFS_INODE_COMPRESSED_FULL = 1,
+ EROFS_INODE_FLAT_INLINE = 2,
+ EROFS_INODE_COMPRESSED_COMPACT = 3,
+ EROFS_INODE_CHUNK_BASED = 4,
+ EROFS_INODE_DATALAYOUT_MAX
+};
+
+static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
+{
+ return datamode == EROFS_INODE_COMPRESSED_COMPACT ||
+ datamode == EROFS_INODE_COMPRESSED_FULL;
+}
+
+/* bit definitions of inode i_format */
+#define EROFS_I_VERSION_MASK 0x01
+#define EROFS_I_DATALAYOUT_MASK 0x07
+
+#define EROFS_I_VERSION_BIT 0
+#define EROFS_I_DATALAYOUT_BIT 1
+#define EROFS_I_ALL_BIT 4
+
+#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1)
+
+/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
+#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F
+/* with chunk indexes or just a 4-byte blkaddr array */
+#define EROFS_CHUNK_FORMAT_INDEXES 0x0020
+
+#define EROFS_CHUNK_FORMAT_ALL \
+ (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
+
+/* 32-byte on-disk inode */
+#define EROFS_INODE_LAYOUT_COMPACT 0
+/* 64-byte on-disk inode */
+#define EROFS_INODE_LAYOUT_EXTENDED 1
+
+struct erofs_inode_chunk_info {
+ __le16 format; /* chunk blkbits, etc. */
+ __le16 reserved;
+};
+
+union erofs_inode_i_u {
+ /* total compressed blocks for compressed inodes */
+ __le32 compressed_blocks;
+
+ /* block address for uncompressed flat inodes */
+ __le32 raw_blkaddr;
+
+ /* for device files, used to indicate old/new device # */
+ __le32 rdev;
+
+ /* for chunk-based files, it contains the summary info */
+ struct erofs_inode_chunk_info c;
+};
+
+/* 32-byte reduced form of an ondisk inode */
+struct erofs_inode_compact {
+ __le16 i_format; /* inode format hints */
+
+/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
+ __le16 i_xattr_icount;
+ __le16 i_mode;
+ __le16 i_nlink;
+ __le32 i_size;
+ __le32 i_reserved;
+ union erofs_inode_i_u i_u;
+
+ __le32 i_ino; /* only used for 32-bit stat compatibility */
+ __le16 i_uid;
+ __le16 i_gid;
+ __le32 i_reserved2;
+};
+
+/* 64-byte complete form of an ondisk inode */
+struct erofs_inode_extended {
+ __le16 i_format; /* inode format hints */
+
+/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
+ __le16 i_xattr_icount;
+ __le16 i_mode;
+ __le16 i_reserved;
+ __le64 i_size;
+ union erofs_inode_i_u i_u;
+
+ __le32 i_ino; /* only used for 32-bit stat compatibility */
+ __le32 i_uid;
+ __le32 i_gid;
+ __le64 i_mtime;
+ __le32 i_mtime_nsec;
+ __le32 i_nlink;
+ __u8 i_reserved2[16];
+};
+
+/*
+ * inline xattrs (n == i_xattr_icount):
+ * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes
+ * 12 bytes / \
+ * / \
+ * /-----------------------\
+ * | erofs_xattr_entries+ |
+ * +-----------------------+
+ * inline xattrs must starts in erofs_xattr_ibody_header,
+ * for read-only fs, no need to introduce h_refcount
+ */
+struct erofs_xattr_ibody_header {
+ __le32 h_name_filter; /* bit value 1 indicates not-present */
+ __u8 h_shared_count;
+ __u8 h_reserved2[7];
+ __le32 h_shared_xattrs[]; /* shared xattr id array */
+};
+
+/* Name indexes */
+#define EROFS_XATTR_INDEX_USER 1
+#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2
+#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
+#define EROFS_XATTR_INDEX_TRUSTED 4
+#define EROFS_XATTR_INDEX_LUSTRE 5
+#define EROFS_XATTR_INDEX_SECURITY 6
+
+/*
+ * bit 7 of e_name_index is set when it refers to a long xattr name prefix,
+ * while the remained lower bits represent the index of the prefix.
+ */
+#define EROFS_XATTR_LONG_PREFIX 0x80
+#define EROFS_XATTR_LONG_PREFIX_MASK 0x7f
+
+#define EROFS_XATTR_FILTER_BITS 32
+#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX
+#define EROFS_XATTR_FILTER_SEED 0x25BBE08F
+
+/* xattr entry (for both inline & shared xattrs) */
+struct erofs_xattr_entry {
+ __u8 e_name_len; /* length of name */
+ __u8 e_name_index; /* attribute name index */
+ __le16 e_value_size; /* size of attribute value */
+ /* followed by e_name and e_value */
+ char e_name[]; /* attribute name */
+};
+
+/* long xattr name prefix */
+struct erofs_xattr_long_prefix {
+ __u8 base_index; /* short xattr name prefix index */
+ char infix[]; /* infix apart from short prefix */
+};
+
+static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
+{
+ if (!i_xattr_icount)
+ return 0;
+
+ return sizeof(struct erofs_xattr_ibody_header) +
+ sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
+}
+
+#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry))
+
+static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
+{
+ return EROFS_XATTR_ALIGN(sizeof(struct erofs_xattr_entry) +
+ e->e_name_len + le16_to_cpu(e->e_value_size));
+}
+
+/* represent a zeroed chunk (hole) */
+#define EROFS_NULL_ADDR -1
+
+/* 4-byte block address array */
+#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32)
+
+/* 8-byte inode chunk indexes */
+struct erofs_inode_chunk_index {
+ __le16 advise; /* always 0, don't care for now */
+ __le16 device_id; /* back-end storage id (with bits masked) */
+ __le32 blkaddr; /* start block address of this inode chunk */
+};
+
+/* dirent sorts in alphabet order, thus we can do binary search */
+struct erofs_dirent {
+ __le64 nid; /* node number */
+ __le16 nameoff; /* start offset of file name */
+ __u8 file_type; /* file type */
+ __u8 reserved; /* reserved */
+} __packed;
+
+/*
+ * EROFS file types should match generic FT_* types and
+ * it seems no need to add BUILD_BUG_ONs since potential
+ * unmatchness will break other fses as well...
+ */
+
+#define EROFS_NAME_LEN 255
+
+/* maximum supported size of a physical compression cluster */
+#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024)
+
+/* available compression algorithm types (for h_algorithmtype) */
+enum {
+ Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZMA = 1,
+ Z_EROFS_COMPRESSION_DEFLATE = 2,
+ Z_EROFS_COMPRESSION_MAX
+};
+#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
+
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lz4_cfgs {
+ __le16 max_distance;
+ __le16 max_pclusterblks;
+ u8 reserved[10];
+} __packed;
+
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lzma_cfgs {
+ __le32 dict_size;
+ __le16 format;
+ u8 reserved[8];
+} __packed;
+
+#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
+
+/* 6 bytes (+ length field = 8 bytes) */
+struct z_erofs_deflate_cfgs {
+ u8 windowbits; /* 8..15 for DEFLATE */
+ u8 reserved[5];
+} __packed;
+
+/*
+ * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
+ * e.g. for 4k logical cluster size, 4B if compacted 2B is off;
+ * (4B) + 2B + (4B) if compacted 2B is on.
+ * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
+ * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
+ * bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
+ * bit 4 : interlaced plain pcluster (0 - off; 1 - on)
+ * bit 5 : fragment pcluster (0 - off; 1 - on)
+ */
+#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
+#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008
+#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010
+#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020
+
+#define Z_EROFS_FRAGMENT_INODE_BIT 7
+struct z_erofs_map_header {
+ union {
+ /* fragment data offset in the packed inode */
+ __le32 h_fragmentoff;
+ struct {
+ __le16 h_reserved1;
+ /* indicates the encoded size of tailpacking data */
+ __le16 h_idata_size;
+ };
+ };
+ __le16 h_advise;
+ /*
+ * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
+ * bit 4-7 : algorithm type of head 2 (logical cluster type 11).
+ */
+ __u8 h_algorithmtype;
+ /*
+ * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
+ * bit 3-6 : reserved;
+ * bit 7 : move the whole file into packed inode or not.
+ */
+ __u8 h_clusterbits;
+};
+
+/*
+ * On-disk logical cluster type:
+ * 0 - literal (uncompressed) lcluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
+ *
+ * In detail,
+ * 0 - literal (uncompressed) lcluster,
+ * di_advise = 0
+ * di_clusterofs = the literal data offset of the lcluster
+ * di_blkaddr = the blkaddr of the literal pcluster
+ *
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * di_advise = 1 or 3
+ * di_clusterofs = the decompressed data offset of the lcluster
+ * di_blkaddr = the blkaddr of the compressed pcluster
+ *
+ * 2 - compressed lcluster (for NONHEAD lclusters)
+ * di_advise = 2
+ * di_clusterofs =
+ * the decompressed data offset in its own HEAD lcluster
+ * di_u.delta[0] = distance to this HEAD lcluster
+ * di_u.delta[1] = distance to the next HEAD lcluster
+ */
+enum {
+ Z_EROFS_LCLUSTER_TYPE_PLAIN = 0,
+ Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1,
+ Z_EROFS_LCLUSTER_TYPE_NONHEAD = 2,
+ Z_EROFS_LCLUSTER_TYPE_HEAD2 = 3,
+ Z_EROFS_LCLUSTER_TYPE_MAX
+};
+
+#define Z_EROFS_LI_LCLUSTER_TYPE_BITS 2
+#define Z_EROFS_LI_LCLUSTER_TYPE_BIT 0
+
+/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
+#define Z_EROFS_LI_PARTIAL_REF (1 << 15)
+
+/*
+ * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
+ * compressed block count of a compressed extent (in logical clusters, aka.
+ * block count of a pcluster).
+ */
+#define Z_EROFS_LI_D0_CBLKCNT (1 << 11)
+
+struct z_erofs_lcluster_index {
+ __le16 di_advise;
+ /* where to decompress in the head lcluster */
+ __le16 di_clusterofs;
+
+ union {
+ /* for the HEAD lclusters */
+ __le32 blkaddr;
+ /*
+ * for the NONHEAD lclusters
+ * [0] - distance to its HEAD lcluster
+ * [1] - distance to the next HEAD lcluster
+ */
+ __le16 delta[2];
+ } di_u;
+};
+
+#define Z_EROFS_FULL_INDEX_ALIGN(end) \
+ (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8)
+
+/* check the EROFS on-disk layout strictly at compile time */
+static inline void erofs_check_ondisk_layout_definitions(void)
+{
+ const __le64 fmh = *(__le64 *)&(struct z_erofs_map_header) {
+ .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT
+ };
+
+ BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
+ BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
+ BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
+ BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
+ BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4);
+ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4);
+ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8);
+ BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
+ BUILD_BUG_ON(sizeof(struct z_erofs_lcluster_index) != 8);
+ BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
+ /* keep in sync between 2 index structures for better extendibility */
+ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
+ sizeof(struct z_erofs_lcluster_index));
+ BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
+
+ BUILD_BUG_ON(BIT(Z_EROFS_LI_LCLUSTER_TYPE_BITS) <
+ Z_EROFS_LCLUSTER_TYPE_MAX - 1);
+ /* exclude old compiler versions like gcc 7.5.0 */
+ BUILD_BUG_ON(__builtin_constant_p(fmh) ?
+ fmh != cpu_to_le64(1ULL << 63) : 0);
+}
+
+#endif
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
new file mode 100644
index 000000000..87ff35bff
--- /dev/null
+++ b/fs/erofs/fscache.c
@@ -0,0 +1,612 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022, Alibaba Cloud
+ * Copyright (C) 2022, Bytedance Inc. All rights reserved.
+ */
+#include <linux/fscache.h>
+#include "internal.h"
+
+static DEFINE_MUTEX(erofs_domain_list_lock);
+static DEFINE_MUTEX(erofs_domain_cookies_lock);
+static LIST_HEAD(erofs_domain_list);
+static LIST_HEAD(erofs_domain_cookies_list);
+static struct vfsmount *erofs_pseudo_mnt;
+
+struct erofs_fscache_request {
+ struct erofs_fscache_request *primary;
+ struct netfs_cache_resources cache_resources;
+ struct address_space *mapping; /* The mapping being accessed */
+ loff_t start; /* Start position */
+ size_t len; /* Length of the request */
+ size_t submitted; /* Length of submitted */
+ short error; /* 0 or error that occurred */
+ refcount_t ref;
+};
+
+static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
+ loff_t start, size_t len)
+{
+ struct erofs_fscache_request *req;
+
+ req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ req->mapping = mapping;
+ req->start = start;
+ req->len = len;
+ refcount_set(&req->ref, 1);
+
+ return req;
+}
+
+static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
+ size_t len)
+{
+ struct erofs_fscache_request *req;
+
+ /* use primary request for the first submission */
+ if (!primary->submitted) {
+ refcount_inc(&primary->ref);
+ return primary;
+ }
+
+ req = erofs_fscache_req_alloc(primary->mapping,
+ primary->start + primary->submitted, len);
+ if (!IS_ERR(req)) {
+ req->primary = primary;
+ refcount_inc(&primary->ref);
+ }
+ return req;
+}
+
+static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
+{
+ struct folio *folio;
+ bool failed = req->error;
+ pgoff_t start_page = req->start / PAGE_SIZE;
+ pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1;
+
+ XA_STATE(xas, &req->mapping->i_pages, start_page);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_page) {
+ if (xas_retry(&xas, folio))
+ continue;
+ if (!failed)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ }
+ rcu_read_unlock();
+}
+
+static void erofs_fscache_req_put(struct erofs_fscache_request *req)
+{
+ if (refcount_dec_and_test(&req->ref)) {
+ if (req->cache_resources.ops)
+ req->cache_resources.ops->end_operation(&req->cache_resources);
+ if (!req->primary)
+ erofs_fscache_req_complete(req);
+ else
+ erofs_fscache_req_put(req->primary);
+ kfree(req);
+ }
+}
+
+static void erofs_fscache_subreq_complete(void *priv,
+ ssize_t transferred_or_error, bool was_async)
+{
+ struct erofs_fscache_request *req = priv;
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ if (req->primary)
+ req->primary->error = transferred_or_error;
+ else
+ req->error = transferred_or_error;
+ }
+ erofs_fscache_req_put(req);
+}
+
+/*
+ * Read data from fscache (cookie, pstart, len), and fill the read data into
+ * page cache described by (req->mapping, lstart, len). @pstart describeis the
+ * start physical address in the cache file.
+ */
+static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
+ struct erofs_fscache_request *req, loff_t pstart, size_t len)
+{
+ enum netfs_io_source source;
+ struct super_block *sb = req->mapping->host->i_sb;
+ struct netfs_cache_resources *cres = &req->cache_resources;
+ struct iov_iter iter;
+ loff_t lstart = req->start + req->submitted;
+ size_t done = 0;
+ int ret;
+
+ DBG_BUGON(len > req->len - req->submitted);
+
+ ret = fscache_begin_read_operation(cres, cookie);
+ if (ret)
+ return ret;
+
+ while (done < len) {
+ loff_t sstart = pstart + done;
+ size_t slen = len - done;
+ unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
+
+ source = cres->ops->prepare_ondemand_read(cres,
+ sstart, &slen, LLONG_MAX, &flags, 0);
+ if (WARN_ON(slen == 0))
+ source = NETFS_INVALID_READ;
+ if (source != NETFS_READ_FROM_CACHE) {
+ erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
+ return -EIO;
+ }
+
+ refcount_inc(&req->ref);
+ iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages,
+ lstart + done, slen);
+
+ ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
+ erofs_fscache_subreq_complete, req);
+ if (ret == -EIOCBQUEUED)
+ ret = 0;
+ if (ret) {
+ erofs_err(sb, "failed to fscache_read (ret %d)", ret);
+ return ret;
+ }
+
+ done += slen;
+ }
+ DBG_BUGON(done != len);
+ return 0;
+}
+
+static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+{
+ int ret;
+ struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private;
+ struct erofs_fscache_request *req;
+
+ req = erofs_fscache_req_alloc(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(req)) {
+ folio_unlock(folio);
+ return PTR_ERR(req);
+ }
+
+ ret = erofs_fscache_read_folios_async(ctx->cookie, req,
+ folio_pos(folio), folio_size(folio));
+ if (ret)
+ req->error = ret;
+
+ erofs_fscache_req_put(req);
+ return ret;
+}
+
+static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
+{
+ struct address_space *mapping = primary->mapping;
+ struct inode *inode = mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct erofs_fscache_request *req;
+ struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
+ struct iov_iter iter;
+ loff_t pos = primary->start + primary->submitted;
+ size_t count;
+ int ret;
+
+ map.m_la = pos;
+ ret = erofs_map_blocks(inode, &map);
+ if (ret)
+ return ret;
+
+ if (map.m_flags & EROFS_MAP_META) {
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ erofs_blk_t blknr;
+ size_t offset, size;
+ void *src;
+
+ /* For tail packing layout, the offset may be non-zero. */
+ offset = erofs_blkoff(sb, map.m_pa);
+ blknr = erofs_blknr(sb, map.m_pa);
+ size = map.m_llen;
+
+ src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+ if (IS_ERR(src))
+ return PTR_ERR(src);
+
+ iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE);
+ if (copy_to_iter(src + offset, size, &iter) != size) {
+ erofs_put_metabuf(&buf);
+ return -EFAULT;
+ }
+ iov_iter_zero(PAGE_SIZE - size, &iter);
+ erofs_put_metabuf(&buf);
+ primary->submitted += PAGE_SIZE;
+ return 0;
+ }
+
+ count = primary->len - primary->submitted;
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
+ iov_iter_zero(count, &iter);
+ primary->submitted += count;
+ return 0;
+ }
+
+ count = min_t(size_t, map.m_llen - (pos - map.m_la), count);
+ DBG_BUGON(!count || count % PAGE_SIZE);
+
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ return ret;
+
+ req = erofs_fscache_req_chain(primary, count);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ req, mdev.m_pa + (pos - map.m_la), count);
+ erofs_fscache_req_put(req);
+ primary->submitted += count;
+ return ret;
+}
+
+static int erofs_fscache_data_read(struct erofs_fscache_request *req)
+{
+ int ret;
+
+ do {
+ ret = erofs_fscache_data_read_slice(req);
+ if (ret)
+ req->error = ret;
+ } while (!ret && req->submitted < req->len);
+
+ return ret;
+}
+
+static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+{
+ struct erofs_fscache_request *req;
+ int ret;
+
+ req = erofs_fscache_req_alloc(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(req)) {
+ folio_unlock(folio);
+ return PTR_ERR(req);
+ }
+
+ ret = erofs_fscache_data_read(req);
+ erofs_fscache_req_put(req);
+ return ret;
+}
+
+static void erofs_fscache_readahead(struct readahead_control *rac)
+{
+ struct erofs_fscache_request *req;
+
+ if (!readahead_count(rac))
+ return;
+
+ req = erofs_fscache_req_alloc(rac->mapping,
+ readahead_pos(rac), readahead_length(rac));
+ if (IS_ERR(req))
+ return;
+
+ /* The request completion will drop refs on the folios. */
+ while (readahead_folio(rac))
+ ;
+
+ erofs_fscache_data_read(req);
+ erofs_fscache_req_put(req);
+}
+
+static const struct address_space_operations erofs_fscache_meta_aops = {
+ .read_folio = erofs_fscache_meta_read_folio,
+};
+
+const struct address_space_operations erofs_fscache_access_aops = {
+ .read_folio = erofs_fscache_read_folio,
+ .readahead = erofs_fscache_readahead,
+};
+
+static void erofs_fscache_domain_put(struct erofs_domain *domain)
+{
+ mutex_lock(&erofs_domain_list_lock);
+ if (refcount_dec_and_test(&domain->ref)) {
+ list_del(&domain->list);
+ if (list_empty(&erofs_domain_list)) {
+ kern_unmount(erofs_pseudo_mnt);
+ erofs_pseudo_mnt = NULL;
+ }
+ fscache_relinquish_volume(domain->volume, NULL, false);
+ mutex_unlock(&erofs_domain_list_lock);
+ kfree(domain->domain_id);
+ kfree(domain);
+ return;
+ }
+ mutex_unlock(&erofs_domain_list_lock);
+}
+
+static int erofs_fscache_register_volume(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ char *domain_id = sbi->domain_id;
+ struct fscache_volume *volume;
+ char *name;
+ int ret = 0;
+
+ name = kasprintf(GFP_KERNEL, "erofs,%s",
+ domain_id ? domain_id : sbi->fsid);
+ if (!name)
+ return -ENOMEM;
+
+ volume = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(volume)) {
+ erofs_err(sb, "failed to register volume for %s", name);
+ ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP;
+ volume = NULL;
+ }
+
+ sbi->volume = volume;
+ kfree(name);
+ return ret;
+}
+
+static int erofs_fscache_init_domain(struct super_block *sb)
+{
+ int err;
+ struct erofs_domain *domain;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL);
+ if (!domain)
+ return -ENOMEM;
+
+ domain->domain_id = kstrdup(sbi->domain_id, GFP_KERNEL);
+ if (!domain->domain_id) {
+ kfree(domain);
+ return -ENOMEM;
+ }
+
+ err = erofs_fscache_register_volume(sb);
+ if (err)
+ goto out;
+
+ if (!erofs_pseudo_mnt) {
+ erofs_pseudo_mnt = kern_mount(&erofs_fs_type);
+ if (IS_ERR(erofs_pseudo_mnt)) {
+ err = PTR_ERR(erofs_pseudo_mnt);
+ goto out;
+ }
+ }
+
+ domain->volume = sbi->volume;
+ refcount_set(&domain->ref, 1);
+ list_add(&domain->list, &erofs_domain_list);
+ sbi->domain = domain;
+ return 0;
+out:
+ kfree(domain->domain_id);
+ kfree(domain);
+ return err;
+}
+
+static int erofs_fscache_register_domain(struct super_block *sb)
+{
+ int err;
+ struct erofs_domain *domain;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ mutex_lock(&erofs_domain_list_lock);
+ list_for_each_entry(domain, &erofs_domain_list, list) {
+ if (!strcmp(domain->domain_id, sbi->domain_id)) {
+ sbi->domain = domain;
+ sbi->volume = domain->volume;
+ refcount_inc(&domain->ref);
+ mutex_unlock(&erofs_domain_list_lock);
+ return 0;
+ }
+ }
+ err = erofs_fscache_init_domain(sb);
+ mutex_unlock(&erofs_domain_list_lock);
+ return err;
+}
+
+static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
+ char *name, unsigned int flags)
+{
+ struct fscache_volume *volume = EROFS_SB(sb)->volume;
+ struct erofs_fscache *ctx;
+ struct fscache_cookie *cookie;
+ struct super_block *isb;
+ struct inode *inode;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&ctx->node);
+ refcount_set(&ctx->ref, 1);
+
+ cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE,
+ name, strlen(name), NULL, 0, 0);
+ if (!cookie) {
+ erofs_err(sb, "failed to get cookie for %s", name);
+ ret = -EINVAL;
+ goto err;
+ }
+ fscache_use_cookie(cookie, false);
+
+ /*
+ * Allocate anonymous inode in global pseudo mount for shareable blobs,
+ * so that they are accessible among erofs fs instances.
+ */
+ isb = flags & EROFS_REG_COOKIE_SHARE ? erofs_pseudo_mnt->mnt_sb : sb;
+ inode = new_inode(isb);
+ if (!inode) {
+ erofs_err(sb, "failed to get anon inode for %s", name);
+ ret = -ENOMEM;
+ goto err_cookie;
+ }
+
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ inode->i_blkbits = EROFS_SB(sb)->blkszbits;
+ inode->i_private = ctx;
+
+ ctx->cookie = cookie;
+ ctx->inode = inode;
+ return ctx;
+
+err_cookie:
+ fscache_unuse_cookie(cookie, NULL, NULL);
+ fscache_relinquish_cookie(cookie, false);
+err:
+ kfree(ctx);
+ return ERR_PTR(ret);
+}
+
+static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
+{
+ fscache_unuse_cookie(ctx->cookie, NULL, NULL);
+ fscache_relinquish_cookie(ctx->cookie, false);
+ iput(ctx->inode);
+ kfree(ctx->name);
+ kfree(ctx);
+}
+
+static struct erofs_fscache *erofs_domain_init_cookie(struct super_block *sb,
+ char *name, unsigned int flags)
+{
+ struct erofs_fscache *ctx;
+ struct erofs_domain *domain = EROFS_SB(sb)->domain;
+
+ ctx = erofs_fscache_acquire_cookie(sb, name, flags);
+ if (IS_ERR(ctx))
+ return ctx;
+
+ ctx->name = kstrdup(name, GFP_KERNEL);
+ if (!ctx->name) {
+ erofs_fscache_relinquish_cookie(ctx);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ refcount_inc(&domain->ref);
+ ctx->domain = domain;
+ list_add(&ctx->node, &erofs_domain_cookies_list);
+ return ctx;
+}
+
+static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
+ char *name, unsigned int flags)
+{
+ struct erofs_fscache *ctx;
+ struct erofs_domain *domain = EROFS_SB(sb)->domain;
+
+ flags |= EROFS_REG_COOKIE_SHARE;
+ mutex_lock(&erofs_domain_cookies_lock);
+ list_for_each_entry(ctx, &erofs_domain_cookies_list, node) {
+ if (ctx->domain != domain || strcmp(ctx->name, name))
+ continue;
+ if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) {
+ refcount_inc(&ctx->ref);
+ } else {
+ erofs_err(sb, "%s already exists in domain %s", name,
+ domain->domain_id);
+ ctx = ERR_PTR(-EEXIST);
+ }
+ mutex_unlock(&erofs_domain_cookies_lock);
+ return ctx;
+ }
+ ctx = erofs_domain_init_cookie(sb, name, flags);
+ mutex_unlock(&erofs_domain_cookies_lock);
+ return ctx;
+}
+
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name,
+ unsigned int flags)
+{
+ if (EROFS_SB(sb)->domain_id)
+ return erofs_domain_register_cookie(sb, name, flags);
+ return erofs_fscache_acquire_cookie(sb, name, flags);
+}
+
+void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
+{
+ struct erofs_domain *domain = NULL;
+
+ if (!ctx)
+ return;
+ if (!ctx->domain)
+ return erofs_fscache_relinquish_cookie(ctx);
+
+ mutex_lock(&erofs_domain_cookies_lock);
+ if (refcount_dec_and_test(&ctx->ref)) {
+ domain = ctx->domain;
+ list_del(&ctx->node);
+ erofs_fscache_relinquish_cookie(ctx);
+ }
+ mutex_unlock(&erofs_domain_cookies_lock);
+ if (domain)
+ erofs_fscache_domain_put(domain);
+}
+
+int erofs_fscache_register_fs(struct super_block *sb)
+{
+ int ret;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_fscache *fscache;
+ unsigned int flags = 0;
+
+ if (sbi->domain_id)
+ ret = erofs_fscache_register_domain(sb);
+ else
+ ret = erofs_fscache_register_volume(sb);
+ if (ret)
+ return ret;
+
+ /*
+ * When shared domain is enabled, using NEED_NOEXIST to guarantee
+ * the primary data blob (aka fsid) is unique in the shared domain.
+ *
+ * For non-shared-domain case, fscache_acquire_volume() invoked by
+ * erofs_fscache_register_volume() has already guaranteed
+ * the uniqueness of primary data blob.
+ *
+ * Acquired domain/volume will be relinquished in kill_sb() on error.
+ */
+ if (sbi->domain_id)
+ flags |= EROFS_REG_COOKIE_NEED_NOEXIST;
+ fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags);
+ if (IS_ERR(fscache))
+ return PTR_ERR(fscache);
+
+ sbi->s_fscache = fscache;
+ return 0;
+}
+
+void erofs_fscache_unregister_fs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ erofs_fscache_unregister_cookie(sbi->s_fscache);
+
+ if (sbi->domain)
+ erofs_fscache_domain_put(sbi->domain);
+ else
+ fscache_relinquish_volume(sbi->volume, NULL, false);
+
+ sbi->s_fscache = NULL;
+ sbi->volume = NULL;
+ sbi->domain = NULL;
+}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
new file mode 100644
index 000000000..edc8ec758
--- /dev/null
+++ b/fs/erofs/inode.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
+ */
+#include "xattr.h"
+
+#include <trace/events/erofs.h>
+
+static void *erofs_read_inode(struct erofs_buf *buf,
+ struct inode *inode, unsigned int *ofs)
+{
+ struct super_block *sb = inode->i_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_inode *vi = EROFS_I(inode);
+ const erofs_off_t inode_loc = erofs_iloc(inode);
+
+ erofs_blk_t blkaddr, nblks = 0;
+ void *kaddr;
+ struct erofs_inode_compact *dic;
+ struct erofs_inode_extended *die, *copied = NULL;
+ unsigned int ifmt;
+ int err;
+
+ blkaddr = erofs_blknr(sb, inode_loc);
+ *ofs = erofs_blkoff(sb, inode_loc);
+
+ kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
+ vi->nid, PTR_ERR(kaddr));
+ return kaddr;
+ }
+
+ dic = kaddr + *ofs;
+ ifmt = le16_to_cpu(dic->i_format);
+
+ if (ifmt & ~EROFS_I_ALL) {
+ erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+ ifmt, vi->nid);
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+
+ vi->datalayout = erofs_inode_datalayout(ifmt);
+ if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
+ erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
+ vi->datalayout, vi->nid);
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+
+ switch (erofs_inode_version(ifmt)) {
+ case EROFS_INODE_LAYOUT_EXTENDED:
+ vi->inode_isize = sizeof(struct erofs_inode_extended);
+ /* check if the extended inode acrosses block boundary */
+ if (*ofs + vi->inode_isize <= sb->s_blocksize) {
+ *ofs += vi->inode_isize;
+ die = (struct erofs_inode_extended *)dic;
+ } else {
+ const unsigned int gotten = sb->s_blocksize - *ofs;
+
+ copied = kmalloc(vi->inode_isize, GFP_NOFS);
+ if (!copied) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ memcpy(copied, dic, gotten);
+ kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1,
+ EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
+ vi->nid, PTR_ERR(kaddr));
+ kfree(copied);
+ return kaddr;
+ }
+ *ofs = vi->inode_isize - gotten;
+ memcpy((u8 *)copied + gotten, kaddr, *ofs);
+ die = copied;
+ }
+ vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
+
+ inode->i_mode = le16_to_cpu(die->i_mode);
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr);
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ inode->i_rdev =
+ new_decode_dev(le32_to_cpu(die->i_u.rdev));
+ break;
+ case S_IFIFO:
+ case S_IFSOCK:
+ inode->i_rdev = 0;
+ break;
+ default:
+ goto bogusimode;
+ }
+ i_uid_write(inode, le32_to_cpu(die->i_uid));
+ i_gid_write(inode, le32_to_cpu(die->i_gid));
+ set_nlink(inode, le32_to_cpu(die->i_nlink));
+
+ /* extended inode has its own timestamp */
+ inode_set_ctime(inode, le64_to_cpu(die->i_mtime),
+ le32_to_cpu(die->i_mtime_nsec));
+
+ inode->i_size = le64_to_cpu(die->i_size);
+
+ /* total blocks for compressed files */
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ nblks = le32_to_cpu(die->i_u.compressed_blocks);
+ else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
+ /* fill chunked inode summary info */
+ vi->chunkformat = le16_to_cpu(die->i_u.c.format);
+ kfree(copied);
+ copied = NULL;
+ break;
+ case EROFS_INODE_LAYOUT_COMPACT:
+ vi->inode_isize = sizeof(struct erofs_inode_compact);
+ *ofs += vi->inode_isize;
+ vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
+
+ inode->i_mode = le16_to_cpu(dic->i_mode);
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr);
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ inode->i_rdev =
+ new_decode_dev(le32_to_cpu(dic->i_u.rdev));
+ break;
+ case S_IFIFO:
+ case S_IFSOCK:
+ inode->i_rdev = 0;
+ break;
+ default:
+ goto bogusimode;
+ }
+ i_uid_write(inode, le16_to_cpu(dic->i_uid));
+ i_gid_write(inode, le16_to_cpu(dic->i_gid));
+ set_nlink(inode, le16_to_cpu(dic->i_nlink));
+
+ /* use build time for compact inodes */
+ inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec);
+
+ inode->i_size = le32_to_cpu(dic->i_size);
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ nblks = le32_to_cpu(dic->i_u.compressed_blocks);
+ else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
+ vi->chunkformat = le16_to_cpu(dic->i_u.c.format);
+ break;
+ default:
+ erofs_err(inode->i_sb,
+ "unsupported on-disk inode version %u of nid %llu",
+ erofs_inode_version(ifmt), vi->nid);
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+
+ if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
+ erofs_err(inode->i_sb,
+ "unsupported chunk format %x of nid %llu",
+ vi->chunkformat, vi->nid);
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+ vi->chunkbits = sb->s_blocksize_bits +
+ (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
+ }
+ inode->i_mtime = inode->i_atime = inode_get_ctime(inode);
+
+ inode->i_flags &= ~S_DAX;
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
+ (vi->datalayout == EROFS_INODE_FLAT_PLAIN ||
+ vi->datalayout == EROFS_INODE_CHUNK_BASED))
+ inode->i_flags |= S_DAX;
+
+ if (!nblks)
+ /* measure inode.i_blocks as generic filesystems */
+ inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
+ else
+ inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
+ return kaddr;
+
+bogusimode:
+ erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
+ inode->i_mode, vi->nid);
+ err = -EFSCORRUPTED;
+err_out:
+ DBG_BUGON(1);
+ kfree(copied);
+ erofs_put_metabuf(buf);
+ return ERR_PTR(err);
+}
+
+static int erofs_fill_symlink(struct inode *inode, void *kaddr,
+ unsigned int m_pofs)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ unsigned int bsz = i_blocksize(inode);
+ char *lnk;
+
+ /* if it cannot be handled with fast symlink scheme */
+ if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
+ inode->i_size >= bsz || inode->i_size < 0) {
+ inode->i_op = &erofs_symlink_iops;
+ return 0;
+ }
+
+ lnk = kmalloc(inode->i_size + 1, GFP_KERNEL);
+ if (!lnk)
+ return -ENOMEM;
+
+ m_pofs += vi->xattr_isize;
+ /* inline symlink data shouldn't cross block boundary */
+ if (m_pofs + inode->i_size > bsz) {
+ kfree(lnk);
+ erofs_err(inode->i_sb,
+ "inline data cross block boundary @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ memcpy(lnk, kaddr + m_pofs, inode->i_size);
+ lnk[inode->i_size] = '\0';
+
+ inode->i_link = lnk;
+ inode->i_op = &erofs_fast_symlink_iops;
+ return 0;
+}
+
+static int erofs_fill_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ void *kaddr;
+ unsigned int ofs;
+ int err = 0;
+
+ trace_erofs_fill_inode(inode);
+
+ /* read inode base data from disk */
+ kaddr = erofs_read_inode(&buf, inode, &ofs);
+ if (IS_ERR(kaddr))
+ return PTR_ERR(kaddr);
+
+ /* setup the new inode */
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &erofs_generic_iops;
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ inode->i_fop = &generic_ro_fops;
+ else
+ inode->i_fop = &erofs_file_fops;
+ break;
+ case S_IFDIR:
+ inode->i_op = &erofs_dir_iops;
+ inode->i_fop = &erofs_dir_fops;
+ inode_nohighmem(inode);
+ break;
+ case S_IFLNK:
+ err = erofs_fill_symlink(inode, kaddr, ofs);
+ if (err)
+ goto out_unlock;
+ inode_nohighmem(inode);
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ inode->i_op = &erofs_generic_iops;
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ goto out_unlock;
+ default:
+ err = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ if (erofs_inode_is_data_compressed(vi->datalayout)) {
+#ifdef CONFIG_EROFS_FS_ZIP
+ if (!erofs_is_fscache_mode(inode->i_sb) &&
+ inode->i_sb->s_blocksize_bits == PAGE_SHIFT) {
+ inode->i_mapping->a_ops = &z_erofs_aops;
+ err = 0;
+ goto out_unlock;
+ }
+#endif
+ err = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+ inode->i_mapping->a_ops = &erofs_raw_access_aops;
+ mapping_set_large_folios(inode->i_mapping);
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (erofs_is_fscache_mode(inode->i_sb))
+ inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+#endif
+
+out_unlock:
+ erofs_put_metabuf(&buf);
+ return err;
+}
+
+/*
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit.
+ */
+static ino_t erofs_squash_ino(erofs_nid_t nid)
+{
+ ino_t ino = (ino_t)nid;
+
+ if (sizeof(ino_t) < sizeof(erofs_nid_t))
+ ino ^= nid >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8;
+ return ino;
+}
+
+static int erofs_iget5_eq(struct inode *inode, void *opaque)
+{
+ return EROFS_I(inode)->nid == *(erofs_nid_t *)opaque;
+}
+
+static int erofs_iget5_set(struct inode *inode, void *opaque)
+{
+ const erofs_nid_t nid = *(erofs_nid_t *)opaque;
+
+ inode->i_ino = erofs_squash_ino(nid);
+ EROFS_I(inode)->nid = nid;
+ return 0;
+}
+
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
+{
+ struct inode *inode;
+
+ inode = iget5_locked(sb, erofs_squash_ino(nid), erofs_iget5_eq,
+ erofs_iget5_set, &nid);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (inode->i_state & I_NEW) {
+ int err = erofs_fill_inode(inode);
+
+ if (err) {
+ iget_failed(inode);
+ return ERR_PTR(err);
+ }
+ unlock_new_inode(inode);
+ }
+ return inode;
+}
+
+int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *const inode = d_inode(path->dentry);
+
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
+ STATX_ATTR_IMMUTABLE);
+
+ generic_fillattr(idmap, request_mask, inode, stat);
+ return 0;
+}
+
+const struct inode_operations erofs_generic_iops = {
+ .getattr = erofs_getattr,
+ .listxattr = erofs_listxattr,
+ .get_inode_acl = erofs_get_acl,
+ .fiemap = erofs_fiemap,
+};
+
+const struct inode_operations erofs_symlink_iops = {
+ .get_link = page_get_link,
+ .getattr = erofs_getattr,
+ .listxattr = erofs_listxattr,
+ .get_inode_acl = erofs_get_acl,
+};
+
+const struct inode_operations erofs_fast_symlink_iops = {
+ .get_link = simple_get_link,
+ .getattr = erofs_getattr,
+ .listxattr = erofs_listxattr,
+ .get_inode_acl = erofs_get_acl,
+};
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
new file mode 100644
index 000000000..d8de61350
--- /dev/null
+++ b/fs/erofs/internal.h
@@ -0,0 +1,537 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
+ */
+#ifndef __EROFS_INTERNAL_H
+#define __EROFS_INTERNAL_H
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/iomap.h>
+#include "erofs_fs.h"
+
+/* redefine pr_fmt "erofs: " */
+#undef pr_fmt
+#define pr_fmt(fmt) "erofs: " fmt
+
+__printf(3, 4) void _erofs_err(struct super_block *sb,
+ const char *function, const char *fmt, ...);
+#define erofs_err(sb, fmt, ...) \
+ _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__)
+__printf(3, 4) void _erofs_info(struct super_block *sb,
+ const char *function, const char *fmt, ...);
+#define erofs_info(sb, fmt, ...) \
+ _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
+#ifdef CONFIG_EROFS_FS_DEBUG
+#define DBG_BUGON BUG_ON
+#else
+#define DBG_BUGON(x) ((void)(x))
+#endif /* !CONFIG_EROFS_FS_DEBUG */
+
+/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
+#define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1
+
+typedef u64 erofs_nid_t;
+typedef u64 erofs_off_t;
+/* data type for filesystem-wide blocks number */
+typedef u32 erofs_blk_t;
+
+struct erofs_device_info {
+ char *path;
+ struct erofs_fscache *fscache;
+ struct block_device *bdev;
+ struct dax_device *dax_dev;
+ u64 dax_part_off;
+
+ u32 blocks;
+ u32 mapped_blkaddr;
+};
+
+enum {
+ EROFS_SYNC_DECOMPRESS_AUTO,
+ EROFS_SYNC_DECOMPRESS_FORCE_ON,
+ EROFS_SYNC_DECOMPRESS_FORCE_OFF
+};
+
+struct erofs_mount_opts {
+#ifdef CONFIG_EROFS_FS_ZIP
+ /* current strategy of how to use managed cache */
+ unsigned char cache_strategy;
+ /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */
+ unsigned int sync_decompress;
+
+ /* threshold for decompression synchronously */
+ unsigned int max_sync_decompress_pages;
+#endif
+ unsigned int mount_opt;
+};
+
+struct erofs_dev_context {
+ struct idr tree;
+ struct rw_semaphore rwsem;
+
+ unsigned int extra_devices;
+ bool flatdev;
+};
+
+struct erofs_fs_context {
+ struct erofs_mount_opts opt;
+ struct erofs_dev_context *devs;
+ char *fsid;
+ char *domain_id;
+};
+
+/* all filesystem-wide lz4 configurations */
+struct erofs_sb_lz4_info {
+ /* # of pages needed for EROFS lz4 rolling decompression */
+ u16 max_distance_pages;
+ /* maximum possible blocks for pclusters in the filesystem */
+ u16 max_pclusterblks;
+};
+
+struct erofs_domain {
+ refcount_t ref;
+ struct list_head list;
+ struct fscache_volume *volume;
+ char *domain_id;
+};
+
+struct erofs_fscache {
+ struct fscache_cookie *cookie;
+ struct inode *inode; /* anonymous inode for the blob */
+
+ /* used for share domain mode */
+ struct erofs_domain *domain;
+ struct list_head node;
+ refcount_t ref;
+ char *name;
+};
+
+struct erofs_xattr_prefix_item {
+ struct erofs_xattr_long_prefix *prefix;
+ u8 infix_len;
+};
+
+struct erofs_sb_info {
+ struct erofs_mount_opts opt; /* options */
+#ifdef CONFIG_EROFS_FS_ZIP
+ /* list for all registered superblocks, mainly for shrinker */
+ struct list_head list;
+ struct mutex umount_mutex;
+
+ /* managed XArray arranged in physical block number */
+ struct xarray managed_pslots;
+
+ unsigned int shrinker_run_no;
+ u16 available_compr_algs;
+
+ /* pseudo inode to manage cached pages */
+ struct inode *managed_cache;
+
+ struct erofs_sb_lz4_info lz4;
+#endif /* CONFIG_EROFS_FS_ZIP */
+ struct inode *packed_inode;
+ struct erofs_dev_context *devs;
+ struct dax_device *dax_dev;
+ u64 dax_part_off;
+ u64 total_blocks;
+ u32 primarydevice_blocks;
+
+ u32 meta_blkaddr;
+#ifdef CONFIG_EROFS_FS_XATTR
+ u32 xattr_blkaddr;
+ u32 xattr_prefix_start;
+ u8 xattr_prefix_count;
+ struct erofs_xattr_prefix_item *xattr_prefixes;
+ unsigned int xattr_filter_reserved;
+#endif
+ u16 device_id_mask; /* valid bits of device id to be used */
+
+ unsigned char islotbits; /* inode slot unit size in bit shift */
+ unsigned char blkszbits; /* filesystem block size in bit shift */
+
+ u32 sb_size; /* total superblock size */
+ u32 build_time_nsec;
+ u64 build_time;
+
+ /* what we really care is nid, rather than ino.. */
+ erofs_nid_t root_nid;
+ erofs_nid_t packed_nid;
+ /* used for statfs, f_files - f_favail */
+ u64 inos;
+
+ u8 uuid[16]; /* 128-bit uuid for volume */
+ u8 volume_name[16]; /* volume name */
+ u32 feature_compat;
+ u32 feature_incompat;
+
+ /* sysfs support */
+ struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
+ struct completion s_kobj_unregister;
+
+ /* fscache support */
+ struct fscache_volume *volume;
+ struct erofs_fscache *s_fscache;
+ struct erofs_domain *domain;
+ char *fsid;
+ char *domain_id;
+};
+
+#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
+#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info)
+
+/* Mount flags set via mount options or defaults */
+#define EROFS_MOUNT_XATTR_USER 0x00000010
+#define EROFS_MOUNT_POSIX_ACL 0x00000020
+#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
+#define EROFS_MOUNT_DAX_NEVER 0x00000080
+
+#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
+
+static inline bool erofs_is_fscache_mode(struct super_block *sb)
+{
+ return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
+}
+
+enum {
+ EROFS_ZIP_CACHE_DISABLED,
+ EROFS_ZIP_CACHE_READAHEAD,
+ EROFS_ZIP_CACHE_READAROUND
+};
+
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+ pgoff_t index;
+ struct lockref lockref;
+};
+
+enum erofs_kmap_type {
+ EROFS_NO_KMAP, /* don't map the buffer */
+ EROFS_KMAP, /* use kmap_local_page() to map the buffer */
+};
+
+struct erofs_buf {
+ struct inode *inode;
+ struct page *page;
+ void *base;
+ enum erofs_kmap_type kmap_type;
+};
+#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
+
+#define ROOT_NID(sb) ((sb)->root_nid)
+
+#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits)
+#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1))
+#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
+#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits)
+
+#define EROFS_FEATURE_FUNCS(name, compat, feature) \
+static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
+{ \
+ return sbi->feature_##compat & EROFS_FEATURE_##feature; \
+}
+
+EROFS_FEATURE_FUNCS(zero_padding, incompat, INCOMPAT_ZERO_PADDING)
+EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
+EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE)
+EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
+EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2)
+EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
+EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
+EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
+EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES)
+EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
+
+/* atomic flag definitions */
+#define EROFS_I_EA_INITED_BIT 0
+#define EROFS_I_Z_INITED_BIT 1
+
+/* bitlock definitions (arranged in reverse order) */
+#define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1)
+#define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2)
+
+struct erofs_inode {
+ erofs_nid_t nid;
+
+ /* atomic flags (including bitlocks) */
+ unsigned long flags;
+
+ unsigned char datalayout;
+ unsigned char inode_isize;
+ unsigned int xattr_isize;
+
+ unsigned int xattr_name_filter;
+ unsigned int xattr_shared_count;
+ unsigned int *xattr_shared_xattrs;
+
+ union {
+ erofs_blk_t raw_blkaddr;
+ struct {
+ unsigned short chunkformat;
+ unsigned char chunkbits;
+ };
+#ifdef CONFIG_EROFS_FS_ZIP
+ struct {
+ unsigned short z_advise;
+ unsigned char z_algorithmtype[2];
+ unsigned char z_logical_clusterbits;
+ unsigned long z_tailextent_headlcn;
+ union {
+ struct {
+ erofs_off_t z_idataoff;
+ unsigned short z_idata_size;
+ };
+ erofs_off_t z_fragmentoff;
+ };
+ };
+#endif /* CONFIG_EROFS_FS_ZIP */
+ };
+ /* the corresponding vfs inode */
+ struct inode vfs_inode;
+};
+
+#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode)
+
+static inline erofs_off_t erofs_iloc(struct inode *inode)
+{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+
+ return erofs_pos(inode->i_sb, sbi->meta_blkaddr) +
+ (EROFS_I(inode)->nid << sbi->islotbits);
+}
+
+static inline unsigned int erofs_inode_version(unsigned int ifmt)
+{
+ return (ifmt >> EROFS_I_VERSION_BIT) & EROFS_I_VERSION_MASK;
+}
+
+static inline unsigned int erofs_inode_datalayout(unsigned int ifmt)
+{
+ return (ifmt >> EROFS_I_DATALAYOUT_BIT) & EROFS_I_DATALAYOUT_MASK;
+}
+
+/*
+ * Different from grab_cache_page_nowait(), reclaiming is never triggered
+ * when allocating new pages.
+ */
+static inline
+struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
+ pgoff_t index)
+{
+ return pagecache_get_page(mapping, index,
+ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+ readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+}
+
+/* Has a disk mapping */
+#define EROFS_MAP_MAPPED 0x0001
+/* Located in metadata (could be copied from bd_inode) */
+#define EROFS_MAP_META 0x0002
+/* The extent is encoded */
+#define EROFS_MAP_ENCODED 0x0004
+/* The length of extent is full */
+#define EROFS_MAP_FULL_MAPPED 0x0008
+/* Located in the special packed inode */
+#define EROFS_MAP_FRAGMENT 0x0010
+/* The extent refers to partial decompressed data */
+#define EROFS_MAP_PARTIAL_REF 0x0020
+
+struct erofs_map_blocks {
+ struct erofs_buf buf;
+
+ erofs_off_t m_pa, m_la;
+ u64 m_plen, m_llen;
+
+ unsigned short m_deviceid;
+ char m_algorithmformat;
+ unsigned int m_flags;
+};
+
+/*
+ * Used to get the exact decompressed length, e.g. fiemap (consider lookback
+ * approach instead if possible since it's more metadata lightweight.)
+ */
+#define EROFS_GET_BLOCKS_FIEMAP 0x0001
+/* Used to map the whole extent if non-negligible data is requested for LZMA */
+#define EROFS_GET_BLOCKS_READMORE 0x0002
+/* Used to map tail extent for tailpacking inline or fragment pcluster */
+#define EROFS_GET_BLOCKS_FINDTAIL 0x0004
+
+enum {
+ Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+ Z_EROFS_COMPRESSION_INTERLACED,
+ Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
+
+struct erofs_map_dev {
+ struct erofs_fscache *m_fscache;
+ struct block_device *m_bdev;
+ struct dax_device *m_daxdev;
+ u64 m_dax_part_off;
+
+ erofs_off_t m_pa;
+ unsigned int m_deviceid;
+};
+
+extern struct file_system_type erofs_fs_type;
+extern const struct super_operations erofs_sops;
+
+extern const struct address_space_operations erofs_raw_access_aops;
+extern const struct address_space_operations z_erofs_aops;
+extern const struct address_space_operations erofs_fscache_access_aops;
+
+extern const struct inode_operations erofs_generic_iops;
+extern const struct inode_operations erofs_symlink_iops;
+extern const struct inode_operations erofs_fast_symlink_iops;
+extern const struct inode_operations erofs_dir_iops;
+
+extern const struct file_operations erofs_file_fops;
+extern const struct file_operations erofs_dir_fops;
+
+extern const struct iomap_ops z_erofs_iomap_report_ops;
+
+/* flags for erofs_fscache_register_cookie() */
+#define EROFS_REG_COOKIE_SHARE 0x0001
+#define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002
+
+void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
+ erofs_off_t *offset, int *lengthp);
+void erofs_unmap_metabuf(struct erofs_buf *buf);
+void erofs_put_metabuf(struct erofs_buf *buf);
+void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
+ enum erofs_kmap_type type);
+void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb);
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type);
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
+int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
+int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags);
+int erofs_namei(struct inode *dir, const struct qstr *name,
+ erofs_nid_t *nid, unsigned int *d_type);
+
+static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count)
+{
+ int retried = 0;
+
+ while (1) {
+ void *p = vm_map_ram(pages, count, -1);
+
+ /* retry two more times (totally 3 times) */
+ if (p || ++retried >= 3)
+ return p;
+ vm_unmap_aliases();
+ }
+ return NULL;
+}
+
+int erofs_register_sysfs(struct super_block *sb);
+void erofs_unregister_sysfs(struct super_block *sb);
+int __init erofs_init_sysfs(void);
+void erofs_exit_sysfs(void);
+
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
+static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
+{
+ set_page_private(page, (unsigned long)*pagepool);
+ *pagepool = page;
+}
+void erofs_release_pages(struct page **pagepool);
+
+#ifdef CONFIG_EROFS_FS_ZIP
+void erofs_workgroup_put(struct erofs_workgroup *grp);
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+ pgoff_t index);
+struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
+ struct erofs_workgroup *grp);
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+void erofs_shrinker_register(struct super_block *sb);
+void erofs_shrinker_unregister(struct super_block *sb);
+int __init erofs_init_shrinker(void);
+void erofs_exit_shrinker(void);
+int __init z_erofs_init_zip_subsystem(void);
+void z_erofs_exit_zip_subsystem(void);
+int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+ struct erofs_workgroup *egrp);
+int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
+ int flags);
+void *erofs_get_pcpubuf(unsigned int requiredpages);
+void erofs_put_pcpubuf(void *ptr);
+int erofs_pcpubuf_growsize(unsigned int nrpages);
+void __init erofs_pcpubuf_init(void);
+void erofs_pcpubuf_exit(void);
+int erofs_init_managed_cache(struct super_block *sb);
+int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
+#else
+static inline void erofs_shrinker_register(struct super_block *sb) {}
+static inline void erofs_shrinker_unregister(struct super_block *sb) {}
+static inline int erofs_init_shrinker(void) { return 0; }
+static inline void erofs_exit_shrinker(void) {}
+static inline int z_erofs_init_zip_subsystem(void) { return 0; }
+static inline void z_erofs_exit_zip_subsystem(void) {}
+static inline void erofs_pcpubuf_init(void) {}
+static inline void erofs_pcpubuf_exit(void) {}
+static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+int __init z_erofs_lzma_init(void);
+void z_erofs_lzma_exit(void);
+#else
+static inline int z_erofs_lzma_init(void) { return 0; }
+static inline int z_erofs_lzma_exit(void) { return 0; }
+#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */
+
+#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
+int __init z_erofs_deflate_init(void);
+void z_erofs_deflate_exit(void);
+#else
+static inline int z_erofs_deflate_init(void) { return 0; }
+static inline int z_erofs_deflate_exit(void) { return 0; }
+#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */
+
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+int erofs_fscache_register_fs(struct super_block *sb);
+void erofs_fscache_unregister_fs(struct super_block *sb);
+
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name, unsigned int flags);
+void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
+#else
+static inline int erofs_fscache_register_fs(struct super_block *sb)
+{
+ return -EOPNOTSUPP;
+}
+static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
+
+static inline
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name, unsigned int flags)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
+{
+}
+#endif
+
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+
+#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
new file mode 100644
index 000000000..d4f631d39
--- /dev/null
+++ b/fs/erofs/namei.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
+ */
+#include "xattr.h"
+#include <trace/events/erofs.h>
+
+struct erofs_qstr {
+ const unsigned char *name;
+ const unsigned char *end;
+};
+
+/* based on the end of qn is accurate and it must have the trailing '\0' */
+static inline int erofs_dirnamecmp(const struct erofs_qstr *qn,
+ const struct erofs_qstr *qd,
+ unsigned int *matched)
+{
+ unsigned int i = *matched;
+
+ /*
+ * on-disk error, let's only BUG_ON in the debugging mode.
+ * otherwise, it will return 1 to just skip the invalid name
+ * and go on (in consideration of the lookup performance).
+ */
+ DBG_BUGON(qd->name > qd->end);
+
+ /* qd could not have trailing '\0' */
+ /* However it is absolutely safe if < qd->end */
+ while (qd->name + i < qd->end && qd->name[i] != '\0') {
+ if (qn->name[i] != qd->name[i]) {
+ *matched = i;
+ return qn->name[i] > qd->name[i] ? 1 : -1;
+ }
+ ++i;
+ }
+ *matched = i;
+ /* See comments in __d_alloc on the terminating NUL character */
+ return qn->name[i] == '\0' ? 0 : 1;
+}
+
+#define nameoff_from_disk(off, sz) (le16_to_cpu(off) & ((sz) - 1))
+
+static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
+ u8 *data,
+ unsigned int dirblksize,
+ const int ndirents)
+{
+ int head, back;
+ unsigned int startprfx, endprfx;
+ struct erofs_dirent *const de = (struct erofs_dirent *)data;
+
+ /* since the 1st dirent has been evaluated previously */
+ head = 1;
+ back = ndirents - 1;
+ startprfx = endprfx = 0;
+
+ while (head <= back) {
+ const int mid = head + (back - head) / 2;
+ const int nameoff = nameoff_from_disk(de[mid].nameoff,
+ dirblksize);
+ unsigned int matched = min(startprfx, endprfx);
+ struct erofs_qstr dname = {
+ .name = data + nameoff,
+ .end = mid >= ndirents - 1 ?
+ data + dirblksize :
+ data + nameoff_from_disk(de[mid + 1].nameoff,
+ dirblksize)
+ };
+
+ /* string comparison without already matched prefix */
+ int ret = erofs_dirnamecmp(name, &dname, &matched);
+
+ if (!ret) {
+ return de + mid;
+ } else if (ret > 0) {
+ head = mid + 1;
+ startprfx = matched;
+ } else {
+ back = mid - 1;
+ endprfx = matched;
+ }
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static void *erofs_find_target_block(struct erofs_buf *target,
+ struct inode *dir, struct erofs_qstr *name, int *_ndirents)
+{
+ unsigned int bsz = i_blocksize(dir);
+ int head = 0, back = erofs_iblks(dir) - 1;
+ unsigned int startprfx = 0, endprfx = 0;
+ void *candidate = ERR_PTR(-ENOENT);
+
+ while (head <= back) {
+ const int mid = head + (back - head) / 2;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_dirent *de;
+
+ buf.inode = dir;
+ de = erofs_bread(&buf, mid, EROFS_KMAP);
+ if (!IS_ERR(de)) {
+ const int nameoff = nameoff_from_disk(de->nameoff, bsz);
+ const int ndirents = nameoff / sizeof(*de);
+ int diff;
+ unsigned int matched;
+ struct erofs_qstr dname;
+
+ if (!ndirents) {
+ erofs_put_metabuf(&buf);
+ erofs_err(dir->i_sb,
+ "corrupted dir block %d @ nid %llu",
+ mid, EROFS_I(dir)->nid);
+ DBG_BUGON(1);
+ de = ERR_PTR(-EFSCORRUPTED);
+ goto out;
+ }
+
+ matched = min(startprfx, endprfx);
+
+ dname.name = (u8 *)de + nameoff;
+ if (ndirents == 1)
+ dname.end = (u8 *)de + bsz;
+ else
+ dname.end = (u8 *)de +
+ nameoff_from_disk(de[1].nameoff, bsz);
+
+ /* string comparison without already matched prefix */
+ diff = erofs_dirnamecmp(name, &dname, &matched);
+
+ if (!diff) {
+ *_ndirents = 0;
+ goto out;
+ } else if (diff > 0) {
+ head = mid + 1;
+ startprfx = matched;
+
+ if (!IS_ERR(candidate))
+ erofs_put_metabuf(target);
+ *target = buf;
+ candidate = de;
+ *_ndirents = ndirents;
+ } else {
+ erofs_put_metabuf(&buf);
+
+ back = mid - 1;
+ endprfx = matched;
+ }
+ continue;
+ }
+out: /* free if the candidate is valid */
+ if (!IS_ERR(candidate))
+ erofs_put_metabuf(target);
+ return de;
+ }
+ return candidate;
+}
+
+int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
+ unsigned int *d_type)
+{
+ int ndirents;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_dirent *de;
+ struct erofs_qstr qn;
+
+ if (!dir->i_size)
+ return -ENOENT;
+
+ qn.name = name->name;
+ qn.end = name->name + name->len;
+ buf.inode = dir;
+
+ ndirents = 0;
+ de = erofs_find_target_block(&buf, dir, &qn, &ndirents);
+ if (IS_ERR(de))
+ return PTR_ERR(de);
+
+ if (ndirents)
+ de = find_target_dirent(&qn, (u8 *)de, i_blocksize(dir),
+ ndirents);
+
+ if (!IS_ERR(de)) {
+ *nid = le64_to_cpu(de->nid);
+ *d_type = de->file_type;
+ }
+ erofs_put_metabuf(&buf);
+ return PTR_ERR_OR_ZERO(de);
+}
+
+static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ int err;
+ erofs_nid_t nid;
+ unsigned int d_type;
+ struct inode *inode;
+
+ trace_erofs_lookup(dir, dentry, flags);
+
+ if (dentry->d_name.len > EROFS_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
+
+ if (err == -ENOENT)
+ /* negative dentry */
+ inode = NULL;
+ else if (err)
+ inode = ERR_PTR(err);
+ else
+ inode = erofs_iget(dir->i_sb, nid);
+ return d_splice_alias(inode, dentry);
+}
+
+const struct inode_operations erofs_dir_iops = {
+ .lookup = erofs_lookup,
+ .getattr = erofs_getattr,
+ .listxattr = erofs_listxattr,
+ .get_inode_acl = erofs_get_acl,
+ .fiemap = erofs_fiemap,
+};
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
new file mode 100644
index 000000000..c7a4b1d77
--- /dev/null
+++ b/fs/erofs/pcpubuf.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) Gao Xiang <xiang@kernel.org>
+ *
+ * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
+ * per-CPU virtual memory (in pages) in advance to store such inplace I/O
+ * data if inplace decompression is failed (due to unmet inplace margin for
+ * example).
+ */
+#include "internal.h"
+
+struct erofs_pcpubuf {
+ raw_spinlock_t lock;
+ void *ptr;
+ struct page **pages;
+ unsigned int nrpages;
+};
+
+static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
+
+void *erofs_get_pcpubuf(unsigned int requiredpages)
+ __acquires(pcb->lock)
+{
+ struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
+
+ raw_spin_lock(&pcb->lock);
+ /* check if the per-CPU buffer is too small */
+ if (requiredpages > pcb->nrpages) {
+ raw_spin_unlock(&pcb->lock);
+ put_cpu_var(erofs_pcb);
+ /* (for sparse checker) pretend pcb->lock is still taken */
+ __acquire(pcb->lock);
+ return NULL;
+ }
+ return pcb->ptr;
+}
+
+void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
+{
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
+
+ DBG_BUGON(pcb->ptr != ptr);
+ raw_spin_unlock(&pcb->lock);
+ put_cpu_var(erofs_pcb);
+}
+
+/* the next step: support per-CPU page buffers hotplug */
+int erofs_pcpubuf_growsize(unsigned int nrpages)
+{
+ static DEFINE_MUTEX(pcb_resize_mutex);
+ static unsigned int pcb_nrpages;
+ struct page *pagepool = NULL;
+ int delta, cpu, ret, i;
+
+ mutex_lock(&pcb_resize_mutex);
+ delta = nrpages - pcb_nrpages;
+ ret = 0;
+ /* avoid shrinking pcpubuf, since no idea how many fses rely on */
+ if (delta <= 0)
+ goto out;
+
+ for_each_possible_cpu(cpu) {
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+ struct page **pages, **oldpages;
+ void *ptr, *old_ptr;
+
+ pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ for (i = 0; i < nrpages; ++i) {
+ pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ oldpages = pages;
+ goto free_pagearray;
+ }
+ }
+ ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
+ if (!ptr) {
+ ret = -ENOMEM;
+ oldpages = pages;
+ goto free_pagearray;
+ }
+ raw_spin_lock(&pcb->lock);
+ old_ptr = pcb->ptr;
+ pcb->ptr = ptr;
+ oldpages = pcb->pages;
+ pcb->pages = pages;
+ i = pcb->nrpages;
+ pcb->nrpages = nrpages;
+ raw_spin_unlock(&pcb->lock);
+
+ if (!oldpages) {
+ DBG_BUGON(old_ptr);
+ continue;
+ }
+
+ if (old_ptr)
+ vunmap(old_ptr);
+free_pagearray:
+ while (i)
+ erofs_pagepool_add(&pagepool, oldpages[--i]);
+ kfree(oldpages);
+ if (ret)
+ break;
+ }
+ pcb_nrpages = nrpages;
+ erofs_release_pages(&pagepool);
+out:
+ mutex_unlock(&pcb_resize_mutex);
+ return ret;
+}
+
+void __init erofs_pcpubuf_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+ raw_spin_lock_init(&pcb->lock);
+ }
+}
+
+void erofs_pcpubuf_exit(void)
+{
+ int cpu, i;
+
+ for_each_possible_cpu(cpu) {
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+ if (pcb->ptr) {
+ vunmap(pcb->ptr);
+ pcb->ptr = NULL;
+ }
+ if (!pcb->pages)
+ continue;
+
+ for (i = 0; i < pcb->nrpages; ++i)
+ if (pcb->pages[i])
+ put_page(pcb->pages[i]);
+ kfree(pcb->pages);
+ pcb->pages = NULL;
+ }
+}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
new file mode 100644
index 000000000..cc44fb2e0
--- /dev/null
+++ b/fs/erofs/super.c
@@ -0,0 +1,1021 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
+ */
+#include <linux/module.h>
+#include <linux/statfs.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/crc32c.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/dax.h>
+#include <linux/exportfs.h>
+#include "xattr.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/erofs.h>
+
+static struct kmem_cache *erofs_inode_cachep __read_mostly;
+
+void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
+ va_end(args);
+}
+
+void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_info("(device %s): %pV", sb->s_id, &vaf);
+ va_end(args);
+}
+
+static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata)
+{
+ size_t len = 1 << EROFS_SB(sb)->blkszbits;
+ struct erofs_super_block *dsb;
+ u32 expected_crc, crc;
+
+ if (len > EROFS_SUPER_OFFSET)
+ len -= EROFS_SUPER_OFFSET;
+
+ dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL);
+ if (!dsb)
+ return -ENOMEM;
+
+ expected_crc = le32_to_cpu(dsb->checksum);
+ dsb->checksum = 0;
+ /* to allow for x86 boot sectors and other oddities. */
+ crc = crc32c(~0, dsb, len);
+ kfree(dsb);
+
+ if (crc != expected_crc) {
+ erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+ return 0;
+}
+
+static void erofs_inode_init_once(void *ptr)
+{
+ struct erofs_inode *vi = ptr;
+
+ inode_init_once(&vi->vfs_inode);
+}
+
+static struct inode *erofs_alloc_inode(struct super_block *sb)
+{
+ struct erofs_inode *vi =
+ alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL);
+
+ if (!vi)
+ return NULL;
+
+ /* zero out everything except vfs_inode */
+ memset(vi, 0, offsetof(struct erofs_inode, vfs_inode));
+ return &vi->vfs_inode;
+}
+
+static void erofs_free_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ if (inode->i_op == &erofs_fast_symlink_iops)
+ kfree(inode->i_link);
+ kfree(vi->xattr_shared_xattrs);
+ kmem_cache_free(erofs_inode_cachep, vi);
+}
+
+static bool check_layout_compatibility(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ const unsigned int feature = le32_to_cpu(dsb->feature_incompat);
+
+ EROFS_SB(sb)->feature_incompat = feature;
+
+ /* check if current kernel meets all mandatory requirements */
+ if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
+ erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
+ feature & ~EROFS_ALL_FEATURE_INCOMPAT);
+ return false;
+ }
+ return true;
+}
+
+/* read variable-sized metadata, offset will be aligned by 4-byte */
+void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
+ erofs_off_t *offset, int *lengthp)
+{
+ u8 *buffer, *ptr;
+ int len, i, cnt;
+
+ *offset = round_up(*offset, 4);
+ ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return ptr;
+
+ len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]);
+ if (!len)
+ len = U16_MAX + 1;
+ buffer = kmalloc(len, GFP_KERNEL);
+ if (!buffer)
+ return ERR_PTR(-ENOMEM);
+ *offset += sizeof(__le16);
+ *lengthp = len;
+
+ for (i = 0; i < len; i += cnt) {
+ cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
+ len - i);
+ ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP);
+ if (IS_ERR(ptr)) {
+ kfree(buffer);
+ return ptr;
+ }
+ memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt);
+ *offset += cnt;
+ }
+ return buffer;
+}
+
+#ifndef CONFIG_EROFS_FS_ZIP
+static int z_erofs_parse_cfgs(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ if (!dsb->u1.available_compr_algs)
+ return 0;
+
+ erofs_err(sb, "compression disabled, unable to mount compressed EROFS");
+ return -EOPNOTSUPP;
+}
+#endif
+
+static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
+ struct erofs_device_info *dif, erofs_off_t *pos)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_fscache *fscache;
+ struct erofs_deviceslot *dis;
+ struct block_device *bdev;
+ void *ptr;
+
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ dis = ptr + erofs_blkoff(sb, *pos);
+
+ if (!sbi->devs->flatdev && !dif->path) {
+ if (!dis->tag[0]) {
+ erofs_err(sb, "empty device tag @ pos %llu", *pos);
+ return -EINVAL;
+ }
+ dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL);
+ if (!dif->path)
+ return -ENOMEM;
+ }
+
+ if (erofs_is_fscache_mode(sb)) {
+ fscache = erofs_fscache_register_cookie(sb, dif->path, 0);
+ if (IS_ERR(fscache))
+ return PTR_ERR(fscache);
+ dif->fscache = fscache;
+ } else if (!sbi->devs->flatdev) {
+ bdev = blkdev_get_by_path(dif->path, BLK_OPEN_READ, sb->s_type,
+ NULL);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ dif->bdev = bdev;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
+ NULL, NULL);
+ }
+
+ dif->blocks = le32_to_cpu(dis->blocks);
+ dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ sbi->total_blocks += dif->blocks;
+ *pos += EROFS_DEVT_SLOT_SIZE;
+ return 0;
+}
+
+static int erofs_scan_devices(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ unsigned int ondisk_extradevs;
+ erofs_off_t pos;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_device_info *dif;
+ int id, err = 0;
+
+ sbi->total_blocks = sbi->primarydevice_blocks;
+ if (!erofs_sb_has_device_table(sbi))
+ ondisk_extradevs = 0;
+ else
+ ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
+
+ if (sbi->devs->extra_devices &&
+ ondisk_extradevs != sbi->devs->extra_devices) {
+ erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
+ ondisk_extradevs, sbi->devs->extra_devices);
+ return -EINVAL;
+ }
+ if (!ondisk_extradevs)
+ return 0;
+
+ if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb))
+ sbi->devs->flatdev = true;
+
+ sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
+ pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
+ down_read(&sbi->devs->rwsem);
+ if (sbi->devs->extra_devices) {
+ idr_for_each_entry(&sbi->devs->tree, dif, id) {
+ err = erofs_init_device(&buf, sb, dif, &pos);
+ if (err)
+ break;
+ }
+ } else {
+ for (id = 0; id < ondisk_extradevs; id++) {
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif) {
+ err = -ENOMEM;
+ break;
+ }
+
+ err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
+ if (err < 0) {
+ kfree(dif);
+ break;
+ }
+ ++sbi->devs->extra_devices;
+
+ err = erofs_init_device(&buf, sb, dif, &pos);
+ if (err)
+ break;
+ }
+ }
+ up_read(&sbi->devs->rwsem);
+ erofs_put_metabuf(&buf);
+ return err;
+}
+
+static int erofs_read_superblock(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_super_block *dsb;
+ void *data;
+ int ret;
+
+ data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+ if (IS_ERR(data)) {
+ erofs_err(sb, "cannot read erofs superblock");
+ return PTR_ERR(data);
+ }
+
+ sbi = EROFS_SB(sb);
+ dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
+
+ ret = -EINVAL;
+ if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) {
+ erofs_err(sb, "cannot find valid erofs superblock");
+ goto out;
+ }
+
+ sbi->blkszbits = dsb->blkszbits;
+ if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) {
+ erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits);
+ goto out;
+ }
+ if (dsb->dirblkbits) {
+ erofs_err(sb, "dirblkbits %u isn't supported", dsb->dirblkbits);
+ goto out;
+ }
+
+ sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
+ if (erofs_sb_has_sb_chksum(sbi)) {
+ ret = erofs_superblock_csum_verify(sb, data);
+ if (ret)
+ goto out;
+ }
+
+ ret = -EINVAL;
+ if (!check_layout_compatibility(sb, dsb))
+ goto out;
+
+ sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
+ if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) {
+ erofs_err(sb, "invalid sb_extslots %u (more than a fs block)",
+ sbi->sb_size);
+ goto out;
+ }
+ sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
+ sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
+#ifdef CONFIG_EROFS_FS_XATTR
+ sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
+ sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start);
+ sbi->xattr_prefix_count = dsb->xattr_prefix_count;
+ sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
+#endif
+ sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
+ sbi->root_nid = le16_to_cpu(dsb->root_nid);
+ sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
+ sbi->inos = le64_to_cpu(dsb->inos);
+
+ sbi->build_time = le64_to_cpu(dsb->build_time);
+ sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
+
+ memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
+
+ ret = strscpy(sbi->volume_name, dsb->volume_name,
+ sizeof(dsb->volume_name));
+ if (ret < 0) { /* -E2BIG */
+ erofs_err(sb, "bad volume name without NIL terminator");
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* parse on-disk compression configurations */
+ ret = z_erofs_parse_cfgs(sb, dsb);
+ if (ret < 0)
+ goto out;
+
+ /* handle multiple devices */
+ ret = erofs_scan_devices(sb, dsb);
+
+ if (erofs_is_fscache_mode(sb))
+ erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
+out:
+ erofs_put_metabuf(&buf);
+ return ret;
+}
+
+static void erofs_default_options(struct erofs_fs_context *ctx)
+{
+#ifdef CONFIG_EROFS_FS_ZIP
+ ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ ctx->opt.max_sync_decompress_pages = 3;
+ ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
+#endif
+#ifdef CONFIG_EROFS_FS_XATTR
+ set_opt(&ctx->opt, XATTR_USER);
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+ set_opt(&ctx->opt, POSIX_ACL);
+#endif
+}
+
+enum {
+ Opt_user_xattr,
+ Opt_acl,
+ Opt_cache_strategy,
+ Opt_dax,
+ Opt_dax_enum,
+ Opt_device,
+ Opt_fsid,
+ Opt_domain_id,
+ Opt_err
+};
+
+static const struct constant_table erofs_param_cache_strategy[] = {
+ {"disabled", EROFS_ZIP_CACHE_DISABLED},
+ {"readahead", EROFS_ZIP_CACHE_READAHEAD},
+ {"readaround", EROFS_ZIP_CACHE_READAROUND},
+ {}
+};
+
+static const struct constant_table erofs_dax_param_enums[] = {
+ {"always", EROFS_MOUNT_DAX_ALWAYS},
+ {"never", EROFS_MOUNT_DAX_NEVER},
+ {}
+};
+
+static const struct fs_parameter_spec erofs_fs_parameters[] = {
+ fsparam_flag_no("user_xattr", Opt_user_xattr),
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_enum("cache_strategy", Opt_cache_strategy,
+ erofs_param_cache_strategy),
+ fsparam_flag("dax", Opt_dax),
+ fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
+ fsparam_string("device", Opt_device),
+ fsparam_string("fsid", Opt_fsid),
+ fsparam_string("domain_id", Opt_domain_id),
+ {}
+};
+
+static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
+{
+#ifdef CONFIG_FS_DAX
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ switch (mode) {
+ case EROFS_MOUNT_DAX_ALWAYS:
+ warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ set_opt(&ctx->opt, DAX_ALWAYS);
+ clear_opt(&ctx->opt, DAX_NEVER);
+ return true;
+ case EROFS_MOUNT_DAX_NEVER:
+ set_opt(&ctx->opt, DAX_NEVER);
+ clear_opt(&ctx->opt, DAX_ALWAYS);
+ return true;
+ default:
+ DBG_BUGON(1);
+ return false;
+ }
+#else
+ errorfc(fc, "dax options not supported");
+ return false;
+#endif
+}
+
+static int erofs_fc_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct erofs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ struct erofs_device_info *dif;
+ int opt, ret;
+
+ opt = fs_parse(fc, erofs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_user_xattr:
+#ifdef CONFIG_EROFS_FS_XATTR
+ if (result.boolean)
+ set_opt(&ctx->opt, XATTR_USER);
+ else
+ clear_opt(&ctx->opt, XATTR_USER);
+#else
+ errorfc(fc, "{,no}user_xattr options not supported");
+#endif
+ break;
+ case Opt_acl:
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+ if (result.boolean)
+ set_opt(&ctx->opt, POSIX_ACL);
+ else
+ clear_opt(&ctx->opt, POSIX_ACL);
+#else
+ errorfc(fc, "{,no}acl options not supported");
+#endif
+ break;
+ case Opt_cache_strategy:
+#ifdef CONFIG_EROFS_FS_ZIP
+ ctx->opt.cache_strategy = result.uint_32;
+#else
+ errorfc(fc, "compression not supported, cache_strategy ignored");
+#endif
+ break;
+ case Opt_dax:
+ if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS))
+ return -EINVAL;
+ break;
+ case Opt_dax_enum:
+ if (!erofs_fc_set_dax_mode(fc, result.uint_32))
+ return -EINVAL;
+ break;
+ case Opt_device:
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif)
+ return -ENOMEM;
+ dif->path = kstrdup(param->string, GFP_KERNEL);
+ if (!dif->path) {
+ kfree(dif);
+ return -ENOMEM;
+ }
+ down_write(&ctx->devs->rwsem);
+ ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
+ up_write(&ctx->devs->rwsem);
+ if (ret < 0) {
+ kfree(dif->path);
+ kfree(dif);
+ return ret;
+ }
+ ++ctx->devs->extra_devices;
+ break;
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ case Opt_fsid:
+ kfree(ctx->fsid);
+ ctx->fsid = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->fsid)
+ return -ENOMEM;
+ break;
+ case Opt_domain_id:
+ kfree(ctx->domain_id);
+ ctx->domain_id = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->domain_id)
+ return -ENOMEM;
+ break;
+#else
+ case Opt_fsid:
+ case Opt_domain_id:
+ errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
+ break;
+#endif
+ default:
+ return -ENOPARAM;
+ }
+ return 0;
+}
+
+static struct inode *erofs_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ return erofs_iget(sb, ino);
+}
+
+static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ erofs_nfs_get_inode);
+}
+
+static struct dentry *erofs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ erofs_nfs_get_inode);
+}
+
+static struct dentry *erofs_get_parent(struct dentry *child)
+{
+ erofs_nid_t nid;
+ unsigned int d_type;
+ int err;
+
+ err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type);
+ if (err)
+ return ERR_PTR(err);
+ return d_obtain_alias(erofs_iget(child->d_sb, nid));
+}
+
+static const struct export_operations erofs_export_ops = {
+ .fh_to_dentry = erofs_fh_to_dentry,
+ .fh_to_parent = erofs_fh_to_parent,
+ .get_parent = erofs_get_parent,
+};
+
+static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc)
+{
+ static const struct tree_descr empty_descr = {""};
+
+ return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr);
+}
+
+static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct inode *inode;
+ struct erofs_sb_info *sbi;
+ struct erofs_fs_context *ctx = fc->fs_private;
+ int err;
+
+ sb->s_magic = EROFS_SUPER_MAGIC;
+ sb->s_flags |= SB_RDONLY | SB_NOATIME;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_op = &erofs_sops;
+
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+
+ sb->s_fs_info = sbi;
+ sbi->opt = ctx->opt;
+ sbi->devs = ctx->devs;
+ ctx->devs = NULL;
+ sbi->fsid = ctx->fsid;
+ ctx->fsid = NULL;
+ sbi->domain_id = ctx->domain_id;
+ ctx->domain_id = NULL;
+
+ sbi->blkszbits = PAGE_SHIFT;
+ if (erofs_is_fscache_mode(sb)) {
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
+
+ err = erofs_fscache_register_fs(sb);
+ if (err)
+ return err;
+
+ err = super_setup_bdi(sb);
+ if (err)
+ return err;
+ } else {
+ if (!sb_set_blocksize(sb, PAGE_SIZE)) {
+ errorfc(fc, "failed to set initial blksize");
+ return -EINVAL;
+ }
+
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
+ &sbi->dax_part_off,
+ NULL, NULL);
+ }
+
+ err = erofs_read_superblock(sb);
+ if (err)
+ return err;
+
+ if (sb->s_blocksize_bits != sbi->blkszbits) {
+ if (erofs_is_fscache_mode(sb)) {
+ errorfc(fc, "unsupported blksize for fscache mode");
+ return -EINVAL;
+ }
+ if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
+ errorfc(fc, "failed to set erofs blksize");
+ return -EINVAL;
+ }
+ }
+
+ if (test_opt(&sbi->opt, DAX_ALWAYS)) {
+ if (!sbi->dax_dev) {
+ errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ } else if (sbi->blkszbits != PAGE_SHIFT) {
+ errorfc(fc, "unsupported blocksize for DAX");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
+ }
+
+ sb->s_time_gran = 1;
+ sb->s_xattr = erofs_xattr_handlers;
+ sb->s_export_op = &erofs_export_ops;
+
+ if (test_opt(&sbi->opt, POSIX_ACL))
+ sb->s_flags |= SB_POSIXACL;
+ else
+ sb->s_flags &= ~SB_POSIXACL;
+
+#ifdef CONFIG_EROFS_FS_ZIP
+ xa_init(&sbi->managed_pslots);
+#endif
+
+ inode = erofs_iget(sb, ROOT_NID(sbi));
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (!S_ISDIR(inode->i_mode)) {
+ erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)",
+ ROOT_NID(sbi), inode->i_mode);
+ iput(inode);
+ return -EINVAL;
+ }
+
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
+ return -ENOMEM;
+
+ erofs_shrinker_register(sb);
+ if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
+ sbi->packed_inode = erofs_iget(sb, sbi->packed_nid);
+ if (IS_ERR(sbi->packed_inode)) {
+ err = PTR_ERR(sbi->packed_inode);
+ sbi->packed_inode = NULL;
+ return err;
+ }
+ }
+ err = erofs_init_managed_cache(sb);
+ if (err)
+ return err;
+
+ err = erofs_xattr_prefixes_init(sb);
+ if (err)
+ return err;
+
+ err = erofs_register_sysfs(sb);
+ if (err)
+ return err;
+
+ erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi));
+ return 0;
+}
+
+static int erofs_fc_anon_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, erofs_fc_fill_pseudo_super);
+}
+
+static int erofs_fc_get_tree(struct fs_context *fc)
+{
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid)
+ return get_tree_nodev(fc, erofs_fc_fill_super);
+
+ return get_tree_bdev(fc, erofs_fc_fill_super);
+}
+
+static int erofs_fc_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ DBG_BUGON(!sb_rdonly(sb));
+
+ if (ctx->fsid || ctx->domain_id)
+ erofs_info(sb, "ignoring reconfiguration for fsid|domain_id.");
+
+ if (test_opt(&ctx->opt, POSIX_ACL))
+ fc->sb_flags |= SB_POSIXACL;
+ else
+ fc->sb_flags &= ~SB_POSIXACL;
+
+ sbi->opt = ctx->opt;
+
+ fc->sb_flags |= SB_RDONLY;
+ return 0;
+}
+
+static int erofs_release_device_info(int id, void *ptr, void *data)
+{
+ struct erofs_device_info *dif = ptr;
+
+ fs_put_dax(dif->dax_dev, NULL);
+ if (dif->bdev)
+ blkdev_put(dif->bdev, &erofs_fs_type);
+ erofs_fscache_unregister_cookie(dif->fscache);
+ dif->fscache = NULL;
+ kfree(dif->path);
+ kfree(dif);
+ return 0;
+}
+
+static void erofs_free_dev_context(struct erofs_dev_context *devs)
+{
+ if (!devs)
+ return;
+ idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+ idr_destroy(&devs->tree);
+ kfree(devs);
+}
+
+static void erofs_fc_free(struct fs_context *fc)
+{
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ erofs_free_dev_context(ctx->devs);
+ kfree(ctx->fsid);
+ kfree(ctx->domain_id);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations erofs_context_ops = {
+ .parse_param = erofs_fc_parse_param,
+ .get_tree = erofs_fc_get_tree,
+ .reconfigure = erofs_fc_reconfigure,
+ .free = erofs_fc_free,
+};
+
+static const struct fs_context_operations erofs_anon_context_ops = {
+ .get_tree = erofs_fc_anon_get_tree,
+};
+
+static int erofs_init_fs_context(struct fs_context *fc)
+{
+ struct erofs_fs_context *ctx;
+
+ /* pseudo mount for anon inodes */
+ if (fc->sb_flags & SB_KERNMOUNT) {
+ fc->ops = &erofs_anon_context_ops;
+ return 0;
+ }
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+ if (!ctx->devs) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ fc->fs_private = ctx;
+
+ idr_init(&ctx->devs->tree);
+ init_rwsem(&ctx->devs->rwsem);
+ erofs_default_options(ctx);
+ fc->ops = &erofs_context_ops;
+ return 0;
+}
+
+static void erofs_kill_sb(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi;
+
+ /* pseudo mount for anon inodes */
+ if (sb->s_flags & SB_KERNMOUNT) {
+ kill_anon_super(sb);
+ return;
+ }
+
+ if (erofs_is_fscache_mode(sb))
+ kill_anon_super(sb);
+ else
+ kill_block_super(sb);
+
+ sbi = EROFS_SB(sb);
+ if (!sbi)
+ return;
+
+ erofs_free_dev_context(sbi->devs);
+ fs_put_dax(sbi->dax_dev, NULL);
+ erofs_fscache_unregister_fs(sb);
+ kfree(sbi->fsid);
+ kfree(sbi->domain_id);
+ kfree(sbi);
+ sb->s_fs_info = NULL;
+}
+
+static void erofs_put_super(struct super_block *sb)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+
+ DBG_BUGON(!sbi);
+
+ erofs_unregister_sysfs(sb);
+ erofs_shrinker_unregister(sb);
+ erofs_xattr_prefixes_cleanup(sb);
+#ifdef CONFIG_EROFS_FS_ZIP
+ iput(sbi->managed_cache);
+ sbi->managed_cache = NULL;
+#endif
+ iput(sbi->packed_inode);
+ sbi->packed_inode = NULL;
+ erofs_free_dev_context(sbi->devs);
+ sbi->devs = NULL;
+ erofs_fscache_unregister_fs(sb);
+}
+
+struct file_system_type erofs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "erofs",
+ .init_fs_context = erofs_init_fs_context,
+ .kill_sb = erofs_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+};
+MODULE_ALIAS_FS("erofs");
+
+static int __init erofs_module_init(void)
+{
+ int err;
+
+ erofs_check_ondisk_layout_definitions();
+
+ erofs_inode_cachep = kmem_cache_create("erofs_inode",
+ sizeof(struct erofs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ erofs_inode_init_once);
+ if (!erofs_inode_cachep)
+ return -ENOMEM;
+
+ err = erofs_init_shrinker();
+ if (err)
+ goto shrinker_err;
+
+ err = z_erofs_lzma_init();
+ if (err)
+ goto lzma_err;
+
+ err = z_erofs_deflate_init();
+ if (err)
+ goto deflate_err;
+
+ erofs_pcpubuf_init();
+ err = z_erofs_init_zip_subsystem();
+ if (err)
+ goto zip_err;
+
+ err = erofs_init_sysfs();
+ if (err)
+ goto sysfs_err;
+
+ err = register_filesystem(&erofs_fs_type);
+ if (err)
+ goto fs_err;
+
+ return 0;
+
+fs_err:
+ erofs_exit_sysfs();
+sysfs_err:
+ z_erofs_exit_zip_subsystem();
+zip_err:
+ z_erofs_deflate_exit();
+deflate_err:
+ z_erofs_lzma_exit();
+lzma_err:
+ erofs_exit_shrinker();
+shrinker_err:
+ kmem_cache_destroy(erofs_inode_cachep);
+ return err;
+}
+
+static void __exit erofs_module_exit(void)
+{
+ unregister_filesystem(&erofs_fs_type);
+
+ /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
+ rcu_barrier();
+
+ erofs_exit_sysfs();
+ z_erofs_exit_zip_subsystem();
+ z_erofs_deflate_exit();
+ z_erofs_lzma_exit();
+ erofs_exit_shrinker();
+ kmem_cache_destroy(erofs_inode_cachep);
+ erofs_pcpubuf_exit();
+}
+
+static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ u64 id = 0;
+
+ if (!erofs_is_fscache_mode(sb))
+ id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+ buf->f_type = sb->s_magic;
+ buf->f_bsize = sb->s_blocksize;
+ buf->f_blocks = sbi->total_blocks;
+ buf->f_bfree = buf->f_bavail = 0;
+
+ buf->f_files = ULLONG_MAX;
+ buf->f_ffree = ULLONG_MAX - sbi->inos;
+
+ buf->f_namelen = EROFS_NAME_LEN;
+
+ buf->f_fsid = u64_to_fsid(id);
+ return 0;
+}
+
+static int erofs_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
+ struct erofs_mount_opts *opt = &sbi->opt;
+
+#ifdef CONFIG_EROFS_FS_XATTR
+ if (test_opt(opt, XATTR_USER))
+ seq_puts(seq, ",user_xattr");
+ else
+ seq_puts(seq, ",nouser_xattr");
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+ if (test_opt(opt, POSIX_ACL))
+ seq_puts(seq, ",acl");
+ else
+ seq_puts(seq, ",noacl");
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP
+ if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
+ seq_puts(seq, ",cache_strategy=disabled");
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
+ seq_puts(seq, ",cache_strategy=readahead");
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
+ seq_puts(seq, ",cache_strategy=readaround");
+#endif
+ if (test_opt(opt, DAX_ALWAYS))
+ seq_puts(seq, ",dax=always");
+ if (test_opt(opt, DAX_NEVER))
+ seq_puts(seq, ",dax=never");
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (sbi->fsid)
+ seq_printf(seq, ",fsid=%s", sbi->fsid);
+ if (sbi->domain_id)
+ seq_printf(seq, ",domain_id=%s", sbi->domain_id);
+#endif
+ return 0;
+}
+
+const struct super_operations erofs_sops = {
+ .put_super = erofs_put_super,
+ .alloc_inode = erofs_alloc_inode,
+ .free_inode = erofs_free_inode,
+ .statfs = erofs_statfs,
+ .show_options = erofs_show_options,
+};
+
+module_init(erofs_module_init);
+module_exit(erofs_module_exit);
+
+MODULE_DESCRIPTION("Enhanced ROM File System");
+MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc.");
+MODULE_LICENSE("GPL");
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
new file mode 100644
index 000000000..435e515c0
--- /dev/null
+++ b/fs/erofs/sysfs.c
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd.
+ * https://www.oppo.com/
+ */
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+
+#include "internal.h"
+
+enum {
+ attr_feature,
+ attr_pointer_ui,
+ attr_pointer_bool,
+};
+
+enum {
+ struct_erofs_sb_info,
+ struct_erofs_mount_opts,
+};
+
+struct erofs_attr {
+ struct attribute attr;
+ short attr_id;
+ int struct_type, offset;
+};
+
+#define EROFS_ATTR(_name, _mode, _id) \
+static struct erofs_attr erofs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+}
+#define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name)
+#define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature)
+
+#define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \
+static struct erofs_attr erofs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+ .struct_type = struct_##_struct, \
+ .offset = offsetof(struct _struct, _name),\
+}
+
+#define EROFS_ATTR_RW(_name, _id, _struct) \
+ EROFS_ATTR_OFFSET(_name, 0644, _id, _struct)
+
+#define EROFS_RO_ATTR(_name, _id, _struct) \
+ EROFS_ATTR_OFFSET(_name, 0444, _id, _struct)
+
+#define EROFS_ATTR_RW_UI(_name, _struct) \
+ EROFS_ATTR_RW(_name, pointer_ui, _struct)
+
+#define EROFS_ATTR_RW_BOOL(_name, _struct) \
+ EROFS_ATTR_RW(_name, pointer_bool, _struct)
+
+#define ATTR_LIST(name) (&erofs_attr_##name.attr)
+
+#ifdef CONFIG_EROFS_FS_ZIP
+EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
+#endif
+
+static struct attribute *erofs_attrs[] = {
+#ifdef CONFIG_EROFS_FS_ZIP
+ ATTR_LIST(sync_decompress),
+#endif
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs);
+
+/* Features this copy of erofs supports */
+EROFS_ATTR_FEATURE(zero_padding);
+EROFS_ATTR_FEATURE(compr_cfgs);
+EROFS_ATTR_FEATURE(big_pcluster);
+EROFS_ATTR_FEATURE(chunked_file);
+EROFS_ATTR_FEATURE(device_table);
+EROFS_ATTR_FEATURE(compr_head2);
+EROFS_ATTR_FEATURE(sb_chksum);
+EROFS_ATTR_FEATURE(ztailpacking);
+EROFS_ATTR_FEATURE(fragments);
+EROFS_ATTR_FEATURE(dedupe);
+
+static struct attribute *erofs_feat_attrs[] = {
+ ATTR_LIST(zero_padding),
+ ATTR_LIST(compr_cfgs),
+ ATTR_LIST(big_pcluster),
+ ATTR_LIST(chunked_file),
+ ATTR_LIST(device_table),
+ ATTR_LIST(compr_head2),
+ ATTR_LIST(sb_chksum),
+ ATTR_LIST(ztailpacking),
+ ATTR_LIST(fragments),
+ ATTR_LIST(dedupe),
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs_feat);
+
+static unsigned char *__struct_ptr(struct erofs_sb_info *sbi,
+ int struct_type, int offset)
+{
+ if (struct_type == struct_erofs_sb_info)
+ return (unsigned char *)sbi + offset;
+ if (struct_type == struct_erofs_mount_opts)
+ return (unsigned char *)&sbi->opt + offset;
+ return NULL;
+}
+
+static ssize_t erofs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
+ s_kobj);
+ struct erofs_attr *a = container_of(attr, struct erofs_attr, attr);
+ unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset);
+
+ switch (a->attr_id) {
+ case attr_feature:
+ return sysfs_emit(buf, "supported\n");
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ return sysfs_emit(buf, "%u\n", *(unsigned int *)ptr);
+ case attr_pointer_bool:
+ if (!ptr)
+ return 0;
+ return sysfs_emit(buf, "%d\n", *(bool *)ptr);
+ }
+ return 0;
+}
+
+static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
+ s_kobj);
+ struct erofs_attr *a = container_of(attr, struct erofs_attr, attr);
+ unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset);
+ unsigned long t;
+ int ret;
+
+ switch (a->attr_id) {
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t != (unsigned int)t)
+ return -ERANGE;
+#ifdef CONFIG_EROFS_FS_ZIP
+ if (!strcmp(a->attr.name, "sync_decompress") &&
+ (t > EROFS_SYNC_DECOMPRESS_FORCE_OFF))
+ return -EINVAL;
+#endif
+ *(unsigned int *)ptr = t;
+ return len;
+ case attr_pointer_bool:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t != 0 && t != 1)
+ return -EINVAL;
+ *(bool *)ptr = !!t;
+ return len;
+ }
+ return 0;
+}
+
+static void erofs_sb_release(struct kobject *kobj)
+{
+ struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops erofs_attr_ops = {
+ .show = erofs_attr_show,
+ .store = erofs_attr_store,
+};
+
+static const struct kobj_type erofs_sb_ktype = {
+ .default_groups = erofs_groups,
+ .sysfs_ops = &erofs_attr_ops,
+ .release = erofs_sb_release,
+};
+
+static const struct kobj_type erofs_ktype = {
+ .sysfs_ops = &erofs_attr_ops,
+};
+
+static struct kset erofs_root = {
+ .kobj = {.ktype = &erofs_ktype},
+};
+
+static const struct kobj_type erofs_feat_ktype = {
+ .default_groups = erofs_feat_groups,
+ .sysfs_ops = &erofs_attr_ops,
+};
+
+static struct kobject erofs_feat = {
+ .kset = &erofs_root,
+};
+
+int erofs_register_sysfs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ char *name;
+ char *str = NULL;
+ int err;
+
+ if (erofs_is_fscache_mode(sb)) {
+ if (sbi->domain_id) {
+ str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id,
+ sbi->fsid);
+ if (!str)
+ return -ENOMEM;
+ name = str;
+ } else {
+ name = sbi->fsid;
+ }
+ } else {
+ name = sb->s_id;
+ }
+ sbi->s_kobj.kset = &erofs_root;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name);
+ kfree(str);
+ if (err)
+ goto put_sb_kobj;
+ return 0;
+
+put_sb_kobj:
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ return err;
+}
+
+void erofs_unregister_sysfs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ if (sbi->s_kobj.state_in_sysfs) {
+ kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ }
+}
+
+int __init erofs_init_sysfs(void)
+{
+ int ret;
+
+ kobject_set_name(&erofs_root.kobj, "erofs");
+ erofs_root.kobj.parent = fs_kobj;
+ ret = kset_register(&erofs_root);
+ if (ret)
+ goto root_err;
+
+ ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
+ NULL, "features");
+ if (ret)
+ goto feat_err;
+ return ret;
+
+feat_err:
+ kobject_put(&erofs_feat);
+ kset_unregister(&erofs_root);
+root_err:
+ return ret;
+}
+
+void erofs_exit_sysfs(void)
+{
+ kobject_put(&erofs_feat);
+ kset_unregister(&erofs_root);
+}
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 000000000..4256a8571
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ */
+#include "internal.h"
+
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
+{
+ struct page *page = *pagepool;
+
+ if (page) {
+ DBG_BUGON(page_ref_count(page) != 1);
+ *pagepool = (struct page *)page_private(page);
+ } else {
+ page = alloc_page(gfp);
+ }
+ return page;
+}
+
+void erofs_release_pages(struct page **pagepool)
+{
+ while (*pagepool) {
+ struct page *page = *pagepool;
+
+ *pagepool = (struct page *)page_private(page);
+ put_page(page);
+ }
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
+static bool erofs_workgroup_get(struct erofs_workgroup *grp)
+{
+ if (lockref_get_not_zero(&grp->lockref))
+ return true;
+
+ spin_lock(&grp->lockref.lock);
+ if (__lockref_is_dead(&grp->lockref)) {
+ spin_unlock(&grp->lockref.lock);
+ return false;
+ }
+
+ if (!grp->lockref.count++)
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ spin_unlock(&grp->lockref.lock);
+ return true;
+}
+
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+ pgoff_t index)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_workgroup *grp;
+
+repeat:
+ rcu_read_lock();
+ grp = xa_load(&sbi->managed_pslots, index);
+ if (grp) {
+ if (!erofs_workgroup_get(grp)) {
+ /* prefer to relax rcu read side */
+ rcu_read_unlock();
+ goto repeat;
+ }
+
+ DBG_BUGON(index != grp->index);
+ }
+ rcu_read_unlock();
+ return grp;
+}
+
+struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
+ struct erofs_workgroup *grp)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ struct erofs_workgroup *pre;
+
+ DBG_BUGON(grp->lockref.count < 1);
+repeat:
+ xa_lock(&sbi->managed_pslots);
+ pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
+ NULL, grp, GFP_NOFS);
+ if (pre) {
+ if (xa_is_err(pre)) {
+ pre = ERR_PTR(xa_err(pre));
+ } else if (!erofs_workgroup_get(pre)) {
+ /* try to legitimize the current in-tree one */
+ xa_unlock(&sbi->managed_pslots);
+ cond_resched();
+ goto repeat;
+ }
+ grp = pre;
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return grp;
+}
+
+static void __erofs_workgroup_free(struct erofs_workgroup *grp)
+{
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ erofs_workgroup_free_rcu(grp);
+}
+
+void erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+ if (lockref_put_or_lock(&grp->lockref))
+ return;
+
+ DBG_BUGON(__lockref_is_dead(&grp->lockref));
+ if (grp->lockref.count == 1)
+ atomic_long_inc(&erofs_global_shrink_cnt);
+ --grp->lockref.count;
+ spin_unlock(&grp->lockref.lock);
+}
+
+static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
+ struct erofs_workgroup *grp)
+{
+ int free = false;
+
+ spin_lock(&grp->lockref.lock);
+ if (grp->lockref.count)
+ goto out;
+
+ /*
+ * Note that all cached pages should be detached before deleted from
+ * the XArray. Otherwise some cached pages could be still attached to
+ * the orphan old workgroup when the new one is available in the tree.
+ */
+ if (erofs_try_to_free_all_cached_pages(sbi, grp))
+ goto out;
+
+ /*
+ * It's impossible to fail after the workgroup is freezed,
+ * however in order to avoid some race conditions, add a
+ * DBG_BUGON to observe this in advance.
+ */
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
+
+ lockref_mark_dead(&grp->lockref);
+ free = true;
+out:
+ spin_unlock(&grp->lockref.lock);
+ if (free)
+ __erofs_workgroup_free(grp);
+ return free;
+}
+
+static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+ unsigned long nr_shrink)
+{
+ struct erofs_workgroup *grp;
+ unsigned int freed = 0;
+ unsigned long index;
+
+ xa_lock(&sbi->managed_pslots);
+ xa_for_each(&sbi->managed_pslots, index, grp) {
+ /* try to shrink each valid workgroup */
+ if (!erofs_try_to_release_workgroup(sbi, grp))
+ continue;
+ xa_unlock(&sbi->managed_pslots);
+
+ ++freed;
+ if (!--nr_shrink)
+ return freed;
+ xa_lock(&sbi->managed_pslots);
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return freed;
+}
+
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+
+void erofs_shrinker_register(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ mutex_init(&sbi->umount_mutex);
+
+ spin_lock(&erofs_sb_list_lock);
+ list_add(&sbi->list, &erofs_sb_list);
+ spin_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_shrinker_unregister(struct super_block *sb)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+
+ mutex_lock(&sbi->umount_mutex);
+ /* clean up all remaining workgroups in memory */
+ erofs_shrink_workstation(sbi, ~0UL);
+
+ spin_lock(&erofs_sb_list_lock);
+ list_del(&sbi->list);
+ spin_unlock(&erofs_sb_list_lock);
+ mutex_unlock(&sbi->umount_mutex);
+}
+
+static unsigned long erofs_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+static unsigned long erofs_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct erofs_sb_info *sbi;
+ struct list_head *p;
+
+ unsigned long nr = sc->nr_to_scan;
+ unsigned int run_no;
+ unsigned long freed = 0;
+
+ spin_lock(&erofs_sb_list_lock);
+ do {
+ run_no = ++shrinker_run_no;
+ } while (run_no == 0);
+
+ /* Iterate over all mounted superblocks and try to shrink them */
+ p = erofs_sb_list.next;
+ while (p != &erofs_sb_list) {
+ sbi = list_entry(p, struct erofs_sb_info, list);
+
+ /*
+ * We move the ones we do to the end of the list, so we stop
+ * when we see one we have already done.
+ */
+ if (sbi->shrinker_run_no == run_no)
+ break;
+
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+
+ spin_unlock(&erofs_sb_list_lock);
+ sbi->shrinker_run_no = run_no;
+
+ freed += erofs_shrink_workstation(sbi, nr - freed);
+
+ spin_lock(&erofs_sb_list_lock);
+ /* Get the next list element before we move this one */
+ p = p->next;
+
+ /*
+ * Move this one to the end of the list to provide some
+ * fairness.
+ */
+ list_move_tail(&sbi->list, &erofs_sb_list);
+ mutex_unlock(&sbi->umount_mutex);
+
+ if (freed >= nr)
+ break;
+ }
+ spin_unlock(&erofs_sb_list_lock);
+ return freed;
+}
+
+static struct shrinker erofs_shrinker_info = {
+ .scan_objects = erofs_shrink_scan,
+ .count_objects = erofs_shrink_count,
+ .seeks = DEFAULT_SEEKS,
+};
+
+int __init erofs_init_shrinker(void)
+{
+ return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
+}
+
+void erofs_exit_shrinker(void)
+{
+ unregister_shrinker(&erofs_shrinker_info);
+}
+#endif /* !CONFIG_EROFS_FS_ZIP */
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
new file mode 100644
index 000000000..09d341675
--- /dev/null
+++ b/fs/erofs/xattr.c
@@ -0,0 +1,560 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2021-2022, Alibaba Cloud
+ */
+#include <linux/security.h>
+#include <linux/xxhash.h>
+#include "xattr.h"
+
+struct erofs_xattr_iter {
+ struct super_block *sb;
+ struct erofs_buf buf;
+ erofs_off_t pos;
+ void *kaddr;
+
+ char *buffer;
+ int buffer_size, buffer_ofs;
+
+ /* getxattr */
+ int index, infix_len;
+ struct qstr name;
+
+ /* listxattr */
+ struct dentry *dentry;
+};
+
+static int erofs_init_inode_xattrs(struct inode *inode)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ struct erofs_xattr_iter it;
+ unsigned int i;
+ struct erofs_xattr_ibody_header *ih;
+ struct super_block *sb = inode->i_sb;
+ int ret = 0;
+
+ /* the most case is that xattrs of this inode are initialized. */
+ if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) {
+ /*
+ * paired with smp_mb() at the end of the function to ensure
+ * fields will only be observed after the bit is set.
+ */
+ smp_mb();
+ return 0;
+ }
+
+ if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE))
+ return -ERESTARTSYS;
+
+ /* someone has initialized xattrs for us? */
+ if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags))
+ goto out_unlock;
+
+ /*
+ * bypass all xattr operations if ->xattr_isize is not greater than
+ * sizeof(struct erofs_xattr_ibody_header), in detail:
+ * 1) it is not enough to contain erofs_xattr_ibody_header then
+ * ->xattr_isize should be 0 (it means no xattr);
+ * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk
+ * undefined right now (maybe use later with some new sb feature).
+ */
+ if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) {
+ erofs_err(sb,
+ "xattr_isize %d of nid %llu is not supported yet",
+ vi->xattr_isize, vi->nid);
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) {
+ if (vi->xattr_isize) {
+ erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid);
+ DBG_BUGON(1);
+ ret = -EFSCORRUPTED;
+ goto out_unlock; /* xattr ondisk layout error */
+ }
+ ret = -ENOATTR;
+ goto out_unlock;
+ }
+
+ it.buf = __EROFS_BUF_INITIALIZER;
+ erofs_init_metabuf(&it.buf, sb);
+ it.pos = erofs_iloc(inode) + vi->inode_isize;
+
+ /* read in shared xattr array (non-atomic, see kmalloc below) */
+ it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP);
+ if (IS_ERR(it.kaddr)) {
+ ret = PTR_ERR(it.kaddr);
+ goto out_unlock;
+ }
+
+ ih = it.kaddr + erofs_blkoff(sb, it.pos);
+ vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter);
+ vi->xattr_shared_count = ih->h_shared_count;
+ vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
+ sizeof(uint), GFP_KERNEL);
+ if (!vi->xattr_shared_xattrs) {
+ erofs_put_metabuf(&it.buf);
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ /* let's skip ibody header */
+ it.pos += sizeof(struct erofs_xattr_ibody_header);
+
+ for (i = 0; i < vi->xattr_shared_count; ++i) {
+ it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos),
+ EROFS_KMAP);
+ if (IS_ERR(it.kaddr)) {
+ kfree(vi->xattr_shared_xattrs);
+ vi->xattr_shared_xattrs = NULL;
+ ret = PTR_ERR(it.kaddr);
+ goto out_unlock;
+ }
+ vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)
+ (it.kaddr + erofs_blkoff(sb, it.pos)));
+ it.pos += sizeof(__le32);
+ }
+ erofs_put_metabuf(&it.buf);
+
+ /* paired with smp_mb() at the beginning of the function. */
+ smp_mb();
+ set_bit(EROFS_I_EA_INITED_BIT, &vi->flags);
+
+out_unlock:
+ clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags);
+ return ret;
+}
+
+static bool erofs_xattr_user_list(struct dentry *dentry)
+{
+ return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
+}
+
+static bool erofs_xattr_trusted_list(struct dentry *dentry)
+{
+ return capable(CAP_SYS_ADMIN);
+}
+
+static int erofs_xattr_generic_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ if (handler->flags == EROFS_XATTR_INDEX_USER &&
+ !test_opt(&EROFS_I_SB(inode)->opt, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return erofs_getxattr(inode, handler->flags, name, buffer, size);
+}
+
+const struct xattr_handler erofs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .flags = EROFS_XATTR_INDEX_USER,
+ .list = erofs_xattr_user_list,
+ .get = erofs_xattr_generic_get,
+};
+
+const struct xattr_handler erofs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .flags = EROFS_XATTR_INDEX_TRUSTED,
+ .list = erofs_xattr_trusted_list,
+ .get = erofs_xattr_generic_get,
+};
+
+#ifdef CONFIG_EROFS_FS_SECURITY
+const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .flags = EROFS_XATTR_INDEX_SECURITY,
+ .get = erofs_xattr_generic_get,
+};
+#endif
+
+const struct xattr_handler *erofs_xattr_handlers[] = {
+ &erofs_xattr_user_handler,
+ &erofs_xattr_trusted_handler,
+#ifdef CONFIG_EROFS_FS_SECURITY
+ &erofs_xattr_security_handler,
+#endif
+ NULL,
+};
+
+static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it,
+ unsigned int len)
+{
+ unsigned int slice, processed;
+ struct super_block *sb = it->sb;
+ void *src;
+
+ for (processed = 0; processed < len; processed += slice) {
+ it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
+ EROFS_KMAP);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ src = it->kaddr + erofs_blkoff(sb, it->pos);
+ slice = min_t(unsigned int, sb->s_blocksize -
+ erofs_blkoff(sb, it->pos), len - processed);
+ memcpy(it->buffer + it->buffer_ofs, src, slice);
+ it->buffer_ofs += slice;
+ it->pos += slice;
+ }
+ return 0;
+}
+
+static int erofs_listxattr_foreach(struct erofs_xattr_iter *it)
+{
+ struct erofs_xattr_entry entry;
+ unsigned int base_index, name_total, prefix_len, infix_len = 0;
+ const char *prefix, *infix = NULL;
+ int err;
+
+ /* 1. handle xattr entry */
+ entry = *(struct erofs_xattr_entry *)
+ (it->kaddr + erofs_blkoff(it->sb, it->pos));
+ it->pos += sizeof(struct erofs_xattr_entry);
+
+ base_index = entry.e_name_index;
+ if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) {
+ struct erofs_sb_info *sbi = EROFS_SB(it->sb);
+ struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes +
+ (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
+
+ if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count)
+ return 0;
+ infix = pf->prefix->infix;
+ infix_len = pf->infix_len;
+ base_index = pf->prefix->base_index;
+ }
+
+ prefix = erofs_xattr_prefix(base_index, it->dentry);
+ if (!prefix)
+ return 0;
+ prefix_len = strlen(prefix);
+ name_total = prefix_len + infix_len + entry.e_name_len + 1;
+
+ if (!it->buffer) {
+ it->buffer_ofs += name_total;
+ return 0;
+ }
+
+ if (it->buffer_ofs + name_total > it->buffer_size)
+ return -ERANGE;
+
+ memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len);
+ memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len);
+ it->buffer_ofs += prefix_len + infix_len;
+
+ /* 2. handle xattr name */
+ err = erofs_xattr_copy_to_buffer(it, entry.e_name_len);
+ if (err)
+ return err;
+
+ it->buffer[it->buffer_ofs++] = '\0';
+ return 0;
+}
+
+static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
+{
+ struct super_block *sb = it->sb;
+ struct erofs_xattr_entry entry;
+ unsigned int slice, processed, value_sz;
+
+ /* 1. handle xattr entry */
+ entry = *(struct erofs_xattr_entry *)
+ (it->kaddr + erofs_blkoff(sb, it->pos));
+ it->pos += sizeof(struct erofs_xattr_entry);
+ value_sz = le16_to_cpu(entry.e_value_size);
+
+ /* should also match the infix for long name prefixes */
+ if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) {
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes +
+ (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
+
+ if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count)
+ return -ENOATTR;
+
+ if (it->index != pf->prefix->base_index ||
+ it->name.len != entry.e_name_len + pf->infix_len)
+ return -ENOATTR;
+
+ if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len))
+ return -ENOATTR;
+
+ it->infix_len = pf->infix_len;
+ } else {
+ if (it->index != entry.e_name_index ||
+ it->name.len != entry.e_name_len)
+ return -ENOATTR;
+
+ it->infix_len = 0;
+ }
+
+ /* 2. handle xattr name */
+ for (processed = 0; processed < entry.e_name_len; processed += slice) {
+ it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
+ EROFS_KMAP);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ slice = min_t(unsigned int,
+ sb->s_blocksize - erofs_blkoff(sb, it->pos),
+ entry.e_name_len - processed);
+ if (memcmp(it->name.name + it->infix_len + processed,
+ it->kaddr + erofs_blkoff(sb, it->pos), slice))
+ return -ENOATTR;
+ it->pos += slice;
+ }
+
+ /* 3. handle xattr value */
+ if (!it->buffer) {
+ it->buffer_ofs = value_sz;
+ return 0;
+ }
+
+ if (it->buffer_size < value_sz)
+ return -ERANGE;
+
+ return erofs_xattr_copy_to_buffer(it, value_sz);
+}
+
+static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
+ struct inode *inode, bool getxattr)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ unsigned int xattr_header_sz, remaining, entry_sz;
+ erofs_off_t next_pos;
+ int ret;
+
+ xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) +
+ sizeof(u32) * vi->xattr_shared_count;
+ if (xattr_header_sz >= vi->xattr_isize) {
+ DBG_BUGON(xattr_header_sz > vi->xattr_isize);
+ return -ENOATTR;
+ }
+
+ remaining = vi->xattr_isize - xattr_header_sz;
+ it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
+
+ while (remaining) {
+ it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos),
+ EROFS_KMAP);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ entry_sz = erofs_xattr_entry_size(it->kaddr +
+ erofs_blkoff(it->sb, it->pos));
+ /* xattr on-disk corruption: xattr entry beyond xattr_isize */
+ if (remaining < entry_sz) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ remaining -= entry_sz;
+ next_pos = it->pos + entry_sz;
+
+ if (getxattr)
+ ret = erofs_getxattr_foreach(it);
+ else
+ ret = erofs_listxattr_foreach(it);
+ if ((getxattr && ret != -ENOATTR) || (!getxattr && ret))
+ break;
+
+ it->pos = next_pos;
+ }
+ return ret;
+}
+
+static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
+ struct inode *inode, bool getxattr)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ struct super_block *const sb = it->sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ unsigned int i;
+ int ret = -ENOATTR;
+
+ for (i = 0; i < vi->xattr_shared_count; ++i) {
+ it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
+ vi->xattr_shared_xattrs[i] * sizeof(__le32);
+ it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
+ EROFS_KMAP);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ if (getxattr)
+ ret = erofs_getxattr_foreach(it);
+ else
+ ret = erofs_listxattr_foreach(it);
+ if ((getxattr && ret != -ENOATTR) || (!getxattr && ret))
+ break;
+ }
+ return ret;
+}
+
+int erofs_getxattr(struct inode *inode, int index, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ int ret;
+ unsigned int hashbit;
+ struct erofs_xattr_iter it;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+
+ if (!name)
+ return -EINVAL;
+
+ ret = erofs_init_inode_xattrs(inode);
+ if (ret)
+ return ret;
+
+ /* reserved flag is non-zero if there's any change of on-disk format */
+ if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) {
+ hashbit = xxh32(name, strlen(name),
+ EROFS_XATTR_FILTER_SEED + index);
+ hashbit &= EROFS_XATTR_FILTER_BITS - 1;
+ if (vi->xattr_name_filter & (1U << hashbit))
+ return -ENOATTR;
+ }
+
+ it.index = index;
+ it.name = (struct qstr)QSTR_INIT(name, strlen(name));
+ if (it.name.len > EROFS_NAME_LEN)
+ return -ERANGE;
+
+ it.sb = inode->i_sb;
+ it.buf = __EROFS_BUF_INITIALIZER;
+ erofs_init_metabuf(&it.buf, it.sb);
+ it.buffer = buffer;
+ it.buffer_size = buffer_size;
+ it.buffer_ofs = 0;
+
+ ret = erofs_xattr_iter_inline(&it, inode, true);
+ if (ret == -ENOATTR)
+ ret = erofs_xattr_iter_shared(&it, inode, true);
+ erofs_put_metabuf(&it.buf);
+ return ret ? ret : it.buffer_ofs;
+}
+
+ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ int ret;
+ struct erofs_xattr_iter it;
+ struct inode *inode = d_inode(dentry);
+
+ ret = erofs_init_inode_xattrs(inode);
+ if (ret == -ENOATTR)
+ return 0;
+ if (ret)
+ return ret;
+
+ it.sb = dentry->d_sb;
+ it.buf = __EROFS_BUF_INITIALIZER;
+ erofs_init_metabuf(&it.buf, it.sb);
+ it.dentry = dentry;
+ it.buffer = buffer;
+ it.buffer_size = buffer_size;
+ it.buffer_ofs = 0;
+
+ ret = erofs_xattr_iter_inline(&it, inode, false);
+ if (!ret || ret == -ENOATTR)
+ ret = erofs_xattr_iter_shared(&it, inode, false);
+ if (ret == -ENOATTR)
+ ret = 0;
+ erofs_put_metabuf(&it.buf);
+ return ret ? ret : it.buffer_ofs;
+}
+
+void erofs_xattr_prefixes_cleanup(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ int i;
+
+ if (sbi->xattr_prefixes) {
+ for (i = 0; i < sbi->xattr_prefix_count; i++)
+ kfree(sbi->xattr_prefixes[i].prefix);
+ kfree(sbi->xattr_prefixes);
+ sbi->xattr_prefixes = NULL;
+ }
+}
+
+int erofs_xattr_prefixes_init(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2;
+ struct erofs_xattr_prefix_item *pfs;
+ int ret = 0, i, len;
+
+ if (!sbi->xattr_prefix_count)
+ return 0;
+
+ pfs = kzalloc(sbi->xattr_prefix_count * sizeof(*pfs), GFP_KERNEL);
+ if (!pfs)
+ return -ENOMEM;
+
+ if (sbi->packed_inode)
+ buf.inode = sbi->packed_inode;
+ else
+ erofs_init_metabuf(&buf, sb);
+
+ for (i = 0; i < sbi->xattr_prefix_count; i++) {
+ void *ptr = erofs_read_metadata(sb, &buf, &pos, &len);
+
+ if (IS_ERR(ptr)) {
+ ret = PTR_ERR(ptr);
+ break;
+ } else if (len < sizeof(*pfs->prefix) ||
+ len > EROFS_NAME_LEN + sizeof(*pfs->prefix)) {
+ kfree(ptr);
+ ret = -EFSCORRUPTED;
+ break;
+ }
+ pfs[i].prefix = ptr;
+ pfs[i].infix_len = len - sizeof(struct erofs_xattr_long_prefix);
+ }
+
+ erofs_put_metabuf(&buf);
+ sbi->xattr_prefixes = pfs;
+ if (ret)
+ erofs_xattr_prefixes_cleanup(sb);
+ return ret;
+}
+
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu)
+{
+ struct posix_acl *acl;
+ int prefix, rc;
+ char *value = NULL;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ rc = erofs_getxattr(inode, prefix, "", NULL, 0);
+ if (rc > 0) {
+ value = kmalloc(rc, GFP_KERNEL);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ rc = erofs_getxattr(inode, prefix, "", value, rc);
+ }
+
+ if (rc == -ENOATTR)
+ acl = NULL;
+ else if (rc < 0)
+ acl = ERR_PTR(rc);
+ else
+ acl = posix_acl_from_xattr(&init_user_ns, value, rc);
+ kfree(value);
+ return acl;
+}
+#endif
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
new file mode 100644
index 000000000..f16283cb8
--- /dev/null
+++ b/fs/erofs/xattr.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ */
+#ifndef __EROFS_XATTR_H
+#define __EROFS_XATTR_H
+
+#include "internal.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+/* Attribute not found */
+#define ENOATTR ENODATA
+
+#ifdef CONFIG_EROFS_FS_XATTR
+extern const struct xattr_handler erofs_xattr_user_handler;
+extern const struct xattr_handler erofs_xattr_trusted_handler;
+extern const struct xattr_handler erofs_xattr_security_handler;
+
+static inline const char *erofs_xattr_prefix(unsigned int idx,
+ struct dentry *dentry)
+{
+ const struct xattr_handler *handler = NULL;
+
+ static const struct xattr_handler *xattr_handler_map[] = {
+ [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+ [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
+ [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
+#endif
+ [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
+#ifdef CONFIG_EROFS_FS_SECURITY
+ [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler,
+#endif
+ };
+
+ if (idx && idx < ARRAY_SIZE(xattr_handler_map))
+ handler = xattr_handler_map[idx];
+
+ if (!xattr_handler_can_list(handler, dentry))
+ return NULL;
+
+ return xattr_prefix(handler);
+}
+
+extern const struct xattr_handler *erofs_xattr_handlers[];
+
+int erofs_xattr_prefixes_init(struct super_block *sb);
+void erofs_xattr_prefixes_cleanup(struct super_block *sb);
+int erofs_getxattr(struct inode *, int, const char *, void *, size_t);
+ssize_t erofs_listxattr(struct dentry *, char *, size_t);
+#else
+static inline int erofs_xattr_prefixes_init(struct super_block *sb) { return 0; }
+static inline void erofs_xattr_prefixes_cleanup(struct super_block *sb) {}
+static inline int erofs_getxattr(struct inode *inode, int index,
+ const char *name, void *buffer,
+ size_t buffer_size)
+{
+ return -EOPNOTSUPP;
+}
+
+#define erofs_listxattr (NULL)
+#define erofs_xattr_handlers (NULL)
+#endif /* !CONFIG_EROFS_FS_XATTR */
+
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu);
+#else
+#define erofs_get_acl (NULL)
+#endif
+
+#endif
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
new file mode 100644
index 000000000..a33cd6757
--- /dev/null
+++ b/fs/erofs/zdata.c
@@ -0,0 +1,1894 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2022 Alibaba Cloud
+ */
+#include "compress.h"
+#include <linux/psi.h>
+#include <linux/cpuhotplug.h>
+#include <trace/events/erofs.h>
+
+#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
+#define Z_EROFS_INLINE_BVECS 2
+
+/*
+ * let's leave a type here in case of introducing
+ * another tagged pointer later.
+ */
+typedef void *z_erofs_next_pcluster_t;
+
+struct z_erofs_bvec {
+ struct page *page;
+ int offset;
+ unsigned int end;
+};
+
+#define __Z_EROFS_BVSET(name, total) \
+struct name { \
+ /* point to the next page which contains the following bvecs */ \
+ struct page *nextpage; \
+ struct z_erofs_bvec bvec[total]; \
+}
+__Z_EROFS_BVSET(z_erofs_bvset,);
+__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ * for everyone else;
+ *
+ * L: Field should be protected by the pcluster lock;
+ *
+ * A: Field should be accessed / updated in atomic for parallelized code.
+ */
+struct z_erofs_pcluster {
+ struct erofs_workgroup obj;
+ struct mutex lock;
+
+ /* A: point to next chained pcluster or TAILs */
+ z_erofs_next_pcluster_t next;
+
+ /* L: the maximum decompression size of this round */
+ unsigned int length;
+
+ /* L: total number of bvecs */
+ unsigned int vcnt;
+
+ /* I: page offset of start position of decompression */
+ unsigned short pageofs_out;
+
+ /* I: page offset of inline compressed data */
+ unsigned short pageofs_in;
+
+ union {
+ /* L: inline a certain number of bvec for bootstrap */
+ struct z_erofs_bvset_inline bvset;
+
+ /* I: can be used to free the pcluster by RCU. */
+ struct rcu_head rcu;
+ };
+
+ union {
+ /* I: physical cluster size in pages */
+ unsigned short pclusterpages;
+
+ /* I: tailpacking inline compressed size */
+ unsigned short tailpacking_size;
+ };
+
+ /* I: compression algorithm format */
+ unsigned char algorithmformat;
+
+ /* L: whether partial decompression or not */
+ bool partial;
+
+ /* L: indicate several pageofs_outs or not */
+ bool multibases;
+
+ /* A: compressed bvecs (can be cached or inplaced pages) */
+ struct z_erofs_bvec compressed_bvecs[];
+};
+
+/* the end of a chain of pclusters */
+#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
+#define Z_EROFS_PCLUSTER_NIL (NULL)
+
+struct z_erofs_decompressqueue {
+ struct super_block *sb;
+ atomic_t pending_bios;
+ z_erofs_next_pcluster_t head;
+
+ union {
+ struct completion done;
+ struct work_struct work;
+ struct kthread_work kthread_work;
+ } u;
+ bool eio, sync;
+};
+
+static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
+{
+ return !pcl->obj.index;
+}
+
+static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
+{
+ if (z_erofs_is_inline_pcluster(pcl))
+ return 1;
+ return pcl->pclusterpages;
+}
+
+/*
+ * bit 30: I/O error occurred on this page
+ * bit 0 - 29: remaining parts to complete this page
+ */
+#define Z_EROFS_PAGE_EIO (1 << 30)
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+ union {
+ atomic_t o;
+ unsigned long v;
+ } u = { .o = ATOMIC_INIT(1) };
+
+ set_page_private(page, u.v);
+ smp_wmb();
+ SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_split(struct page *page)
+{
+ atomic_inc((atomic_t *)&page->private);
+}
+
+static void z_erofs_onlinepage_endio(struct page *page, int err)
+{
+ int orig, v;
+
+ DBG_BUGON(!PagePrivate(page));
+
+ do {
+ orig = atomic_read((atomic_t *)&page->private);
+ v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0);
+ } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
+
+ if (!(v & ~Z_EROFS_PAGE_EIO)) {
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ if (!(v & Z_EROFS_PAGE_EIO))
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+}
+
+#define Z_EROFS_ONSTACK_PAGES 32
+
+/*
+ * since pclustersize is variable for big pcluster feature, introduce slab
+ * pools implementation for different pcluster sizes.
+ */
+struct z_erofs_pcluster_slab {
+ struct kmem_cache *slab;
+ unsigned int maxpages;
+ char name[48];
+};
+
+#define _PCLP(n) { .maxpages = n }
+
+static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
+ _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
+ _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+};
+
+struct z_erofs_bvec_iter {
+ struct page *bvpage;
+ struct z_erofs_bvset *bvset;
+ unsigned int nr, cur;
+};
+
+static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
+{
+ if (iter->bvpage)
+ kunmap_local(iter->bvset);
+ return iter->bvpage;
+}
+
+static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
+{
+ unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
+ /* have to access nextpage in advance, otherwise it will be unmapped */
+ struct page *nextpage = iter->bvset->nextpage;
+ struct page *oldpage;
+
+ DBG_BUGON(!nextpage);
+ oldpage = z_erofs_bvec_iter_end(iter);
+ iter->bvpage = nextpage;
+ iter->bvset = kmap_local_page(nextpage);
+ iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
+ iter->cur = 0;
+ return oldpage;
+}
+
+static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvset_inline *bvset,
+ unsigned int bootstrap_nr,
+ unsigned int cur)
+{
+ *iter = (struct z_erofs_bvec_iter) {
+ .nr = bootstrap_nr,
+ .bvset = (struct z_erofs_bvset *)bvset,
+ };
+
+ while (cur > iter->nr) {
+ cur -= iter->nr;
+ z_erofs_bvset_flip(iter);
+ }
+ iter->cur = cur;
+}
+
+static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvec *bvec,
+ struct page **candidate_bvpage,
+ struct page **pagepool)
+{
+ if (iter->cur >= iter->nr) {
+ struct page *nextpage = *candidate_bvpage;
+
+ if (!nextpage) {
+ nextpage = erofs_allocpage(pagepool, GFP_NOFS);
+ if (!nextpage)
+ return -ENOMEM;
+ set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
+ }
+ DBG_BUGON(iter->bvset->nextpage);
+ iter->bvset->nextpage = nextpage;
+ z_erofs_bvset_flip(iter);
+
+ iter->bvset->nextpage = NULL;
+ *candidate_bvpage = NULL;
+ }
+ iter->bvset->bvec[iter->cur++] = *bvec;
+ return 0;
+}
+
+static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvec *bvec,
+ struct page **old_bvpage)
+{
+ if (iter->cur == iter->nr)
+ *old_bvpage = z_erofs_bvset_flip(iter);
+ else
+ *old_bvpage = NULL;
+ *bvec = iter->bvset->bvec[iter->cur++];
+}
+
+static void z_erofs_destroy_pcluster_pool(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+ if (!pcluster_pool[i].slab)
+ continue;
+ kmem_cache_destroy(pcluster_pool[i].slab);
+ pcluster_pool[i].slab = NULL;
+ }
+}
+
+static int z_erofs_create_pcluster_pool(void)
+{
+ struct z_erofs_pcluster_slab *pcs;
+ struct z_erofs_pcluster *a;
+ unsigned int size;
+
+ for (pcs = pcluster_pool;
+ pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
+ size = struct_size(a, compressed_bvecs, pcs->maxpages);
+
+ sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
+ pcs->slab = kmem_cache_create(pcs->name, size, 0,
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (pcs->slab)
+ continue;
+
+ z_erofs_destroy_pcluster_pool();
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+ struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+ struct z_erofs_pcluster *pcl;
+
+ if (nrpages > pcs->maxpages)
+ continue;
+
+ pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+ if (!pcl)
+ return ERR_PTR(-ENOMEM);
+ pcl->pclusterpages = nrpages;
+ return pcl;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
+{
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+ struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+
+ if (pclusterpages > pcs->maxpages)
+ continue;
+
+ kmem_cache_free(pcs->slab, pcl);
+ return;
+ }
+ DBG_BUGON(1);
+}
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+
+#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
+static struct kthread_worker __rcu **z_erofs_pcpu_workers;
+
+static void erofs_destroy_percpu_workers(void)
+{
+ struct kthread_worker *worker;
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ worker = rcu_dereference_protected(
+ z_erofs_pcpu_workers[cpu], 1);
+ rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
+ if (worker)
+ kthread_destroy_worker(worker);
+ }
+ kfree(z_erofs_pcpu_workers);
+}
+
+static struct kthread_worker *erofs_init_percpu_worker(int cpu)
+{
+ struct kthread_worker *worker =
+ kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
+
+ if (IS_ERR(worker))
+ return worker;
+ if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI))
+ sched_set_fifo_low(worker->task);
+ return worker;
+}
+
+static int erofs_init_percpu_workers(void)
+{
+ struct kthread_worker *worker;
+ unsigned int cpu;
+
+ z_erofs_pcpu_workers = kcalloc(num_possible_cpus(),
+ sizeof(struct kthread_worker *), GFP_ATOMIC);
+ if (!z_erofs_pcpu_workers)
+ return -ENOMEM;
+
+ for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */
+ worker = erofs_init_percpu_worker(cpu);
+ if (!IS_ERR(worker))
+ rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
+ }
+ return 0;
+}
+#else
+static inline void erofs_destroy_percpu_workers(void) {}
+static inline int erofs_init_percpu_workers(void) { return 0; }
+#endif
+
+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
+static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
+static enum cpuhp_state erofs_cpuhp_state;
+
+static int erofs_cpu_online(unsigned int cpu)
+{
+ struct kthread_worker *worker, *old;
+
+ worker = erofs_init_percpu_worker(cpu);
+ if (IS_ERR(worker))
+ return PTR_ERR(worker);
+
+ spin_lock(&z_erofs_pcpu_worker_lock);
+ old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
+ lockdep_is_held(&z_erofs_pcpu_worker_lock));
+ if (!old)
+ rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
+ spin_unlock(&z_erofs_pcpu_worker_lock);
+ if (old)
+ kthread_destroy_worker(worker);
+ return 0;
+}
+
+static int erofs_cpu_offline(unsigned int cpu)
+{
+ struct kthread_worker *worker;
+
+ spin_lock(&z_erofs_pcpu_worker_lock);
+ worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
+ lockdep_is_held(&z_erofs_pcpu_worker_lock));
+ rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
+ spin_unlock(&z_erofs_pcpu_worker_lock);
+
+ synchronize_rcu();
+ if (worker)
+ kthread_destroy_worker(worker);
+ return 0;
+}
+
+static int erofs_cpu_hotplug_init(void)
+{
+ int state;
+
+ state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline);
+ if (state < 0)
+ return state;
+
+ erofs_cpuhp_state = state;
+ return 0;
+}
+
+static void erofs_cpu_hotplug_destroy(void)
+{
+ if (erofs_cpuhp_state)
+ cpuhp_remove_state_nocalls(erofs_cpuhp_state);
+}
+#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
+static inline int erofs_cpu_hotplug_init(void) { return 0; }
+static inline void erofs_cpu_hotplug_destroy(void) {}
+#endif
+
+void z_erofs_exit_zip_subsystem(void)
+{
+ erofs_cpu_hotplug_destroy();
+ erofs_destroy_percpu_workers();
+ destroy_workqueue(z_erofs_workqueue);
+ z_erofs_destroy_pcluster_pool();
+}
+
+int __init z_erofs_init_zip_subsystem(void)
+{
+ int err = z_erofs_create_pcluster_pool();
+
+ if (err)
+ goto out_error_pcluster_pool;
+
+ z_erofs_workqueue = alloc_workqueue("erofs_worker",
+ WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
+ if (!z_erofs_workqueue) {
+ err = -ENOMEM;
+ goto out_error_workqueue_init;
+ }
+
+ err = erofs_init_percpu_workers();
+ if (err)
+ goto out_error_pcpu_worker;
+
+ err = erofs_cpu_hotplug_init();
+ if (err < 0)
+ goto out_error_cpuhp_init;
+ return err;
+
+out_error_cpuhp_init:
+ erofs_destroy_percpu_workers();
+out_error_pcpu_worker:
+ destroy_workqueue(z_erofs_workqueue);
+out_error_workqueue_init:
+ z_erofs_destroy_pcluster_pool();
+out_error_pcluster_pool:
+ return err;
+}
+
+enum z_erofs_pclustermode {
+ Z_EROFS_PCLUSTER_INFLIGHT,
+ /*
+ * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
+ * could be dispatched into bypass queue later due to uptodated managed
+ * pages. All related online pages cannot be reused for inplace I/O (or
+ * bvpage) since it can be directly decoded without I/O submission.
+ */
+ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
+ /*
+ * The pcluster was just linked to a decompression chain by us. It can
+ * also be linked with the remaining pclusters, which means if the
+ * processing page is the tail page of a pcluster, this pcluster can
+ * safely use the whole page (since the previous pcluster is within the
+ * same chain) for in-place I/O, as illustrated below:
+ * ___________________________________________________
+ * | tail (partial) page | head (partial) page |
+ * | (of the current pcl) | (of the previous pcl) |
+ * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
+ *
+ * [ (*) the page above can be used as inplace I/O. ]
+ */
+ Z_EROFS_PCLUSTER_FOLLOWED,
+};
+
+struct z_erofs_decompress_frontend {
+ struct inode *const inode;
+ struct erofs_map_blocks map;
+ struct z_erofs_bvec_iter biter;
+
+ struct page *pagepool;
+ struct page *candidate_bvpage;
+ struct z_erofs_pcluster *pcl;
+ z_erofs_next_pcluster_t owned_head;
+ enum z_erofs_pclustermode mode;
+
+ erofs_off_t headoffset;
+
+ /* a pointer used to pick up inplace I/O pages */
+ unsigned int icur;
+};
+
+#define DECOMPRESS_FRONTEND_INIT(__i) { \
+ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED }
+
+static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
+{
+ unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
+
+ if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
+ return false;
+
+ if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
+ return true;
+
+ if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
+ fe->map.m_la < fe->headoffset)
+ return true;
+
+ return false;
+}
+
+static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
+{
+ struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
+ struct z_erofs_pcluster *pcl = fe->pcl;
+ bool shouldalloc = z_erofs_should_alloc_cache(fe);
+ bool standalone = true;
+ /*
+ * optimistic allocation without direct reclaim since inplace I/O
+ * can be used if low memory otherwise.
+ */
+ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+ unsigned int i;
+
+ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
+ return;
+
+ for (i = 0; i < pcl->pclusterpages; ++i) {
+ struct page *page;
+ void *t; /* mark pages just found for debugging */
+ struct page *newpage = NULL;
+
+ /* the compressed page was loaded before */
+ if (READ_ONCE(pcl->compressed_bvecs[i].page))
+ continue;
+
+ page = find_get_page(mc, pcl->obj.index + i);
+
+ if (page) {
+ t = (void *)((unsigned long)page | 1);
+ } else {
+ /* I/O is needed, no possible to decompress directly */
+ standalone = false;
+ if (!shouldalloc)
+ continue;
+
+ /*
+ * try to use cached I/O if page allocation
+ * succeeds or fallback to in-place I/O instead
+ * to avoid any direct reclaim.
+ */
+ newpage = erofs_allocpage(&fe->pagepool, gfp);
+ if (!newpage)
+ continue;
+ set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
+ t = (void *)((unsigned long)newpage | 1);
+ }
+
+ if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
+ continue;
+
+ if (page)
+ put_page(page);
+ else if (newpage)
+ erofs_pagepool_add(&fe->pagepool, newpage);
+ }
+
+ /*
+ * don't do inplace I/O if all compressed pages are available in
+ * managed cache since it can be moved to the bypass queue instead.
+ */
+ if (standalone)
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
+}
+
+/* called by erofs_shrinker to get rid of all compressed_pages */
+int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+ struct erofs_workgroup *grp)
+{
+ struct z_erofs_pcluster *const pcl =
+ container_of(grp, struct z_erofs_pcluster, obj);
+ int i;
+
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ /*
+ * refcount of workgroup is now freezed as 0,
+ * therefore no need to worry about available decompression users.
+ */
+ for (i = 0; i < pcl->pclusterpages; ++i) {
+ struct page *page = pcl->compressed_bvecs[i].page;
+
+ if (!page)
+ continue;
+
+ /* block other users from reclaiming or migrating the page */
+ if (!trylock_page(page))
+ return -EBUSY;
+
+ if (!erofs_page_is_managed(sbi, page))
+ continue;
+
+ /* barrier is implied in the following 'unlock_page' */
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
+ detach_page_private(page);
+ unlock_page(page);
+ }
+ return 0;
+}
+
+static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
+{
+ struct z_erofs_pcluster *pcl = folio_get_private(folio);
+ bool ret;
+ int i;
+
+ if (!folio_test_private(folio))
+ return true;
+
+ ret = false;
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->obj.lockref.count > 0)
+ goto out;
+
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ for (i = 0; i < pcl->pclusterpages; ++i) {
+ if (pcl->compressed_bvecs[i].page == &folio->page) {
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
+ ret = true;
+ break;
+ }
+ }
+ if (ret)
+ folio_detach_private(folio);
+out:
+ spin_unlock(&pcl->obj.lockref.lock);
+ return ret;
+}
+
+/*
+ * It will be called only on inode eviction. In case that there are still some
+ * decompression requests in progress, wait with rescheduling for a bit here.
+ * An extra lock could be introduced instead but it seems unnecessary.
+ */
+static void z_erofs_cache_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
+{
+ const size_t stop = length + offset;
+
+ /* Check for potential overflow in debug mode */
+ DBG_BUGON(stop > folio_size(folio) || stop < length);
+
+ if (offset == 0 && stop == folio_size(folio))
+ while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
+ cond_resched();
+}
+
+static const struct address_space_operations z_erofs_cache_aops = {
+ .release_folio = z_erofs_cache_release_folio,
+ .invalidate_folio = z_erofs_cache_invalidate_folio,
+};
+
+int erofs_init_managed_cache(struct super_block *sb)
+{
+ struct inode *const inode = new_inode(sb);
+
+ if (!inode)
+ return -ENOMEM;
+
+ set_nlink(inode, 1);
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &z_erofs_cache_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ EROFS_SB(sb)->managed_cache = inode;
+ return 0;
+}
+
+static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
+ struct z_erofs_bvec *bvec)
+{
+ struct z_erofs_pcluster *const pcl = fe->pcl;
+
+ while (fe->icur > 0) {
+ if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
+ NULL, bvec->page)) {
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ return true;
+ }
+ }
+ return false;
+}
+
+/* callers must be with pcluster lock held */
+static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
+ struct z_erofs_bvec *bvec, bool exclusive)
+{
+ int ret;
+
+ if (exclusive) {
+ /* give priority for inplaceio to use file pages first */
+ if (z_erofs_try_inplace_io(fe, bvec))
+ return 0;
+ /* otherwise, check if it can be used as a bvpage */
+ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
+ !fe->candidate_bvpage)
+ fe->candidate_bvpage = bvec->page;
+ }
+ ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
+ &fe->pagepool);
+ fe->pcl->vcnt += (ret >= 0);
+ return ret;
+}
+
+static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
+{
+ struct z_erofs_pcluster *pcl = f->pcl;
+ z_erofs_next_pcluster_t *owned_head = &f->owned_head;
+
+ /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
+ if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
+ *owned_head) == Z_EROFS_PCLUSTER_NIL) {
+ *owned_head = &pcl->next;
+ /* so we can attach this pcluster to our submission chain. */
+ f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
+ return;
+ }
+
+ /* type 2, it belongs to an ongoing chain */
+ f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
+}
+
+static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
+{
+ struct erofs_map_blocks *map = &fe->map;
+ bool ztailpacking = map->m_flags & EROFS_MAP_META;
+ struct z_erofs_pcluster *pcl;
+ struct erofs_workgroup *grp;
+ int err;
+
+ if (!(map->m_flags & EROFS_MAP_ENCODED) ||
+ (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
+ /* no available pcluster, let's allocate one */
+ pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
+ map->m_plen >> PAGE_SHIFT);
+ if (IS_ERR(pcl))
+ return PTR_ERR(pcl);
+
+ spin_lock_init(&pcl->obj.lockref.lock);
+ pcl->obj.lockref.count = 1; /* one ref for this request */
+ pcl->algorithmformat = map->m_algorithmformat;
+ pcl->length = 0;
+ pcl->partial = true;
+
+ /* new pclusters should be claimed as type 1, primary and followed */
+ pcl->next = fe->owned_head;
+ pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
+
+ /*
+ * lock all primary followed works before visible to others
+ * and mutex_trylock *never* fails for a new pcluster.
+ */
+ mutex_init(&pcl->lock);
+ DBG_BUGON(!mutex_trylock(&pcl->lock));
+
+ if (ztailpacking) {
+ pcl->obj.index = 0; /* which indicates ztailpacking */
+ pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa);
+ pcl->tailpacking_size = map->m_plen;
+ } else {
+ pcl->obj.index = map->m_pa >> PAGE_SHIFT;
+
+ grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
+ if (IS_ERR(grp)) {
+ err = PTR_ERR(grp);
+ goto err_out;
+ }
+
+ if (grp != &pcl->obj) {
+ fe->pcl = container_of(grp,
+ struct z_erofs_pcluster, obj);
+ err = -EEXIST;
+ goto err_out;
+ }
+ }
+ fe->owned_head = &pcl->next;
+ fe->pcl = pcl;
+ return 0;
+
+err_out:
+ mutex_unlock(&pcl->lock);
+ z_erofs_free_pcluster(pcl);
+ return err;
+}
+
+static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
+{
+ struct erofs_map_blocks *map = &fe->map;
+ struct super_block *sb = fe->inode->i_sb;
+ erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
+ struct erofs_workgroup *grp = NULL;
+ int ret;
+
+ DBG_BUGON(fe->pcl);
+
+ /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
+
+ if (!(map->m_flags & EROFS_MAP_META)) {
+ grp = erofs_find_workgroup(sb, blknr);
+ } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
+ if (grp) {
+ fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ ret = -EEXIST;
+ } else {
+ ret = z_erofs_register_pcluster(fe);
+ }
+
+ if (ret == -EEXIST) {
+ mutex_lock(&fe->pcl->lock);
+ z_erofs_try_to_claim_pcluster(fe);
+ } else if (ret) {
+ return ret;
+ }
+
+ z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
+ Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
+ if (!z_erofs_is_inline_pcluster(fe->pcl)) {
+ /* bind cache first when cached decompression is preferred */
+ z_erofs_bind_cache(fe);
+ } else {
+ void *mptr;
+
+ mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
+ if (IS_ERR(mptr)) {
+ ret = PTR_ERR(mptr);
+ erofs_err(sb, "failed to get inline data %d", ret);
+ return ret;
+ }
+ get_page(map->buf.page);
+ WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
+ }
+ /* file-backed inplace I/O pages are traversed in reverse order */
+ fe->icur = z_erofs_pclusterpages(fe->pcl);
+ return 0;
+}
+
+/*
+ * keep in mind that no referenced pclusters will be freed
+ * only after a RCU grace period.
+ */
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+ z_erofs_free_pcluster(container_of(head,
+ struct z_erofs_pcluster, rcu));
+}
+
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+ struct z_erofs_pcluster *const pcl =
+ container_of(grp, struct z_erofs_pcluster, obj);
+
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+}
+
+static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
+{
+ struct z_erofs_pcluster *pcl = fe->pcl;
+
+ if (!pcl)
+ return;
+
+ z_erofs_bvec_iter_end(&fe->biter);
+ mutex_unlock(&pcl->lock);
+
+ if (fe->candidate_bvpage)
+ fe->candidate_bvpage = NULL;
+
+ /*
+ * if all pending pages are added, don't hold its reference
+ * any longer if the pcluster isn't hosted by ourselves.
+ */
+ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
+ erofs_workgroup_put(&pcl->obj);
+
+ fe->pcl = NULL;
+}
+
+static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
+ unsigned int cur, unsigned int end, erofs_off_t pos)
+{
+ struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ unsigned int cnt;
+ u8 *src;
+
+ if (!packed_inode)
+ return -EFSCORRUPTED;
+
+ buf.inode = packed_inode;
+ for (; cur < end; cur += cnt, pos += cnt) {
+ cnt = min_t(unsigned int, end - cur,
+ sb->s_blocksize - erofs_blkoff(sb, pos));
+ src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
+ if (IS_ERR(src)) {
+ erofs_put_metabuf(&buf);
+ return PTR_ERR(src);
+ }
+ memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
+ }
+ erofs_put_metabuf(&buf);
+ return 0;
+}
+
+static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
+ struct page *page)
+{
+ struct inode *const inode = fe->inode;
+ struct erofs_map_blocks *const map = &fe->map;
+ const loff_t offset = page_offset(page);
+ bool tight = true, exclusive;
+ unsigned int cur, end, len, split;
+ int err = 0;
+
+ z_erofs_onlinepage_init(page);
+
+ split = 0;
+ end = PAGE_SIZE;
+repeat:
+ if (offset + end - 1 < map->m_la ||
+ offset + end - 1 >= map->m_la + map->m_llen) {
+ z_erofs_pcluster_end(fe);
+ map->m_la = offset + end - 1;
+ map->m_llen = 0;
+ err = z_erofs_map_blocks_iter(inode, map, 0);
+ if (err)
+ goto out;
+ }
+
+ cur = offset > map->m_la ? 0 : map->m_la - offset;
+ /* bump split parts first to avoid several separate cases */
+ ++split;
+
+ if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+ zero_user_segment(page, cur, end);
+ tight = false;
+ goto next_part;
+ }
+
+ if (map->m_flags & EROFS_MAP_FRAGMENT) {
+ erofs_off_t fpos = offset + cur - map->m_la;
+
+ len = min_t(unsigned int, map->m_llen - fpos, end - cur);
+ err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
+ EROFS_I(inode)->z_fragmentoff + fpos);
+ if (err)
+ goto out;
+ tight = false;
+ goto next_part;
+ }
+
+ if (!fe->pcl) {
+ err = z_erofs_pcluster_begin(fe);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * Ensure the current partial page belongs to this submit chain rather
+ * than other concurrent submit chains or the noio(bypass) chain since
+ * those chains are handled asynchronously thus the page cannot be used
+ * for inplace I/O or bvpage (should be processed in a strict order.)
+ */
+ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+ exclusive = (!cur && ((split <= 1) || tight));
+ if (cur)
+ tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
+
+ err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
+ .page = page,
+ .offset = offset - map->m_la,
+ .end = end,
+ }), exclusive);
+ if (err)
+ goto out;
+
+ z_erofs_onlinepage_split(page);
+ if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
+ fe->pcl->multibases = true;
+ if (fe->pcl->length < offset + end - map->m_la) {
+ fe->pcl->length = offset + end - map->m_la;
+ fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ }
+ if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+ !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
+ fe->pcl->length == map->m_llen)
+ fe->pcl->partial = false;
+next_part:
+ /* shorten the remaining extent to update progress */
+ map->m_llen = offset + cur - map->m_la;
+ map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
+
+ end = cur;
+ if (end > 0)
+ goto repeat;
+
+out:
+ z_erofs_onlinepage_endio(page, err);
+ return err;
+}
+
+static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
+ unsigned int readahead_pages)
+{
+ /* auto: enable for read_folio, disable for readahead */
+ if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
+ !readahead_pages)
+ return true;
+
+ if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
+ (readahead_pages <= sbi->opt.max_sync_decompress_pages))
+ return true;
+
+ return false;
+}
+
+static bool z_erofs_page_is_invalidated(struct page *page)
+{
+ return !page->mapping && !z_erofs_is_shortlived_page(page);
+}
+
+struct z_erofs_decompress_backend {
+ struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
+ struct super_block *sb;
+ struct z_erofs_pcluster *pcl;
+
+ /* pages with the longest decompressed length for deduplication */
+ struct page **decompressed_pages;
+ /* pages to keep the compressed data */
+ struct page **compressed_pages;
+
+ struct list_head decompressed_secondary_bvecs;
+ struct page **pagepool;
+ unsigned int onstack_used, nr_pages;
+};
+
+struct z_erofs_bvec_item {
+ struct z_erofs_bvec bvec;
+ struct list_head list;
+};
+
+static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
+ struct z_erofs_bvec *bvec)
+{
+ struct z_erofs_bvec_item *item;
+ unsigned int pgnr;
+
+ if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
+ (bvec->end == PAGE_SIZE ||
+ bvec->offset + bvec->end == be->pcl->length)) {
+ pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
+ DBG_BUGON(pgnr >= be->nr_pages);
+ if (!be->decompressed_pages[pgnr]) {
+ be->decompressed_pages[pgnr] = bvec->page;
+ return;
+ }
+ }
+
+ /* (cold path) one pcluster is requested multiple times */
+ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
+ item->bvec = *bvec;
+ list_add(&item->list, &be->decompressed_secondary_bvecs);
+}
+
+static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
+ int err)
+{
+ unsigned int off0 = be->pcl->pageofs_out;
+ struct list_head *p, *n;
+
+ list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
+ struct z_erofs_bvec_item *bvi;
+ unsigned int end, cur;
+ void *dst, *src;
+
+ bvi = container_of(p, struct z_erofs_bvec_item, list);
+ cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
+ end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
+ bvi->bvec.end);
+ dst = kmap_local_page(bvi->bvec.page);
+ while (cur < end) {
+ unsigned int pgnr, scur, len;
+
+ pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
+ DBG_BUGON(pgnr >= be->nr_pages);
+
+ scur = bvi->bvec.offset + cur -
+ ((pgnr << PAGE_SHIFT) - off0);
+ len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
+ if (!be->decompressed_pages[pgnr]) {
+ err = -EFSCORRUPTED;
+ cur += len;
+ continue;
+ }
+ src = kmap_local_page(be->decompressed_pages[pgnr]);
+ memcpy(dst + cur, src + scur, len);
+ kunmap_local(src);
+ cur += len;
+ }
+ kunmap_local(dst);
+ z_erofs_onlinepage_endio(bvi->bvec.page, err);
+ list_del(p);
+ kfree(bvi);
+ }
+}
+
+static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
+{
+ struct z_erofs_pcluster *pcl = be->pcl;
+ struct z_erofs_bvec_iter biter;
+ struct page *old_bvpage;
+ int i;
+
+ z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
+ for (i = 0; i < pcl->vcnt; ++i) {
+ struct z_erofs_bvec bvec;
+
+ z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
+
+ if (old_bvpage)
+ z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
+
+ DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
+ z_erofs_do_decompressed_bvec(be, &bvec);
+ }
+
+ old_bvpage = z_erofs_bvec_iter_end(&biter);
+ if (old_bvpage)
+ z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
+}
+
+static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
+ bool *overlapped)
+{
+ struct z_erofs_pcluster *pcl = be->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ int i, err = 0;
+
+ *overlapped = false;
+ for (i = 0; i < pclusterpages; ++i) {
+ struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
+ struct page *page = bvec->page;
+
+ /* compressed pages ought to be present before decompressing */
+ if (!page) {
+ DBG_BUGON(1);
+ continue;
+ }
+ be->compressed_pages[i] = page;
+
+ if (z_erofs_is_inline_pcluster(pcl)) {
+ if (!PageUptodate(page))
+ err = -EIO;
+ continue;
+ }
+
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
+ if (!z_erofs_is_shortlived_page(page)) {
+ if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
+ if (!PageUptodate(page))
+ err = -EIO;
+ continue;
+ }
+ z_erofs_do_decompressed_bvec(be, bvec);
+ *overlapped = true;
+ }
+ }
+
+ if (err)
+ return err;
+ return 0;
+}
+
+static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
+ int err)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
+ struct z_erofs_pcluster *pcl = be->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ const struct z_erofs_decompressor *decompressor =
+ &erofs_decompressors[pcl->algorithmformat];
+ unsigned int i, inputsize;
+ int err2;
+ struct page *page;
+ bool overlapped;
+
+ mutex_lock(&pcl->lock);
+ be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
+
+ /* allocate (de)compressed page arrays if cannot be kept on stack */
+ be->decompressed_pages = NULL;
+ be->compressed_pages = NULL;
+ be->onstack_used = 0;
+ if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
+ be->decompressed_pages = be->onstack_pages;
+ be->onstack_used = be->nr_pages;
+ memset(be->decompressed_pages, 0,
+ sizeof(struct page *) * be->nr_pages);
+ }
+
+ if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
+ be->compressed_pages = be->onstack_pages + be->onstack_used;
+
+ if (!be->decompressed_pages)
+ be->decompressed_pages =
+ kvcalloc(be->nr_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
+ if (!be->compressed_pages)
+ be->compressed_pages =
+ kvcalloc(pclusterpages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
+
+ z_erofs_parse_out_bvecs(be);
+ err2 = z_erofs_parse_in_bvecs(be, &overlapped);
+ if (err2)
+ err = err2;
+ if (err)
+ goto out;
+
+ if (z_erofs_is_inline_pcluster(pcl))
+ inputsize = pcl->tailpacking_size;
+ else
+ inputsize = pclusterpages * PAGE_SIZE;
+
+ err = decompressor->decompress(&(struct z_erofs_decompress_req) {
+ .sb = be->sb,
+ .in = be->compressed_pages,
+ .out = be->decompressed_pages,
+ .pageofs_in = pcl->pageofs_in,
+ .pageofs_out = pcl->pageofs_out,
+ .inputsize = inputsize,
+ .outputsize = pcl->length,
+ .alg = pcl->algorithmformat,
+ .inplace_io = overlapped,
+ .partial_decoding = pcl->partial,
+ .fillgaps = pcl->multibases,
+ }, be->pagepool);
+
+out:
+ /* must handle all compressed pages before actual file pages */
+ if (z_erofs_is_inline_pcluster(pcl)) {
+ page = pcl->compressed_bvecs[0].page;
+ WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
+ put_page(page);
+ } else {
+ for (i = 0; i < pclusterpages; ++i) {
+ /* consider shortlived pages added when decompressing */
+ page = be->compressed_pages[i];
+
+ if (erofs_page_is_managed(sbi, page))
+ continue;
+ (void)z_erofs_put_shortlivedpage(be->pagepool, page);
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
+ }
+ }
+ if (be->compressed_pages < be->onstack_pages ||
+ be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
+ kvfree(be->compressed_pages);
+ z_erofs_fill_other_copies(be, err);
+
+ for (i = 0; i < be->nr_pages; ++i) {
+ page = be->decompressed_pages[i];
+ if (!page)
+ continue;
+
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
+
+ /* recycle all individual short-lived pages */
+ if (z_erofs_put_shortlivedpage(be->pagepool, page))
+ continue;
+ z_erofs_onlinepage_endio(page, err);
+ }
+
+ if (be->decompressed_pages != be->onstack_pages)
+ kvfree(be->decompressed_pages);
+
+ pcl->length = 0;
+ pcl->partial = true;
+ pcl->multibases = false;
+ pcl->bvset.nextpage = NULL;
+ pcl->vcnt = 0;
+
+ /* pcluster lock MUST be taken before the following line */
+ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
+ mutex_unlock(&pcl->lock);
+ return err;
+}
+
+static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
+ struct page **pagepool)
+{
+ struct z_erofs_decompress_backend be = {
+ .sb = io->sb,
+ .pagepool = pagepool,
+ .decompressed_secondary_bvecs =
+ LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
+ };
+ z_erofs_next_pcluster_t owned = io->head;
+
+ while (owned != Z_EROFS_PCLUSTER_TAIL) {
+ DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
+
+ be.pcl = container_of(owned, struct z_erofs_pcluster, next);
+ owned = READ_ONCE(be.pcl->next);
+
+ z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
+ if (z_erofs_is_inline_pcluster(be.pcl))
+ z_erofs_free_pcluster(be.pcl);
+ else
+ erofs_workgroup_put(&be.pcl->obj);
+ }
+}
+
+static void z_erofs_decompressqueue_work(struct work_struct *work)
+{
+ struct z_erofs_decompressqueue *bgq =
+ container_of(work, struct z_erofs_decompressqueue, u.work);
+ struct page *pagepool = NULL;
+
+ DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
+ z_erofs_decompress_queue(bgq, &pagepool);
+ erofs_release_pages(&pagepool);
+ kvfree(bgq);
+}
+
+#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
+static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
+{
+ z_erofs_decompressqueue_work((struct work_struct *)work);
+}
+#endif
+
+static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
+ int bios)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
+
+ /* wake up the caller thread for sync decompression */
+ if (io->sync) {
+ if (!atomic_add_return(bios, &io->pending_bios))
+ complete(&io->u.done);
+ return;
+ }
+
+ if (atomic_add_return(bios, &io->pending_bios))
+ return;
+ /* Use (kthread_)work and sync decompression for atomic contexts only */
+ if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
+#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
+ struct kthread_worker *worker;
+
+ rcu_read_lock();
+ worker = rcu_dereference(
+ z_erofs_pcpu_workers[raw_smp_processor_id()]);
+ if (!worker) {
+ INIT_WORK(&io->u.work, z_erofs_decompressqueue_work);
+ queue_work(z_erofs_workqueue, &io->u.work);
+ } else {
+ kthread_queue_work(worker, &io->u.kthread_work);
+ }
+ rcu_read_unlock();
+#else
+ queue_work(z_erofs_workqueue, &io->u.work);
+#endif
+ /* enable sync decompression for readahead */
+ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
+ sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
+ return;
+ }
+ z_erofs_decompressqueue_work(&io->u.work);
+}
+
+static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
+ unsigned int nr,
+ struct page **pagepool,
+ struct address_space *mc)
+{
+ const pgoff_t index = pcl->obj.index;
+ gfp_t gfp = mapping_gfp_mask(mc);
+ bool tocache = false;
+
+ struct address_space *mapping;
+ struct page *oldpage, *page;
+ int justfound;
+
+repeat:
+ page = READ_ONCE(pcl->compressed_bvecs[nr].page);
+ oldpage = page;
+
+ if (!page)
+ goto out_allocpage;
+
+ justfound = (unsigned long)page & 1UL;
+ page = (struct page *)((unsigned long)page & ~1UL);
+
+ /*
+ * preallocated cached pages, which is used to avoid direct reclaim
+ * otherwise, it will go inplace I/O path instead.
+ */
+ if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
+ WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
+ set_page_private(page, 0);
+ tocache = true;
+ goto out_tocache;
+ }
+ mapping = READ_ONCE(page->mapping);
+
+ /*
+ * file-backed online pages in plcuster are all locked steady,
+ * therefore it is impossible for `mapping' to be NULL.
+ */
+ if (mapping && mapping != mc)
+ /* ought to be unmanaged pages */
+ goto out;
+
+ /* directly return for shortlived page as well */
+ if (z_erofs_is_shortlived_page(page))
+ goto out;
+
+ lock_page(page);
+
+ /* only true if page reclaim goes wrong, should never happen */
+ DBG_BUGON(justfound && PagePrivate(page));
+
+ /* the page is still in manage cache */
+ if (page->mapping == mc) {
+ WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
+
+ if (!PagePrivate(page)) {
+ /*
+ * impossible to be !PagePrivate(page) for
+ * the current restriction as well if
+ * the page is already in compressed_bvecs[].
+ */
+ DBG_BUGON(!justfound);
+
+ justfound = 0;
+ set_page_private(page, (unsigned long)pcl);
+ SetPagePrivate(page);
+ }
+
+ /* no need to submit io if it is already up-to-date */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ page = NULL;
+ }
+ goto out;
+ }
+
+ /*
+ * the managed page has been truncated, it's unsafe to
+ * reuse this one, let's allocate a new cache-managed page.
+ */
+ DBG_BUGON(page->mapping);
+ DBG_BUGON(!justfound);
+
+ tocache = true;
+ unlock_page(page);
+ put_page(page);
+out_allocpage:
+ page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
+ if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
+ oldpage, page)) {
+ erofs_pagepool_add(pagepool, page);
+ cond_resched();
+ goto repeat;
+ }
+out_tocache:
+ if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+ /* turn into temporary page if fails (1 ref) */
+ set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
+ goto out;
+ }
+ attach_page_private(page, pcl);
+ /* drop a refcount added by allocpage (then we have 2 refs here) */
+ put_page(page);
+
+out: /* the only exit (for tracing and debugging) */
+ return page;
+}
+
+static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
+ struct z_erofs_decompressqueue *fgq, bool *fg)
+{
+ struct z_erofs_decompressqueue *q;
+
+ if (fg && !*fg) {
+ q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
+ if (!q) {
+ *fg = true;
+ goto fg_out;
+ }
+#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
+ kthread_init_work(&q->u.kthread_work,
+ z_erofs_decompressqueue_kthread_work);
+#else
+ INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
+#endif
+ } else {
+fg_out:
+ q = fgq;
+ init_completion(&fgq->u.done);
+ atomic_set(&fgq->pending_bios, 0);
+ q->eio = false;
+ q->sync = true;
+ }
+ q->sb = sb;
+ q->head = Z_EROFS_PCLUSTER_TAIL;
+ return q;
+}
+
+/* define decompression jobqueue types */
+enum {
+ JQ_BYPASS,
+ JQ_SUBMIT,
+ NR_JOBQUEUES,
+};
+
+static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
+ z_erofs_next_pcluster_t qtail[],
+ z_erofs_next_pcluster_t owned_head)
+{
+ z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
+ z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
+
+ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
+
+ WRITE_ONCE(*submit_qtail, owned_head);
+ WRITE_ONCE(*bypass_qtail, &pcl->next);
+
+ qtail[JQ_BYPASS] = &pcl->next;
+}
+
+static void z_erofs_decompressqueue_endio(struct bio *bio)
+{
+ struct z_erofs_decompressqueue *q = bio->bi_private;
+ blk_status_t err = bio->bi_status;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+
+ DBG_BUGON(PageUptodate(page));
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
+
+ if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
+ if (!err)
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ }
+ if (err)
+ q->eio = true;
+ z_erofs_decompress_kickoff(q, -1);
+ bio_put(bio);
+}
+
+static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
+ struct z_erofs_decompressqueue *fgq,
+ bool *force_fg, bool readahead)
+{
+ struct super_block *sb = f->inode->i_sb;
+ struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
+ z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
+ struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
+ z_erofs_next_pcluster_t owned_head = f->owned_head;
+ /* bio is NULL initially, so no need to initialize last_{index,bdev} */
+ pgoff_t last_index;
+ struct block_device *last_bdev;
+ unsigned int nr_bios = 0;
+ struct bio *bio = NULL;
+ unsigned long pflags;
+ int memstall = 0;
+
+ /*
+ * if managed cache is enabled, bypass jobqueue is needed,
+ * no need to read from device for all pclusters in this queue.
+ */
+ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
+ q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
+
+ qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
+ qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
+
+ /* by default, all need io submission */
+ q[JQ_SUBMIT]->head = owned_head;
+
+ do {
+ struct erofs_map_dev mdev;
+ struct z_erofs_pcluster *pcl;
+ pgoff_t cur, end;
+ unsigned int i = 0;
+ bool bypass = true;
+
+ DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
+ pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+ owned_head = READ_ONCE(pcl->next);
+
+ if (z_erofs_is_inline_pcluster(pcl)) {
+ move_to_bypass_jobqueue(pcl, qtail, owned_head);
+ continue;
+ }
+
+ /* no device id here, thus it will always succeed */
+ mdev = (struct erofs_map_dev) {
+ .m_pa = erofs_pos(sb, pcl->obj.index),
+ };
+ (void)erofs_map_dev(sb, &mdev);
+
+ cur = erofs_blknr(sb, mdev.m_pa);
+ end = cur + pcl->pclusterpages;
+
+ do {
+ struct page *page;
+
+ page = pickup_page_for_submission(pcl, i++,
+ &f->pagepool, mc);
+ if (!page)
+ continue;
+
+ if (bio && (cur != last_index + 1 ||
+ last_bdev != mdev.m_bdev)) {
+submit_bio_retry:
+ submit_bio(bio);
+ if (memstall) {
+ psi_memstall_leave(&pflags);
+ memstall = 0;
+ }
+ bio = NULL;
+ }
+
+ if (unlikely(PageWorkingset(page)) && !memstall) {
+ psi_memstall_enter(&pflags);
+ memstall = 1;
+ }
+
+ if (!bio) {
+ bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
+ bio->bi_end_io = z_erofs_decompressqueue_endio;
+
+ last_bdev = mdev.m_bdev;
+ bio->bi_iter.bi_sector = (sector_t)cur <<
+ (sb->s_blocksize_bits - 9);
+ bio->bi_private = q[JQ_SUBMIT];
+ if (readahead)
+ bio->bi_opf |= REQ_RAHEAD;
+ ++nr_bios;
+ }
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
+ goto submit_bio_retry;
+
+ last_index = cur;
+ bypass = false;
+ } while (++cur < end);
+
+ if (!bypass)
+ qtail[JQ_SUBMIT] = &pcl->next;
+ else
+ move_to_bypass_jobqueue(pcl, qtail, owned_head);
+ } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
+
+ if (bio) {
+ submit_bio(bio);
+ if (memstall)
+ psi_memstall_leave(&pflags);
+ }
+
+ /*
+ * although background is preferred, no one is pending for submission.
+ * don't issue decompression but drop it directly instead.
+ */
+ if (!*force_fg && !nr_bios) {
+ kvfree(q[JQ_SUBMIT]);
+ return;
+ }
+ z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
+}
+
+static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
+ bool force_fg, bool ra)
+{
+ struct z_erofs_decompressqueue io[NR_JOBQUEUES];
+
+ if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ return;
+ z_erofs_submit_queue(f, io, &force_fg, ra);
+
+ /* handle bypass queue (no i/o pclusters) immediately */
+ z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
+
+ if (!force_fg)
+ return;
+
+ /* wait until all bios are completed */
+ wait_for_completion_io(&io[JQ_SUBMIT].u.done);
+
+ /* handle synchronous decompress queue in the caller context */
+ z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
+}
+
+/*
+ * Since partial uptodate is still unimplemented for now, we have to use
+ * approximate readmore strategies as a start.
+ */
+static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+ struct readahead_control *rac, bool backmost)
+{
+ struct inode *inode = f->inode;
+ struct erofs_map_blocks *map = &f->map;
+ erofs_off_t cur, end, headoffset = f->headoffset;
+ int err;
+
+ if (backmost) {
+ if (rac)
+ end = headoffset + readahead_length(rac) - 1;
+ else
+ end = headoffset + PAGE_SIZE - 1;
+ map->m_la = end;
+ err = z_erofs_map_blocks_iter(inode, map,
+ EROFS_GET_BLOCKS_READMORE);
+ if (err)
+ return;
+
+ /* expand ra for the trailing edge if readahead */
+ if (rac) {
+ cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
+ readahead_expand(rac, headoffset, cur - headoffset);
+ return;
+ }
+ end = round_up(end, PAGE_SIZE);
+ } else {
+ end = round_up(map->m_la, PAGE_SIZE);
+
+ if (!map->m_llen)
+ return;
+ }
+
+ cur = map->m_la + map->m_llen - 1;
+ while ((cur >= end) && (cur < i_size_read(inode))) {
+ pgoff_t index = cur >> PAGE_SHIFT;
+ struct page *page;
+
+ page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
+ if (page) {
+ if (PageUptodate(page))
+ unlock_page(page);
+ else
+ (void)z_erofs_do_read_page(f, page);
+ put_page(page);
+ }
+
+ if (cur < PAGE_SIZE)
+ break;
+ cur = (index << PAGE_SHIFT) - 1;
+ }
+}
+
+static int z_erofs_read_folio(struct file *file, struct folio *folio)
+{
+ struct inode *const inode = folio->mapping->host;
+ struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+ struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ int err;
+
+ trace_erofs_read_folio(folio, false);
+ f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
+
+ z_erofs_pcluster_readmore(&f, NULL, true);
+ err = z_erofs_do_read_page(&f, &folio->page);
+ z_erofs_pcluster_readmore(&f, NULL, false);
+ z_erofs_pcluster_end(&f);
+
+ /* if some compressed cluster ready, need submit them anyway */
+ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
+
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
+ err, folio->index, EROFS_I(inode)->nid);
+
+ erofs_put_metabuf(&f.map.buf);
+ erofs_release_pages(&f.pagepool);
+ return err;
+}
+
+static void z_erofs_readahead(struct readahead_control *rac)
+{
+ struct inode *const inode = rac->mapping->host;
+ struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+ struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ struct folio *head = NULL, *folio;
+ unsigned int nr_folios;
+ int err;
+
+ f.headoffset = readahead_pos(rac);
+
+ z_erofs_pcluster_readmore(&f, rac, true);
+ nr_folios = readahead_count(rac);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
+
+ while ((folio = readahead_folio(rac))) {
+ folio->private = head;
+ head = folio;
+ }
+
+ /* traverse in reverse order for best metadata I/O performance */
+ while (head) {
+ folio = head;
+ head = folio_get_private(folio);
+
+ err = z_erofs_do_read_page(&f, &folio->page);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
+ folio->index, EROFS_I(inode)->nid);
+ }
+ z_erofs_pcluster_readmore(&f, rac, false);
+ z_erofs_pcluster_end(&f);
+
+ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true);
+ erofs_put_metabuf(&f.map.buf);
+ erofs_release_pages(&f.pagepool);
+}
+
+const struct address_space_operations z_erofs_aops = {
+ .read_folio = z_erofs_read_folio,
+ .readahead = z_erofs_readahead,
+};
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
new file mode 100644
index 000000000..7a1a24ae4
--- /dev/null
+++ b/fs/erofs/zmap.c
@@ -0,0 +1,774 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018-2019 HUAWEI, Inc.
+ * https://www.huawei.com/
+ */
+#include "internal.h"
+#include <asm/unaligned.h>
+#include <trace/events/erofs.h>
+
+struct z_erofs_maprecorder {
+ struct inode *inode;
+ struct erofs_map_blocks *map;
+ void *kaddr;
+
+ unsigned long lcn;
+ /* compression extent information gathered */
+ u8 type, headtype;
+ u16 clusterofs;
+ u16 delta[2];
+ erofs_blk_t pblk, compressedblks;
+ erofs_off_t nextpackoff;
+ bool partialref;
+};
+
+static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
+ unsigned long lcn)
+{
+ struct inode *const inode = m->inode;
+ struct erofs_inode *const vi = EROFS_I(inode);
+ const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) +
+ vi->inode_isize + vi->xattr_isize) +
+ lcn * sizeof(struct z_erofs_lcluster_index);
+ struct z_erofs_lcluster_index *di;
+ unsigned int advise, type;
+
+ m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
+ erofs_blknr(inode->i_sb, pos), EROFS_KMAP);
+ if (IS_ERR(m->kaddr))
+ return PTR_ERR(m->kaddr);
+
+ m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index);
+ m->lcn = lcn;
+ di = m->kaddr + erofs_blkoff(inode->i_sb, pos);
+
+ advise = le16_to_cpu(di->di_advise);
+ type = (advise >> Z_EROFS_LI_LCLUSTER_TYPE_BIT) &
+ ((1 << Z_EROFS_LI_LCLUSTER_TYPE_BITS) - 1);
+ switch (type) {
+ case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
+ m->clusterofs = 1 << vi->z_logical_clusterbits;
+ m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+ if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) {
+ if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+ Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ m->compressedblks = m->delta[0] &
+ ~Z_EROFS_LI_D0_CBLKCNT;
+ m->delta[0] = 1;
+ }
+ m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
+ break;
+ case Z_EROFS_LCLUSTER_TYPE_PLAIN:
+ case Z_EROFS_LCLUSTER_TYPE_HEAD1:
+ case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ if (advise & Z_EROFS_LI_PARTIAL_REF)
+ m->partialref = true;
+ m->clusterofs = le16_to_cpu(di->di_clusterofs);
+ if (m->clusterofs >= 1 << vi->z_logical_clusterbits) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ m->pblk = le32_to_cpu(di->di_u.blkaddr);
+ break;
+ default:
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+ m->type = type;
+ return 0;
+}
+
+static unsigned int decode_compactedbits(unsigned int lobits,
+ unsigned int lomask,
+ u8 *in, unsigned int pos, u8 *type)
+{
+ const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7);
+ const unsigned int lo = v & lomask;
+
+ *type = (v >> lobits) & 3;
+ return lo;
+}
+
+static int get_compacted_la_distance(unsigned int lclusterbits,
+ unsigned int encodebits,
+ unsigned int vcnt, u8 *in, int i)
+{
+ const unsigned int lomask = (1 << lclusterbits) - 1;
+ unsigned int lo, d1 = 0;
+ u8 type;
+
+ DBG_BUGON(i >= vcnt);
+
+ do {
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+
+ if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD)
+ return d1;
+ ++d1;
+ } while (++i < vcnt);
+
+ /* vcnt - 1 (Z_EROFS_LCLUSTER_TYPE_NONHEAD) item */
+ if (!(lo & Z_EROFS_LI_D0_CBLKCNT))
+ d1 += lo - 1;
+ return d1;
+}
+
+static int unpack_compacted_index(struct z_erofs_maprecorder *m,
+ unsigned int amortizedshift,
+ erofs_off_t pos, bool lookahead)
+{
+ struct erofs_inode *const vi = EROFS_I(m->inode);
+ const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ const unsigned int lomask = (1 << lclusterbits) - 1;
+ unsigned int vcnt, base, lo, encodebits, nblk, eofs;
+ int i;
+ u8 *in, type;
+ bool big_pcluster;
+
+ if (1 << amortizedshift == 4 && lclusterbits <= 14)
+ vcnt = 2;
+ else if (1 << amortizedshift == 2 && lclusterbits == 12)
+ vcnt = 16;
+ else
+ return -EOPNOTSUPP;
+
+ /* it doesn't equal to round_up(..) */
+ m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
+ (vcnt << amortizedshift);
+ big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
+ eofs = erofs_blkoff(m->inode->i_sb, pos);
+ base = round_down(eofs, vcnt << amortizedshift);
+ in = m->kaddr + base;
+
+ i = (eofs - base) >> amortizedshift;
+
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+ m->type = type;
+ if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ m->clusterofs = 1 << lclusterbits;
+
+ /* figure out lookahead_distance: delta[1] if needed */
+ if (lookahead)
+ m->delta[1] = get_compacted_la_distance(lclusterbits,
+ encodebits, vcnt, in, i);
+ if (lo & Z_EROFS_LI_D0_CBLKCNT) {
+ if (!big_pcluster) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ m->compressedblks = lo & ~Z_EROFS_LI_D0_CBLKCNT;
+ m->delta[0] = 1;
+ return 0;
+ } else if (i + 1 != (int)vcnt) {
+ m->delta[0] = lo;
+ return 0;
+ }
+ /*
+ * since the last lcluster in the pack is special,
+ * of which lo saves delta[1] rather than delta[0].
+ * Hence, get delta[0] by the previous lcluster indirectly.
+ */
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * (i - 1), &type);
+ if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD)
+ lo = 0;
+ else if (lo & Z_EROFS_LI_D0_CBLKCNT)
+ lo = 1;
+ m->delta[0] = lo + 1;
+ return 0;
+ }
+ m->clusterofs = lo;
+ m->delta[0] = 0;
+ /* figout out blkaddr (pblk) for HEAD lclusters */
+ if (!big_pcluster) {
+ nblk = 1;
+ while (i > 0) {
+ --i;
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+ if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD)
+ i -= lo;
+
+ if (i >= 0)
+ ++nblk;
+ }
+ } else {
+ nblk = 0;
+ while (i > 0) {
+ --i;
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+ if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ if (lo & Z_EROFS_LI_D0_CBLKCNT) {
+ --i;
+ nblk += lo & ~Z_EROFS_LI_D0_CBLKCNT;
+ continue;
+ }
+ /* bigpcluster shouldn't have plain d0 == 1 */
+ if (lo <= 1) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ i -= lo - 2;
+ continue;
+ }
+ ++nblk;
+ }
+ }
+ in += (vcnt << amortizedshift) - sizeof(__le32);
+ m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;
+ return 0;
+}
+
+static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
+ unsigned long lcn, bool lookahead)
+{
+ struct inode *const inode = m->inode;
+ struct erofs_inode *const vi = EROFS_I(inode);
+ const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
+ ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
+ unsigned int totalidx = erofs_iblks(inode);
+ unsigned int compacted_4b_initial, compacted_2b;
+ unsigned int amortizedshift;
+ erofs_off_t pos;
+
+ if (lcn >= totalidx)
+ return -EINVAL;
+
+ m->lcn = lcn;
+ /* used to align to 32-byte (compacted_2b) alignment */
+ compacted_4b_initial = (32 - ebase % 32) / 4;
+ if (compacted_4b_initial == 32 / 4)
+ compacted_4b_initial = 0;
+
+ if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
+ compacted_4b_initial < totalidx)
+ compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
+ else
+ compacted_2b = 0;
+
+ pos = ebase;
+ if (lcn < compacted_4b_initial) {
+ amortizedshift = 2;
+ goto out;
+ }
+ pos += compacted_4b_initial * 4;
+ lcn -= compacted_4b_initial;
+
+ if (lcn < compacted_2b) {
+ amortizedshift = 1;
+ goto out;
+ }
+ pos += compacted_2b * 2;
+ lcn -= compacted_2b;
+ amortizedshift = 2;
+out:
+ pos += lcn * (1 << amortizedshift);
+ m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
+ erofs_blknr(inode->i_sb, pos), EROFS_KMAP);
+ if (IS_ERR(m->kaddr))
+ return PTR_ERR(m->kaddr);
+ return unpack_compacted_index(m, amortizedshift, pos, lookahead);
+}
+
+static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
+ unsigned int lcn, bool lookahead)
+{
+ switch (EROFS_I(m->inode)->datalayout) {
+ case EROFS_INODE_COMPRESSED_FULL:
+ return z_erofs_load_full_lcluster(m, lcn);
+ case EROFS_INODE_COMPRESSED_COMPACT:
+ return z_erofs_load_compact_lcluster(m, lcn, lookahead);
+ default:
+ return -EINVAL;
+ }
+}
+
+static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
+ unsigned int lookback_distance)
+{
+ struct super_block *sb = m->inode->i_sb;
+ struct erofs_inode *const vi = EROFS_I(m->inode);
+ const unsigned int lclusterbits = vi->z_logical_clusterbits;
+
+ while (m->lcn >= lookback_distance) {
+ unsigned long lcn = m->lcn - lookback_distance;
+ int err;
+
+ err = z_erofs_load_lcluster_from_disk(m, lcn, false);
+ if (err)
+ return err;
+
+ switch (m->type) {
+ case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
+ lookback_distance = m->delta[0];
+ if (!lookback_distance)
+ goto err_bogus;
+ continue;
+ case Z_EROFS_LCLUSTER_TYPE_PLAIN:
+ case Z_EROFS_LCLUSTER_TYPE_HEAD1:
+ case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ m->headtype = m->type;
+ m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ return 0;
+ default:
+ erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
+ m->type, lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+ }
+err_bogus:
+ erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",
+ lookback_distance, m->lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+}
+
+static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
+ unsigned int initial_lcn)
+{
+ struct super_block *sb = m->inode->i_sb;
+ struct erofs_inode *const vi = EROFS_I(m->inode);
+ struct erofs_map_blocks *const map = m->map;
+ const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ unsigned long lcn;
+ int err;
+
+ DBG_BUGON(m->type != Z_EROFS_LCLUSTER_TYPE_PLAIN &&
+ m->type != Z_EROFS_LCLUSTER_TYPE_HEAD1 &&
+ m->type != Z_EROFS_LCLUSTER_TYPE_HEAD2);
+ DBG_BUGON(m->type != m->headtype);
+
+ if (m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
+ ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
+ ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
+ map->m_plen = 1ULL << lclusterbits;
+ return 0;
+ }
+ lcn = m->lcn + 1;
+ if (m->compressedblks)
+ goto out;
+
+ err = z_erofs_load_lcluster_from_disk(m, lcn, false);
+ if (err)
+ return err;
+
+ /*
+ * If the 1st NONHEAD lcluster has already been handled initially w/o
+ * valid compressedblks, which means at least it mustn't be CBLKCNT, or
+ * an internal implemenatation error is detected.
+ *
+ * The following code can also handle it properly anyway, but let's
+ * BUG_ON in the debugging mode only for developers to notice that.
+ */
+ DBG_BUGON(lcn == initial_lcn &&
+ m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
+
+ switch (m->type) {
+ case Z_EROFS_LCLUSTER_TYPE_PLAIN:
+ case Z_EROFS_LCLUSTER_TYPE_HEAD1:
+ case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ /*
+ * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
+ * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
+ */
+ m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits);
+ break;
+ case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
+ if (m->delta[0] != 1)
+ goto err_bonus_cblkcnt;
+ if (m->compressedblks)
+ break;
+ fallthrough;
+ default:
+ erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn,
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+out:
+ map->m_plen = erofs_pos(sb, m->compressedblks);
+ return 0;
+err_bonus_cblkcnt:
+ erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+}
+
+static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
+{
+ struct inode *inode = m->inode;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_map_blocks *map = m->map;
+ unsigned int lclusterbits = vi->z_logical_clusterbits;
+ u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
+ int err;
+
+ do {
+ /* handle the last EOF pcluster (no next HEAD lcluster) */
+ if ((lcn << lclusterbits) >= inode->i_size) {
+ map->m_llen = inode->i_size - map->m_la;
+ return 0;
+ }
+
+ err = z_erofs_load_lcluster_from_disk(m, lcn, true);
+ if (err)
+ return err;
+
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ DBG_BUGON(!m->delta[1] &&
+ m->clusterofs != 1 << lclusterbits);
+ } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
+ m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 ||
+ m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
+ /* go on until the next HEAD lcluster */
+ if (lcn != headlcn)
+ break;
+ m->delta[1] = 1;
+ } else {
+ erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
+ m->type, lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+ lcn += m->delta[1];
+ } while (m->delta[1]);
+
+ map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la;
+ return 0;
+}
+
+static int z_erofs_do_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
+ bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ struct z_erofs_maprecorder m = {
+ .inode = inode,
+ .map = map,
+ };
+ int err = 0;
+ unsigned int lclusterbits, endoff, afmt;
+ unsigned long initial_lcn;
+ unsigned long long ofs, end;
+
+ lclusterbits = vi->z_logical_clusterbits;
+ ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
+ initial_lcn = ofs >> lclusterbits;
+ endoff = ofs & ((1 << lclusterbits) - 1);
+
+ err = z_erofs_load_lcluster_from_disk(&m, initial_lcn, false);
+ if (err)
+ goto unmap_out;
+
+ if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL))
+ vi->z_idataoff = m.nextpackoff;
+
+ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
+ end = (m.lcn + 1ULL) << lclusterbits;
+
+ switch (m.type) {
+ case Z_EROFS_LCLUSTER_TYPE_PLAIN:
+ case Z_EROFS_LCLUSTER_TYPE_HEAD1:
+ case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ if (endoff >= m.clusterofs) {
+ m.headtype = m.type;
+ map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+ /*
+ * For ztailpacking files, in order to inline data more
+ * effectively, special EOF lclusters are now supported
+ * which can have three parts at most.
+ */
+ if (ztailpacking && end > inode->i_size)
+ end = inode->i_size;
+ break;
+ }
+ /* m.lcn should be >= 1 if endoff < m.clusterofs */
+ if (!m.lcn) {
+ erofs_err(inode->i_sb,
+ "invalid logical cluster 0 at nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
+ end = (m.lcn << lclusterbits) | m.clusterofs;
+ map->m_flags |= EROFS_MAP_FULL_MAPPED;
+ m.delta[0] = 1;
+ fallthrough;
+ case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
+ /* get the corresponding first chunk */
+ err = z_erofs_extent_lookback(&m, m.delta[0]);
+ if (err)
+ goto unmap_out;
+ break;
+ default:
+ erofs_err(inode->i_sb,
+ "unknown type %u @ offset %llu of nid %llu",
+ m.type, ofs, vi->nid);
+ err = -EOPNOTSUPP;
+ goto unmap_out;
+ }
+ if (m.partialref)
+ map->m_flags |= EROFS_MAP_PARTIAL_REF;
+ map->m_llen = end - map->m_la;
+
+ if (flags & EROFS_GET_BLOCKS_FINDTAIL) {
+ vi->z_tailextent_headlcn = m.lcn;
+ /* for non-compact indexes, fragmentoff is 64 bits */
+ if (fragment && vi->datalayout == EROFS_INODE_COMPRESSED_FULL)
+ vi->z_fragmentoff |= (u64)m.pblk << 32;
+ }
+ if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
+ map->m_flags |= EROFS_MAP_META;
+ map->m_pa = vi->z_idataoff;
+ map->m_plen = vi->z_idata_size;
+ } else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
+ map->m_flags |= EROFS_MAP_FRAGMENT;
+ } else {
+ map->m_pa = erofs_pos(inode->i_sb, m.pblk);
+ err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
+ if (err)
+ goto unmap_out;
+ }
+
+ if (m.headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN) {
+ if (map->m_llen > map->m_plen) {
+ DBG_BUGON(1);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
+ afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
+ Z_EROFS_COMPRESSION_INTERLACED :
+ Z_EROFS_COMPRESSION_SHIFTED;
+ } else {
+ afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
+ vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
+ if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ afmt, vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
+ }
+ map->m_algorithmformat = afmt;
+
+ if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
+ ((flags & EROFS_GET_BLOCKS_READMORE) &&
+ (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA ||
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) &&
+ map->m_llen >= i_blocksize(inode))) {
+ err = z_erofs_get_extent_decompressedlen(&m);
+ if (!err)
+ map->m_flags |= EROFS_MAP_FULL_MAPPED;
+ }
+
+unmap_out:
+ erofs_unmap_metabuf(&m.map->buf);
+ return err;
+}
+
+static int z_erofs_fill_inode_lazy(struct inode *inode)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ struct super_block *const sb = inode->i_sb;
+ int err, headnr;
+ erofs_off_t pos;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ void *kaddr;
+ struct z_erofs_map_header *h;
+
+ if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
+ /*
+ * paired with smp_mb() at the end of the function to ensure
+ * fields will only be observed after the bit is set.
+ */
+ smp_mb();
+ return 0;
+ }
+
+ if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
+ return -ERESTARTSYS;
+
+ err = 0;
+ if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
+ goto out_unlock;
+
+ pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ err = PTR_ERR(kaddr);
+ goto out_unlock;
+ }
+
+ h = kaddr + erofs_blkoff(sb, pos);
+ /*
+ * if the highest bit of the 8-byte map header is set, the whole file
+ * is stored in the packed inode. The rest bits keeps z_fragmentoff.
+ */
+ if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) {
+ vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63);
+ vi->z_tailextent_headlcn = 0;
+ goto done;
+ }
+ vi->z_advise = le16_to_cpu(h->h_advise);
+ vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
+ vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+
+ headnr = 0;
+ if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
+ vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
+ erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
+ headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
+ err = -EOPNOTSUPP;
+ goto out_put_metabuf;
+ }
+
+ vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7);
+ if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+ vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+ Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+ erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto out_put_metabuf;
+ }
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+ erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto out_put_metabuf;
+ }
+
+ if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) {
+ struct erofs_map_blocks map = {
+ .buf = __EROFS_BUF_INITIALIZER
+ };
+
+ vi->z_idata_size = le16_to_cpu(h->h_idata_size);
+ err = z_erofs_do_map_blocks(inode, &map,
+ EROFS_GET_BLOCKS_FINDTAIL);
+ erofs_put_metabuf(&map.buf);
+
+ if (!map.m_plen ||
+ erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) {
+ erofs_err(sb, "invalid tail-packing pclustersize %llu",
+ map.m_plen);
+ err = -EFSCORRUPTED;
+ }
+ if (err < 0)
+ goto out_put_metabuf;
+ }
+
+ if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
+ !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) {
+ struct erofs_map_blocks map = {
+ .buf = __EROFS_BUF_INITIALIZER
+ };
+
+ vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
+ err = z_erofs_do_map_blocks(inode, &map,
+ EROFS_GET_BLOCKS_FINDTAIL);
+ erofs_put_metabuf(&map.buf);
+ if (err < 0)
+ goto out_put_metabuf;
+ }
+done:
+ /* paired with smp_mb() at the beginning of the function */
+ smp_mb();
+ set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
+out_put_metabuf:
+ erofs_put_metabuf(&buf);
+out_unlock:
+ clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
+ return err;
+}
+
+int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
+ int flags)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ int err = 0;
+
+ trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
+
+ /* when trying to read beyond EOF, leave it unmapped */
+ if (map->m_la >= inode->i_size) {
+ map->m_llen = map->m_la + 1 - inode->i_size;
+ map->m_la = inode->i_size;
+ map->m_flags = 0;
+ goto out;
+ }
+
+ err = z_erofs_fill_inode_lazy(inode);
+ if (err)
+ goto out;
+
+ if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED |
+ EROFS_MAP_FRAGMENT;
+ goto out;
+ }
+
+ err = z_erofs_do_map_blocks(inode, map, flags);
+out:
+ trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
+ return err;
+}
+
+static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ int ret;
+ struct erofs_map_blocks map = { .m_la = offset };
+
+ ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
+ erofs_put_metabuf(&map.buf);
+ if (ret < 0)
+ return ret;
+
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = map.m_la;
+ iomap->length = map.m_llen;
+ if (map.m_flags & EROFS_MAP_MAPPED) {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+ IOMAP_NULL_ADDR : map.m_pa;
+ } else {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ /*
+ * No strict rule on how to describe extents for post EOF, yet
+ * we need to do like below. Otherwise, iomap itself will get
+ * into an endless loop on post EOF.
+ *
+ * Calculate the effective offset by subtracting extent start
+ * (map.m_la) from the requested offset, and add it to length.
+ * (NB: offset >= map.m_la always)
+ */
+ if (iomap->offset >= inode->i_size)
+ iomap->length = length + offset - map.m_la;
+ }
+ iomap->flags = 0;
+ return 0;
+}
+
+const struct iomap_ops z_erofs_iomap_report_ops = {
+ .iomap_begin = z_erofs_iomap_begin_report,
+};