diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /fs/erofs | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/erofs')
-rw-r--r-- | fs/erofs/Kconfig | 145 | ||||
-rw-r--r-- | fs/erofs/Makefile | 9 | ||||
-rw-r--r-- | fs/erofs/compress.h | 105 | ||||
-rw-r--r-- | fs/erofs/data.c | 452 | ||||
-rw-r--r-- | fs/erofs/decompressor.c | 445 | ||||
-rw-r--r-- | fs/erofs/decompressor_deflate.c | 248 | ||||
-rw-r--r-- | fs/erofs/decompressor_lzma.c | 294 | ||||
-rw-r--r-- | fs/erofs/dir.c | 111 | ||||
-rw-r--r-- | fs/erofs/erofs_fs.h | 461 | ||||
-rw-r--r-- | fs/erofs/fscache.c | 612 | ||||
-rw-r--r-- | fs/erofs/inode.c | 395 | ||||
-rw-r--r-- | fs/erofs/internal.h | 537 | ||||
-rw-r--r-- | fs/erofs/namei.c | 224 | ||||
-rw-r--r-- | fs/erofs/pcpubuf.c | 148 | ||||
-rw-r--r-- | fs/erofs/super.c | 1021 | ||||
-rw-r--r-- | fs/erofs/sysfs.c | 277 | ||||
-rw-r--r-- | fs/erofs/utils.c | 282 | ||||
-rw-r--r-- | fs/erofs/xattr.c | 560 | ||||
-rw-r--r-- | fs/erofs/xattr.h | 73 | ||||
-rw-r--r-- | fs/erofs/zdata.c | 1894 | ||||
-rw-r--r-- | fs/erofs/zmap.c | 774 |
21 files changed, 9067 insertions, 0 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig new file mode 100644 index 0000000000..f6dc961e6c --- /dev/null +++ b/fs/erofs/Kconfig @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config EROFS_FS + tristate "EROFS filesystem support" + depends on BLOCK + select FS_IOMAP + select LIBCRC32C + help + EROFS (Enhanced Read-Only File System) is a lightweight read-only + file system with modern designs (e.g. no buffer heads, inline + xattrs/data, chunk-based deduplication, multiple devices, etc.) for + scenarios which need high-performance read-only solutions, e.g. + smartphones with Android OS, LiveCDs and high-density hosts with + numerous containers; + + It also provides fixed-sized output compression support in order to + improve storage density as well as keep relatively higher compression + ratios and implements in-place decompression to reuse the file page + for compressed data temporarily with proper strategies, which is + quite useful to ensure guaranteed end-to-end runtime decompression + performance under extremely memory pressure without extra cost. + + See the documentation at <file:Documentation/filesystems/erofs.rst> + for more details. + + If unsure, say N. + +config EROFS_FS_DEBUG + bool "EROFS debugging feature" + depends on EROFS_FS + help + Print debugging messages and enable more BUG_ONs which check + filesystem consistency and find potential issues aggressively, + which can be used for Android eng build, for example. + + For daily use, say N. + +config EROFS_FS_XATTR + bool "EROFS extended attributes" + depends on EROFS_FS + select XXHASH + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config EROFS_FS_POSIX_ACL + bool "EROFS Access Control Lists" + depends on EROFS_FS_XATTR + select FS_POSIX_ACL + default y + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N. + +config EROFS_FS_SECURITY + bool "EROFS Security Labels" + depends on EROFS_FS_XATTR + default y + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the erofs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. + +config EROFS_FS_ZIP + bool "EROFS Data Compression Support" + depends on EROFS_FS + select LZ4_DECOMPRESS + default y + help + Enable fixed-sized output compression for EROFS. + + If you don't want to enable compression feature, say N. + +config EROFS_FS_ZIP_LZMA + bool "EROFS LZMA compressed data support" + depends on EROFS_FS_ZIP + select XZ_DEC + select XZ_DEC_MICROLZMA + help + Saying Y here includes support for reading EROFS file systems + containing LZMA compressed data, specifically called microLZMA. it + gives better compression ratios than the LZ4 algorithm, at the + expense of more CPU overhead. + + LZMA support is an experimental feature for now and so most file + systems will be readable without selecting this option. + + If unsure, say N. + +config EROFS_FS_ZIP_DEFLATE + bool "EROFS DEFLATE compressed data support" + depends on EROFS_FS_ZIP + select ZLIB_INFLATE + help + Saying Y here includes support for reading EROFS file systems + containing DEFLATE compressed data. It gives better compression + ratios than the default LZ4 format, while it costs more CPU + overhead. + + DEFLATE support is an experimental feature for now and so most + file systems will be readable without selecting this option. + + If unsure, say N. + +config EROFS_FS_ONDEMAND + bool "EROFS fscache-based on-demand read support" + depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) + default n + help + This permits EROFS to use fscache-backed data blobs with on-demand + read support. + + If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD + bool "EROFS per-cpu decompression kthread workers" + depends on EROFS_FS_ZIP + help + Saying Y here enables per-CPU kthread workers pool to carry out + async decompression for low latencies on some architectures. + + If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD_HIPRI + bool "EROFS high priority per-CPU kthread workers" + depends on EROFS_FS_ZIP && EROFS_FS_PCPU_KTHREAD + default y + help + This permits EROFS to configure per-CPU kthread workers to run + at higher priority. + + If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile new file mode 100644 index 0000000000..994d0b9ded --- /dev/null +++ b/fs/erofs/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_EROFS_FS) += erofs.o +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o +erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o +erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o +erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o +erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o +erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h new file mode 100644 index 0000000000..279933e007 --- /dev/null +++ b/fs/erofs/compress.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2019 HUAWEI, Inc. + * https://www.huawei.com/ + */ +#ifndef __EROFS_FS_COMPRESS_H +#define __EROFS_FS_COMPRESS_H + +#include "internal.h" + +struct z_erofs_decompress_req { + struct super_block *sb; + struct page **in, **out; + + unsigned short pageofs_in, pageofs_out; + unsigned int inputsize, outputsize; + + /* indicate the algorithm will be used for decompression */ + unsigned int alg; + bool inplace_io, partial_decoding, fillgaps; +}; + +struct z_erofs_decompressor { + int (*config)(struct super_block *sb, struct erofs_super_block *dsb, + void *data, int size); + int (*decompress)(struct z_erofs_decompress_req *rq, + struct page **pagepool); + char *name; +}; + +/* some special page->private (unsigned long, see below) */ +#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2) +#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2) + +/* + * For all pages in a pcluster, page->private should be one of + * Type Last 2bits page->private + * short-lived page 00 Z_EROFS_SHORTLIVED_PAGE + * preallocated page (tryalloc) 00 Z_EROFS_PREALLOCATED_PAGE + * cached/managed page 00 pointer to z_erofs_pcluster + * online page (file-backed, 01/10/11 sub-index << 2 | count + * some pages can be used for inplace I/O) + * + * page->mapping should be one of + * Type page->mapping + * short-lived page NULL + * preallocated page NULL + * cached/managed page non-NULL or NULL (invalidated/truncated page) + * online page non-NULL + * + * For all managed pages, PG_private should be set with 1 extra refcount, + * which is used for page reclaim / migration. + */ + +/* + * short-lived pages are pages directly from buddy system with specific + * page->private (no need to set PagePrivate since these are non-LRU / + * non-movable pages and bypass reclaim / migration code). + */ +static inline bool z_erofs_is_shortlived_page(struct page *page) +{ + if (page->private != Z_EROFS_SHORTLIVED_PAGE) + return false; + + DBG_BUGON(page->mapping); + return true; +} + +static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, + struct page *page) +{ + if (!z_erofs_is_shortlived_page(page)) + return false; + + /* short-lived pages should not be used by others at the same time */ + if (page_ref_count(page) > 1) { + put_page(page); + } else { + /* follow the pcluster rule above. */ + erofs_pagepool_add(pagepool, page); + } + return true; +} + +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, + struct page *page) +{ + return page->mapping == MNGD_MAPPING(sbi); +} + +int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, + unsigned int padbufsize); +extern const struct z_erofs_decompressor erofs_decompressors[]; + +/* prototypes for specific algorithms */ +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size); +int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size); +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); +int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); +#endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c new file mode 100644 index 0000000000..0c2c99c58b --- /dev/null +++ b/fs/erofs/data.c @@ -0,0 +1,452 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud + */ +#include "internal.h" +#include <linux/prefetch.h> +#include <linux/sched/mm.h> +#include <linux/dax.h> +#include <trace/events/erofs.h> + +void erofs_unmap_metabuf(struct erofs_buf *buf) +{ + if (buf->kmap_type == EROFS_KMAP) + kunmap_local(buf->base); + buf->base = NULL; + buf->kmap_type = EROFS_NO_KMAP; +} + +void erofs_put_metabuf(struct erofs_buf *buf) +{ + if (!buf->page) + return; + erofs_unmap_metabuf(buf); + put_page(buf->page); + buf->page = NULL; +} + +/* + * Derive the block size from inode->i_blkbits to make compatible with + * anonymous inode in fscache mode. + */ +void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, + enum erofs_kmap_type type) +{ + struct inode *inode = buf->inode; + erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits; + pgoff_t index = offset >> PAGE_SHIFT; + struct page *page = buf->page; + struct folio *folio; + unsigned int nofs_flag; + + if (!page || page->index != index) { + erofs_put_metabuf(buf); + + nofs_flag = memalloc_nofs_save(); + folio = read_cache_folio(inode->i_mapping, index, NULL, NULL); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(folio)) + return folio; + + /* should already be PageUptodate, no need to lock page */ + page = folio_file_page(folio, index); + buf->page = page; + } + if (buf->kmap_type == EROFS_NO_KMAP) { + if (type == EROFS_KMAP) + buf->base = kmap_local_page(page); + buf->kmap_type = type; + } else if (buf->kmap_type != type) { + DBG_BUGON(1); + return ERR_PTR(-EFAULT); + } + if (type == EROFS_NO_KMAP) + return NULL; + return buf->base + (offset & ~PAGE_MASK); +} + +void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) +{ + if (erofs_is_fscache_mode(sb)) + buf->inode = EROFS_SB(sb)->s_fscache->inode; + else + buf->inode = sb->s_bdev->bd_inode; +} + +void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, + erofs_blk_t blkaddr, enum erofs_kmap_type type) +{ + erofs_init_metabuf(buf, sb); + return erofs_bread(buf, blkaddr, type); +} + +static int erofs_map_blocks_flatmode(struct inode *inode, + struct erofs_map_blocks *map) +{ + erofs_blk_t nblocks, lastblk; + u64 offset = map->m_la; + struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; + bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + + nblocks = erofs_iblks(inode); + lastblk = nblocks - tailendpacking; + + /* there is no hole in flatmode */ + map->m_flags = EROFS_MAP_MAPPED; + if (offset < erofs_pos(sb, lastblk)) { + map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; + map->m_plen = erofs_pos(sb, lastblk) - offset; + } else if (tailendpacking) { + map->m_pa = erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(sb, offset); + map->m_plen = inode->i_size - offset; + + /* inline data should be located in the same meta block */ + if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { + erofs_err(sb, "inline data cross block boundary @ nid %llu", + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + map->m_flags |= EROFS_MAP_META; + } else { + erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx", + vi->nid, inode->i_size, map->m_la); + DBG_BUGON(1); + return -EIO; + } + return 0; +} + +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) +{ + struct super_block *sb = inode->i_sb; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_inode_chunk_index *idx; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + u64 chunknr; + unsigned int unit; + erofs_off_t pos; + void *kaddr; + int err = 0; + + trace_erofs_map_blocks_enter(inode, map, 0); + map->m_deviceid = 0; + if (map->m_la >= inode->i_size) { + /* leave out-of-bound access unmapped */ + map->m_flags = 0; + map->m_plen = 0; + goto out; + } + + if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { + err = erofs_map_blocks_flatmode(inode, map); + goto out; + } + + if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) + unit = sizeof(*idx); /* chunk index */ + else + unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */ + + chunknr = map->m_la >> vi->chunkbits; + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize, unit) + unit * chunknr; + + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP); + if (IS_ERR(kaddr)) { + err = PTR_ERR(kaddr); + goto out; + } + map->m_la = chunknr << vi->chunkbits; + map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, + round_up(inode->i_size - map->m_la, sb->s_blocksize)); + + /* handle block map */ + if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { + __le32 *blkaddr = kaddr + erofs_blkoff(sb, pos); + + if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { + map->m_flags = 0; + } else { + map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr)); + map->m_flags = EROFS_MAP_MAPPED; + } + goto out_unlock; + } + /* parse chunk indexes */ + idx = kaddr + erofs_blkoff(sb, pos); + switch (le32_to_cpu(idx->blkaddr)) { + case EROFS_NULL_ADDR: + map->m_flags = 0; + break; + default: + map->m_deviceid = le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; + map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr)); + map->m_flags = EROFS_MAP_MAPPED; + break; + } +out_unlock: + erofs_put_metabuf(&buf); +out: + if (!err) + map->m_llen = map->m_plen; + trace_erofs_map_blocks_exit(inode, map, 0, err); + return err; +} + +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) +{ + struct erofs_dev_context *devs = EROFS_SB(sb)->devs; + struct erofs_device_info *dif; + int id; + + map->m_bdev = sb->s_bdev; + map->m_daxdev = EROFS_SB(sb)->dax_dev; + map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; + map->m_fscache = EROFS_SB(sb)->s_fscache; + + if (map->m_deviceid) { + down_read(&devs->rwsem); + dif = idr_find(&devs->tree, map->m_deviceid - 1); + if (!dif) { + up_read(&devs->rwsem); + return -ENODEV; + } + if (devs->flatdev) { + map->m_pa += erofs_pos(sb, dif->mapped_blkaddr); + up_read(&devs->rwsem); + return 0; + } + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + map->m_dax_part_off = dif->dax_part_off; + map->m_fscache = dif->fscache; + up_read(&devs->rwsem); + } else if (devs->extra_devices && !devs->flatdev) { + down_read(&devs->rwsem); + idr_for_each_entry(&devs->tree, dif, id) { + erofs_off_t startoff, length; + + if (!dif->mapped_blkaddr) + continue; + startoff = erofs_pos(sb, dif->mapped_blkaddr); + length = erofs_pos(sb, dif->blocks); + + if (map->m_pa >= startoff && + map->m_pa < startoff + length) { + map->m_pa -= startoff; + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + map->m_dax_part_off = dif->dax_part_off; + map->m_fscache = dif->fscache; + break; + } + } + up_read(&devs->rwsem); + } + return 0; +} + +static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + struct super_block *sb = inode->i_sb; + struct erofs_map_blocks map; + struct erofs_map_dev mdev; + + map.m_la = offset; + map.m_llen = length; + + ret = erofs_map_blocks(inode, &map); + if (ret < 0) + return ret; + + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + ret = erofs_map_dev(sb, &mdev); + if (ret) + return ret; + + iomap->offset = map.m_la; + if (flags & IOMAP_DAX) + iomap->dax_dev = mdev.m_daxdev; + else + iomap->bdev = mdev.m_bdev; + iomap->length = map.m_llen; + iomap->flags = 0; + iomap->private = NULL; + + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + if (!iomap->length) + iomap->length = length; + return 0; + } + + if (map.m_flags & EROFS_MAP_META) { + void *ptr; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + + iomap->type = IOMAP_INLINE; + ptr = erofs_read_metabuf(&buf, sb, + erofs_blknr(sb, mdev.m_pa), EROFS_KMAP); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + iomap->inline_data = ptr + erofs_blkoff(sb, mdev.m_pa); + iomap->private = buf.base; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = mdev.m_pa; + if (flags & IOMAP_DAX) + iomap->addr += mdev.m_dax_part_off; + } + return 0; +} + +static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + void *ptr = iomap->private; + + if (ptr) { + struct erofs_buf buf = { + .page = kmap_to_page(ptr), + .base = ptr, + .kmap_type = EROFS_KMAP, + }; + + DBG_BUGON(iomap->type != IOMAP_INLINE); + erofs_put_metabuf(&buf); + } else { + DBG_BUGON(iomap->type == IOMAP_INLINE); + } + return written; +} + +static const struct iomap_ops erofs_iomap_ops = { + .iomap_begin = erofs_iomap_begin, + .iomap_end = erofs_iomap_end, +}; + +int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { +#ifdef CONFIG_EROFS_FS_ZIP + return iomap_fiemap(inode, fieinfo, start, len, + &z_erofs_iomap_report_ops); +#else + return -EOPNOTSUPP; +#endif + } + return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops); +} + +/* + * since we dont have write or truncate flows, so no inode + * locking needs to be held at the moment. + */ +static int erofs_read_folio(struct file *file, struct folio *folio) +{ + return iomap_read_folio(folio, &erofs_iomap_ops); +} + +static void erofs_readahead(struct readahead_control *rac) +{ + return iomap_readahead(rac, &erofs_iomap_ops); +} + +static sector_t erofs_bmap(struct address_space *mapping, sector_t block) +{ + return iomap_bmap(mapping, block, &erofs_iomap_ops); +} + +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + /* no need taking (shared) inode lock since it's a ro filesystem */ + if (!iov_iter_count(to)) + return 0; + +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return dax_iomap_rw(iocb, to, &erofs_iomap_ops); +#endif + if (iocb->ki_flags & IOCB_DIRECT) { + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int blksize_mask; + + if (bdev) + blksize_mask = bdev_logical_block_size(bdev) - 1; + else + blksize_mask = i_blocksize(inode) - 1; + + if ((iocb->ki_pos | iov_iter_count(to) | + iov_iter_alignment(to)) & blksize_mask) + return -EINVAL; + + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, + NULL, 0, NULL, 0); + } + return filemap_read(iocb, to, 0); +} + +/* for uncompressed (aligned) files and raw access for other files */ +const struct address_space_operations erofs_raw_access_aops = { + .read_folio = erofs_read_folio, + .readahead = erofs_readahead, + .bmap = erofs_bmap, + .direct_IO = noop_direct_IO, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, +}; + +#ifdef CONFIG_FS_DAX +static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops); +} + +static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) +{ + return erofs_dax_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct erofs_dax_vm_ops = { + .fault = erofs_dax_fault, + .huge_fault = erofs_dax_huge_fault, +}; + +static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!IS_DAX(file_inode(file))) + return generic_file_readonly_mmap(file, vma); + + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + + vma->vm_ops = &erofs_dax_vm_ops; + vm_flags_set(vma, VM_HUGEPAGE); + return 0; +} +#else +#define erofs_file_mmap generic_file_readonly_mmap +#endif + +const struct file_operations erofs_file_fops = { + .llseek = generic_file_llseek, + .read_iter = erofs_file_read_iter, + .mmap = erofs_file_mmap, + .splice_read = filemap_splice_read, +}; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c new file mode 100644 index 0000000000..d36b3963c0 --- /dev/null +++ b/fs/erofs/decompressor.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 HUAWEI, Inc. + * https://www.huawei.com/ + */ +#include "compress.h" +#include <linux/module.h> +#include <linux/lz4.h> + +#ifndef LZ4_DISTANCE_MAX /* history window size */ +#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ +#endif + +#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) +#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN +#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) +#endif + +struct z_erofs_lz4_decompress_ctx { + struct z_erofs_decompress_req *rq; + /* # of encoded, decoded pages */ + unsigned int inpages, outpages; + /* decoded block total length (used for in-place decompression) */ + unsigned int oend; +}; + +static int z_erofs_load_lz4_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct z_erofs_lz4_cfgs *lz4 = data; + u16 distance; + + if (lz4) { + if (size < sizeof(struct z_erofs_lz4_cfgs)) { + erofs_err(sb, "invalid lz4 cfgs, size=%u", size); + return -EINVAL; + } + distance = le16_to_cpu(lz4->max_distance); + + sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks); + if (!sbi->lz4.max_pclusterblks) { + sbi->lz4.max_pclusterblks = 1; /* reserved case */ + } else if (sbi->lz4.max_pclusterblks > + erofs_blknr(sb, Z_EROFS_PCLUSTER_MAX_SIZE)) { + erofs_err(sb, "too large lz4 pclusterblks %u", + sbi->lz4.max_pclusterblks); + return -EINVAL; + } + } else { + distance = le16_to_cpu(dsb->u1.lz4_max_distance); + sbi->lz4.max_pclusterblks = 1; + } + + sbi->lz4.max_distance_pages = distance ? + DIV_ROUND_UP(distance, PAGE_SIZE) + 1 : + LZ4_MAX_DISTANCE_PAGES; + return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); +} + +/* + * Fill all gaps with bounce pages if it's a sparse page list. Also check if + * all physical pages are consecutive, which can be seen for moderate CR. + */ +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, + struct page **pagepool) +{ + struct z_erofs_decompress_req *rq = ctx->rq; + struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; + unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, + BITS_PER_LONG)] = { 0 }; + unsigned int lz4_max_distance_pages = + EROFS_SB(rq->sb)->lz4.max_distance_pages; + void *kaddr = NULL; + unsigned int i, j, top; + + top = 0; + for (i = j = 0; i < ctx->outpages; ++i, ++j) { + struct page *const page = rq->out[i]; + struct page *victim; + + if (j >= lz4_max_distance_pages) + j = 0; + + /* 'valid' bounced can only be tested after a complete round */ + if (!rq->fillgaps && test_bit(j, bounced)) { + DBG_BUGON(i < lz4_max_distance_pages); + DBG_BUGON(top >= lz4_max_distance_pages); + availables[top++] = rq->out[i - lz4_max_distance_pages]; + } + + if (page) { + __clear_bit(j, bounced); + if (!PageHighMem(page)) { + if (!i) { + kaddr = page_address(page); + continue; + } + if (kaddr && + kaddr + PAGE_SIZE == page_address(page)) { + kaddr += PAGE_SIZE; + continue; + } + } + kaddr = NULL; + continue; + } + kaddr = NULL; + __set_bit(j, bounced); + + if (top) { + victim = availables[--top]; + get_page(victim); + } else { + victim = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); + } + rq->out[i] = victim; + } + return kaddr ? 1 : 0; +} + +static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, + void *inpage, void *out, unsigned int *inputmargin, + int *maptype, bool may_inplace) +{ + struct z_erofs_decompress_req *rq = ctx->rq; + unsigned int omargin, total, i; + struct page **in; + void *src, *tmp; + + if (rq->inplace_io) { + omargin = PAGE_ALIGN(ctx->oend) - ctx->oend; + if (rq->partial_decoding || !may_inplace || + omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) + goto docopy; + + for (i = 0; i < ctx->inpages; ++i) + if (rq->out[ctx->outpages - ctx->inpages + i] != + rq->in[i]) + goto docopy; + kunmap_local(inpage); + *maptype = 3; + return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); + } + + if (ctx->inpages <= 1) { + *maptype = 0; + return inpage; + } + kunmap_local(inpage); + src = erofs_vm_map_ram(rq->in, ctx->inpages); + if (!src) + return ERR_PTR(-ENOMEM); + *maptype = 1; + return src; + +docopy: + /* Or copy compressed data which can be overlapped to per-CPU buffer */ + in = rq->in; + src = erofs_get_pcpubuf(ctx->inpages); + if (!src) { + DBG_BUGON(1); + kunmap_local(inpage); + return ERR_PTR(-EFAULT); + } + + tmp = src; + total = rq->inputsize; + while (total) { + unsigned int page_copycnt = + min_t(unsigned int, total, PAGE_SIZE - *inputmargin); + + if (!inpage) + inpage = kmap_local_page(*in); + memcpy(tmp, inpage + *inputmargin, page_copycnt); + kunmap_local(inpage); + inpage = NULL; + tmp += page_copycnt; + total -= page_copycnt; + ++in; + *inputmargin = 0; + } + *maptype = 2; + return src; +} + +/* + * Get the exact inputsize with zero_padding feature. + * - For LZ4, it should work if zero_padding feature is on (5.3+); + * - For MicroLZMA, it'd be enabled all the time. + */ +int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, + unsigned int padbufsize) +{ + const char *padend; + + padend = memchr_inv(padbuf, 0, padbufsize); + if (!padend) + return -EFSCORRUPTED; + rq->inputsize -= padend - padbuf; + rq->pageofs_in += padend - padbuf; + return 0; +} + +static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, + u8 *dst) +{ + struct z_erofs_decompress_req *rq = ctx->rq; + bool support_0padding = false, may_inplace = false; + unsigned int inputmargin; + u8 *out, *headpage, *src; + int ret, maptype; + + DBG_BUGON(*rq->in == NULL); + headpage = kmap_local_page(*rq->in); + + /* LZ4 decompression inplace is only safe if zero_padding is enabled */ + if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { + support_0padding = true; + ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + rq->sb->s_blocksize - rq->pageofs_in)); + if (ret) { + kunmap_local(headpage); + return ret; + } + may_inplace = !((rq->pageofs_in + rq->inputsize) & + (rq->sb->s_blocksize - 1)); + } + + inputmargin = rq->pageofs_in; + src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, + &maptype, may_inplace); + if (IS_ERR(src)) + return PTR_ERR(src); + + out = dst + rq->pageofs_out; + /* legacy format could compress extra data in a pcluster. */ + if (rq->partial_decoding || !support_0padding) + ret = LZ4_decompress_safe_partial(src + inputmargin, out, + rq->inputsize, rq->outputsize, rq->outputsize); + else + ret = LZ4_decompress_safe(src + inputmargin, out, + rq->inputsize, rq->outputsize); + + if (ret != rq->outputsize) { + erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", + ret, rq->inputsize, inputmargin, rq->outputsize); + + print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, + 16, 1, src + inputmargin, rq->inputsize, true); + print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, + 16, 1, out, rq->outputsize, true); + + if (ret >= 0) + memset(out + ret, 0, rq->outputsize - ret); + ret = -EIO; + } else { + ret = 0; + } + + if (maptype == 0) { + kunmap_local(headpage); + } else if (maptype == 1) { + vm_unmap_ram(src, ctx->inpages); + } else if (maptype == 2) { + erofs_put_pcpubuf(src); + } else if (maptype != 3) { + DBG_BUGON(1); + return -EFAULT; + } + return ret; +} + +static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + struct z_erofs_lz4_decompress_ctx ctx; + unsigned int dst_maptype; + void *dst; + int ret; + + ctx.rq = rq; + ctx.oend = rq->pageofs_out + rq->outputsize; + ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT; + ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + + /* one optimized fast path only for non bigpcluster cases yet */ + if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_local_page(*rq->out); + dst_maptype = 0; + goto dstmap_out; + } + + /* general decoding path which can be used for all cases */ + ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool); + if (ret < 0) { + return ret; + } else if (ret > 0) { + dst = page_address(*rq->out); + dst_maptype = 1; + } else { + dst = erofs_vm_map_ram(rq->out, ctx.outpages); + if (!dst) + return -ENOMEM; + dst_maptype = 2; + } + +dstmap_out: + ret = z_erofs_lz4_decompress_mem(&ctx, dst); + if (!dst_maptype) + kunmap_local(dst); + else if (dst_maptype == 2) + vm_unmap_ram(dst, ctx.outpages); + return ret; +} + +static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + const unsigned int outpages = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int righthalf = min_t(unsigned int, rq->outputsize, + PAGE_SIZE - rq->pageofs_out); + const unsigned int lefthalf = rq->outputsize - righthalf; + const unsigned int interlaced_offset = + rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; + u8 *src; + + if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + if (rq->out[0] == *rq->in) { + DBG_BUGON(rq->pageofs_out); + return 0; + } + + src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; + if (rq->out[0]) + memcpy_to_page(rq->out[0], rq->pageofs_out, + src + interlaced_offset, righthalf); + + if (outpages > inpages) { + DBG_BUGON(!rq->out[outpages - 1]); + if (rq->out[outpages - 1] != rq->in[inpages - 1]) { + memcpy_to_page(rq->out[outpages - 1], 0, src + + (interlaced_offset ? 0 : righthalf), + lefthalf); + } else if (!interlaced_offset) { + memmove(src, src + righthalf, lefthalf); + flush_dcache_page(rq->in[inpages - 1]); + } + } + kunmap_local(src); + return 0; +} + +const struct z_erofs_decompressor erofs_decompressors[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = { + .decompress = z_erofs_transform_plain, + .name = "shifted" + }, + [Z_EROFS_COMPRESSION_INTERLACED] = { + .decompress = z_erofs_transform_plain, + .name = "interlaced" + }, + [Z_EROFS_COMPRESSION_LZ4] = { + .config = z_erofs_load_lz4_config, + .decompress = z_erofs_lz4_decompress, + .name = "lz4" + }, +#ifdef CONFIG_EROFS_FS_ZIP_LZMA + [Z_EROFS_COMPRESSION_LZMA] = { + .config = z_erofs_load_lzma_config, + .decompress = z_erofs_lzma_decompress, + .name = "lzma" + }, +#endif +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE + [Z_EROFS_COMPRESSION_DEFLATE] = { + .config = z_erofs_load_deflate_config, + .decompress = z_erofs_deflate_decompress, + .name = "deflate" + }, +#endif +}; + +int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + unsigned int algs, alg; + erofs_off_t offset; + int size, ret = 0; + + if (!erofs_sb_has_compr_cfgs(sbi)) { + sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4; + return z_erofs_load_lz4_config(sb, dsb, NULL, 0); + } + + sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); + if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { + erofs_err(sb, "unidentified algorithms %x, please upgrade kernel", + sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); + return -EOPNOTSUPP; + } + + erofs_init_metabuf(&buf, sb); + offset = EROFS_SUPER_OFFSET + sbi->sb_size; + alg = 0; + for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { + void *data; + + if (!(algs & 1)) + continue; + + data = erofs_read_metadata(sb, &buf, &offset, &size); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + + if (alg >= ARRAY_SIZE(erofs_decompressors) || + !erofs_decompressors[alg].config) { + erofs_err(sb, "algorithm %d isn't enabled on this kernel", + alg); + ret = -EOPNOTSUPP; + } else { + ret = erofs_decompressors[alg].config(sb, + dsb, data, size); + } + + kfree(data); + if (ret) + break; + } + erofs_put_metabuf(&buf); + return ret; +} diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c new file mode 100644 index 0000000000..0e1946a6bd --- /dev/null +++ b/fs/erofs/decompressor_deflate.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/module.h> +#include <linux/zlib.h> +#include "compress.h" + +struct z_erofs_deflate { + struct z_erofs_deflate *next; + struct z_stream_s z; + u8 bounce[PAGE_SIZE]; +}; + +static DEFINE_SPINLOCK(z_erofs_deflate_lock); +static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms; +static struct z_erofs_deflate *z_erofs_deflate_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq); + +module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444); + +void z_erofs_deflate_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + continue; + } + z_erofs_deflate_head = NULL; + spin_unlock(&z_erofs_deflate_lock); + + while (strm) { + struct z_erofs_deflate *n = strm->next; + + vfree(strm->z.workspace); + kfree(strm); + --z_erofs_deflate_avail_strms; + strm = n; + } + } +} + +int __init z_erofs_deflate_init(void) +{ + /* by default, use # of possible CPUs instead */ + if (!z_erofs_deflate_nstrms) + z_erofs_deflate_nstrms = num_possible_cpus(); + + for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms; + ++z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + strm = kzalloc(sizeof(*strm), GFP_KERNEL); + if (!strm) + goto out_failed; + + /* XXX: in-kernel zlib cannot shrink windowbits currently */ + strm->z.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!strm->z.workspace) { + kfree(strm); + goto out_failed; + } + + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + } + return 0; + +out_failed: + pr_err("failed to allocate zlib workspace\n"); + z_erofs_deflate_exit(); + return -ENOMEM; +} + +int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) +{ + struct z_erofs_deflate_cfgs *dfl = data; + + if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) { + erofs_err(sb, "invalid deflate cfgs, size=%u", size); + return -EINVAL; + } + + if (dfl->windowbits > MAX_WBITS) { + erofs_err(sb, "unsupported windowbits %u", dfl->windowbits); + return -EOPNOTSUPP; + } + + erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!"); + return 0; +} + +int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + struct super_block *sb = rq->sb; + unsigned int insz, outsz, pofs; + struct z_erofs_deflate *strm; + u8 *kin, *kout = NULL; + bool bounced = false; + int no = -1, ni = 0, j = 0, zerr, err; + + /* 1. get the exact DEFLATE compressed size */ + kin = kmap_local_page(*rq->in); + err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + sb->s_blocksize - rq->pageofs_in)); + if (err) { + kunmap_local(kin); + return err; + } + + /* 2. get an available DEFLATE context */ +again: + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head)); + goto again; + } + z_erofs_deflate_head = strm->next; + spin_unlock(&z_erofs_deflate_lock); + + /* 3. multi-call decompress */ + insz = rq->inputsize; + outsz = rq->outputsize; + zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS); + if (zerr != Z_OK) { + err = -EIO; + goto failed_zinit; + } + + pofs = rq->pageofs_out; + strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in); + insz -= strm->z.avail_in; + strm->z.next_in = kin + rq->pageofs_in; + strm->z.avail_out = 0; + + while (1) { + if (!strm->z.avail_out) { + if (++no >= nrpages_out || !outsz) { + erofs_err(sb, "insufficient space for decompressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) + kunmap_local(kout); + strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); + outsz -= strm->z.avail_out; + if (!rq->out[no]) { + rq->out[no] = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } + kout = kmap_local_page(rq->out[no]); + strm->z.next_out = kout + pofs; + pofs = 0; + } + + if (!strm->z.avail_in && insz) { + if (++ni >= nrpages_in) { + erofs_err(sb, "invalid compressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) { /* unlike kmap(), take care of the orders */ + j = strm->z.next_out - kout; + kunmap_local(kout); + } + kunmap_local(kin); + strm->z.avail_in = min_t(u32, insz, PAGE_SIZE); + insz -= strm->z.avail_in; + kin = kmap_local_page(rq->in[ni]); + strm->z.next_in = kin; + bounced = false; + if (kout) { + kout = kmap_local_page(rq->out[no]); + strm->z.next_out = kout + j; + } + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Or use short-lived pages from the + * on-stack pagepool where pages share among the same request + * and not _all_ inplace I/O pages are needed to be doubled. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in); + strm->z.next_in = strm->bounce; + bounced = true; + } + + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + + zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH); + if (zerr != Z_OK || !(outsz + strm->z.avail_out)) { + if (zerr == Z_OK && rq->partial_decoding) + break; + if (zerr == Z_STREAM_END && !outsz) + break; + erofs_err(sb, "failed to decompress %d in[%u] out[%u]", + zerr, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + + if (zlib_inflateEnd(&strm->z) != Z_OK && !err) + err = -EIO; + if (kout) + kunmap_local(kout); +failed_zinit: + kunmap_local(kin); + /* 4. push back DEFLATE stream context to the global list */ + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + wake_up(&z_erofs_deflate_wq); + return err; +} diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c new file mode 100644 index 0000000000..ba4ec73f4a --- /dev/null +++ b/fs/erofs/decompressor_lzma.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/xz.h> +#include <linux/module.h> +#include "compress.h" + +struct z_erofs_lzma { + struct z_erofs_lzma *next; + struct xz_dec_microlzma *state; + struct xz_buf buf; + u8 bounce[PAGE_SIZE]; +}; + +/* considering the LZMA performance, no need to use a lockless list for now */ +static DEFINE_SPINLOCK(z_erofs_lzma_lock); +static unsigned int z_erofs_lzma_max_dictsize; +static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms; +static struct z_erofs_lzma *z_erofs_lzma_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq); + +module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444); + +void z_erofs_lzma_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_lzma_avail_strms) { + struct z_erofs_lzma *strm; + + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + DBG_BUGON(1); + return; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + while (strm) { + struct z_erofs_lzma *n = strm->next; + + if (strm->state) + xz_dec_microlzma_end(strm->state); + kfree(strm); + --z_erofs_lzma_avail_strms; + strm = n; + } + } +} + +int __init z_erofs_lzma_init(void) +{ + unsigned int i; + + /* by default, use # of possible CPUs instead */ + if (!z_erofs_lzma_nstrms) + z_erofs_lzma_nstrms = num_possible_cpus(); + + for (i = 0; i < z_erofs_lzma_nstrms; ++i) { + struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL); + + if (!strm) { + z_erofs_lzma_exit(); + return -ENOMEM; + } + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + ++z_erofs_lzma_avail_strms; + } + return 0; +} + +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) +{ + static DEFINE_MUTEX(lzma_resize_mutex); + struct z_erofs_lzma_cfgs *lzma = data; + unsigned int dict_size, i; + struct z_erofs_lzma *strm, *head = NULL; + int err; + + if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) { + erofs_err(sb, "invalid lzma cfgs, size=%u", size); + return -EINVAL; + } + if (lzma->format) { + erofs_err(sb, "unidentified lzma format %x, please check kernel version", + le16_to_cpu(lzma->format)); + return -EINVAL; + } + dict_size = le32_to_cpu(lzma->dict_size); + if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) { + erofs_err(sb, "unsupported lzma dictionary size %u", + dict_size); + return -EINVAL; + } + + erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!"); + + /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */ + mutex_lock(&lzma_resize_mutex); + + if (z_erofs_lzma_max_dictsize >= dict_size) { + mutex_unlock(&lzma_resize_mutex); + return 0; + } + + /* 1. collect/isolate all streams for the following check */ + for (i = 0; i < z_erofs_lzma_avail_strms; ++i) { + struct z_erofs_lzma *last; + +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, + READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + for (last = strm; last->next; last = last->next) + ++i; + last->next = head; + head = strm; + } + + err = 0; + /* 2. walk each isolated stream and grow max dict_size if needed */ + for (strm = head; strm; strm = strm->next) { + if (strm->state) + xz_dec_microlzma_end(strm->state); + strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size); + if (!strm->state) + err = -ENOMEM; + } + + /* 3. push back all to the global list and update max dict_size */ + spin_lock(&z_erofs_lzma_lock); + DBG_BUGON(z_erofs_lzma_head); + z_erofs_lzma_head = head; + spin_unlock(&z_erofs_lzma_lock); + wake_up_all(&z_erofs_lzma_wq); + + z_erofs_lzma_max_dictsize = dict_size; + mutex_unlock(&lzma_resize_mutex); + return err; +} + +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + unsigned int inlen, outlen, pageofs; + struct z_erofs_lzma *strm; + u8 *kin; + bool bounced = false; + int no, ni, j, err = 0; + + /* 1. get the exact LZMA compressed size */ + kin = kmap(*rq->in); + err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + rq->sb->s_blocksize - rq->pageofs_in)); + if (err) { + kunmap(*rq->in); + return err; + } + + /* 2. get an available lzma context */ +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = strm->next; + spin_unlock(&z_erofs_lzma_lock); + + /* 3. multi-call decompress */ + inlen = rq->inputsize; + outlen = rq->outputsize; + xz_dec_microlzma_reset(strm->state, inlen, outlen, + !rq->partial_decoding); + pageofs = rq->pageofs_out; + strm->buf.in = kin + rq->pageofs_in; + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in); + inlen -= strm->buf.in_size; + strm->buf.out = NULL; + strm->buf.out_pos = 0; + strm->buf.out_size = 0; + + for (ni = 0, no = -1;;) { + enum xz_ret xz_err; + + if (strm->buf.out_pos == strm->buf.out_size) { + if (strm->buf.out) { + kunmap(rq->out[no]); + strm->buf.out = NULL; + } + + if (++no >= nrpages_out || !outlen) { + erofs_err(rq->sb, "decompressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.out_pos = 0; + strm->buf.out_size = min_t(u32, outlen, + PAGE_SIZE - pageofs); + outlen -= strm->buf.out_size; + if (!rq->out[no] && rq->fillgaps) { /* deduped */ + rq->out[no] = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } + if (rq->out[no]) + strm->buf.out = kmap(rq->out[no]) + pageofs; + pageofs = 0; + } else if (strm->buf.in_pos == strm->buf.in_size) { + kunmap(rq->in[ni]); + + if (++ni >= nrpages_in || !inlen) { + erofs_err(rq->sb, "compressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE); + inlen -= strm->buf.in_size; + kin = kmap(rq->in[ni]); + strm->buf.in = kin; + bounced = false; + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Otherwise, Use short-lived pages + * from the on-stack pagepool where pages share with the same + * request. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->buf.in, strm->buf.in_size); + strm->buf.in = strm->bounce; + bounced = true; + } + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + xz_err = xz_dec_microlzma_run(strm->state, &strm->buf); + DBG_BUGON(strm->buf.out_pos > strm->buf.out_size); + DBG_BUGON(strm->buf.in_pos > strm->buf.in_size); + + if (xz_err != XZ_OK) { + if (xz_err == XZ_STREAM_END && !outlen) + break; + erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]", + xz_err, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + if (no < nrpages_out && strm->buf.out) + kunmap(rq->out[no]); + if (ni < nrpages_in) + kunmap(rq->in[ni]); + /* 4. push back LZMA stream context to the global list */ + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + wake_up(&z_erofs_lzma_wq); + return err; +} diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c new file mode 100644 index 0000000000..b80abec053 --- /dev/null +++ b/fs/erofs/dir.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2022, Alibaba Cloud + */ +#include "internal.h" + +static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, + void *dentry_blk, struct erofs_dirent *de, + unsigned int nameoff, unsigned int maxsize) +{ + const struct erofs_dirent *end = dentry_blk + nameoff; + + while (de < end) { + const char *de_name; + unsigned int de_namelen; + unsigned char d_type; + + d_type = fs_ftype_to_dtype(de->file_type); + + nameoff = le16_to_cpu(de->nameoff); + de_name = (char *)dentry_blk + nameoff; + + /* the last dirent in the block? */ + if (de + 1 >= end) + de_namelen = strnlen(de_name, maxsize - nameoff); + else + de_namelen = le16_to_cpu(de[1].nameoff) - nameoff; + + /* a corrupted entry is found */ + if (nameoff + de_namelen > maxsize || + de_namelen > EROFS_NAME_LEN) { + erofs_err(dir->i_sb, "bogus dirent @ nid %llu", + EROFS_I(dir)->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + if (!dir_emit(ctx, de_name, de_namelen, + le64_to_cpu(de->nid), d_type)) + return 1; + ++de; + ctx->pos += sizeof(struct erofs_dirent); + } + return 0; +} + +static int erofs_readdir(struct file *f, struct dir_context *ctx) +{ + struct inode *dir = file_inode(f); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + struct super_block *sb = dir->i_sb; + unsigned long bsz = sb->s_blocksize; + const size_t dirsize = i_size_read(dir); + unsigned int i = erofs_blknr(sb, ctx->pos); + unsigned int ofs = erofs_blkoff(sb, ctx->pos); + int err = 0; + bool initial = true; + + buf.inode = dir; + while (ctx->pos < dirsize) { + struct erofs_dirent *de; + unsigned int nameoff, maxsize; + + de = erofs_bread(&buf, i, EROFS_KMAP); + if (IS_ERR(de)) { + erofs_err(sb, "fail to readdir of logical block %u of nid %llu", + i, EROFS_I(dir)->nid); + err = PTR_ERR(de); + break; + } + + nameoff = le16_to_cpu(de->nameoff); + if (nameoff < sizeof(struct erofs_dirent) || nameoff >= bsz) { + erofs_err(sb, "invalid de[0].nameoff %u @ nid %llu", + nameoff, EROFS_I(dir)->nid); + err = -EFSCORRUPTED; + break; + } + + maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz); + + /* search dirents at the arbitrary position */ + if (initial) { + initial = false; + + ofs = roundup(ofs, sizeof(struct erofs_dirent)); + ctx->pos = erofs_pos(sb, i) + ofs; + if (ofs >= nameoff) + goto skip_this; + } + + err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, + nameoff, maxsize); + if (err) + break; +skip_this: + ctx->pos = erofs_pos(sb, i) + maxsize; + ++i; + ofs = 0; + } + erofs_put_metabuf(&buf); + return err < 0 ? err : 0; +} + +const struct file_operations erofs_dir_fops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = erofs_readdir, +}; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h new file mode 100644 index 0000000000..a03ec70ba6 --- /dev/null +++ b/fs/erofs/erofs_fs.h @@ -0,0 +1,461 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */ +/* + * EROFS (Enhanced ROM File System) on-disk format definition + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud + */ +#ifndef __EROFS_FS_H +#define __EROFS_FS_H + +#define EROFS_SUPER_OFFSET 1024 + +#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 +#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 + +/* + * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should + * be incompatible with this kernel version. + */ +#define EROFS_FEATURE_INCOMPAT_ZERO_PADDING 0x00000001 +#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 +#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 +#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 +#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 +#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 +#define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 +#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 +#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 +#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 +#define EROFS_ALL_FEATURE_INCOMPAT \ + (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ + EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ + EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ + EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ + EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ + EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ + EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ + EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ + EROFS_FEATURE_INCOMPAT_DEDUPE | \ + EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES) + +#define EROFS_SB_EXTSLOT_SIZE 16 + +struct erofs_deviceslot { + u8 tag[64]; /* digest(sha256), etc. */ + __le32 blocks; /* total fs blocks of this device */ + __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ + u8 reserved[56]; +}; +#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) + +/* erofs on-disk super block (currently 128 bytes) */ +struct erofs_super_block { + __le32 magic; /* file system magic number */ + __le32 checksum; /* crc32c(super_block) */ + __le32 feature_compat; + __u8 blkszbits; /* filesystem block size in bit shift */ + __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ + + __le16 root_nid; /* nid of root directory */ + __le64 inos; /* total valid ino # (== f_files - f_favail) */ + + __le64 build_time; /* compact inode time derivation */ + __le32 build_time_nsec; /* compact inode time derivation in ns scale */ + __le32 blocks; /* used for statfs */ + __le32 meta_blkaddr; /* start block address of metadata area */ + __le32 xattr_blkaddr; /* start block address of shared xattr area */ + __u8 uuid[16]; /* 128-bit uuid for volume */ + __u8 volume_name[16]; /* volume name */ + __le32 feature_incompat; + union { + /* bitmap for available compression algorithms */ + __le16 available_compr_algs; + /* customized sliding window size instead of 64k by default */ + __le16 lz4_max_distance; + } __packed u1; + __le16 extra_devices; /* # of devices besides the primary device */ + __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ + __u8 dirblkbits; /* directory block size in bit shift */ + __u8 xattr_prefix_count; /* # of long xattr name prefixes */ + __le32 xattr_prefix_start; /* start of long xattr prefixes */ + __le64 packed_nid; /* nid of the special packed inode */ + __u8 xattr_filter_reserved; /* reserved for xattr name filter */ + __u8 reserved2[23]; +}; + +/* + * EROFS inode datalayout (i_format in on-disk inode): + * 0 - uncompressed flat inode without tail-packing inline data: + * 1 - compressed inode with non-compact indexes: + * 2 - uncompressed flat inode with tail-packing inline data: + * 3 - compressed inode with compact indexes: + * 4 - chunk-based inode with (optional) multi-device support: + * 5~7 - reserved + */ +enum { + EROFS_INODE_FLAT_PLAIN = 0, + EROFS_INODE_COMPRESSED_FULL = 1, + EROFS_INODE_FLAT_INLINE = 2, + EROFS_INODE_COMPRESSED_COMPACT = 3, + EROFS_INODE_CHUNK_BASED = 4, + EROFS_INODE_DATALAYOUT_MAX +}; + +static inline bool erofs_inode_is_data_compressed(unsigned int datamode) +{ + return datamode == EROFS_INODE_COMPRESSED_COMPACT || + datamode == EROFS_INODE_COMPRESSED_FULL; +} + +/* bit definitions of inode i_format */ +#define EROFS_I_VERSION_MASK 0x01 +#define EROFS_I_DATALAYOUT_MASK 0x07 + +#define EROFS_I_VERSION_BIT 0 +#define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_ALL_BIT 4 + +#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1) + +/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ +#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F +/* with chunk indexes or just a 4-byte blkaddr array */ +#define EROFS_CHUNK_FORMAT_INDEXES 0x0020 + +#define EROFS_CHUNK_FORMAT_ALL \ + (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) + +/* 32-byte on-disk inode */ +#define EROFS_INODE_LAYOUT_COMPACT 0 +/* 64-byte on-disk inode */ +#define EROFS_INODE_LAYOUT_EXTENDED 1 + +struct erofs_inode_chunk_info { + __le16 format; /* chunk blkbits, etc. */ + __le16 reserved; +}; + +union erofs_inode_i_u { + /* total compressed blocks for compressed inodes */ + __le32 compressed_blocks; + + /* block address for uncompressed flat inodes */ + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + + /* for chunk-based files, it contains the summary info */ + struct erofs_inode_chunk_info c; +}; + +/* 32-byte reduced form of an ondisk inode */ +struct erofs_inode_compact { + __le16 i_format; /* inode format hints */ + +/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ + __le16 i_xattr_icount; + __le16 i_mode; + __le16 i_nlink; + __le32 i_size; + __le32 i_reserved; + union erofs_inode_i_u i_u; + + __le32 i_ino; /* only used for 32-bit stat compatibility */ + __le16 i_uid; + __le16 i_gid; + __le32 i_reserved2; +}; + +/* 64-byte complete form of an ondisk inode */ +struct erofs_inode_extended { + __le16 i_format; /* inode format hints */ + +/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ + __le16 i_xattr_icount; + __le16 i_mode; + __le16 i_reserved; + __le64 i_size; + union erofs_inode_i_u i_u; + + __le32 i_ino; /* only used for 32-bit stat compatibility */ + __le32 i_uid; + __le32 i_gid; + __le64 i_mtime; + __le32 i_mtime_nsec; + __le32 i_nlink; + __u8 i_reserved2[16]; +}; + +/* + * inline xattrs (n == i_xattr_icount): + * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes + * 12 bytes / \ + * / \ + * /-----------------------\ + * | erofs_xattr_entries+ | + * +-----------------------+ + * inline xattrs must starts in erofs_xattr_ibody_header, + * for read-only fs, no need to introduce h_refcount + */ +struct erofs_xattr_ibody_header { + __le32 h_name_filter; /* bit value 1 indicates not-present */ + __u8 h_shared_count; + __u8 h_reserved2[7]; + __le32 h_shared_xattrs[]; /* shared xattr id array */ +}; + +/* Name indexes */ +#define EROFS_XATTR_INDEX_USER 1 +#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EROFS_XATTR_INDEX_TRUSTED 4 +#define EROFS_XATTR_INDEX_LUSTRE 5 +#define EROFS_XATTR_INDEX_SECURITY 6 + +/* + * bit 7 of e_name_index is set when it refers to a long xattr name prefix, + * while the remained lower bits represent the index of the prefix. + */ +#define EROFS_XATTR_LONG_PREFIX 0x80 +#define EROFS_XATTR_LONG_PREFIX_MASK 0x7f + +#define EROFS_XATTR_FILTER_BITS 32 +#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX +#define EROFS_XATTR_FILTER_SEED 0x25BBE08F + +/* xattr entry (for both inline & shared xattrs) */ +struct erofs_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_size; /* size of attribute value */ + /* followed by e_name and e_value */ + char e_name[]; /* attribute name */ +}; + +/* long xattr name prefix */ +struct erofs_xattr_long_prefix { + __u8 base_index; /* short xattr name prefix index */ + char infix[]; /* infix apart from short prefix */ +}; + +static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) +{ + if (!i_xattr_icount) + return 0; + + return sizeof(struct erofs_xattr_ibody_header) + + sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1); +} + +#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry)) + +static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) +{ + return EROFS_XATTR_ALIGN(sizeof(struct erofs_xattr_entry) + + e->e_name_len + le16_to_cpu(e->e_value_size)); +} + +/* represent a zeroed chunk (hole) */ +#define EROFS_NULL_ADDR -1 + +/* 4-byte block address array */ +#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) + +/* 8-byte inode chunk indexes */ +struct erofs_inode_chunk_index { + __le16 advise; /* always 0, don't care for now */ + __le16 device_id; /* back-end storage id (with bits masked) */ + __le32 blkaddr; /* start block address of this inode chunk */ +}; + +/* dirent sorts in alphabet order, thus we can do binary search */ +struct erofs_dirent { + __le64 nid; /* node number */ + __le16 nameoff; /* start offset of file name */ + __u8 file_type; /* file type */ + __u8 reserved; /* reserved */ +} __packed; + +/* + * EROFS file types should match generic FT_* types and + * it seems no need to add BUILD_BUG_ONs since potential + * unmatchness will break other fses as well... + */ + +#define EROFS_NAME_LEN 255 + +/* maximum supported size of a physical compression cluster */ +#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) + +/* available compression algorithm types (for h_algorithmtype) */ +enum { + Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZMA = 1, + Z_EROFS_COMPRESSION_DEFLATE = 2, + Z_EROFS_COMPRESSION_MAX +}; +#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) + +/* 14 bytes (+ length field = 16 bytes) */ +struct z_erofs_lz4_cfgs { + __le16 max_distance; + __le16 max_pclusterblks; + u8 reserved[10]; +} __packed; + +/* 14 bytes (+ length field = 16 bytes) */ +struct z_erofs_lzma_cfgs { + __le32 dict_size; + __le16 format; + u8 reserved[8]; +} __packed; + +#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) + +/* 6 bytes (+ length field = 8 bytes) */ +struct z_erofs_deflate_cfgs { + u8 windowbits; /* 8..15 for DEFLATE */ + u8 reserved[5]; +} __packed; + +/* + * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) + * e.g. for 4k logical cluster size, 4B if compacted 2B is off; + * (4B) + 2B + (4B) if compacted 2B is on. + * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) + * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) + * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) + * bit 4 : interlaced plain pcluster (0 - off; 1 - on) + * bit 5 : fragment pcluster (0 - off; 1 - on) + */ +#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 +#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 +#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 +#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 +#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 +#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 + +#define Z_EROFS_FRAGMENT_INODE_BIT 7 +struct z_erofs_map_header { + union { + /* fragment data offset in the packed inode */ + __le32 h_fragmentoff; + struct { + __le16 h_reserved1; + /* indicates the encoded size of tailpacking data */ + __le16 h_idata_size; + }; + }; + __le16 h_advise; + /* + * bit 0-3 : algorithm type of head 1 (logical cluster type 01); + * bit 4-7 : algorithm type of head 2 (logical cluster type 11). + */ + __u8 h_algorithmtype; + /* + * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; + * bit 3-6 : reserved; + * bit 7 : move the whole file into packed inode or not. + */ + __u8 h_clusterbits; +}; + +/* + * On-disk logical cluster type: + * 0 - literal (uncompressed) lcluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * 2 - compressed lcluster (for NONHEAD lclusters) + * + * In detail, + * 0 - literal (uncompressed) lcluster, + * di_advise = 0 + * di_clusterofs = the literal data offset of the lcluster + * di_blkaddr = the blkaddr of the literal pcluster + * + * 1,3 - compressed lcluster (for HEAD lclusters) + * di_advise = 1 or 3 + * di_clusterofs = the decompressed data offset of the lcluster + * di_blkaddr = the blkaddr of the compressed pcluster + * + * 2 - compressed lcluster (for NONHEAD lclusters) + * di_advise = 2 + * di_clusterofs = + * the decompressed data offset in its own HEAD lcluster + * di_u.delta[0] = distance to this HEAD lcluster + * di_u.delta[1] = distance to the next HEAD lcluster + */ +enum { + Z_EROFS_LCLUSTER_TYPE_PLAIN = 0, + Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1, + Z_EROFS_LCLUSTER_TYPE_NONHEAD = 2, + Z_EROFS_LCLUSTER_TYPE_HEAD2 = 3, + Z_EROFS_LCLUSTER_TYPE_MAX +}; + +#define Z_EROFS_LI_LCLUSTER_TYPE_BITS 2 +#define Z_EROFS_LI_LCLUSTER_TYPE_BIT 0 + +/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ +#define Z_EROFS_LI_PARTIAL_REF (1 << 15) + +/* + * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the + * compressed block count of a compressed extent (in logical clusters, aka. + * block count of a pcluster). + */ +#define Z_EROFS_LI_D0_CBLKCNT (1 << 11) + +struct z_erofs_lcluster_index { + __le16 di_advise; + /* where to decompress in the head lcluster */ + __le16 di_clusterofs; + + union { + /* for the HEAD lclusters */ + __le32 blkaddr; + /* + * for the NONHEAD lclusters + * [0] - distance to its HEAD lcluster + * [1] - distance to the next HEAD lcluster + */ + __le16 delta[2]; + } di_u; +}; + +#define Z_EROFS_FULL_INDEX_ALIGN(end) \ + (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8) + +/* check the EROFS on-disk layout strictly at compile time */ +static inline void erofs_check_ondisk_layout_definitions(void) +{ + const __le64 fmh = *(__le64 *)&(struct z_erofs_map_header) { + .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT + }; + + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); + BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32); + BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); + BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); + BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4); + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4); + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8); + BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); + BUILD_BUG_ON(sizeof(struct z_erofs_lcluster_index) != 8); + BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); + /* keep in sync between 2 index structures for better extendibility */ + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != + sizeof(struct z_erofs_lcluster_index)); + BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); + + BUILD_BUG_ON(BIT(Z_EROFS_LI_LCLUSTER_TYPE_BITS) < + Z_EROFS_LCLUSTER_TYPE_MAX - 1); + /* exclude old compiler versions like gcc 7.5.0 */ + BUILD_BUG_ON(__builtin_constant_p(fmh) ? + fmh != cpu_to_le64(1ULL << 63) : 0); +} + +#endif diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c new file mode 100644 index 0000000000..87ff35bff8 --- /dev/null +++ b/fs/erofs/fscache.c @@ -0,0 +1,612 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022, Alibaba Cloud + * Copyright (C) 2022, Bytedance Inc. All rights reserved. + */ +#include <linux/fscache.h> +#include "internal.h" + +static DEFINE_MUTEX(erofs_domain_list_lock); +static DEFINE_MUTEX(erofs_domain_cookies_lock); +static LIST_HEAD(erofs_domain_list); +static LIST_HEAD(erofs_domain_cookies_list); +static struct vfsmount *erofs_pseudo_mnt; + +struct erofs_fscache_request { + struct erofs_fscache_request *primary; + struct netfs_cache_resources cache_resources; + struct address_space *mapping; /* The mapping being accessed */ + loff_t start; /* Start position */ + size_t len; /* Length of the request */ + size_t submitted; /* Length of submitted */ + short error; /* 0 or error that occurred */ + refcount_t ref; +}; + +static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping, + loff_t start, size_t len) +{ + struct erofs_fscache_request *req; + + req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL); + if (!req) + return ERR_PTR(-ENOMEM); + + req->mapping = mapping; + req->start = start; + req->len = len; + refcount_set(&req->ref, 1); + + return req; +} + +static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary, + size_t len) +{ + struct erofs_fscache_request *req; + + /* use primary request for the first submission */ + if (!primary->submitted) { + refcount_inc(&primary->ref); + return primary; + } + + req = erofs_fscache_req_alloc(primary->mapping, + primary->start + primary->submitted, len); + if (!IS_ERR(req)) { + req->primary = primary; + refcount_inc(&primary->ref); + } + return req; +} + +static void erofs_fscache_req_complete(struct erofs_fscache_request *req) +{ + struct folio *folio; + bool failed = req->error; + pgoff_t start_page = req->start / PAGE_SIZE; + pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1; + + XA_STATE(xas, &req->mapping->i_pages, start_page); + + rcu_read_lock(); + xas_for_each(&xas, folio, last_page) { + if (xas_retry(&xas, folio)) + continue; + if (!failed) + folio_mark_uptodate(folio); + folio_unlock(folio); + } + rcu_read_unlock(); +} + +static void erofs_fscache_req_put(struct erofs_fscache_request *req) +{ + if (refcount_dec_and_test(&req->ref)) { + if (req->cache_resources.ops) + req->cache_resources.ops->end_operation(&req->cache_resources); + if (!req->primary) + erofs_fscache_req_complete(req); + else + erofs_fscache_req_put(req->primary); + kfree(req); + } +} + +static void erofs_fscache_subreq_complete(void *priv, + ssize_t transferred_or_error, bool was_async) +{ + struct erofs_fscache_request *req = priv; + + if (IS_ERR_VALUE(transferred_or_error)) { + if (req->primary) + req->primary->error = transferred_or_error; + else + req->error = transferred_or_error; + } + erofs_fscache_req_put(req); +} + +/* + * Read data from fscache (cookie, pstart, len), and fill the read data into + * page cache described by (req->mapping, lstart, len). @pstart describeis the + * start physical address in the cache file. + */ +static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, + struct erofs_fscache_request *req, loff_t pstart, size_t len) +{ + enum netfs_io_source source; + struct super_block *sb = req->mapping->host->i_sb; + struct netfs_cache_resources *cres = &req->cache_resources; + struct iov_iter iter; + loff_t lstart = req->start + req->submitted; + size_t done = 0; + int ret; + + DBG_BUGON(len > req->len - req->submitted); + + ret = fscache_begin_read_operation(cres, cookie); + if (ret) + return ret; + + while (done < len) { + loff_t sstart = pstart + done; + size_t slen = len - done; + unsigned long flags = 1 << NETFS_SREQ_ONDEMAND; + + source = cres->ops->prepare_ondemand_read(cres, + sstart, &slen, LLONG_MAX, &flags, 0); + if (WARN_ON(slen == 0)) + source = NETFS_INVALID_READ; + if (source != NETFS_READ_FROM_CACHE) { + erofs_err(sb, "failed to fscache prepare_read (source %d)", source); + return -EIO; + } + + refcount_inc(&req->ref); + iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages, + lstart + done, slen); + + ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL, + erofs_fscache_subreq_complete, req); + if (ret == -EIOCBQUEUED) + ret = 0; + if (ret) { + erofs_err(sb, "failed to fscache_read (ret %d)", ret); + return ret; + } + + done += slen; + } + DBG_BUGON(done != len); + return 0; +} + +static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) +{ + int ret; + struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private; + struct erofs_fscache_request *req; + + req = erofs_fscache_req_alloc(folio_mapping(folio), + folio_pos(folio), folio_size(folio)); + if (IS_ERR(req)) { + folio_unlock(folio); + return PTR_ERR(req); + } + + ret = erofs_fscache_read_folios_async(ctx->cookie, req, + folio_pos(folio), folio_size(folio)); + if (ret) + req->error = ret; + + erofs_fscache_req_put(req); + return ret; +} + +static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) +{ + struct address_space *mapping = primary->mapping; + struct inode *inode = mapping->host; + struct super_block *sb = inode->i_sb; + struct erofs_fscache_request *req; + struct erofs_map_blocks map; + struct erofs_map_dev mdev; + struct iov_iter iter; + loff_t pos = primary->start + primary->submitted; + size_t count; + int ret; + + map.m_la = pos; + ret = erofs_map_blocks(inode, &map); + if (ret) + return ret; + + if (map.m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + erofs_blk_t blknr; + size_t offset, size; + void *src; + + /* For tail packing layout, the offset may be non-zero. */ + offset = erofs_blkoff(sb, map.m_pa); + blknr = erofs_blknr(sb, map.m_pa); + size = map.m_llen; + + src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); + if (IS_ERR(src)) + return PTR_ERR(src); + + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE); + if (copy_to_iter(src + offset, size, &iter) != size) { + erofs_put_metabuf(&buf); + return -EFAULT; + } + iov_iter_zero(PAGE_SIZE - size, &iter); + erofs_put_metabuf(&buf); + primary->submitted += PAGE_SIZE; + return 0; + } + + count = primary->len - primary->submitted; + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count); + iov_iter_zero(count, &iter); + primary->submitted += count; + return 0; + } + + count = min_t(size_t, map.m_llen - (pos - map.m_la), count); + DBG_BUGON(!count || count % PAGE_SIZE); + + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + ret = erofs_map_dev(sb, &mdev); + if (ret) + return ret; + + req = erofs_fscache_req_chain(primary, count); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + req, mdev.m_pa + (pos - map.m_la), count); + erofs_fscache_req_put(req); + primary->submitted += count; + return ret; +} + +static int erofs_fscache_data_read(struct erofs_fscache_request *req) +{ + int ret; + + do { + ret = erofs_fscache_data_read_slice(req); + if (ret) + req->error = ret; + } while (!ret && req->submitted < req->len); + + return ret; +} + +static int erofs_fscache_read_folio(struct file *file, struct folio *folio) +{ + struct erofs_fscache_request *req; + int ret; + + req = erofs_fscache_req_alloc(folio_mapping(folio), + folio_pos(folio), folio_size(folio)); + if (IS_ERR(req)) { + folio_unlock(folio); + return PTR_ERR(req); + } + + ret = erofs_fscache_data_read(req); + erofs_fscache_req_put(req); + return ret; +} + +static void erofs_fscache_readahead(struct readahead_control *rac) +{ + struct erofs_fscache_request *req; + + if (!readahead_count(rac)) + return; + + req = erofs_fscache_req_alloc(rac->mapping, + readahead_pos(rac), readahead_length(rac)); + if (IS_ERR(req)) + return; + + /* The request completion will drop refs on the folios. */ + while (readahead_folio(rac)) + ; + + erofs_fscache_data_read(req); + erofs_fscache_req_put(req); +} + +static const struct address_space_operations erofs_fscache_meta_aops = { + .read_folio = erofs_fscache_meta_read_folio, +}; + +const struct address_space_operations erofs_fscache_access_aops = { + .read_folio = erofs_fscache_read_folio, + .readahead = erofs_fscache_readahead, +}; + +static void erofs_fscache_domain_put(struct erofs_domain *domain) +{ + mutex_lock(&erofs_domain_list_lock); + if (refcount_dec_and_test(&domain->ref)) { + list_del(&domain->list); + if (list_empty(&erofs_domain_list)) { + kern_unmount(erofs_pseudo_mnt); + erofs_pseudo_mnt = NULL; + } + fscache_relinquish_volume(domain->volume, NULL, false); + mutex_unlock(&erofs_domain_list_lock); + kfree(domain->domain_id); + kfree(domain); + return; + } + mutex_unlock(&erofs_domain_list_lock); +} + +static int erofs_fscache_register_volume(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + char *domain_id = sbi->domain_id; + struct fscache_volume *volume; + char *name; + int ret = 0; + + name = kasprintf(GFP_KERNEL, "erofs,%s", + domain_id ? domain_id : sbi->fsid); + if (!name) + return -ENOMEM; + + volume = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR_OR_NULL(volume)) { + erofs_err(sb, "failed to register volume for %s", name); + ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; + volume = NULL; + } + + sbi->volume = volume; + kfree(name); + return ret; +} + +static int erofs_fscache_init_domain(struct super_block *sb) +{ + int err; + struct erofs_domain *domain; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL); + if (!domain) + return -ENOMEM; + + domain->domain_id = kstrdup(sbi->domain_id, GFP_KERNEL); + if (!domain->domain_id) { + kfree(domain); + return -ENOMEM; + } + + err = erofs_fscache_register_volume(sb); + if (err) + goto out; + + if (!erofs_pseudo_mnt) { + erofs_pseudo_mnt = kern_mount(&erofs_fs_type); + if (IS_ERR(erofs_pseudo_mnt)) { + err = PTR_ERR(erofs_pseudo_mnt); + goto out; + } + } + + domain->volume = sbi->volume; + refcount_set(&domain->ref, 1); + list_add(&domain->list, &erofs_domain_list); + sbi->domain = domain; + return 0; +out: + kfree(domain->domain_id); + kfree(domain); + return err; +} + +static int erofs_fscache_register_domain(struct super_block *sb) +{ + int err; + struct erofs_domain *domain; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_lock(&erofs_domain_list_lock); + list_for_each_entry(domain, &erofs_domain_list, list) { + if (!strcmp(domain->domain_id, sbi->domain_id)) { + sbi->domain = domain; + sbi->volume = domain->volume; + refcount_inc(&domain->ref); + mutex_unlock(&erofs_domain_list_lock); + return 0; + } + } + err = erofs_fscache_init_domain(sb); + mutex_unlock(&erofs_domain_list_lock); + return err; +} + +static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, + char *name, unsigned int flags) +{ + struct fscache_volume *volume = EROFS_SB(sb)->volume; + struct erofs_fscache *ctx; + struct fscache_cookie *cookie; + struct super_block *isb; + struct inode *inode; + int ret; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&ctx->node); + refcount_set(&ctx->ref, 1); + + cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, + name, strlen(name), NULL, 0, 0); + if (!cookie) { + erofs_err(sb, "failed to get cookie for %s", name); + ret = -EINVAL; + goto err; + } + fscache_use_cookie(cookie, false); + + /* + * Allocate anonymous inode in global pseudo mount for shareable blobs, + * so that they are accessible among erofs fs instances. + */ + isb = flags & EROFS_REG_COOKIE_SHARE ? erofs_pseudo_mnt->mnt_sb : sb; + inode = new_inode(isb); + if (!inode) { + erofs_err(sb, "failed to get anon inode for %s", name); + ret = -ENOMEM; + goto err_cookie; + } + + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &erofs_fscache_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + inode->i_blkbits = EROFS_SB(sb)->blkszbits; + inode->i_private = ctx; + + ctx->cookie = cookie; + ctx->inode = inode; + return ctx; + +err_cookie: + fscache_unuse_cookie(cookie, NULL, NULL); + fscache_relinquish_cookie(cookie, false); +err: + kfree(ctx); + return ERR_PTR(ret); +} + +static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) +{ + fscache_unuse_cookie(ctx->cookie, NULL, NULL); + fscache_relinquish_cookie(ctx->cookie, false); + iput(ctx->inode); + kfree(ctx->name); + kfree(ctx); +} + +static struct erofs_fscache *erofs_domain_init_cookie(struct super_block *sb, + char *name, unsigned int flags) +{ + struct erofs_fscache *ctx; + struct erofs_domain *domain = EROFS_SB(sb)->domain; + + ctx = erofs_fscache_acquire_cookie(sb, name, flags); + if (IS_ERR(ctx)) + return ctx; + + ctx->name = kstrdup(name, GFP_KERNEL); + if (!ctx->name) { + erofs_fscache_relinquish_cookie(ctx); + return ERR_PTR(-ENOMEM); + } + + refcount_inc(&domain->ref); + ctx->domain = domain; + list_add(&ctx->node, &erofs_domain_cookies_list); + return ctx; +} + +static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, + char *name, unsigned int flags) +{ + struct erofs_fscache *ctx; + struct erofs_domain *domain = EROFS_SB(sb)->domain; + + flags |= EROFS_REG_COOKIE_SHARE; + mutex_lock(&erofs_domain_cookies_lock); + list_for_each_entry(ctx, &erofs_domain_cookies_list, node) { + if (ctx->domain != domain || strcmp(ctx->name, name)) + continue; + if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { + refcount_inc(&ctx->ref); + } else { + erofs_err(sb, "%s already exists in domain %s", name, + domain->domain_id); + ctx = ERR_PTR(-EEXIST); + } + mutex_unlock(&erofs_domain_cookies_lock); + return ctx; + } + ctx = erofs_domain_init_cookie(sb, name, flags); + mutex_unlock(&erofs_domain_cookies_lock); + return ctx; +} + +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, + unsigned int flags) +{ + if (EROFS_SB(sb)->domain_id) + return erofs_domain_register_cookie(sb, name, flags); + return erofs_fscache_acquire_cookie(sb, name, flags); +} + +void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) +{ + struct erofs_domain *domain = NULL; + + if (!ctx) + return; + if (!ctx->domain) + return erofs_fscache_relinquish_cookie(ctx); + + mutex_lock(&erofs_domain_cookies_lock); + if (refcount_dec_and_test(&ctx->ref)) { + domain = ctx->domain; + list_del(&ctx->node); + erofs_fscache_relinquish_cookie(ctx); + } + mutex_unlock(&erofs_domain_cookies_lock); + if (domain) + erofs_fscache_domain_put(domain); +} + +int erofs_fscache_register_fs(struct super_block *sb) +{ + int ret; + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_fscache *fscache; + unsigned int flags = 0; + + if (sbi->domain_id) + ret = erofs_fscache_register_domain(sb); + else + ret = erofs_fscache_register_volume(sb); + if (ret) + return ret; + + /* + * When shared domain is enabled, using NEED_NOEXIST to guarantee + * the primary data blob (aka fsid) is unique in the shared domain. + * + * For non-shared-domain case, fscache_acquire_volume() invoked by + * erofs_fscache_register_volume() has already guaranteed + * the uniqueness of primary data blob. + * + * Acquired domain/volume will be relinquished in kill_sb() on error. + */ + if (sbi->domain_id) + flags |= EROFS_REG_COOKIE_NEED_NOEXIST; + fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); + if (IS_ERR(fscache)) + return PTR_ERR(fscache); + + sbi->s_fscache = fscache; + return 0; +} + +void erofs_fscache_unregister_fs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + erofs_fscache_unregister_cookie(sbi->s_fscache); + + if (sbi->domain) + erofs_fscache_domain_put(sbi->domain); + else + fscache_relinquish_volume(sbi->volume, NULL, false); + + sbi->s_fscache = NULL; + sbi->volume = NULL; + sbi->domain = NULL; +} diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c new file mode 100644 index 0000000000..edc8ec7581 --- /dev/null +++ b/fs/erofs/inode.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud + */ +#include "xattr.h" + +#include <trace/events/erofs.h> + +static void *erofs_read_inode(struct erofs_buf *buf, + struct inode *inode, unsigned int *ofs) +{ + struct super_block *sb = inode->i_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_inode *vi = EROFS_I(inode); + const erofs_off_t inode_loc = erofs_iloc(inode); + + erofs_blk_t blkaddr, nblks = 0; + void *kaddr; + struct erofs_inode_compact *dic; + struct erofs_inode_extended *die, *copied = NULL; + unsigned int ifmt; + int err; + + blkaddr = erofs_blknr(sb, inode_loc); + *ofs = erofs_blkoff(sb, inode_loc); + + kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP); + if (IS_ERR(kaddr)) { + erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", + vi->nid, PTR_ERR(kaddr)); + return kaddr; + } + + dic = kaddr + *ofs; + ifmt = le16_to_cpu(dic->i_format); + + if (ifmt & ~EROFS_I_ALL) { + erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu", + ifmt, vi->nid); + err = -EOPNOTSUPP; + goto err_out; + } + + vi->datalayout = erofs_inode_datalayout(ifmt); + if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) { + erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu", + vi->datalayout, vi->nid); + err = -EOPNOTSUPP; + goto err_out; + } + + switch (erofs_inode_version(ifmt)) { + case EROFS_INODE_LAYOUT_EXTENDED: + vi->inode_isize = sizeof(struct erofs_inode_extended); + /* check if the extended inode acrosses block boundary */ + if (*ofs + vi->inode_isize <= sb->s_blocksize) { + *ofs += vi->inode_isize; + die = (struct erofs_inode_extended *)dic; + } else { + const unsigned int gotten = sb->s_blocksize - *ofs; + + copied = kmalloc(vi->inode_isize, GFP_NOFS); + if (!copied) { + err = -ENOMEM; + goto err_out; + } + memcpy(copied, dic, gotten); + kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1, + EROFS_KMAP); + if (IS_ERR(kaddr)) { + erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", + vi->nid, PTR_ERR(kaddr)); + kfree(copied); + return kaddr; + } + *ofs = vi->inode_isize - gotten; + memcpy((u8 *)copied + gotten, kaddr, *ofs); + die = copied; + } + vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); + + inode->i_mode = le16_to_cpu(die->i_mode); + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr); + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = + new_decode_dev(le32_to_cpu(die->i_u.rdev)); + break; + case S_IFIFO: + case S_IFSOCK: + inode->i_rdev = 0; + break; + default: + goto bogusimode; + } + i_uid_write(inode, le32_to_cpu(die->i_uid)); + i_gid_write(inode, le32_to_cpu(die->i_gid)); + set_nlink(inode, le32_to_cpu(die->i_nlink)); + + /* extended inode has its own timestamp */ + inode_set_ctime(inode, le64_to_cpu(die->i_mtime), + le32_to_cpu(die->i_mtime_nsec)); + + inode->i_size = le64_to_cpu(die->i_size); + + /* total blocks for compressed files */ + if (erofs_inode_is_data_compressed(vi->datalayout)) + nblks = le32_to_cpu(die->i_u.compressed_blocks); + else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) + /* fill chunked inode summary info */ + vi->chunkformat = le16_to_cpu(die->i_u.c.format); + kfree(copied); + copied = NULL; + break; + case EROFS_INODE_LAYOUT_COMPACT: + vi->inode_isize = sizeof(struct erofs_inode_compact); + *ofs += vi->inode_isize; + vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); + + inode->i_mode = le16_to_cpu(dic->i_mode); + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr); + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = + new_decode_dev(le32_to_cpu(dic->i_u.rdev)); + break; + case S_IFIFO: + case S_IFSOCK: + inode->i_rdev = 0; + break; + default: + goto bogusimode; + } + i_uid_write(inode, le16_to_cpu(dic->i_uid)); + i_gid_write(inode, le16_to_cpu(dic->i_gid)); + set_nlink(inode, le16_to_cpu(dic->i_nlink)); + + /* use build time for compact inodes */ + inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec); + + inode->i_size = le32_to_cpu(dic->i_size); + if (erofs_inode_is_data_compressed(vi->datalayout)) + nblks = le32_to_cpu(dic->i_u.compressed_blocks); + else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) + vi->chunkformat = le16_to_cpu(dic->i_u.c.format); + break; + default: + erofs_err(inode->i_sb, + "unsupported on-disk inode version %u of nid %llu", + erofs_inode_version(ifmt), vi->nid); + err = -EOPNOTSUPP; + goto err_out; + } + + if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { + erofs_err(inode->i_sb, + "unsupported chunk format %x of nid %llu", + vi->chunkformat, vi->nid); + err = -EOPNOTSUPP; + goto err_out; + } + vi->chunkbits = sb->s_blocksize_bits + + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); + } + inode->i_mtime = inode->i_atime = inode_get_ctime(inode); + + inode->i_flags &= ~S_DAX; + if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && + (vi->datalayout == EROFS_INODE_FLAT_PLAIN || + vi->datalayout == EROFS_INODE_CHUNK_BASED)) + inode->i_flags |= S_DAX; + + if (!nblks) + /* measure inode.i_blocks as generic filesystems */ + inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; + else + inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); + return kaddr; + +bogusimode: + erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu", + inode->i_mode, vi->nid); + err = -EFSCORRUPTED; +err_out: + DBG_BUGON(1); + kfree(copied); + erofs_put_metabuf(buf); + return ERR_PTR(err); +} + +static int erofs_fill_symlink(struct inode *inode, void *kaddr, + unsigned int m_pofs) +{ + struct erofs_inode *vi = EROFS_I(inode); + unsigned int bsz = i_blocksize(inode); + char *lnk; + + /* if it cannot be handled with fast symlink scheme */ + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || + inode->i_size >= bsz || inode->i_size < 0) { + inode->i_op = &erofs_symlink_iops; + return 0; + } + + lnk = kmalloc(inode->i_size + 1, GFP_KERNEL); + if (!lnk) + return -ENOMEM; + + m_pofs += vi->xattr_isize; + /* inline symlink data shouldn't cross block boundary */ + if (m_pofs + inode->i_size > bsz) { + kfree(lnk); + erofs_err(inode->i_sb, + "inline data cross block boundary @ nid %llu", + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + memcpy(lnk, kaddr + m_pofs, inode->i_size); + lnk[inode->i_size] = '\0'; + + inode->i_link = lnk; + inode->i_op = &erofs_fast_symlink_iops; + return 0; +} + +static int erofs_fill_inode(struct inode *inode) +{ + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *kaddr; + unsigned int ofs; + int err = 0; + + trace_erofs_fill_inode(inode); + + /* read inode base data from disk */ + kaddr = erofs_read_inode(&buf, inode, &ofs); + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); + + /* setup the new inode */ + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &erofs_generic_iops; + if (erofs_inode_is_data_compressed(vi->datalayout)) + inode->i_fop = &generic_ro_fops; + else + inode->i_fop = &erofs_file_fops; + break; + case S_IFDIR: + inode->i_op = &erofs_dir_iops; + inode->i_fop = &erofs_dir_fops; + inode_nohighmem(inode); + break; + case S_IFLNK: + err = erofs_fill_symlink(inode, kaddr, ofs); + if (err) + goto out_unlock; + inode_nohighmem(inode); + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + inode->i_op = &erofs_generic_iops; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + goto out_unlock; + default: + err = -EFSCORRUPTED; + goto out_unlock; + } + + if (erofs_inode_is_data_compressed(vi->datalayout)) { +#ifdef CONFIG_EROFS_FS_ZIP + if (!erofs_is_fscache_mode(inode->i_sb) && + inode->i_sb->s_blocksize_bits == PAGE_SHIFT) { + inode->i_mapping->a_ops = &z_erofs_aops; + err = 0; + goto out_unlock; + } +#endif + err = -EOPNOTSUPP; + goto out_unlock; + } + inode->i_mapping->a_ops = &erofs_raw_access_aops; + mapping_set_large_folios(inode->i_mapping); +#ifdef CONFIG_EROFS_FS_ONDEMAND + if (erofs_is_fscache_mode(inode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; +#endif + +out_unlock: + erofs_put_metabuf(&buf); + return err; +} + +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. + */ +static ino_t erofs_squash_ino(erofs_nid_t nid) +{ + ino_t ino = (ino_t)nid; + + if (sizeof(ino_t) < sizeof(erofs_nid_t)) + ino ^= nid >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8; + return ino; +} + +static int erofs_iget5_eq(struct inode *inode, void *opaque) +{ + return EROFS_I(inode)->nid == *(erofs_nid_t *)opaque; +} + +static int erofs_iget5_set(struct inode *inode, void *opaque) +{ + const erofs_nid_t nid = *(erofs_nid_t *)opaque; + + inode->i_ino = erofs_squash_ino(nid); + EROFS_I(inode)->nid = nid; + return 0; +} + +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) +{ + struct inode *inode; + + inode = iget5_locked(sb, erofs_squash_ino(nid), erofs_iget5_eq, + erofs_iget5_set, &nid); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + int err = erofs_fill_inode(inode); + + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + unlock_new_inode(inode); + } + return inode; +} + +int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *const inode = d_inode(path->dentry); + + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) + stat->attributes |= STATX_ATTR_COMPRESSED; + + stat->attributes |= STATX_ATTR_IMMUTABLE; + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | + STATX_ATTR_IMMUTABLE); + + generic_fillattr(idmap, request_mask, inode, stat); + return 0; +} + +const struct inode_operations erofs_generic_iops = { + .getattr = erofs_getattr, + .listxattr = erofs_listxattr, + .get_inode_acl = erofs_get_acl, + .fiemap = erofs_fiemap, +}; + +const struct inode_operations erofs_symlink_iops = { + .get_link = page_get_link, + .getattr = erofs_getattr, + .listxattr = erofs_listxattr, + .get_inode_acl = erofs_get_acl, +}; + +const struct inode_operations erofs_fast_symlink_iops = { + .get_link = simple_get_link, + .getattr = erofs_getattr, + .listxattr = erofs_listxattr, + .get_inode_acl = erofs_get_acl, +}; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h new file mode 100644 index 0000000000..d8de61350d --- /dev/null +++ b/fs/erofs/internal.h @@ -0,0 +1,537 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud + */ +#ifndef __EROFS_INTERNAL_H +#define __EROFS_INTERNAL_H + +#include <linux/fs.h> +#include <linux/dcache.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include <linux/magic.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/iomap.h> +#include "erofs_fs.h" + +/* redefine pr_fmt "erofs: " */ +#undef pr_fmt +#define pr_fmt(fmt) "erofs: " fmt + +__printf(3, 4) void _erofs_err(struct super_block *sb, + const char *function, const char *fmt, ...); +#define erofs_err(sb, fmt, ...) \ + _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__) +__printf(3, 4) void _erofs_info(struct super_block *sb, + const char *function, const char *fmt, ...); +#define erofs_info(sb, fmt, ...) \ + _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__) +#ifdef CONFIG_EROFS_FS_DEBUG +#define DBG_BUGON BUG_ON +#else +#define DBG_BUGON(x) ((void)(x)) +#endif /* !CONFIG_EROFS_FS_DEBUG */ + +/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */ +#define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1 + +typedef u64 erofs_nid_t; +typedef u64 erofs_off_t; +/* data type for filesystem-wide blocks number */ +typedef u32 erofs_blk_t; + +struct erofs_device_info { + char *path; + struct erofs_fscache *fscache; + struct block_device *bdev; + struct dax_device *dax_dev; + u64 dax_part_off; + + u32 blocks; + u32 mapped_blkaddr; +}; + +enum { + EROFS_SYNC_DECOMPRESS_AUTO, + EROFS_SYNC_DECOMPRESS_FORCE_ON, + EROFS_SYNC_DECOMPRESS_FORCE_OFF +}; + +struct erofs_mount_opts { +#ifdef CONFIG_EROFS_FS_ZIP + /* current strategy of how to use managed cache */ + unsigned char cache_strategy; + /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */ + unsigned int sync_decompress; + + /* threshold for decompression synchronously */ + unsigned int max_sync_decompress_pages; +#endif + unsigned int mount_opt; +}; + +struct erofs_dev_context { + struct idr tree; + struct rw_semaphore rwsem; + + unsigned int extra_devices; + bool flatdev; +}; + +struct erofs_fs_context { + struct erofs_mount_opts opt; + struct erofs_dev_context *devs; + char *fsid; + char *domain_id; +}; + +/* all filesystem-wide lz4 configurations */ +struct erofs_sb_lz4_info { + /* # of pages needed for EROFS lz4 rolling decompression */ + u16 max_distance_pages; + /* maximum possible blocks for pclusters in the filesystem */ + u16 max_pclusterblks; +}; + +struct erofs_domain { + refcount_t ref; + struct list_head list; + struct fscache_volume *volume; + char *domain_id; +}; + +struct erofs_fscache { + struct fscache_cookie *cookie; + struct inode *inode; /* anonymous inode for the blob */ + + /* used for share domain mode */ + struct erofs_domain *domain; + struct list_head node; + refcount_t ref; + char *name; +}; + +struct erofs_xattr_prefix_item { + struct erofs_xattr_long_prefix *prefix; + u8 infix_len; +}; + +struct erofs_sb_info { + struct erofs_mount_opts opt; /* options */ +#ifdef CONFIG_EROFS_FS_ZIP + /* list for all registered superblocks, mainly for shrinker */ + struct list_head list; + struct mutex umount_mutex; + + /* managed XArray arranged in physical block number */ + struct xarray managed_pslots; + + unsigned int shrinker_run_no; + u16 available_compr_algs; + + /* pseudo inode to manage cached pages */ + struct inode *managed_cache; + + struct erofs_sb_lz4_info lz4; +#endif /* CONFIG_EROFS_FS_ZIP */ + struct inode *packed_inode; + struct erofs_dev_context *devs; + struct dax_device *dax_dev; + u64 dax_part_off; + u64 total_blocks; + u32 primarydevice_blocks; + + u32 meta_blkaddr; +#ifdef CONFIG_EROFS_FS_XATTR + u32 xattr_blkaddr; + u32 xattr_prefix_start; + u8 xattr_prefix_count; + struct erofs_xattr_prefix_item *xattr_prefixes; + unsigned int xattr_filter_reserved; +#endif + u16 device_id_mask; /* valid bits of device id to be used */ + + unsigned char islotbits; /* inode slot unit size in bit shift */ + unsigned char blkszbits; /* filesystem block size in bit shift */ + + u32 sb_size; /* total superblock size */ + u32 build_time_nsec; + u64 build_time; + + /* what we really care is nid, rather than ino.. */ + erofs_nid_t root_nid; + erofs_nid_t packed_nid; + /* used for statfs, f_files - f_favail */ + u64 inos; + + u8 uuid[16]; /* 128-bit uuid for volume */ + u8 volume_name[16]; /* volume name */ + u32 feature_compat; + u32 feature_incompat; + + /* sysfs support */ + struct kobject s_kobj; /* /sys/fs/erofs/<devname> */ + struct completion s_kobj_unregister; + + /* fscache support */ + struct fscache_volume *volume; + struct erofs_fscache *s_fscache; + struct erofs_domain *domain; + char *fsid; + char *domain_id; +}; + +#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) +#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info) + +/* Mount flags set via mount options or defaults */ +#define EROFS_MOUNT_XATTR_USER 0x00000010 +#define EROFS_MOUNT_POSIX_ACL 0x00000020 +#define EROFS_MOUNT_DAX_ALWAYS 0x00000040 +#define EROFS_MOUNT_DAX_NEVER 0x00000080 + +#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) + +static inline bool erofs_is_fscache_mode(struct super_block *sb) +{ + return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev; +} + +enum { + EROFS_ZIP_CACHE_DISABLED, + EROFS_ZIP_CACHE_READAHEAD, + EROFS_ZIP_CACHE_READAROUND +}; + +/* basic unit of the workstation of a super_block */ +struct erofs_workgroup { + pgoff_t index; + struct lockref lockref; +}; + +enum erofs_kmap_type { + EROFS_NO_KMAP, /* don't map the buffer */ + EROFS_KMAP, /* use kmap_local_page() to map the buffer */ +}; + +struct erofs_buf { + struct inode *inode; + struct page *page; + void *base; + enum erofs_kmap_type kmap_type; +}; +#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) + +#define ROOT_NID(sb) ((sb)->root_nid) + +#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits) +#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) +#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) +#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) + +#define EROFS_FEATURE_FUNCS(name, compat, feature) \ +static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ +{ \ + return sbi->feature_##compat & EROFS_FEATURE_##feature; \ +} + +EROFS_FEATURE_FUNCS(zero_padding, incompat, INCOMPAT_ZERO_PADDING) +EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) +EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) +EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) +EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) +EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) +EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) +EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) +EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) +EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) +EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) +EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) + +/* atomic flag definitions */ +#define EROFS_I_EA_INITED_BIT 0 +#define EROFS_I_Z_INITED_BIT 1 + +/* bitlock definitions (arranged in reverse order) */ +#define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1) +#define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2) + +struct erofs_inode { + erofs_nid_t nid; + + /* atomic flags (including bitlocks) */ + unsigned long flags; + + unsigned char datalayout; + unsigned char inode_isize; + unsigned int xattr_isize; + + unsigned int xattr_name_filter; + unsigned int xattr_shared_count; + unsigned int *xattr_shared_xattrs; + + union { + erofs_blk_t raw_blkaddr; + struct { + unsigned short chunkformat; + unsigned char chunkbits; + }; +#ifdef CONFIG_EROFS_FS_ZIP + struct { + unsigned short z_advise; + unsigned char z_algorithmtype[2]; + unsigned char z_logical_clusterbits; + unsigned long z_tailextent_headlcn; + union { + struct { + erofs_off_t z_idataoff; + unsigned short z_idata_size; + }; + erofs_off_t z_fragmentoff; + }; + }; +#endif /* CONFIG_EROFS_FS_ZIP */ + }; + /* the corresponding vfs inode */ + struct inode vfs_inode; +}; + +#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode) + +static inline erofs_off_t erofs_iloc(struct inode *inode) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + return erofs_pos(inode->i_sb, sbi->meta_blkaddr) + + (EROFS_I(inode)->nid << sbi->islotbits); +} + +static inline unsigned int erofs_inode_version(unsigned int ifmt) +{ + return (ifmt >> EROFS_I_VERSION_BIT) & EROFS_I_VERSION_MASK; +} + +static inline unsigned int erofs_inode_datalayout(unsigned int ifmt) +{ + return (ifmt >> EROFS_I_DATALAYOUT_BIT) & EROFS_I_DATALAYOUT_MASK; +} + +/* + * Different from grab_cache_page_nowait(), reclaiming is never triggered + * when allocating new pages. + */ +static inline +struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, + pgoff_t index) +{ + return pagecache_get_page(mapping, index, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); +} + +/* Has a disk mapping */ +#define EROFS_MAP_MAPPED 0x0001 +/* Located in metadata (could be copied from bd_inode) */ +#define EROFS_MAP_META 0x0002 +/* The extent is encoded */ +#define EROFS_MAP_ENCODED 0x0004 +/* The length of extent is full */ +#define EROFS_MAP_FULL_MAPPED 0x0008 +/* Located in the special packed inode */ +#define EROFS_MAP_FRAGMENT 0x0010 +/* The extent refers to partial decompressed data */ +#define EROFS_MAP_PARTIAL_REF 0x0020 + +struct erofs_map_blocks { + struct erofs_buf buf; + + erofs_off_t m_pa, m_la; + u64 m_plen, m_llen; + + unsigned short m_deviceid; + char m_algorithmformat; + unsigned int m_flags; +}; + +/* + * Used to get the exact decompressed length, e.g. fiemap (consider lookback + * approach instead if possible since it's more metadata lightweight.) + */ +#define EROFS_GET_BLOCKS_FIEMAP 0x0001 +/* Used to map the whole extent if non-negligible data is requested for LZMA */ +#define EROFS_GET_BLOCKS_READMORE 0x0002 +/* Used to map tail extent for tailpacking inline or fragment pcluster */ +#define EROFS_GET_BLOCKS_FINDTAIL 0x0004 + +enum { + Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_INTERLACED, + Z_EROFS_COMPRESSION_RUNTIME_MAX +}; + +struct erofs_map_dev { + struct erofs_fscache *m_fscache; + struct block_device *m_bdev; + struct dax_device *m_daxdev; + u64 m_dax_part_off; + + erofs_off_t m_pa; + unsigned int m_deviceid; +}; + +extern struct file_system_type erofs_fs_type; +extern const struct super_operations erofs_sops; + +extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations z_erofs_aops; +extern const struct address_space_operations erofs_fscache_access_aops; + +extern const struct inode_operations erofs_generic_iops; +extern const struct inode_operations erofs_symlink_iops; +extern const struct inode_operations erofs_fast_symlink_iops; +extern const struct inode_operations erofs_dir_iops; + +extern const struct file_operations erofs_file_fops; +extern const struct file_operations erofs_dir_fops; + +extern const struct iomap_ops z_erofs_iomap_report_ops; + +/* flags for erofs_fscache_register_cookie() */ +#define EROFS_REG_COOKIE_SHARE 0x0001 +#define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002 + +void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, + erofs_off_t *offset, int *lengthp); +void erofs_unmap_metabuf(struct erofs_buf *buf); +void erofs_put_metabuf(struct erofs_buf *buf); +void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, + enum erofs_kmap_type type); +void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); +void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, + erofs_blk_t blkaddr, enum erofs_kmap_type type); +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); +int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); +int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags); +int erofs_namei(struct inode *dir, const struct qstr *name, + erofs_nid_t *nid, unsigned int *d_type); + +static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) +{ + int retried = 0; + + while (1) { + void *p = vm_map_ram(pages, count, -1); + + /* retry two more times (totally 3 times) */ + if (p || ++retried >= 3) + return p; + vm_unmap_aliases(); + } + return NULL; +} + +int erofs_register_sysfs(struct super_block *sb); +void erofs_unregister_sysfs(struct super_block *sb); +int __init erofs_init_sysfs(void); +void erofs_exit_sysfs(void); + +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); +static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) +{ + set_page_private(page, (unsigned long)*pagepool); + *pagepool = page; +} +void erofs_release_pages(struct page **pagepool); + +#ifdef CONFIG_EROFS_FS_ZIP +void erofs_workgroup_put(struct erofs_workgroup *grp); +struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, + pgoff_t index); +struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, + struct erofs_workgroup *grp); +void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); +void erofs_shrinker_register(struct super_block *sb); +void erofs_shrinker_unregister(struct super_block *sb); +int __init erofs_init_shrinker(void); +void erofs_exit_shrinker(void); +int __init z_erofs_init_zip_subsystem(void); +void z_erofs_exit_zip_subsystem(void); +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *egrp); +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, + int flags); +void *erofs_get_pcpubuf(unsigned int requiredpages); +void erofs_put_pcpubuf(void *ptr); +int erofs_pcpubuf_growsize(unsigned int nrpages); +void __init erofs_pcpubuf_init(void); +void erofs_pcpubuf_exit(void); +int erofs_init_managed_cache(struct super_block *sb); +int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); +#else +static inline void erofs_shrinker_register(struct super_block *sb) {} +static inline void erofs_shrinker_unregister(struct super_block *sb) {} +static inline int erofs_init_shrinker(void) { return 0; } +static inline void erofs_exit_shrinker(void) {} +static inline int z_erofs_init_zip_subsystem(void) { return 0; } +static inline void z_erofs_exit_zip_subsystem(void) {} +static inline void erofs_pcpubuf_init(void) {} +static inline void erofs_pcpubuf_exit(void) {} +static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } +#endif /* !CONFIG_EROFS_FS_ZIP */ + +#ifdef CONFIG_EROFS_FS_ZIP_LZMA +int __init z_erofs_lzma_init(void); +void z_erofs_lzma_exit(void); +#else +static inline int z_erofs_lzma_init(void) { return 0; } +static inline int z_erofs_lzma_exit(void) { return 0; } +#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */ + +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE +int __init z_erofs_deflate_init(void); +void z_erofs_deflate_exit(void); +#else +static inline int z_erofs_deflate_init(void) { return 0; } +static inline int z_erofs_deflate_exit(void) { return 0; } +#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */ + +#ifdef CONFIG_EROFS_FS_ONDEMAND +int erofs_fscache_register_fs(struct super_block *sb); +void erofs_fscache_unregister_fs(struct super_block *sb); + +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, unsigned int flags); +void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); +#else +static inline int erofs_fscache_register_fs(struct super_block *sb) +{ + return -EOPNOTSUPP; +} +static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} + +static inline +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, unsigned int flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache) +{ +} +#endif + +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c new file mode 100644 index 0000000000..d4f631d39f --- /dev/null +++ b/fs/erofs/namei.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2022, Alibaba Cloud + */ +#include "xattr.h" +#include <trace/events/erofs.h> + +struct erofs_qstr { + const unsigned char *name; + const unsigned char *end; +}; + +/* based on the end of qn is accurate and it must have the trailing '\0' */ +static inline int erofs_dirnamecmp(const struct erofs_qstr *qn, + const struct erofs_qstr *qd, + unsigned int *matched) +{ + unsigned int i = *matched; + + /* + * on-disk error, let's only BUG_ON in the debugging mode. + * otherwise, it will return 1 to just skip the invalid name + * and go on (in consideration of the lookup performance). + */ + DBG_BUGON(qd->name > qd->end); + + /* qd could not have trailing '\0' */ + /* However it is absolutely safe if < qd->end */ + while (qd->name + i < qd->end && qd->name[i] != '\0') { + if (qn->name[i] != qd->name[i]) { + *matched = i; + return qn->name[i] > qd->name[i] ? 1 : -1; + } + ++i; + } + *matched = i; + /* See comments in __d_alloc on the terminating NUL character */ + return qn->name[i] == '\0' ? 0 : 1; +} + +#define nameoff_from_disk(off, sz) (le16_to_cpu(off) & ((sz) - 1)) + +static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, + u8 *data, + unsigned int dirblksize, + const int ndirents) +{ + int head, back; + unsigned int startprfx, endprfx; + struct erofs_dirent *const de = (struct erofs_dirent *)data; + + /* since the 1st dirent has been evaluated previously */ + head = 1; + back = ndirents - 1; + startprfx = endprfx = 0; + + while (head <= back) { + const int mid = head + (back - head) / 2; + const int nameoff = nameoff_from_disk(de[mid].nameoff, + dirblksize); + unsigned int matched = min(startprfx, endprfx); + struct erofs_qstr dname = { + .name = data + nameoff, + .end = mid >= ndirents - 1 ? + data + dirblksize : + data + nameoff_from_disk(de[mid + 1].nameoff, + dirblksize) + }; + + /* string comparison without already matched prefix */ + int ret = erofs_dirnamecmp(name, &dname, &matched); + + if (!ret) { + return de + mid; + } else if (ret > 0) { + head = mid + 1; + startprfx = matched; + } else { + back = mid - 1; + endprfx = matched; + } + } + + return ERR_PTR(-ENOENT); +} + +static void *erofs_find_target_block(struct erofs_buf *target, + struct inode *dir, struct erofs_qstr *name, int *_ndirents) +{ + unsigned int bsz = i_blocksize(dir); + int head = 0, back = erofs_iblks(dir) - 1; + unsigned int startprfx = 0, endprfx = 0; + void *candidate = ERR_PTR(-ENOENT); + + while (head <= back) { + const int mid = head + (back - head) / 2; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + struct erofs_dirent *de; + + buf.inode = dir; + de = erofs_bread(&buf, mid, EROFS_KMAP); + if (!IS_ERR(de)) { + const int nameoff = nameoff_from_disk(de->nameoff, bsz); + const int ndirents = nameoff / sizeof(*de); + int diff; + unsigned int matched; + struct erofs_qstr dname; + + if (!ndirents) { + erofs_put_metabuf(&buf); + erofs_err(dir->i_sb, + "corrupted dir block %d @ nid %llu", + mid, EROFS_I(dir)->nid); + DBG_BUGON(1); + de = ERR_PTR(-EFSCORRUPTED); + goto out; + } + + matched = min(startprfx, endprfx); + + dname.name = (u8 *)de + nameoff; + if (ndirents == 1) + dname.end = (u8 *)de + bsz; + else + dname.end = (u8 *)de + + nameoff_from_disk(de[1].nameoff, bsz); + + /* string comparison without already matched prefix */ + diff = erofs_dirnamecmp(name, &dname, &matched); + + if (!diff) { + *_ndirents = 0; + goto out; + } else if (diff > 0) { + head = mid + 1; + startprfx = matched; + + if (!IS_ERR(candidate)) + erofs_put_metabuf(target); + *target = buf; + candidate = de; + *_ndirents = ndirents; + } else { + erofs_put_metabuf(&buf); + + back = mid - 1; + endprfx = matched; + } + continue; + } +out: /* free if the candidate is valid */ + if (!IS_ERR(candidate)) + erofs_put_metabuf(target); + return de; + } + return candidate; +} + +int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, + unsigned int *d_type) +{ + int ndirents; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + struct erofs_dirent *de; + struct erofs_qstr qn; + + if (!dir->i_size) + return -ENOENT; + + qn.name = name->name; + qn.end = name->name + name->len; + buf.inode = dir; + + ndirents = 0; + de = erofs_find_target_block(&buf, dir, &qn, &ndirents); + if (IS_ERR(de)) + return PTR_ERR(de); + + if (ndirents) + de = find_target_dirent(&qn, (u8 *)de, i_blocksize(dir), + ndirents); + + if (!IS_ERR(de)) { + *nid = le64_to_cpu(de->nid); + *d_type = de->file_type; + } + erofs_put_metabuf(&buf); + return PTR_ERR_OR_ZERO(de); +} + +static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + int err; + erofs_nid_t nid; + unsigned int d_type; + struct inode *inode; + + trace_erofs_lookup(dir, dentry, flags); + + if (dentry->d_name.len > EROFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); + + if (err == -ENOENT) + /* negative dentry */ + inode = NULL; + else if (err) + inode = ERR_PTR(err); + else + inode = erofs_iget(dir->i_sb, nid); + return d_splice_alias(inode, dentry); +} + +const struct inode_operations erofs_dir_iops = { + .lookup = erofs_lookup, + .getattr = erofs_getattr, + .listxattr = erofs_listxattr, + .get_inode_acl = erofs_get_acl, + .fiemap = erofs_fiemap, +}; diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c new file mode 100644 index 0000000000..c7a4b1d770 --- /dev/null +++ b/fs/erofs/pcpubuf.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Gao Xiang <xiang@kernel.org> + * + * For low-latency decompression algorithms (e.g. lz4), reserve consecutive + * per-CPU virtual memory (in pages) in advance to store such inplace I/O + * data if inplace decompression is failed (due to unmet inplace margin for + * example). + */ +#include "internal.h" + +struct erofs_pcpubuf { + raw_spinlock_t lock; + void *ptr; + struct page **pages; + unsigned int nrpages; +}; + +static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb); + +void *erofs_get_pcpubuf(unsigned int requiredpages) + __acquires(pcb->lock) +{ + struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb); + + raw_spin_lock(&pcb->lock); + /* check if the per-CPU buffer is too small */ + if (requiredpages > pcb->nrpages) { + raw_spin_unlock(&pcb->lock); + put_cpu_var(erofs_pcb); + /* (for sparse checker) pretend pcb->lock is still taken */ + __acquire(pcb->lock); + return NULL; + } + return pcb->ptr; +} + +void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock) +{ + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id()); + + DBG_BUGON(pcb->ptr != ptr); + raw_spin_unlock(&pcb->lock); + put_cpu_var(erofs_pcb); +} + +/* the next step: support per-CPU page buffers hotplug */ +int erofs_pcpubuf_growsize(unsigned int nrpages) +{ + static DEFINE_MUTEX(pcb_resize_mutex); + static unsigned int pcb_nrpages; + struct page *pagepool = NULL; + int delta, cpu, ret, i; + + mutex_lock(&pcb_resize_mutex); + delta = nrpages - pcb_nrpages; + ret = 0; + /* avoid shrinking pcpubuf, since no idea how many fses rely on */ + if (delta <= 0) + goto out; + + for_each_possible_cpu(cpu) { + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); + struct page **pages, **oldpages; + void *ptr, *old_ptr; + + pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + break; + } + + for (i = 0; i < nrpages; ++i) { + pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL); + if (!pages[i]) { + ret = -ENOMEM; + oldpages = pages; + goto free_pagearray; + } + } + ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL); + if (!ptr) { + ret = -ENOMEM; + oldpages = pages; + goto free_pagearray; + } + raw_spin_lock(&pcb->lock); + old_ptr = pcb->ptr; + pcb->ptr = ptr; + oldpages = pcb->pages; + pcb->pages = pages; + i = pcb->nrpages; + pcb->nrpages = nrpages; + raw_spin_unlock(&pcb->lock); + + if (!oldpages) { + DBG_BUGON(old_ptr); + continue; + } + + if (old_ptr) + vunmap(old_ptr); +free_pagearray: + while (i) + erofs_pagepool_add(&pagepool, oldpages[--i]); + kfree(oldpages); + if (ret) + break; + } + pcb_nrpages = nrpages; + erofs_release_pages(&pagepool); +out: + mutex_unlock(&pcb_resize_mutex); + return ret; +} + +void __init erofs_pcpubuf_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); + + raw_spin_lock_init(&pcb->lock); + } +} + +void erofs_pcpubuf_exit(void) +{ + int cpu, i; + + for_each_possible_cpu(cpu) { + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); + + if (pcb->ptr) { + vunmap(pcb->ptr); + pcb->ptr = NULL; + } + if (!pcb->pages) + continue; + + for (i = 0; i < pcb->nrpages; ++i) + if (pcb->pages[i]) + put_page(pcb->pages[i]); + kfree(pcb->pages); + pcb->pages = NULL; + } +} diff --git a/fs/erofs/super.c b/fs/erofs/super.c new file mode 100644 index 0000000000..cc44fb2e00 --- /dev/null +++ b/fs/erofs/super.c @@ -0,0 +1,1021 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud + */ +#include <linux/module.h> +#include <linux/statfs.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include <linux/crc32c.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/dax.h> +#include <linux/exportfs.h> +#include "xattr.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/erofs.h> + +static struct kmem_cache *erofs_inode_cachep __read_mostly; + +void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); + va_end(args); +} + +void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_info("(device %s): %pV", sb->s_id, &vaf); + va_end(args); +} + +static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata) +{ + size_t len = 1 << EROFS_SB(sb)->blkszbits; + struct erofs_super_block *dsb; + u32 expected_crc, crc; + + if (len > EROFS_SUPER_OFFSET) + len -= EROFS_SUPER_OFFSET; + + dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL); + if (!dsb) + return -ENOMEM; + + expected_crc = le32_to_cpu(dsb->checksum); + dsb->checksum = 0; + /* to allow for x86 boot sectors and other oddities. */ + crc = crc32c(~0, dsb, len); + kfree(dsb); + + if (crc != expected_crc) { + erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected", + crc, expected_crc); + return -EBADMSG; + } + return 0; +} + +static void erofs_inode_init_once(void *ptr) +{ + struct erofs_inode *vi = ptr; + + inode_init_once(&vi->vfs_inode); +} + +static struct inode *erofs_alloc_inode(struct super_block *sb) +{ + struct erofs_inode *vi = + alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL); + + if (!vi) + return NULL; + + /* zero out everything except vfs_inode */ + memset(vi, 0, offsetof(struct erofs_inode, vfs_inode)); + return &vi->vfs_inode; +} + +static void erofs_free_inode(struct inode *inode) +{ + struct erofs_inode *vi = EROFS_I(inode); + + if (inode->i_op == &erofs_fast_symlink_iops) + kfree(inode->i_link); + kfree(vi->xattr_shared_xattrs); + kmem_cache_free(erofs_inode_cachep, vi); +} + +static bool check_layout_compatibility(struct super_block *sb, + struct erofs_super_block *dsb) +{ + const unsigned int feature = le32_to_cpu(dsb->feature_incompat); + + EROFS_SB(sb)->feature_incompat = feature; + + /* check if current kernel meets all mandatory requirements */ + if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", + feature & ~EROFS_ALL_FEATURE_INCOMPAT); + return false; + } + return true; +} + +/* read variable-sized metadata, offset will be aligned by 4-byte */ +void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, + erofs_off_t *offset, int *lengthp) +{ + u8 *buffer, *ptr; + int len, i, cnt; + + *offset = round_up(*offset, 4); + ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); + if (IS_ERR(ptr)) + return ptr; + + len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]); + if (!len) + len = U16_MAX + 1; + buffer = kmalloc(len, GFP_KERNEL); + if (!buffer) + return ERR_PTR(-ENOMEM); + *offset += sizeof(__le16); + *lengthp = len; + + for (i = 0; i < len; i += cnt) { + cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), + len - i); + ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); + if (IS_ERR(ptr)) { + kfree(buffer); + return ptr; + } + memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt); + *offset += cnt; + } + return buffer; +} + +#ifndef CONFIG_EROFS_FS_ZIP +static int z_erofs_parse_cfgs(struct super_block *sb, + struct erofs_super_block *dsb) +{ + if (!dsb->u1.available_compr_algs) + return 0; + + erofs_err(sb, "compression disabled, unable to mount compressed EROFS"); + return -EOPNOTSUPP; +} +#endif + +static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, + struct erofs_device_info *dif, erofs_off_t *pos) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_fscache *fscache; + struct erofs_deviceslot *dis; + struct block_device *bdev; + void *ptr; + + ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + dis = ptr + erofs_blkoff(sb, *pos); + + if (!sbi->devs->flatdev && !dif->path) { + if (!dis->tag[0]) { + erofs_err(sb, "empty device tag @ pos %llu", *pos); + return -EINVAL; + } + dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL); + if (!dif->path) + return -ENOMEM; + } + + if (erofs_is_fscache_mode(sb)) { + fscache = erofs_fscache_register_cookie(sb, dif->path, 0); + if (IS_ERR(fscache)) + return PTR_ERR(fscache); + dif->fscache = fscache; + } else if (!sbi->devs->flatdev) { + bdev = blkdev_get_by_path(dif->path, BLK_OPEN_READ, sb->s_type, + NULL); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + dif->bdev = bdev; + dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off, + NULL, NULL); + } + + dif->blocks = le32_to_cpu(dis->blocks); + dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + sbi->total_blocks += dif->blocks; + *pos += EROFS_DEVT_SLOT_SIZE; + return 0; +} + +static int erofs_scan_devices(struct super_block *sb, + struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int ondisk_extradevs; + erofs_off_t pos; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + struct erofs_device_info *dif; + int id, err = 0; + + sbi->total_blocks = sbi->primarydevice_blocks; + if (!erofs_sb_has_device_table(sbi)) + ondisk_extradevs = 0; + else + ondisk_extradevs = le16_to_cpu(dsb->extra_devices); + + if (sbi->devs->extra_devices && + ondisk_extradevs != sbi->devs->extra_devices) { + erofs_err(sb, "extra devices don't match (ondisk %u, given %u)", + ondisk_extradevs, sbi->devs->extra_devices); + return -EINVAL; + } + if (!ondisk_extradevs) + return 0; + + if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb)) + sbi->devs->flatdev = true; + + sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; + pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; + down_read(&sbi->devs->rwsem); + if (sbi->devs->extra_devices) { + idr_for_each_entry(&sbi->devs->tree, dif, id) { + err = erofs_init_device(&buf, sb, dif, &pos); + if (err) + break; + } + } else { + for (id = 0; id < ondisk_extradevs; id++) { + dif = kzalloc(sizeof(*dif), GFP_KERNEL); + if (!dif) { + err = -ENOMEM; + break; + } + + err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL); + if (err < 0) { + kfree(dif); + break; + } + ++sbi->devs->extra_devices; + + err = erofs_init_device(&buf, sb, dif, &pos); + if (err) + break; + } + } + up_read(&sbi->devs->rwsem); + erofs_put_metabuf(&buf); + return err; +} + +static int erofs_read_superblock(struct super_block *sb) +{ + struct erofs_sb_info *sbi; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + struct erofs_super_block *dsb; + void *data; + int ret; + + data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP); + if (IS_ERR(data)) { + erofs_err(sb, "cannot read erofs superblock"); + return PTR_ERR(data); + } + + sbi = EROFS_SB(sb); + dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); + + ret = -EINVAL; + if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { + erofs_err(sb, "cannot find valid erofs superblock"); + goto out; + } + + sbi->blkszbits = dsb->blkszbits; + if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) { + erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits); + goto out; + } + if (dsb->dirblkbits) { + erofs_err(sb, "dirblkbits %u isn't supported", dsb->dirblkbits); + goto out; + } + + sbi->feature_compat = le32_to_cpu(dsb->feature_compat); + if (erofs_sb_has_sb_chksum(sbi)) { + ret = erofs_superblock_csum_verify(sb, data); + if (ret) + goto out; + } + + ret = -EINVAL; + if (!check_layout_compatibility(sb, dsb)) + goto out; + + sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; + if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { + erofs_err(sb, "invalid sb_extslots %u (more than a fs block)", + sbi->sb_size); + goto out; + } + sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); + sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); +#ifdef CONFIG_EROFS_FS_XATTR + sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); + sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); + sbi->xattr_prefix_count = dsb->xattr_prefix_count; + sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; +#endif + sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); + sbi->root_nid = le16_to_cpu(dsb->root_nid); + sbi->packed_nid = le64_to_cpu(dsb->packed_nid); + sbi->inos = le64_to_cpu(dsb->inos); + + sbi->build_time = le64_to_cpu(dsb->build_time); + sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); + + memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid)); + + ret = strscpy(sbi->volume_name, dsb->volume_name, + sizeof(dsb->volume_name)); + if (ret < 0) { /* -E2BIG */ + erofs_err(sb, "bad volume name without NIL terminator"); + ret = -EFSCORRUPTED; + goto out; + } + + /* parse on-disk compression configurations */ + ret = z_erofs_parse_cfgs(sb, dsb); + if (ret < 0) + goto out; + + /* handle multiple devices */ + ret = erofs_scan_devices(sb, dsb); + + if (erofs_is_fscache_mode(sb)) + erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); +out: + erofs_put_metabuf(&buf); + return ret; +} + +static void erofs_default_options(struct erofs_fs_context *ctx) +{ +#ifdef CONFIG_EROFS_FS_ZIP + ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; + ctx->opt.max_sync_decompress_pages = 3; + ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO; +#endif +#ifdef CONFIG_EROFS_FS_XATTR + set_opt(&ctx->opt, XATTR_USER); +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + set_opt(&ctx->opt, POSIX_ACL); +#endif +} + +enum { + Opt_user_xattr, + Opt_acl, + Opt_cache_strategy, + Opt_dax, + Opt_dax_enum, + Opt_device, + Opt_fsid, + Opt_domain_id, + Opt_err +}; + +static const struct constant_table erofs_param_cache_strategy[] = { + {"disabled", EROFS_ZIP_CACHE_DISABLED}, + {"readahead", EROFS_ZIP_CACHE_READAHEAD}, + {"readaround", EROFS_ZIP_CACHE_READAROUND}, + {} +}; + +static const struct constant_table erofs_dax_param_enums[] = { + {"always", EROFS_MOUNT_DAX_ALWAYS}, + {"never", EROFS_MOUNT_DAX_NEVER}, + {} +}; + +static const struct fs_parameter_spec erofs_fs_parameters[] = { + fsparam_flag_no("user_xattr", Opt_user_xattr), + fsparam_flag_no("acl", Opt_acl), + fsparam_enum("cache_strategy", Opt_cache_strategy, + erofs_param_cache_strategy), + fsparam_flag("dax", Opt_dax), + fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), + fsparam_string("device", Opt_device), + fsparam_string("fsid", Opt_fsid), + fsparam_string("domain_id", Opt_domain_id), + {} +}; + +static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) +{ +#ifdef CONFIG_FS_DAX + struct erofs_fs_context *ctx = fc->fs_private; + + switch (mode) { + case EROFS_MOUNT_DAX_ALWAYS: + warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + set_opt(&ctx->opt, DAX_ALWAYS); + clear_opt(&ctx->opt, DAX_NEVER); + return true; + case EROFS_MOUNT_DAX_NEVER: + set_opt(&ctx->opt, DAX_NEVER); + clear_opt(&ctx->opt, DAX_ALWAYS); + return true; + default: + DBG_BUGON(1); + return false; + } +#else + errorfc(fc, "dax options not supported"); + return false; +#endif +} + +static int erofs_fc_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct erofs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + struct erofs_device_info *dif; + int opt, ret; + + opt = fs_parse(fc, erofs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_user_xattr: +#ifdef CONFIG_EROFS_FS_XATTR + if (result.boolean) + set_opt(&ctx->opt, XATTR_USER); + else + clear_opt(&ctx->opt, XATTR_USER); +#else + errorfc(fc, "{,no}user_xattr options not supported"); +#endif + break; + case Opt_acl: +#ifdef CONFIG_EROFS_FS_POSIX_ACL + if (result.boolean) + set_opt(&ctx->opt, POSIX_ACL); + else + clear_opt(&ctx->opt, POSIX_ACL); +#else + errorfc(fc, "{,no}acl options not supported"); +#endif + break; + case Opt_cache_strategy: +#ifdef CONFIG_EROFS_FS_ZIP + ctx->opt.cache_strategy = result.uint_32; +#else + errorfc(fc, "compression not supported, cache_strategy ignored"); +#endif + break; + case Opt_dax: + if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS)) + return -EINVAL; + break; + case Opt_dax_enum: + if (!erofs_fc_set_dax_mode(fc, result.uint_32)) + return -EINVAL; + break; + case Opt_device: + dif = kzalloc(sizeof(*dif), GFP_KERNEL); + if (!dif) + return -ENOMEM; + dif->path = kstrdup(param->string, GFP_KERNEL); + if (!dif->path) { + kfree(dif); + return -ENOMEM; + } + down_write(&ctx->devs->rwsem); + ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL); + up_write(&ctx->devs->rwsem); + if (ret < 0) { + kfree(dif->path); + kfree(dif); + return ret; + } + ++ctx->devs->extra_devices; + break; +#ifdef CONFIG_EROFS_FS_ONDEMAND + case Opt_fsid: + kfree(ctx->fsid); + ctx->fsid = kstrdup(param->string, GFP_KERNEL); + if (!ctx->fsid) + return -ENOMEM; + break; + case Opt_domain_id: + kfree(ctx->domain_id); + ctx->domain_id = kstrdup(param->string, GFP_KERNEL); + if (!ctx->domain_id) + return -ENOMEM; + break; +#else + case Opt_fsid: + case Opt_domain_id: + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); + break; +#endif + default: + return -ENOPARAM; + } + return 0; +} + +static struct inode *erofs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + return erofs_iget(sb, ino); +} + +static struct dentry *erofs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + erofs_nfs_get_inode); +} + +static struct dentry *erofs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + erofs_nfs_get_inode); +} + +static struct dentry *erofs_get_parent(struct dentry *child) +{ + erofs_nid_t nid; + unsigned int d_type; + int err; + + err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type); + if (err) + return ERR_PTR(err); + return d_obtain_alias(erofs_iget(child->d_sb, nid)); +} + +static const struct export_operations erofs_export_ops = { + .fh_to_dentry = erofs_fh_to_dentry, + .fh_to_parent = erofs_fh_to_parent, + .get_parent = erofs_get_parent, +}; + +static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc) +{ + static const struct tree_descr empty_descr = {""}; + + return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr); +} + +static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct inode *inode; + struct erofs_sb_info *sbi; + struct erofs_fs_context *ctx = fc->fs_private; + int err; + + sb->s_magic = EROFS_SUPER_MAGIC; + sb->s_flags |= SB_RDONLY | SB_NOATIME; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &erofs_sops; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + sbi->opt = ctx->opt; + sbi->devs = ctx->devs; + ctx->devs = NULL; + sbi->fsid = ctx->fsid; + ctx->fsid = NULL; + sbi->domain_id = ctx->domain_id; + ctx->domain_id = NULL; + + sbi->blkszbits = PAGE_SHIFT; + if (erofs_is_fscache_mode(sb)) { + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + + err = erofs_fscache_register_fs(sb); + if (err) + return err; + + err = super_setup_bdi(sb); + if (err) + return err; + } else { + if (!sb_set_blocksize(sb, PAGE_SIZE)) { + errorfc(fc, "failed to set initial blksize"); + return -EINVAL; + } + + sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, + &sbi->dax_part_off, + NULL, NULL); + } + + err = erofs_read_superblock(sb); + if (err) + return err; + + if (sb->s_blocksize_bits != sbi->blkszbits) { + if (erofs_is_fscache_mode(sb)) { + errorfc(fc, "unsupported blksize for fscache mode"); + return -EINVAL; + } + if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { + errorfc(fc, "failed to set erofs blksize"); + return -EINVAL; + } + } + + if (test_opt(&sbi->opt, DAX_ALWAYS)) { + if (!sbi->dax_dev) { + errorfc(fc, "DAX unsupported by block device. Turning off DAX."); + clear_opt(&sbi->opt, DAX_ALWAYS); + } else if (sbi->blkszbits != PAGE_SHIFT) { + errorfc(fc, "unsupported blocksize for DAX"); + clear_opt(&sbi->opt, DAX_ALWAYS); + } + } + + sb->s_time_gran = 1; + sb->s_xattr = erofs_xattr_handlers; + sb->s_export_op = &erofs_export_ops; + + if (test_opt(&sbi->opt, POSIX_ACL)) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + +#ifdef CONFIG_EROFS_FS_ZIP + xa_init(&sbi->managed_pslots); +#endif + + inode = erofs_iget(sb, ROOT_NID(sbi)); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (!S_ISDIR(inode->i_mode)) { + erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)", + ROOT_NID(sbi), inode->i_mode); + iput(inode); + return -EINVAL; + } + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + erofs_shrinker_register(sb); + if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { + sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); + if (IS_ERR(sbi->packed_inode)) { + err = PTR_ERR(sbi->packed_inode); + sbi->packed_inode = NULL; + return err; + } + } + err = erofs_init_managed_cache(sb); + if (err) + return err; + + err = erofs_xattr_prefixes_init(sb); + if (err) + return err; + + err = erofs_register_sysfs(sb); + if (err) + return err; + + erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi)); + return 0; +} + +static int erofs_fc_anon_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, erofs_fc_fill_pseudo_super); +} + +static int erofs_fc_get_tree(struct fs_context *fc) +{ + struct erofs_fs_context *ctx = fc->fs_private; + + if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid) + return get_tree_nodev(fc, erofs_fc_fill_super); + + return get_tree_bdev(fc, erofs_fc_fill_super); +} + +static int erofs_fc_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_fs_context *ctx = fc->fs_private; + + DBG_BUGON(!sb_rdonly(sb)); + + if (ctx->fsid || ctx->domain_id) + erofs_info(sb, "ignoring reconfiguration for fsid|domain_id."); + + if (test_opt(&ctx->opt, POSIX_ACL)) + fc->sb_flags |= SB_POSIXACL; + else + fc->sb_flags &= ~SB_POSIXACL; + + sbi->opt = ctx->opt; + + fc->sb_flags |= SB_RDONLY; + return 0; +} + +static int erofs_release_device_info(int id, void *ptr, void *data) +{ + struct erofs_device_info *dif = ptr; + + fs_put_dax(dif->dax_dev, NULL); + if (dif->bdev) + blkdev_put(dif->bdev, &erofs_fs_type); + erofs_fscache_unregister_cookie(dif->fscache); + dif->fscache = NULL; + kfree(dif->path); + kfree(dif); + return 0; +} + +static void erofs_free_dev_context(struct erofs_dev_context *devs) +{ + if (!devs) + return; + idr_for_each(&devs->tree, &erofs_release_device_info, NULL); + idr_destroy(&devs->tree); + kfree(devs); +} + +static void erofs_fc_free(struct fs_context *fc) +{ + struct erofs_fs_context *ctx = fc->fs_private; + + erofs_free_dev_context(ctx->devs); + kfree(ctx->fsid); + kfree(ctx->domain_id); + kfree(ctx); +} + +static const struct fs_context_operations erofs_context_ops = { + .parse_param = erofs_fc_parse_param, + .get_tree = erofs_fc_get_tree, + .reconfigure = erofs_fc_reconfigure, + .free = erofs_fc_free, +}; + +static const struct fs_context_operations erofs_anon_context_ops = { + .get_tree = erofs_fc_anon_get_tree, +}; + +static int erofs_init_fs_context(struct fs_context *fc) +{ + struct erofs_fs_context *ctx; + + /* pseudo mount for anon inodes */ + if (fc->sb_flags & SB_KERNMOUNT) { + fc->ops = &erofs_anon_context_ops; + return 0; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); + if (!ctx->devs) { + kfree(ctx); + return -ENOMEM; + } + fc->fs_private = ctx; + + idr_init(&ctx->devs->tree); + init_rwsem(&ctx->devs->rwsem); + erofs_default_options(ctx); + fc->ops = &erofs_context_ops; + return 0; +} + +static void erofs_kill_sb(struct super_block *sb) +{ + struct erofs_sb_info *sbi; + + /* pseudo mount for anon inodes */ + if (sb->s_flags & SB_KERNMOUNT) { + kill_anon_super(sb); + return; + } + + if (erofs_is_fscache_mode(sb)) + kill_anon_super(sb); + else + kill_block_super(sb); + + sbi = EROFS_SB(sb); + if (!sbi) + return; + + erofs_free_dev_context(sbi->devs); + fs_put_dax(sbi->dax_dev, NULL); + erofs_fscache_unregister_fs(sb); + kfree(sbi->fsid); + kfree(sbi->domain_id); + kfree(sbi); + sb->s_fs_info = NULL; +} + +static void erofs_put_super(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + DBG_BUGON(!sbi); + + erofs_unregister_sysfs(sb); + erofs_shrinker_unregister(sb); + erofs_xattr_prefixes_cleanup(sb); +#ifdef CONFIG_EROFS_FS_ZIP + iput(sbi->managed_cache); + sbi->managed_cache = NULL; +#endif + iput(sbi->packed_inode); + sbi->packed_inode = NULL; + erofs_free_dev_context(sbi->devs); + sbi->devs = NULL; + erofs_fscache_unregister_fs(sb); +} + +struct file_system_type erofs_fs_type = { + .owner = THIS_MODULE, + .name = "erofs", + .init_fs_context = erofs_init_fs_context, + .kill_sb = erofs_kill_sb, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, +}; +MODULE_ALIAS_FS("erofs"); + +static int __init erofs_module_init(void) +{ + int err; + + erofs_check_ondisk_layout_definitions(); + + erofs_inode_cachep = kmem_cache_create("erofs_inode", + sizeof(struct erofs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + erofs_inode_init_once); + if (!erofs_inode_cachep) + return -ENOMEM; + + err = erofs_init_shrinker(); + if (err) + goto shrinker_err; + + err = z_erofs_lzma_init(); + if (err) + goto lzma_err; + + err = z_erofs_deflate_init(); + if (err) + goto deflate_err; + + erofs_pcpubuf_init(); + err = z_erofs_init_zip_subsystem(); + if (err) + goto zip_err; + + err = erofs_init_sysfs(); + if (err) + goto sysfs_err; + + err = register_filesystem(&erofs_fs_type); + if (err) + goto fs_err; + + return 0; + +fs_err: + erofs_exit_sysfs(); +sysfs_err: + z_erofs_exit_zip_subsystem(); +zip_err: + z_erofs_deflate_exit(); +deflate_err: + z_erofs_lzma_exit(); +lzma_err: + erofs_exit_shrinker(); +shrinker_err: + kmem_cache_destroy(erofs_inode_cachep); + return err; +} + +static void __exit erofs_module_exit(void) +{ + unregister_filesystem(&erofs_fs_type); + + /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ + rcu_barrier(); + + erofs_exit_sysfs(); + z_erofs_exit_zip_subsystem(); + z_erofs_deflate_exit(); + z_erofs_lzma_exit(); + erofs_exit_shrinker(); + kmem_cache_destroy(erofs_inode_cachep); + erofs_pcpubuf_exit(); +} + +static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + u64 id = 0; + + if (!erofs_is_fscache_mode(sb)) + id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->total_blocks; + buf->f_bfree = buf->f_bavail = 0; + + buf->f_files = ULLONG_MAX; + buf->f_ffree = ULLONG_MAX - sbi->inos; + + buf->f_namelen = EROFS_NAME_LEN; + + buf->f_fsid = u64_to_fsid(id); + return 0; +} + +static int erofs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); + struct erofs_mount_opts *opt = &sbi->opt; + +#ifdef CONFIG_EROFS_FS_XATTR + if (test_opt(opt, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + else + seq_puts(seq, ",nouser_xattr"); +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + if (test_opt(opt, POSIX_ACL)) + seq_puts(seq, ",acl"); + else + seq_puts(seq, ",noacl"); +#endif +#ifdef CONFIG_EROFS_FS_ZIP + if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED) + seq_puts(seq, ",cache_strategy=disabled"); + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) + seq_puts(seq, ",cache_strategy=readahead"); + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND) + seq_puts(seq, ",cache_strategy=readaround"); +#endif + if (test_opt(opt, DAX_ALWAYS)) + seq_puts(seq, ",dax=always"); + if (test_opt(opt, DAX_NEVER)) + seq_puts(seq, ",dax=never"); +#ifdef CONFIG_EROFS_FS_ONDEMAND + if (sbi->fsid) + seq_printf(seq, ",fsid=%s", sbi->fsid); + if (sbi->domain_id) + seq_printf(seq, ",domain_id=%s", sbi->domain_id); +#endif + return 0; +} + +const struct super_operations erofs_sops = { + .put_super = erofs_put_super, + .alloc_inode = erofs_alloc_inode, + .free_inode = erofs_free_inode, + .statfs = erofs_statfs, + .show_options = erofs_show_options, +}; + +module_init(erofs_module_init); +module_exit(erofs_module_exit); + +MODULE_DESCRIPTION("Enhanced ROM File System"); +MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); +MODULE_LICENSE("GPL"); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c new file mode 100644 index 0000000000..435e515c07 --- /dev/null +++ b/fs/erofs/sysfs.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd. + * https://www.oppo.com/ + */ +#include <linux/sysfs.h> +#include <linux/kobject.h> + +#include "internal.h" + +enum { + attr_feature, + attr_pointer_ui, + attr_pointer_bool, +}; + +enum { + struct_erofs_sb_info, + struct_erofs_mount_opts, +}; + +struct erofs_attr { + struct attribute attr; + short attr_id; + int struct_type, offset; +}; + +#define EROFS_ATTR(_name, _mode, _id) \ +static struct erofs_attr erofs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ +} +#define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name) +#define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature) + +#define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \ +static struct erofs_attr erofs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ + .struct_type = struct_##_struct, \ + .offset = offsetof(struct _struct, _name),\ +} + +#define EROFS_ATTR_RW(_name, _id, _struct) \ + EROFS_ATTR_OFFSET(_name, 0644, _id, _struct) + +#define EROFS_RO_ATTR(_name, _id, _struct) \ + EROFS_ATTR_OFFSET(_name, 0444, _id, _struct) + +#define EROFS_ATTR_RW_UI(_name, _struct) \ + EROFS_ATTR_RW(_name, pointer_ui, _struct) + +#define EROFS_ATTR_RW_BOOL(_name, _struct) \ + EROFS_ATTR_RW(_name, pointer_bool, _struct) + +#define ATTR_LIST(name) (&erofs_attr_##name.attr) + +#ifdef CONFIG_EROFS_FS_ZIP +EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); +#endif + +static struct attribute *erofs_attrs[] = { +#ifdef CONFIG_EROFS_FS_ZIP + ATTR_LIST(sync_decompress), +#endif + NULL, +}; +ATTRIBUTE_GROUPS(erofs); + +/* Features this copy of erofs supports */ +EROFS_ATTR_FEATURE(zero_padding); +EROFS_ATTR_FEATURE(compr_cfgs); +EROFS_ATTR_FEATURE(big_pcluster); +EROFS_ATTR_FEATURE(chunked_file); +EROFS_ATTR_FEATURE(device_table); +EROFS_ATTR_FEATURE(compr_head2); +EROFS_ATTR_FEATURE(sb_chksum); +EROFS_ATTR_FEATURE(ztailpacking); +EROFS_ATTR_FEATURE(fragments); +EROFS_ATTR_FEATURE(dedupe); + +static struct attribute *erofs_feat_attrs[] = { + ATTR_LIST(zero_padding), + ATTR_LIST(compr_cfgs), + ATTR_LIST(big_pcluster), + ATTR_LIST(chunked_file), + ATTR_LIST(device_table), + ATTR_LIST(compr_head2), + ATTR_LIST(sb_chksum), + ATTR_LIST(ztailpacking), + ATTR_LIST(fragments), + ATTR_LIST(dedupe), + NULL, +}; +ATTRIBUTE_GROUPS(erofs_feat); + +static unsigned char *__struct_ptr(struct erofs_sb_info *sbi, + int struct_type, int offset) +{ + if (struct_type == struct_erofs_sb_info) + return (unsigned char *)sbi + offset; + if (struct_type == struct_erofs_mount_opts) + return (unsigned char *)&sbi->opt + offset; + return NULL; +} + +static ssize_t erofs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); + unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); + + switch (a->attr_id) { + case attr_feature: + return sysfs_emit(buf, "supported\n"); + case attr_pointer_ui: + if (!ptr) + return 0; + return sysfs_emit(buf, "%u\n", *(unsigned int *)ptr); + case attr_pointer_bool: + if (!ptr) + return 0; + return sysfs_emit(buf, "%d\n", *(bool *)ptr); + } + return 0; +} + +static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); + unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); + unsigned long t; + int ret; + + switch (a->attr_id) { + case attr_pointer_ui: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t != (unsigned int)t) + return -ERANGE; +#ifdef CONFIG_EROFS_FS_ZIP + if (!strcmp(a->attr.name, "sync_decompress") && + (t > EROFS_SYNC_DECOMPRESS_FORCE_OFF)) + return -EINVAL; +#endif + *(unsigned int *)ptr = t; + return len; + case attr_pointer_bool: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t != 0 && t != 1) + return -EINVAL; + *(bool *)ptr = !!t; + return len; + } + return 0; +} + +static void erofs_sb_release(struct kobject *kobj) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops erofs_attr_ops = { + .show = erofs_attr_show, + .store = erofs_attr_store, +}; + +static const struct kobj_type erofs_sb_ktype = { + .default_groups = erofs_groups, + .sysfs_ops = &erofs_attr_ops, + .release = erofs_sb_release, +}; + +static const struct kobj_type erofs_ktype = { + .sysfs_ops = &erofs_attr_ops, +}; + +static struct kset erofs_root = { + .kobj = {.ktype = &erofs_ktype}, +}; + +static const struct kobj_type erofs_feat_ktype = { + .default_groups = erofs_feat_groups, + .sysfs_ops = &erofs_attr_ops, +}; + +static struct kobject erofs_feat = { + .kset = &erofs_root, +}; + +int erofs_register_sysfs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + char *name; + char *str = NULL; + int err; + + if (erofs_is_fscache_mode(sb)) { + if (sbi->domain_id) { + str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id, + sbi->fsid); + if (!str) + return -ENOMEM; + name = str; + } else { + name = sbi->fsid; + } + } else { + name = sb->s_id; + } + sbi->s_kobj.kset = &erofs_root; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name); + kfree(str); + if (err) + goto put_sb_kobj; + return 0; + +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; +} + +void erofs_unregister_sysfs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + if (sbi->s_kobj.state_in_sysfs) { + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + } +} + +int __init erofs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&erofs_root.kobj, "erofs"); + erofs_root.kobj.parent = fs_kobj; + ret = kset_register(&erofs_root); + if (ret) + goto root_err; + + ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, + NULL, "features"); + if (ret) + goto feat_err; + return ret; + +feat_err: + kobject_put(&erofs_feat); + kset_unregister(&erofs_root); +root_err: + return ret; +} + +void erofs_exit_sysfs(void) +{ + kobject_put(&erofs_feat); + kset_unregister(&erofs_root); +} diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c new file mode 100644 index 0000000000..4256a85719 --- /dev/null +++ b/fs/erofs/utils.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * https://www.huawei.com/ + */ +#include "internal.h" + +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) +{ + struct page *page = *pagepool; + + if (page) { + DBG_BUGON(page_ref_count(page) != 1); + *pagepool = (struct page *)page_private(page); + } else { + page = alloc_page(gfp); + } + return page; +} + +void erofs_release_pages(struct page **pagepool) +{ + while (*pagepool) { + struct page *page = *pagepool; + + *pagepool = (struct page *)page_private(page); + put_page(page); + } +} + +#ifdef CONFIG_EROFS_FS_ZIP +/* global shrink count (for all mounted EROFS instances) */ +static atomic_long_t erofs_global_shrink_cnt; + +static bool erofs_workgroup_get(struct erofs_workgroup *grp) +{ + if (lockref_get_not_zero(&grp->lockref)) + return true; + + spin_lock(&grp->lockref.lock); + if (__lockref_is_dead(&grp->lockref)) { + spin_unlock(&grp->lockref.lock); + return false; + } + + if (!grp->lockref.count++) + atomic_long_dec(&erofs_global_shrink_cnt); + spin_unlock(&grp->lockref.lock); + return true; +} + +struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, + pgoff_t index) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_workgroup *grp; + +repeat: + rcu_read_lock(); + grp = xa_load(&sbi->managed_pslots, index); + if (grp) { + if (!erofs_workgroup_get(grp)) { + /* prefer to relax rcu read side */ + rcu_read_unlock(); + goto repeat; + } + + DBG_BUGON(index != grp->index); + } + rcu_read_unlock(); + return grp; +} + +struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, + struct erofs_workgroup *grp) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct erofs_workgroup *pre; + + DBG_BUGON(grp->lockref.count < 1); +repeat: + xa_lock(&sbi->managed_pslots); + pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, + NULL, grp, GFP_NOFS); + if (pre) { + if (xa_is_err(pre)) { + pre = ERR_PTR(xa_err(pre)); + } else if (!erofs_workgroup_get(pre)) { + /* try to legitimize the current in-tree one */ + xa_unlock(&sbi->managed_pslots); + cond_resched(); + goto repeat; + } + grp = pre; + } + xa_unlock(&sbi->managed_pslots); + return grp; +} + +static void __erofs_workgroup_free(struct erofs_workgroup *grp) +{ + atomic_long_dec(&erofs_global_shrink_cnt); + erofs_workgroup_free_rcu(grp); +} + +void erofs_workgroup_put(struct erofs_workgroup *grp) +{ + if (lockref_put_or_lock(&grp->lockref)) + return; + + DBG_BUGON(__lockref_is_dead(&grp->lockref)); + if (grp->lockref.count == 1) + atomic_long_inc(&erofs_global_shrink_cnt); + --grp->lockref.count; + spin_unlock(&grp->lockref.lock); +} + +static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp) +{ + int free = false; + + spin_lock(&grp->lockref.lock); + if (grp->lockref.count) + goto out; + + /* + * Note that all cached pages should be detached before deleted from + * the XArray. Otherwise some cached pages could be still attached to + * the orphan old workgroup when the new one is available in the tree. + */ + if (erofs_try_to_free_all_cached_pages(sbi, grp)) + goto out; + + /* + * It's impossible to fail after the workgroup is freezed, + * however in order to avoid some race conditions, add a + * DBG_BUGON to observe this in advance. + */ + DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); + + lockref_mark_dead(&grp->lockref); + free = true; +out: + spin_unlock(&grp->lockref.lock); + if (free) + __erofs_workgroup_free(grp); + return free; +} + +static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, + unsigned long nr_shrink) +{ + struct erofs_workgroup *grp; + unsigned int freed = 0; + unsigned long index; + + xa_lock(&sbi->managed_pslots); + xa_for_each(&sbi->managed_pslots, index, grp) { + /* try to shrink each valid workgroup */ + if (!erofs_try_to_release_workgroup(sbi, grp)) + continue; + xa_unlock(&sbi->managed_pslots); + + ++freed; + if (!--nr_shrink) + return freed; + xa_lock(&sbi->managed_pslots); + } + xa_unlock(&sbi->managed_pslots); + return freed; +} + +/* protected by 'erofs_sb_list_lock' */ +static unsigned int shrinker_run_no; + +/* protects the mounted 'erofs_sb_list' */ +static DEFINE_SPINLOCK(erofs_sb_list_lock); +static LIST_HEAD(erofs_sb_list); + +void erofs_shrinker_register(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_init(&sbi->umount_mutex); + + spin_lock(&erofs_sb_list_lock); + list_add(&sbi->list, &erofs_sb_list); + spin_unlock(&erofs_sb_list_lock); +} + +void erofs_shrinker_unregister(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + mutex_lock(&sbi->umount_mutex); + /* clean up all remaining workgroups in memory */ + erofs_shrink_workstation(sbi, ~0UL); + + spin_lock(&erofs_sb_list_lock); + list_del(&sbi->list); + spin_unlock(&erofs_sb_list_lock); + mutex_unlock(&sbi->umount_mutex); +} + +static unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return atomic_long_read(&erofs_global_shrink_cnt); +} + +static unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct erofs_sb_info *sbi; + struct list_head *p; + + unsigned long nr = sc->nr_to_scan; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&erofs_sb_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + + /* Iterate over all mounted superblocks and try to shrink them */ + p = erofs_sb_list.next; + while (p != &erofs_sb_list) { + sbi = list_entry(p, struct erofs_sb_info, list); + + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (sbi->shrinker_run_no == run_no) + break; + + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + + spin_unlock(&erofs_sb_list_lock); + sbi->shrinker_run_no = run_no; + + freed += erofs_shrink_workstation(sbi, nr - freed); + + spin_lock(&erofs_sb_list_lock); + /* Get the next list element before we move this one */ + p = p->next; + + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&sbi->list, &erofs_sb_list); + mutex_unlock(&sbi->umount_mutex); + + if (freed >= nr) + break; + } + spin_unlock(&erofs_sb_list_lock); + return freed; +} + +static struct shrinker erofs_shrinker_info = { + .scan_objects = erofs_shrink_scan, + .count_objects = erofs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + +int __init erofs_init_shrinker(void) +{ + return register_shrinker(&erofs_shrinker_info, "erofs-shrinker"); +} + +void erofs_exit_shrinker(void) +{ + unregister_shrinker(&erofs_shrinker_info); +} +#endif /* !CONFIG_EROFS_FS_ZIP */ diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c new file mode 100644 index 0000000000..09d341675e --- /dev/null +++ b/fs/erofs/xattr.c @@ -0,0 +1,560 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2021-2022, Alibaba Cloud + */ +#include <linux/security.h> +#include <linux/xxhash.h> +#include "xattr.h" + +struct erofs_xattr_iter { + struct super_block *sb; + struct erofs_buf buf; + erofs_off_t pos; + void *kaddr; + + char *buffer; + int buffer_size, buffer_ofs; + + /* getxattr */ + int index, infix_len; + struct qstr name; + + /* listxattr */ + struct dentry *dentry; +}; + +static int erofs_init_inode_xattrs(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct erofs_xattr_iter it; + unsigned int i; + struct erofs_xattr_ibody_header *ih; + struct super_block *sb = inode->i_sb; + int ret = 0; + + /* the most case is that xattrs of this inode are initialized. */ + if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); + return 0; + } + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + /* someone has initialized xattrs for us? */ + if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) + goto out_unlock; + + /* + * bypass all xattr operations if ->xattr_isize is not greater than + * sizeof(struct erofs_xattr_ibody_header), in detail: + * 1) it is not enough to contain erofs_xattr_ibody_header then + * ->xattr_isize should be 0 (it means no xattr); + * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk + * undefined right now (maybe use later with some new sb feature). + */ + if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { + erofs_err(sb, + "xattr_isize %d of nid %llu is not supported yet", + vi->xattr_isize, vi->nid); + ret = -EOPNOTSUPP; + goto out_unlock; + } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { + if (vi->xattr_isize) { + erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid); + DBG_BUGON(1); + ret = -EFSCORRUPTED; + goto out_unlock; /* xattr ondisk layout error */ + } + ret = -ENOATTR; + goto out_unlock; + } + + it.buf = __EROFS_BUF_INITIALIZER; + erofs_init_metabuf(&it.buf, sb); + it.pos = erofs_iloc(inode) + vi->inode_isize; + + /* read in shared xattr array (non-atomic, see kmalloc below) */ + it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP); + if (IS_ERR(it.kaddr)) { + ret = PTR_ERR(it.kaddr); + goto out_unlock; + } + + ih = it.kaddr + erofs_blkoff(sb, it.pos); + vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); + vi->xattr_shared_count = ih->h_shared_count; + vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, + sizeof(uint), GFP_KERNEL); + if (!vi->xattr_shared_xattrs) { + erofs_put_metabuf(&it.buf); + ret = -ENOMEM; + goto out_unlock; + } + + /* let's skip ibody header */ + it.pos += sizeof(struct erofs_xattr_ibody_header); + + for (i = 0; i < vi->xattr_shared_count; ++i) { + it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), + EROFS_KMAP); + if (IS_ERR(it.kaddr)) { + kfree(vi->xattr_shared_xattrs); + vi->xattr_shared_xattrs = NULL; + ret = PTR_ERR(it.kaddr); + goto out_unlock; + } + vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *) + (it.kaddr + erofs_blkoff(sb, it.pos))); + it.pos += sizeof(__le32); + } + erofs_put_metabuf(&it.buf); + + /* paired with smp_mb() at the beginning of the function. */ + smp_mb(); + set_bit(EROFS_I_EA_INITED_BIT, &vi->flags); + +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags); + return ret; +} + +static bool erofs_xattr_user_list(struct dentry *dentry) +{ + return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); +} + +static bool erofs_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static int erofs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + if (handler->flags == EROFS_XATTR_INDEX_USER && + !test_opt(&EROFS_I_SB(inode)->opt, XATTR_USER)) + return -EOPNOTSUPP; + + return erofs_getxattr(inode, handler->flags, name, buffer, size); +} + +const struct xattr_handler erofs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = EROFS_XATTR_INDEX_USER, + .list = erofs_xattr_user_list, + .get = erofs_xattr_generic_get, +}; + +const struct xattr_handler erofs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = EROFS_XATTR_INDEX_TRUSTED, + .list = erofs_xattr_trusted_list, + .get = erofs_xattr_generic_get, +}; + +#ifdef CONFIG_EROFS_FS_SECURITY +const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = EROFS_XATTR_INDEX_SECURITY, + .get = erofs_xattr_generic_get, +}; +#endif + +const struct xattr_handler *erofs_xattr_handlers[] = { + &erofs_xattr_user_handler, + &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + &erofs_xattr_security_handler, +#endif + NULL, +}; + +static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, + unsigned int len) +{ + unsigned int slice, processed; + struct super_block *sb = it->sb; + void *src; + + for (processed = 0; processed < len; processed += slice) { + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + src = it->kaddr + erofs_blkoff(sb, it->pos); + slice = min_t(unsigned int, sb->s_blocksize - + erofs_blkoff(sb, it->pos), len - processed); + memcpy(it->buffer + it->buffer_ofs, src, slice); + it->buffer_ofs += slice; + it->pos += slice; + } + return 0; +} + +static int erofs_listxattr_foreach(struct erofs_xattr_iter *it) +{ + struct erofs_xattr_entry entry; + unsigned int base_index, name_total, prefix_len, infix_len = 0; + const char *prefix, *infix = NULL; + int err; + + /* 1. handle xattr entry */ + entry = *(struct erofs_xattr_entry *) + (it->kaddr + erofs_blkoff(it->sb, it->pos)); + it->pos += sizeof(struct erofs_xattr_entry); + + base_index = entry.e_name_index; + if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(it->sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return 0; + infix = pf->prefix->infix; + infix_len = pf->infix_len; + base_index = pf->prefix->base_index; + } + + prefix = erofs_xattr_prefix(base_index, it->dentry); + if (!prefix) + return 0; + prefix_len = strlen(prefix); + name_total = prefix_len + infix_len + entry.e_name_len + 1; + + if (!it->buffer) { + it->buffer_ofs += name_total; + return 0; + } + + if (it->buffer_ofs + name_total > it->buffer_size) + return -ERANGE; + + memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); + memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len); + it->buffer_ofs += prefix_len + infix_len; + + /* 2. handle xattr name */ + err = erofs_xattr_copy_to_buffer(it, entry.e_name_len); + if (err) + return err; + + it->buffer[it->buffer_ofs++] = '\0'; + return 0; +} + +static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) +{ + struct super_block *sb = it->sb; + struct erofs_xattr_entry entry; + unsigned int slice, processed, value_sz; + + /* 1. handle xattr entry */ + entry = *(struct erofs_xattr_entry *) + (it->kaddr + erofs_blkoff(sb, it->pos)); + it->pos += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); + + /* should also match the infix for long name prefixes */ + if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return -ENOATTR; + + if (it->index != pf->prefix->base_index || + it->name.len != entry.e_name_len + pf->infix_len) + return -ENOATTR; + + if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) + return -ENOATTR; + + it->infix_len = pf->infix_len; + } else { + if (it->index != entry.e_name_index || + it->name.len != entry.e_name_len) + return -ENOATTR; + + it->infix_len = 0; + } + + /* 2. handle xattr name */ + for (processed = 0; processed < entry.e_name_len; processed += slice) { + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + slice = min_t(unsigned int, + sb->s_blocksize - erofs_blkoff(sb, it->pos), + entry.e_name_len - processed); + if (memcmp(it->name.name + it->infix_len + processed, + it->kaddr + erofs_blkoff(sb, it->pos), slice)) + return -ENOATTR; + it->pos += slice; + } + + /* 3. handle xattr value */ + if (!it->buffer) { + it->buffer_ofs = value_sz; + return 0; + } + + if (it->buffer_size < value_sz) + return -ERANGE; + + return erofs_xattr_copy_to_buffer(it, value_sz); +} + +static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, + struct inode *inode, bool getxattr) +{ + struct erofs_inode *const vi = EROFS_I(inode); + unsigned int xattr_header_sz, remaining, entry_sz; + erofs_off_t next_pos; + int ret; + + xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * vi->xattr_shared_count; + if (xattr_header_sz >= vi->xattr_isize) { + DBG_BUGON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + remaining = vi->xattr_isize - xattr_header_sz; + it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; + + while (remaining) { + it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + entry_sz = erofs_xattr_entry_size(it->kaddr + + erofs_blkoff(it->sb, it->pos)); + /* xattr on-disk corruption: xattr entry beyond xattr_isize */ + if (remaining < entry_sz) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + remaining -= entry_sz; + next_pos = it->pos + entry_sz; + + if (getxattr) + ret = erofs_getxattr_foreach(it); + else + ret = erofs_listxattr_foreach(it); + if ((getxattr && ret != -ENOATTR) || (!getxattr && ret)) + break; + + it->pos = next_pos; + } + return ret; +} + +static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, + struct inode *inode, bool getxattr) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = it->sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int i; + int ret = -ENOATTR; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + + vi->xattr_shared_xattrs[i] * sizeof(__le32); + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + if (getxattr) + ret = erofs_getxattr_foreach(it); + else + ret = erofs_listxattr_foreach(it); + if ((getxattr && ret != -ENOATTR) || (!getxattr && ret)) + break; + } + return ret; +} + +int erofs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size) +{ + int ret; + unsigned int hashbit; + struct erofs_xattr_iter it; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + + if (!name) + return -EINVAL; + + ret = erofs_init_inode_xattrs(inode); + if (ret) + return ret; + + /* reserved flag is non-zero if there's any change of on-disk format */ + if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) { + hashbit = xxh32(name, strlen(name), + EROFS_XATTR_FILTER_SEED + index); + hashbit &= EROFS_XATTR_FILTER_BITS - 1; + if (vi->xattr_name_filter & (1U << hashbit)) + return -ENOATTR; + } + + it.index = index; + it.name = (struct qstr)QSTR_INIT(name, strlen(name)); + if (it.name.len > EROFS_NAME_LEN) + return -ERANGE; + + it.sb = inode->i_sb; + it.buf = __EROFS_BUF_INITIALIZER; + erofs_init_metabuf(&it.buf, it.sb); + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + ret = erofs_xattr_iter_inline(&it, inode, true); + if (ret == -ENOATTR) + ret = erofs_xattr_iter_shared(&it, inode, true); + erofs_put_metabuf(&it.buf); + return ret ? ret : it.buffer_ofs; +} + +ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + int ret; + struct erofs_xattr_iter it; + struct inode *inode = d_inode(dentry); + + ret = erofs_init_inode_xattrs(inode); + if (ret == -ENOATTR) + return 0; + if (ret) + return ret; + + it.sb = dentry->d_sb; + it.buf = __EROFS_BUF_INITIALIZER; + erofs_init_metabuf(&it.buf, it.sb); + it.dentry = dentry; + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + ret = erofs_xattr_iter_inline(&it, inode, false); + if (!ret || ret == -ENOATTR) + ret = erofs_xattr_iter_shared(&it, inode, false); + if (ret == -ENOATTR) + ret = 0; + erofs_put_metabuf(&it.buf); + return ret ? ret : it.buffer_ofs; +} + +void erofs_xattr_prefixes_cleanup(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + int i; + + if (sbi->xattr_prefixes) { + for (i = 0; i < sbi->xattr_prefix_count; i++) + kfree(sbi->xattr_prefixes[i].prefix); + kfree(sbi->xattr_prefixes); + sbi->xattr_prefixes = NULL; + } +} + +int erofs_xattr_prefixes_init(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2; + struct erofs_xattr_prefix_item *pfs; + int ret = 0, i, len; + + if (!sbi->xattr_prefix_count) + return 0; + + pfs = kzalloc(sbi->xattr_prefix_count * sizeof(*pfs), GFP_KERNEL); + if (!pfs) + return -ENOMEM; + + if (sbi->packed_inode) + buf.inode = sbi->packed_inode; + else + erofs_init_metabuf(&buf, sb); + + for (i = 0; i < sbi->xattr_prefix_count; i++) { + void *ptr = erofs_read_metadata(sb, &buf, &pos, &len); + + if (IS_ERR(ptr)) { + ret = PTR_ERR(ptr); + break; + } else if (len < sizeof(*pfs->prefix) || + len > EROFS_NAME_LEN + sizeof(*pfs->prefix)) { + kfree(ptr); + ret = -EFSCORRUPTED; + break; + } + pfs[i].prefix = ptr; + pfs[i].infix_len = len - sizeof(struct erofs_xattr_long_prefix); + } + + erofs_put_metabuf(&buf); + sbi->xattr_prefixes = pfs; + if (ret) + erofs_xattr_prefixes_cleanup(sb); + return ret; +} + +#ifdef CONFIG_EROFS_FS_POSIX_ACL +struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu) +{ + struct posix_acl *acl; + int prefix, rc; + char *value = NULL; + + if (rcu) + return ERR_PTR(-ECHILD); + + switch (type) { + case ACL_TYPE_ACCESS: + prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + rc = erofs_getxattr(inode, prefix, "", NULL, 0); + if (rc > 0) { + value = kmalloc(rc, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + rc = erofs_getxattr(inode, prefix, "", value, rc); + } + + if (rc == -ENOATTR) + acl = NULL; + else if (rc < 0) + acl = ERR_PTR(rc); + else + acl = posix_acl_from_xattr(&init_user_ns, value, rc); + kfree(value); + return acl; +} +#endif diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h new file mode 100644 index 0000000000..f16283cb8c --- /dev/null +++ b/fs/erofs/xattr.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * https://www.huawei.com/ + */ +#ifndef __EROFS_XATTR_H +#define __EROFS_XATTR_H + +#include "internal.h" +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +/* Attribute not found */ +#define ENOATTR ENODATA + +#ifdef CONFIG_EROFS_FS_XATTR +extern const struct xattr_handler erofs_xattr_user_handler; +extern const struct xattr_handler erofs_xattr_trusted_handler; +extern const struct xattr_handler erofs_xattr_security_handler; + +static inline const char *erofs_xattr_prefix(unsigned int idx, + struct dentry *dentry) +{ + const struct xattr_handler *handler = NULL; + + static const struct xattr_handler *xattr_handler_map[] = { + [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, +#endif + [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, +#endif + }; + + if (idx && idx < ARRAY_SIZE(xattr_handler_map)) + handler = xattr_handler_map[idx]; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); +} + +extern const struct xattr_handler *erofs_xattr_handlers[]; + +int erofs_xattr_prefixes_init(struct super_block *sb); +void erofs_xattr_prefixes_cleanup(struct super_block *sb); +int erofs_getxattr(struct inode *, int, const char *, void *, size_t); +ssize_t erofs_listxattr(struct dentry *, char *, size_t); +#else +static inline int erofs_xattr_prefixes_init(struct super_block *sb) { return 0; } +static inline void erofs_xattr_prefixes_cleanup(struct super_block *sb) {} +static inline int erofs_getxattr(struct inode *inode, int index, + const char *name, void *buffer, + size_t buffer_size) +{ + return -EOPNOTSUPP; +} + +#define erofs_listxattr (NULL) +#define erofs_xattr_handlers (NULL) +#endif /* !CONFIG_EROFS_FS_XATTR */ + +#ifdef CONFIG_EROFS_FS_POSIX_ACL +struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu); +#else +#define erofs_get_acl (NULL) +#endif + +#endif diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c new file mode 100644 index 0000000000..a33cd6757f --- /dev/null +++ b/fs/erofs/zdata.c @@ -0,0 +1,1894 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2022 Alibaba Cloud + */ +#include "compress.h" +#include <linux/psi.h> +#include <linux/cpuhotplug.h> +#include <trace/events/erofs.h> + +#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) +#define Z_EROFS_INLINE_BVECS 2 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else; + * + * L: Field should be protected by the pcluster lock; + * + * A: Field should be accessed / updated in atomic for parallelized code. + */ +struct z_erofs_pcluster { + struct erofs_workgroup obj; + struct mutex lock; + + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* L: the maximum decompression size of this round */ + unsigned int length; + + /* L: total number of bvecs */ + unsigned int vcnt; + + /* I: page offset of start position of decompression */ + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; + + union { + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; + + /* I: can be used to free the pcluster by RCU. */ + struct rcu_head rcu; + }; + + union { + /* I: physical cluster size in pages */ + unsigned short pclusterpages; + + /* I: tailpacking inline compressed size */ + unsigned short tailpacking_size; + }; + + /* I: compression algorithm format */ + unsigned char algorithmformat; + + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; +}; + +/* the end of a chain of pclusters */ +#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) +#define Z_EROFS_PCLUSTER_NIL (NULL) + +struct z_erofs_decompressqueue { + struct super_block *sb; + atomic_t pending_bios; + z_erofs_next_pcluster_t head; + + union { + struct completion done; + struct work_struct work; + struct kthread_work kthread_work; + } u; + bool eio, sync; +}; + +static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) +{ + return !pcl->obj.index; +} + +static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) +{ + if (z_erofs_is_inline_pcluster(pcl)) + return 1; + return pcl->pclusterpages; +} + +/* + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page + */ +#define Z_EROFS_PAGE_EIO (1 << 30) + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + atomic_t o; + unsigned long v; + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_split(struct page *page) +{ + atomic_inc((atomic_t *)&page->private); +} + +static void z_erofs_onlinepage_endio(struct page *page, int err) +{ + int orig, v; + + DBG_BUGON(!PagePrivate(page)); + + do { + orig = atomic_read((atomic_t *)&page->private); + v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); + + if (!(v & ~Z_EROFS_PAGE_EIO)) { + set_page_private(page, 0); + ClearPagePrivate(page); + if (!(v & Z_EROFS_PAGE_EIO)) + SetPageUptodate(page); + unlock_page(page); + } +} + +#define Z_EROFS_ONSTACK_PAGES 32 + +/* + * since pclustersize is variable for big pcluster feature, introduce slab + * pools implementation for different pcluster sizes. + */ +struct z_erofs_pcluster_slab { + struct kmem_cache *slab; + unsigned int maxpages; + char name[48]; +}; + +#define _PCLP(n) { .maxpages = n } + +static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { + _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), + _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) +}; + +struct z_erofs_bvec_iter { + struct page *bvpage; + struct z_erofs_bvset *bvset; + unsigned int nr, cur; +}; + +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) +{ + if (iter->bvpage) + kunmap_local(iter->bvset); + return iter->bvpage; +} + +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) +{ + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; + /* have to access nextpage in advance, otherwise it will be unmapped */ + struct page *nextpage = iter->bvset->nextpage; + struct page *oldpage; + + DBG_BUGON(!nextpage); + oldpage = z_erofs_bvec_iter_end(iter); + iter->bvpage = nextpage; + iter->bvset = kmap_local_page(nextpage); + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); + iter->cur = 0; + return oldpage; +} + +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvset_inline *bvset, + unsigned int bootstrap_nr, + unsigned int cur) +{ + *iter = (struct z_erofs_bvec_iter) { + .nr = bootstrap_nr, + .bvset = (struct z_erofs_bvset *)bvset, + }; + + while (cur > iter->nr) { + cur -= iter->nr; + z_erofs_bvset_flip(iter); + } + iter->cur = cur; +} + +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **candidate_bvpage, + struct page **pagepool) +{ + if (iter->cur >= iter->nr) { + struct page *nextpage = *candidate_bvpage; + + if (!nextpage) { + nextpage = erofs_allocpage(pagepool, GFP_NOFS); + if (!nextpage) + return -ENOMEM; + set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); + } + DBG_BUGON(iter->bvset->nextpage); + iter->bvset->nextpage = nextpage; + z_erofs_bvset_flip(iter); + + iter->bvset->nextpage = NULL; + *candidate_bvpage = NULL; + } + iter->bvset->bvec[iter->cur++] = *bvec; + return 0; +} + +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **old_bvpage) +{ + if (iter->cur == iter->nr) + *old_bvpage = z_erofs_bvset_flip(iter); + else + *old_bvpage = NULL; + *bvec = iter->bvset->bvec[iter->cur++]; +} + +static void z_erofs_destroy_pcluster_pool(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + if (!pcluster_pool[i].slab) + continue; + kmem_cache_destroy(pcluster_pool[i].slab); + pcluster_pool[i].slab = NULL; + } +} + +static int z_erofs_create_pcluster_pool(void) +{ + struct z_erofs_pcluster_slab *pcs; + struct z_erofs_pcluster *a; + unsigned int size; + + for (pcs = pcluster_pool; + pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { + size = struct_size(a, compressed_bvecs, pcs->maxpages); + + sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); + pcs->slab = kmem_cache_create(pcs->name, size, 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (pcs->slab) + continue; + + z_erofs_destroy_pcluster_pool(); + return -ENOMEM; + } + return 0; +} + +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + struct z_erofs_pcluster *pcl; + + if (nrpages > pcs->maxpages) + continue; + + pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + if (!pcl) + return ERR_PTR(-ENOMEM); + pcl->pclusterpages = nrpages; + return pcl; + } + return ERR_PTR(-EINVAL); +} + +static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) +{ + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + + if (pclusterpages > pcs->maxpages) + continue; + + kmem_cache_free(pcs->slab, pcl); + return; + } + DBG_BUGON(1); +} + +static struct workqueue_struct *z_erofs_workqueue __read_mostly; + +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static struct kthread_worker __rcu **z_erofs_pcpu_workers; + +static void erofs_destroy_percpu_workers(void) +{ + struct kthread_worker *worker; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + worker = rcu_dereference_protected( + z_erofs_pcpu_workers[cpu], 1); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + if (worker) + kthread_destroy_worker(worker); + } + kfree(z_erofs_pcpu_workers); +} + +static struct kthread_worker *erofs_init_percpu_worker(int cpu) +{ + struct kthread_worker *worker = + kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu); + + if (IS_ERR(worker)) + return worker; + if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) + sched_set_fifo_low(worker->task); + return worker; +} + +static int erofs_init_percpu_workers(void) +{ + struct kthread_worker *worker; + unsigned int cpu; + + z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), + sizeof(struct kthread_worker *), GFP_ATOMIC); + if (!z_erofs_pcpu_workers) + return -ENOMEM; + + for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ + worker = erofs_init_percpu_worker(cpu); + if (!IS_ERR(worker)) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + } + return 0; +} +#else +static inline void erofs_destroy_percpu_workers(void) {} +static inline int erofs_init_percpu_workers(void) { return 0; } +#endif + +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) +static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); +static enum cpuhp_state erofs_cpuhp_state; + +static int erofs_cpu_online(unsigned int cpu) +{ + struct kthread_worker *worker, *old; + + worker = erofs_init_percpu_worker(cpu); + if (IS_ERR(worker)) + return PTR_ERR(worker); + + spin_lock(&z_erofs_pcpu_worker_lock); + old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + if (!old) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + spin_unlock(&z_erofs_pcpu_worker_lock); + if (old) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_offline(unsigned int cpu) +{ + struct kthread_worker *worker; + + spin_lock(&z_erofs_pcpu_worker_lock); + worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + spin_unlock(&z_erofs_pcpu_worker_lock); + + synchronize_rcu(); + if (worker) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_hotplug_init(void) +{ + int state; + + state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); + if (state < 0) + return state; + + erofs_cpuhp_state = state; + return 0; +} + +static void erofs_cpu_hotplug_destroy(void) +{ + if (erofs_cpuhp_state) + cpuhp_remove_state_nocalls(erofs_cpuhp_state); +} +#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ +static inline int erofs_cpu_hotplug_init(void) { return 0; } +static inline void erofs_cpu_hotplug_destroy(void) {} +#endif + +void z_erofs_exit_zip_subsystem(void) +{ + erofs_cpu_hotplug_destroy(); + erofs_destroy_percpu_workers(); + destroy_workqueue(z_erofs_workqueue); + z_erofs_destroy_pcluster_pool(); +} + +int __init z_erofs_init_zip_subsystem(void) +{ + int err = z_erofs_create_pcluster_pool(); + + if (err) + goto out_error_pcluster_pool; + + z_erofs_workqueue = alloc_workqueue("erofs_worker", + WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); + if (!z_erofs_workqueue) { + err = -ENOMEM; + goto out_error_workqueue_init; + } + + err = erofs_init_percpu_workers(); + if (err) + goto out_error_pcpu_worker; + + err = erofs_cpu_hotplug_init(); + if (err < 0) + goto out_error_cpuhp_init; + return err; + +out_error_cpuhp_init: + erofs_destroy_percpu_workers(); +out_error_pcpu_worker: + destroy_workqueue(z_erofs_workqueue); +out_error_workqueue_init: + z_erofs_destroy_pcluster_pool(); +out_error_pcluster_pool: + return err; +} + +enum z_erofs_pclustermode { + Z_EROFS_PCLUSTER_INFLIGHT, + /* + * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it + * could be dispatched into bypass queue later due to uptodated managed + * pages. All related online pages cannot be reused for inplace I/O (or + * bvpage) since it can be directly decoded without I/O submission. + */ + Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, + /* + * The pcluster was just linked to a decompression chain by us. It can + * also be linked with the remaining pclusters, which means if the + * processing page is the tail page of a pcluster, this pcluster can + * safely use the whole page (since the previous pcluster is within the + * same chain) for in-place I/O, as illustrated below: + * ___________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current pcl) | (of the previous pcl) | + * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| + * + * [ (*) the page above can be used as inplace I/O. ] + */ + Z_EROFS_PCLUSTER_FOLLOWED, +}; + +struct z_erofs_decompress_frontend { + struct inode *const inode; + struct erofs_map_blocks map; + struct z_erofs_bvec_iter biter; + + struct page *pagepool; + struct page *candidate_bvpage; + struct z_erofs_pcluster *pcl; + z_erofs_next_pcluster_t owned_head; + enum z_erofs_pclustermode mode; + + erofs_off_t headoffset; + + /* a pointer used to pick up inplace I/O pages */ + unsigned int icur; +}; + +#define DECOMPRESS_FRONTEND_INIT(__i) { \ + .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ + .mode = Z_EROFS_PCLUSTER_FOLLOWED } + +static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) +{ + unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; + + if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) + return false; + + if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) + return true; + + if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && + fe->map.m_la < fe->headoffset) + return true; + + return false; +} + +static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) +{ + struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); + struct z_erofs_pcluster *pcl = fe->pcl; + bool shouldalloc = z_erofs_should_alloc_cache(fe); + bool standalone = true; + /* + * optimistic allocation without direct reclaim since inplace I/O + * can be used if low memory otherwise. + */ + gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + unsigned int i; + + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) + return; + + for (i = 0; i < pcl->pclusterpages; ++i) { + struct page *page; + void *t; /* mark pages just found for debugging */ + struct page *newpage = NULL; + + /* the compressed page was loaded before */ + if (READ_ONCE(pcl->compressed_bvecs[i].page)) + continue; + + page = find_get_page(mc, pcl->obj.index + i); + + if (page) { + t = (void *)((unsigned long)page | 1); + } else { + /* I/O is needed, no possible to decompress directly */ + standalone = false; + if (!shouldalloc) + continue; + + /* + * try to use cached I/O if page allocation + * succeeds or fallback to in-place I/O instead + * to avoid any direct reclaim. + */ + newpage = erofs_allocpage(&fe->pagepool, gfp); + if (!newpage) + continue; + set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); + t = (void *)((unsigned long)newpage | 1); + } + + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) + continue; + + if (page) + put_page(page); + else if (newpage) + erofs_pagepool_add(&fe->pagepool, newpage); + } + + /* + * don't do inplace I/O if all compressed pages are available in + * managed cache since it can be moved to the bypass queue instead. + */ + if (standalone) + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; +} + +/* called by erofs_shrinker to get rid of all compressed_pages */ +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp) +{ + struct z_erofs_pcluster *const pcl = + container_of(grp, struct z_erofs_pcluster, obj); + int i; + + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + /* + * refcount of workgroup is now freezed as 0, + * therefore no need to worry about available decompression users. + */ + for (i = 0; i < pcl->pclusterpages; ++i) { + struct page *page = pcl->compressed_bvecs[i].page; + + if (!page) + continue; + + /* block other users from reclaiming or migrating the page */ + if (!trylock_page(page)) + return -EBUSY; + + if (!erofs_page_is_managed(sbi, page)) + continue; + + /* barrier is implied in the following 'unlock_page' */ + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + detach_page_private(page); + unlock_page(page); + } + return 0; +} + +static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) +{ + struct z_erofs_pcluster *pcl = folio_get_private(folio); + bool ret; + int i; + + if (!folio_test_private(folio)) + return true; + + ret = false; + spin_lock(&pcl->obj.lockref.lock); + if (pcl->obj.lockref.count > 0) + goto out; + + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (i = 0; i < pcl->pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page == &folio->page) { + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + ret = true; + break; + } + } + if (ret) + folio_detach_private(folio); +out: + spin_unlock(&pcl->obj.lockref.lock); + return ret; +} + +/* + * It will be called only on inode eviction. In case that there are still some + * decompression requests in progress, wait with rescheduling for a bit here. + * An extra lock could be introduced instead but it seems unnecessary. + */ +static void z_erofs_cache_invalidate_folio(struct folio *folio, + size_t offset, size_t length) +{ + const size_t stop = length + offset; + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > folio_size(folio) || stop < length); + + if (offset == 0 && stop == folio_size(folio)) + while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations z_erofs_cache_aops = { + .release_folio = z_erofs_cache_release_folio, + .invalidate_folio = z_erofs_cache_invalidate_folio, +}; + +int erofs_init_managed_cache(struct super_block *sb) +{ + struct inode *const inode = new_inode(sb); + + if (!inode) + return -ENOMEM; + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &z_erofs_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + EROFS_SB(sb)->managed_cache = inode; + return 0; +} + +static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, + struct z_erofs_bvec *bvec) +{ + struct z_erofs_pcluster *const pcl = fe->pcl; + + while (fe->icur > 0) { + if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, + NULL, bvec->page)) { + pcl->compressed_bvecs[fe->icur] = *bvec; + return true; + } + } + return false; +} + +/* callers must be with pcluster lock held */ +static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, + struct z_erofs_bvec *bvec, bool exclusive) +{ + int ret; + + if (exclusive) { + /* give priority for inplaceio to use file pages first */ + if (z_erofs_try_inplace_io(fe, bvec)) + return 0; + /* otherwise, check if it can be used as a bvpage */ + if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && + !fe->candidate_bvpage) + fe->candidate_bvpage = bvec->page; + } + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, + &fe->pagepool); + fe->pcl->vcnt += (ret >= 0); + return ret; +} + +static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) +{ + struct z_erofs_pcluster *pcl = f->pcl; + z_erofs_next_pcluster_t *owned_head = &f->owned_head; + + /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ + if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, + *owned_head) == Z_EROFS_PCLUSTER_NIL) { + *owned_head = &pcl->next; + /* so we can attach this pcluster to our submission chain. */ + f->mode = Z_EROFS_PCLUSTER_FOLLOWED; + return; + } + + /* type 2, it belongs to an ongoing chain */ + f->mode = Z_EROFS_PCLUSTER_INFLIGHT; +} + +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) +{ + struct erofs_map_blocks *map = &fe->map; + bool ztailpacking = map->m_flags & EROFS_MAP_META; + struct z_erofs_pcluster *pcl; + struct erofs_workgroup *grp; + int err; + + if (!(map->m_flags & EROFS_MAP_ENCODED) || + (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + /* no available pcluster, let's allocate one */ + pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : + map->m_plen >> PAGE_SHIFT); + if (IS_ERR(pcl)) + return PTR_ERR(pcl); + + spin_lock_init(&pcl->obj.lockref.lock); + pcl->obj.lockref.count = 1; /* one ref for this request */ + pcl->algorithmformat = map->m_algorithmformat; + pcl->length = 0; + pcl->partial = true; + + /* new pclusters should be claimed as type 1, primary and followed */ + pcl->next = fe->owned_head; + pcl->pageofs_out = map->m_la & ~PAGE_MASK; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; + + /* + * lock all primary followed works before visible to others + * and mutex_trylock *never* fails for a new pcluster. + */ + mutex_init(&pcl->lock); + DBG_BUGON(!mutex_trylock(&pcl->lock)); + + if (ztailpacking) { + pcl->obj.index = 0; /* which indicates ztailpacking */ + pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); + pcl->tailpacking_size = map->m_plen; + } else { + pcl->obj.index = map->m_pa >> PAGE_SHIFT; + + grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); + if (IS_ERR(grp)) { + err = PTR_ERR(grp); + goto err_out; + } + + if (grp != &pcl->obj) { + fe->pcl = container_of(grp, + struct z_erofs_pcluster, obj); + err = -EEXIST; + goto err_out; + } + } + fe->owned_head = &pcl->next; + fe->pcl = pcl; + return 0; + +err_out: + mutex_unlock(&pcl->lock); + z_erofs_free_pcluster(pcl); + return err; +} + +static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) +{ + struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; + erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); + struct erofs_workgroup *grp = NULL; + int ret; + + DBG_BUGON(fe->pcl); + + /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ + DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); + + if (!(map->m_flags & EROFS_MAP_META)) { + grp = erofs_find_workgroup(sb, blknr); + } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + if (grp) { + fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + ret = -EEXIST; + } else { + ret = z_erofs_register_pcluster(fe); + } + + if (ret == -EEXIST) { + mutex_lock(&fe->pcl->lock); + z_erofs_try_to_claim_pcluster(fe); + } else if (ret) { + return ret; + } + + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, + Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); + if (!z_erofs_is_inline_pcluster(fe->pcl)) { + /* bind cache first when cached decompression is preferred */ + z_erofs_bind_cache(fe); + } else { + void *mptr; + + mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); + if (IS_ERR(mptr)) { + ret = PTR_ERR(mptr); + erofs_err(sb, "failed to get inline data %d", ret); + return ret; + } + get_page(map->buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; + } + /* file-backed inplace I/O pages are traversed in reverse order */ + fe->icur = z_erofs_pclusterpages(fe->pcl); + return 0; +} + +/* + * keep in mind that no referenced pclusters will be freed + * only after a RCU grace period. + */ +static void z_erofs_rcu_callback(struct rcu_head *head) +{ + z_erofs_free_pcluster(container_of(head, + struct z_erofs_pcluster, rcu)); +} + +void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +{ + struct z_erofs_pcluster *const pcl = + container_of(grp, struct z_erofs_pcluster, obj); + + call_rcu(&pcl->rcu, z_erofs_rcu_callback); +} + +static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) +{ + struct z_erofs_pcluster *pcl = fe->pcl; + + if (!pcl) + return; + + z_erofs_bvec_iter_end(&fe->biter); + mutex_unlock(&pcl->lock); + + if (fe->candidate_bvpage) + fe->candidate_bvpage = NULL; + + /* + * if all pending pages are added, don't hold its reference + * any longer if the pcluster isn't hosted by ourselves. + */ + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) + erofs_workgroup_put(&pcl->obj); + + fe->pcl = NULL; +} + +static int z_erofs_read_fragment(struct super_block *sb, struct page *page, + unsigned int cur, unsigned int end, erofs_off_t pos) +{ + struct inode *packed_inode = EROFS_SB(sb)->packed_inode; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + unsigned int cnt; + u8 *src; + + if (!packed_inode) + return -EFSCORRUPTED; + + buf.inode = packed_inode; + for (; cur < end; cur += cnt, pos += cnt) { + cnt = min_t(unsigned int, end - cur, + sb->s_blocksize - erofs_blkoff(sb, pos)); + src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); + if (IS_ERR(src)) { + erofs_put_metabuf(&buf); + return PTR_ERR(src); + } + memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); + } + erofs_put_metabuf(&buf); + return 0; +} + +static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, + struct page *page) +{ + struct inode *const inode = fe->inode; + struct erofs_map_blocks *const map = &fe->map; + const loff_t offset = page_offset(page); + bool tight = true, exclusive; + unsigned int cur, end, len, split; + int err = 0; + + z_erofs_onlinepage_init(page); + + split = 0; + end = PAGE_SIZE; +repeat: + if (offset + end - 1 < map->m_la || + offset + end - 1 >= map->m_la + map->m_llen) { + z_erofs_pcluster_end(fe); + map->m_la = offset + end - 1; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + goto out; + } + + cur = offset > map->m_la ? 0 : map->m_la - offset; + /* bump split parts first to avoid several separate cases */ + ++split; + + if (!(map->m_flags & EROFS_MAP_MAPPED)) { + zero_user_segment(page, cur, end); + tight = false; + goto next_part; + } + + if (map->m_flags & EROFS_MAP_FRAGMENT) { + erofs_off_t fpos = offset + cur - map->m_la; + + len = min_t(unsigned int, map->m_llen - fpos, end - cur); + err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, + EROFS_I(inode)->z_fragmentoff + fpos); + if (err) + goto out; + tight = false; + goto next_part; + } + + if (!fe->pcl) { + err = z_erofs_pcluster_begin(fe); + if (err) + goto out; + } + + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or bvpage (should be processed in a strict order.) + */ + tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + exclusive = (!cur && ((split <= 1) || tight)); + if (cur) + tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); + + err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { + .page = page, + .offset = offset - map->m_la, + .end = end, + }), exclusive); + if (err) + goto out; + + z_erofs_onlinepage_split(page); + if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) + fe->pcl->multibases = true; + if (fe->pcl->length < offset + end - map->m_la) { + fe->pcl->length = offset + end - map->m_la; + fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + !(map->m_flags & EROFS_MAP_PARTIAL_REF) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; +next_part: + /* shorten the remaining extent to update progress */ + map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; + + end = cur; + if (end > 0) + goto repeat; + +out: + z_erofs_onlinepage_endio(page, err); + return err; +} + +static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, + unsigned int readahead_pages) +{ + /* auto: enable for read_folio, disable for readahead */ + if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && + !readahead_pages) + return true; + + if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && + (readahead_pages <= sbi->opt.max_sync_decompress_pages)) + return true; + + return false; +} + +static bool z_erofs_page_is_invalidated(struct page *page) +{ + return !page->mapping && !z_erofs_is_shortlived_page(page); +} + +struct z_erofs_decompress_backend { + struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; + struct super_block *sb; + struct z_erofs_pcluster *pcl; + + /* pages with the longest decompressed length for deduplication */ + struct page **decompressed_pages; + /* pages to keep the compressed data */ + struct page **compressed_pages; + + struct list_head decompressed_secondary_bvecs; + struct page **pagepool; + unsigned int onstack_used, nr_pages; +}; + +struct z_erofs_bvec_item { + struct z_erofs_bvec bvec; + struct list_head list; +}; + +static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) +{ + struct z_erofs_bvec_item *item; + unsigned int pgnr; + + if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) && + (bvec->end == PAGE_SIZE || + bvec->offset + bvec->end == be->pcl->length)) { + pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + if (!be->decompressed_pages[pgnr]) { + be->decompressed_pages[pgnr] = bvec->page; + return; + } + } + + /* (cold path) one pcluster is requested multiple times */ + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); + item->bvec = *bvec; + list_add(&item->list, &be->decompressed_secondary_bvecs); +} + +static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, + int err) +{ + unsigned int off0 = be->pcl->pageofs_out; + struct list_head *p, *n; + + list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { + struct z_erofs_bvec_item *bvi; + unsigned int end, cur; + void *dst, *src; + + bvi = container_of(p, struct z_erofs_bvec_item, list); + cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; + end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, + bvi->bvec.end); + dst = kmap_local_page(bvi->bvec.page); + while (cur < end) { + unsigned int pgnr, scur, len; + + pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + + scur = bvi->bvec.offset + cur - + ((pgnr << PAGE_SHIFT) - off0); + len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); + if (!be->decompressed_pages[pgnr]) { + err = -EFSCORRUPTED; + cur += len; + continue; + } + src = kmap_local_page(be->decompressed_pages[pgnr]); + memcpy(dst + cur, src + scur, len); + kunmap_local(src); + cur += len; + } + kunmap_local(dst); + z_erofs_onlinepage_endio(bvi->bvec.page, err); + list_del(p); + kfree(bvi); + } +} + +static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +{ + struct z_erofs_pcluster *pcl = be->pcl; + struct z_erofs_bvec_iter biter; + struct page *old_bvpage; + int i; + + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); + for (i = 0; i < pcl->vcnt; ++i) { + struct z_erofs_bvec bvec; + + z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); + + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); + + DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); + z_erofs_do_decompressed_bvec(be, &bvec); + } + + old_bvpage = z_erofs_bvec_iter_end(&biter); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); +} + +static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, + bool *overlapped) +{ + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + int i, err = 0; + + *overlapped = false; + for (i = 0; i < pclusterpages; ++i) { + struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; + struct page *page = bvec->page; + + /* compressed pages ought to be present before decompressing */ + if (!page) { + DBG_BUGON(1); + continue; + } + be->compressed_pages[i] = page; + + if (z_erofs_is_inline_pcluster(pcl)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } + + DBG_BUGON(z_erofs_page_is_invalidated(page)); + if (!z_erofs_is_shortlived_page(page)) { + if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } + z_erofs_do_decompressed_bvec(be, bvec); + *overlapped = true; + } + } + + if (err) + return err; + return 0; +} + +static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, + int err) +{ + struct erofs_sb_info *const sbi = EROFS_SB(be->sb); + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + const struct z_erofs_decompressor *decompressor = + &erofs_decompressors[pcl->algorithmformat]; + unsigned int i, inputsize; + int err2; + struct page *page; + bool overlapped; + + mutex_lock(&pcl->lock); + be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; + + /* allocate (de)compressed page arrays if cannot be kept on stack */ + be->decompressed_pages = NULL; + be->compressed_pages = NULL; + be->onstack_used = 0; + if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { + be->decompressed_pages = be->onstack_pages; + be->onstack_used = be->nr_pages; + memset(be->decompressed_pages, 0, + sizeof(struct page *) * be->nr_pages); + } + + if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) + be->compressed_pages = be->onstack_pages + be->onstack_used; + + if (!be->decompressed_pages) + be->decompressed_pages = + kvcalloc(be->nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + if (!be->compressed_pages) + be->compressed_pages = + kvcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + + z_erofs_parse_out_bvecs(be); + err2 = z_erofs_parse_in_bvecs(be, &overlapped); + if (err2) + err = err2; + if (err) + goto out; + + if (z_erofs_is_inline_pcluster(pcl)) + inputsize = pcl->tailpacking_size; + else + inputsize = pclusterpages * PAGE_SIZE; + + err = decompressor->decompress(&(struct z_erofs_decompress_req) { + .sb = be->sb, + .in = be->compressed_pages, + .out = be->decompressed_pages, + .pageofs_in = pcl->pageofs_in, + .pageofs_out = pcl->pageofs_out, + .inputsize = inputsize, + .outputsize = pcl->length, + .alg = pcl->algorithmformat, + .inplace_io = overlapped, + .partial_decoding = pcl->partial, + .fillgaps = pcl->multibases, + }, be->pagepool); + +out: + /* must handle all compressed pages before actual file pages */ + if (z_erofs_is_inline_pcluster(pcl)) { + page = pcl->compressed_bvecs[0].page; + WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); + put_page(page); + } else { + for (i = 0; i < pclusterpages; ++i) { + /* consider shortlived pages added when decompressing */ + page = be->compressed_pages[i]; + + if (erofs_page_is_managed(sbi, page)) + continue; + (void)z_erofs_put_shortlivedpage(be->pagepool, page); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + } + } + if (be->compressed_pages < be->onstack_pages || + be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) + kvfree(be->compressed_pages); + z_erofs_fill_other_copies(be, err); + + for (i = 0; i < be->nr_pages; ++i) { + page = be->decompressed_pages[i]; + if (!page) + continue; + + DBG_BUGON(z_erofs_page_is_invalidated(page)); + + /* recycle all individual short-lived pages */ + if (z_erofs_put_shortlivedpage(be->pagepool, page)) + continue; + z_erofs_onlinepage_endio(page, err); + } + + if (be->decompressed_pages != be->onstack_pages) + kvfree(be->decompressed_pages); + + pcl->length = 0; + pcl->partial = true; + pcl->multibases = false; + pcl->bvset.nextpage = NULL; + pcl->vcnt = 0; + + /* pcluster lock MUST be taken before the following line */ + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); + mutex_unlock(&pcl->lock); + return err; +} + +static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, + struct page **pagepool) +{ + struct z_erofs_decompress_backend be = { + .sb = io->sb, + .pagepool = pagepool, + .decompressed_secondary_bvecs = + LIST_HEAD_INIT(be.decompressed_secondary_bvecs), + }; + z_erofs_next_pcluster_t owned = io->head; + + while (owned != Z_EROFS_PCLUSTER_TAIL) { + DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); + + be.pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(be.pcl->next); + + z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + if (z_erofs_is_inline_pcluster(be.pcl)) + z_erofs_free_pcluster(be.pcl); + else + erofs_workgroup_put(&be.pcl->obj); + } +} + +static void z_erofs_decompressqueue_work(struct work_struct *work) +{ + struct z_erofs_decompressqueue *bgq = + container_of(work, struct z_erofs_decompressqueue, u.work); + struct page *pagepool = NULL; + + DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL); + z_erofs_decompress_queue(bgq, &pagepool); + erofs_release_pages(&pagepool); + kvfree(bgq); +} + +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) +{ + z_erofs_decompressqueue_work((struct work_struct *)work); +} +#endif + +static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, + int bios) +{ + struct erofs_sb_info *const sbi = EROFS_SB(io->sb); + + /* wake up the caller thread for sync decompression */ + if (io->sync) { + if (!atomic_add_return(bios, &io->pending_bios)) + complete(&io->u.done); + return; + } + + if (atomic_add_return(bios, &io->pending_bios)) + return; + /* Use (kthread_)work and sync decompression for atomic contexts only */ + if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) { +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + struct kthread_worker *worker; + + rcu_read_lock(); + worker = rcu_dereference( + z_erofs_pcpu_workers[raw_smp_processor_id()]); + if (!worker) { + INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); + queue_work(z_erofs_workqueue, &io->u.work); + } else { + kthread_queue_work(worker, &io->u.kthread_work); + } + rcu_read_unlock(); +#else + queue_work(z_erofs_workqueue, &io->u.work); +#endif + /* enable sync decompression for readahead */ + if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) + sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; + return; + } + z_erofs_decompressqueue_work(&io->u.work); +} + +static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, + unsigned int nr, + struct page **pagepool, + struct address_space *mc) +{ + const pgoff_t index = pcl->obj.index; + gfp_t gfp = mapping_gfp_mask(mc); + bool tocache = false; + + struct address_space *mapping; + struct page *oldpage, *page; + int justfound; + +repeat: + page = READ_ONCE(pcl->compressed_bvecs[nr].page); + oldpage = page; + + if (!page) + goto out_allocpage; + + justfound = (unsigned long)page & 1UL; + page = (struct page *)((unsigned long)page & ~1UL); + + /* + * preallocated cached pages, which is used to avoid direct reclaim + * otherwise, it will go inplace I/O path instead. + */ + if (page->private == Z_EROFS_PREALLOCATED_PAGE) { + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); + set_page_private(page, 0); + tocache = true; + goto out_tocache; + } + mapping = READ_ONCE(page->mapping); + + /* + * file-backed online pages in plcuster are all locked steady, + * therefore it is impossible for `mapping' to be NULL. + */ + if (mapping && mapping != mc) + /* ought to be unmanaged pages */ + goto out; + + /* directly return for shortlived page as well */ + if (z_erofs_is_shortlived_page(page)) + goto out; + + lock_page(page); + + /* only true if page reclaim goes wrong, should never happen */ + DBG_BUGON(justfound && PagePrivate(page)); + + /* the page is still in manage cache */ + if (page->mapping == mc) { + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); + + if (!PagePrivate(page)) { + /* + * impossible to be !PagePrivate(page) for + * the current restriction as well if + * the page is already in compressed_bvecs[]. + */ + DBG_BUGON(!justfound); + + justfound = 0; + set_page_private(page, (unsigned long)pcl); + SetPagePrivate(page); + } + + /* no need to submit io if it is already up-to-date */ + if (PageUptodate(page)) { + unlock_page(page); + page = NULL; + } + goto out; + } + + /* + * the managed page has been truncated, it's unsafe to + * reuse this one, let's allocate a new cache-managed page. + */ + DBG_BUGON(page->mapping); + DBG_BUGON(!justfound); + + tocache = true; + unlock_page(page); + put_page(page); +out_allocpage: + page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); + if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, + oldpage, page)) { + erofs_pagepool_add(pagepool, page); + cond_resched(); + goto repeat; + } +out_tocache: + if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { + /* turn into temporary page if fails (1 ref) */ + set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); + goto out; + } + attach_page_private(page, pcl); + /* drop a refcount added by allocpage (then we have 2 refs here) */ + put_page(page); + +out: /* the only exit (for tracing and debugging) */ + return page; +} + +static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, + struct z_erofs_decompressqueue *fgq, bool *fg) +{ + struct z_erofs_decompressqueue *q; + + if (fg && !*fg) { + q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN); + if (!q) { + *fg = true; + goto fg_out; + } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + kthread_init_work(&q->u.kthread_work, + z_erofs_decompressqueue_kthread_work); +#else + INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); +#endif + } else { +fg_out: + q = fgq; + init_completion(&fgq->u.done); + atomic_set(&fgq->pending_bios, 0); + q->eio = false; + q->sync = true; + } + q->sb = sb; + q->head = Z_EROFS_PCLUSTER_TAIL; + return q; +} + +/* define decompression jobqueue types */ +enum { + JQ_BYPASS, + JQ_SUBMIT, + NR_JOBQUEUES, +}; + +static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, + z_erofs_next_pcluster_t qtail[], + z_erofs_next_pcluster_t owned_head) +{ + z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; + z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; + + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); + + WRITE_ONCE(*submit_qtail, owned_head); + WRITE_ONCE(*bypass_qtail, &pcl->next); + + qtail[JQ_BYPASS] = &pcl->next; +} + +static void z_erofs_decompressqueue_endio(struct bio *bio) +{ + struct z_erofs_decompressqueue *q = bio->bi_private; + blk_status_t err = bio->bi_status; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(z_erofs_page_is_invalidated(page)); + + if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { + if (!err) + SetPageUptodate(page); + unlock_page(page); + } + } + if (err) + q->eio = true; + z_erofs_decompress_kickoff(q, -1); + bio_put(bio); +} + +static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, + struct z_erofs_decompressqueue *fgq, + bool *force_fg, bool readahead) +{ + struct super_block *sb = f->inode->i_sb; + struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); + z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; + struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; + z_erofs_next_pcluster_t owned_head = f->owned_head; + /* bio is NULL initially, so no need to initialize last_{index,bdev} */ + pgoff_t last_index; + struct block_device *last_bdev; + unsigned int nr_bios = 0; + struct bio *bio = NULL; + unsigned long pflags; + int memstall = 0; + + /* + * if managed cache is enabled, bypass jobqueue is needed, + * no need to read from device for all pclusters in this queue. + */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); + + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; + qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; + + /* by default, all need io submission */ + q[JQ_SUBMIT]->head = owned_head; + + do { + struct erofs_map_dev mdev; + struct z_erofs_pcluster *pcl; + pgoff_t cur, end; + unsigned int i = 0; + bool bypass = true; + + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); + pcl = container_of(owned_head, struct z_erofs_pcluster, next); + owned_head = READ_ONCE(pcl->next); + + if (z_erofs_is_inline_pcluster(pcl)) { + move_to_bypass_jobqueue(pcl, qtail, owned_head); + continue; + } + + /* no device id here, thus it will always succeed */ + mdev = (struct erofs_map_dev) { + .m_pa = erofs_pos(sb, pcl->obj.index), + }; + (void)erofs_map_dev(sb, &mdev); + + cur = erofs_blknr(sb, mdev.m_pa); + end = cur + pcl->pclusterpages; + + do { + struct page *page; + + page = pickup_page_for_submission(pcl, i++, + &f->pagepool, mc); + if (!page) + continue; + + if (bio && (cur != last_index + 1 || + last_bdev != mdev.m_bdev)) { +submit_bio_retry: + submit_bio(bio); + if (memstall) { + psi_memstall_leave(&pflags); + memstall = 0; + } + bio = NULL; + } + + if (unlikely(PageWorkingset(page)) && !memstall) { + psi_memstall_enter(&pflags); + memstall = 1; + } + + if (!bio) { + bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, + REQ_OP_READ, GFP_NOIO); + bio->bi_end_io = z_erofs_decompressqueue_endio; + + last_bdev = mdev.m_bdev; + bio->bi_iter.bi_sector = (sector_t)cur << + (sb->s_blocksize_bits - 9); + bio->bi_private = q[JQ_SUBMIT]; + if (readahead) + bio->bi_opf |= REQ_RAHEAD; + ++nr_bios; + } + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + goto submit_bio_retry; + + last_index = cur; + bypass = false; + } while (++cur < end); + + if (!bypass) + qtail[JQ_SUBMIT] = &pcl->next; + else + move_to_bypass_jobqueue(pcl, qtail, owned_head); + } while (owned_head != Z_EROFS_PCLUSTER_TAIL); + + if (bio) { + submit_bio(bio); + if (memstall) + psi_memstall_leave(&pflags); + } + + /* + * although background is preferred, no one is pending for submission. + * don't issue decompression but drop it directly instead. + */ + if (!*force_fg && !nr_bios) { + kvfree(q[JQ_SUBMIT]); + return; + } + z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); +} + +static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, + bool force_fg, bool ra) +{ + struct z_erofs_decompressqueue io[NR_JOBQUEUES]; + + if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) + return; + z_erofs_submit_queue(f, io, &force_fg, ra); + + /* handle bypass queue (no i/o pclusters) immediately */ + z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); + + if (!force_fg) + return; + + /* wait until all bios are completed */ + wait_for_completion_io(&io[JQ_SUBMIT].u.done); + + /* handle synchronous decompress queue in the caller context */ + z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); +} + +/* + * Since partial uptodate is still unimplemented for now, we have to use + * approximate readmore strategies as a start. + */ +static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, + struct readahead_control *rac, bool backmost) +{ + struct inode *inode = f->inode; + struct erofs_map_blocks *map = &f->map; + erofs_off_t cur, end, headoffset = f->headoffset; + int err; + + if (backmost) { + if (rac) + end = headoffset + readahead_length(rac) - 1; + else + end = headoffset + PAGE_SIZE - 1; + map->m_la = end; + err = z_erofs_map_blocks_iter(inode, map, + EROFS_GET_BLOCKS_READMORE); + if (err) + return; + + /* expand ra for the trailing edge if readahead */ + if (rac) { + cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); + readahead_expand(rac, headoffset, cur - headoffset); + return; + } + end = round_up(end, PAGE_SIZE); + } else { + end = round_up(map->m_la, PAGE_SIZE); + + if (!map->m_llen) + return; + } + + cur = map->m_la + map->m_llen - 1; + while ((cur >= end) && (cur < i_size_read(inode))) { + pgoff_t index = cur >> PAGE_SHIFT; + struct page *page; + + page = erofs_grab_cache_page_nowait(inode->i_mapping, index); + if (page) { + if (PageUptodate(page)) + unlock_page(page); + else + (void)z_erofs_do_read_page(f, page); + put_page(page); + } + + if (cur < PAGE_SIZE) + break; + cur = (index << PAGE_SHIFT) - 1; + } +} + +static int z_erofs_read_folio(struct file *file, struct folio *folio) +{ + struct inode *const inode = folio->mapping->host; + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + int err; + + trace_erofs_read_folio(folio, false); + f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; + + z_erofs_pcluster_readmore(&f, NULL, true); + err = z_erofs_do_read_page(&f, &folio->page); + z_erofs_pcluster_readmore(&f, NULL, false); + z_erofs_pcluster_end(&f); + + /* if some compressed cluster ready, need submit them anyway */ + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); + + if (err && err != -EINTR) + erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", + err, folio->index, EROFS_I(inode)->nid); + + erofs_put_metabuf(&f.map.buf); + erofs_release_pages(&f.pagepool); + return err; +} + +static void z_erofs_readahead(struct readahead_control *rac) +{ + struct inode *const inode = rac->mapping->host; + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + struct folio *head = NULL, *folio; + unsigned int nr_folios; + int err; + + f.headoffset = readahead_pos(rac); + + z_erofs_pcluster_readmore(&f, rac, true); + nr_folios = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false); + + while ((folio = readahead_folio(rac))) { + folio->private = head; + head = folio; + } + + /* traverse in reverse order for best metadata I/O performance */ + while (head) { + folio = head; + head = folio_get_private(folio); + + err = z_erofs_do_read_page(&f, &folio->page); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); + } + z_erofs_pcluster_readmore(&f, rac, false); + z_erofs_pcluster_end(&f); + + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); + erofs_put_metabuf(&f.map.buf); + erofs_release_pages(&f.pagepool); +} + +const struct address_space_operations z_erofs_aops = { + .read_folio = z_erofs_read_folio, + .readahead = z_erofs_readahead, +}; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c new file mode 100644 index 0000000000..7a1a24ae4a --- /dev/null +++ b/fs/erofs/zmap.c @@ -0,0 +1,774 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018-2019 HUAWEI, Inc. + * https://www.huawei.com/ + */ +#include "internal.h" +#include <asm/unaligned.h> +#include <trace/events/erofs.h> + +struct z_erofs_maprecorder { + struct inode *inode; + struct erofs_map_blocks *map; + void *kaddr; + + unsigned long lcn; + /* compression extent information gathered */ + u8 type, headtype; + u16 clusterofs; + u16 delta[2]; + erofs_blk_t pblk, compressedblks; + erofs_off_t nextpackoff; + bool partialref; +}; + +static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn) +{ + struct inode *const inode = m->inode; + struct erofs_inode *const vi = EROFS_I(inode); + const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize) + + lcn * sizeof(struct z_erofs_lcluster_index); + struct z_erofs_lcluster_index *di; + unsigned int advise, type; + + m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, + erofs_blknr(inode->i_sb, pos), EROFS_KMAP); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); + + m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); + m->lcn = lcn; + di = m->kaddr + erofs_blkoff(inode->i_sb, pos); + + advise = le16_to_cpu(di->di_advise); + type = (advise >> Z_EROFS_LI_LCLUSTER_TYPE_BIT) & + ((1 << Z_EROFS_LI_LCLUSTER_TYPE_BITS) - 1); + switch (type) { + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: + m->clusterofs = 1 << vi->z_logical_clusterbits; + m->delta[0] = le16_to_cpu(di->di_u.delta[0]); + if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { + if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + m->compressedblks = m->delta[0] & + ~Z_EROFS_LI_D0_CBLKCNT; + m->delta[0] = 1; + } + m->delta[1] = le16_to_cpu(di->di_u.delta[1]); + break; + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: + if (advise & Z_EROFS_LI_PARTIAL_REF) + m->partialref = true; + m->clusterofs = le16_to_cpu(di->di_clusterofs); + if (m->clusterofs >= 1 << vi->z_logical_clusterbits) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + m->pblk = le32_to_cpu(di->di_u.blkaddr); + break; + default: + DBG_BUGON(1); + return -EOPNOTSUPP; + } + m->type = type; + return 0; +} + +static unsigned int decode_compactedbits(unsigned int lobits, + unsigned int lomask, + u8 *in, unsigned int pos, u8 *type) +{ + const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); + const unsigned int lo = v & lomask; + + *type = (v >> lobits) & 3; + return lo; +} + +static int get_compacted_la_distance(unsigned int lclusterbits, + unsigned int encodebits, + unsigned int vcnt, u8 *in, int i) +{ + const unsigned int lomask = (1 << lclusterbits) - 1; + unsigned int lo, d1 = 0; + u8 type; + + DBG_BUGON(i >= vcnt); + + do { + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + + if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) + return d1; + ++d1; + } while (++i < vcnt); + + /* vcnt - 1 (Z_EROFS_LCLUSTER_TYPE_NONHEAD) item */ + if (!(lo & Z_EROFS_LI_D0_CBLKCNT)) + d1 += lo - 1; + return d1; +} + +static int unpack_compacted_index(struct z_erofs_maprecorder *m, + unsigned int amortizedshift, + erofs_off_t pos, bool lookahead) +{ + struct erofs_inode *const vi = EROFS_I(m->inode); + const unsigned int lclusterbits = vi->z_logical_clusterbits; + const unsigned int lomask = (1 << lclusterbits) - 1; + unsigned int vcnt, base, lo, encodebits, nblk, eofs; + int i; + u8 *in, type; + bool big_pcluster; + + if (1 << amortizedshift == 4 && lclusterbits <= 14) + vcnt = 2; + else if (1 << amortizedshift == 2 && lclusterbits == 12) + vcnt = 16; + else + return -EOPNOTSUPP; + + /* it doesn't equal to round_up(..) */ + m->nextpackoff = round_down(pos, vcnt << amortizedshift) + + (vcnt << amortizedshift); + big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; + eofs = erofs_blkoff(m->inode->i_sb, pos); + base = round_down(eofs, vcnt << amortizedshift); + in = m->kaddr + base; + + i = (eofs - base) >> amortizedshift; + + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + m->type = type; + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + m->clusterofs = 1 << lclusterbits; + + /* figure out lookahead_distance: delta[1] if needed */ + if (lookahead) + m->delta[1] = get_compacted_la_distance(lclusterbits, + encodebits, vcnt, in, i); + if (lo & Z_EROFS_LI_D0_CBLKCNT) { + if (!big_pcluster) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + m->compressedblks = lo & ~Z_EROFS_LI_D0_CBLKCNT; + m->delta[0] = 1; + return 0; + } else if (i + 1 != (int)vcnt) { + m->delta[0] = lo; + return 0; + } + /* + * since the last lcluster in the pack is special, + * of which lo saves delta[1] rather than delta[0]. + * Hence, get delta[0] by the previous lcluster indirectly. + */ + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * (i - 1), &type); + if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) + lo = 0; + else if (lo & Z_EROFS_LI_D0_CBLKCNT) + lo = 1; + m->delta[0] = lo + 1; + return 0; + } + m->clusterofs = lo; + m->delta[0] = 0; + /* figout out blkaddr (pblk) for HEAD lclusters */ + if (!big_pcluster) { + nblk = 1; + while (i > 0) { + --i; + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) + i -= lo; + + if (i >= 0) + ++nblk; + } + } else { + nblk = 0; + while (i > 0) { + --i; + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + if (lo & Z_EROFS_LI_D0_CBLKCNT) { + --i; + nblk += lo & ~Z_EROFS_LI_D0_CBLKCNT; + continue; + } + /* bigpcluster shouldn't have plain d0 == 1 */ + if (lo <= 1) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + i -= lo - 2; + continue; + } + ++nblk; + } + } + in += (vcnt << amortizedshift) - sizeof(__le32); + m->pblk = le32_to_cpu(*(__le32 *)in) + nblk; + return 0; +} + +static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn, bool lookahead) +{ + struct inode *const inode = m->inode; + struct erofs_inode *const vi = EROFS_I(inode); + const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); + unsigned int totalidx = erofs_iblks(inode); + unsigned int compacted_4b_initial, compacted_2b; + unsigned int amortizedshift; + erofs_off_t pos; + + if (lcn >= totalidx) + return -EINVAL; + + m->lcn = lcn; + /* used to align to 32-byte (compacted_2b) alignment */ + compacted_4b_initial = (32 - ebase % 32) / 4; + if (compacted_4b_initial == 32 / 4) + compacted_4b_initial = 0; + + if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && + compacted_4b_initial < totalidx) + compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); + else + compacted_2b = 0; + + pos = ebase; + if (lcn < compacted_4b_initial) { + amortizedshift = 2; + goto out; + } + pos += compacted_4b_initial * 4; + lcn -= compacted_4b_initial; + + if (lcn < compacted_2b) { + amortizedshift = 1; + goto out; + } + pos += compacted_2b * 2; + lcn -= compacted_2b; + amortizedshift = 2; +out: + pos += lcn * (1 << amortizedshift); + m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, + erofs_blknr(inode->i_sb, pos), EROFS_KMAP); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); + return unpack_compacted_index(m, amortizedshift, pos, lookahead); +} + +static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, + unsigned int lcn, bool lookahead) +{ + switch (EROFS_I(m->inode)->datalayout) { + case EROFS_INODE_COMPRESSED_FULL: + return z_erofs_load_full_lcluster(m, lcn); + case EROFS_INODE_COMPRESSED_COMPACT: + return z_erofs_load_compact_lcluster(m, lcn, lookahead); + default: + return -EINVAL; + } +} + +static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, + unsigned int lookback_distance) +{ + struct super_block *sb = m->inode->i_sb; + struct erofs_inode *const vi = EROFS_I(m->inode); + const unsigned int lclusterbits = vi->z_logical_clusterbits; + + while (m->lcn >= lookback_distance) { + unsigned long lcn = m->lcn - lookback_distance; + int err; + + err = z_erofs_load_lcluster_from_disk(m, lcn, false); + if (err) + return err; + + switch (m->type) { + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: + lookback_distance = m->delta[0]; + if (!lookback_distance) + goto err_bogus; + continue; + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: + m->headtype = m->type; + m->map->m_la = (lcn << lclusterbits) | m->clusterofs; + return 0; + default: + erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + } +err_bogus: + erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", + lookback_distance, m->lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; +} + +static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, + unsigned int initial_lcn) +{ + struct super_block *sb = m->inode->i_sb; + struct erofs_inode *const vi = EROFS_I(m->inode); + struct erofs_map_blocks *const map = m->map; + const unsigned int lclusterbits = vi->z_logical_clusterbits; + unsigned long lcn; + int err; + + DBG_BUGON(m->type != Z_EROFS_LCLUSTER_TYPE_PLAIN && + m->type != Z_EROFS_LCLUSTER_TYPE_HEAD1 && + m->type != Z_EROFS_LCLUSTER_TYPE_HEAD2); + DBG_BUGON(m->type != m->headtype); + + if (m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || + ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || + ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { + map->m_plen = 1ULL << lclusterbits; + return 0; + } + lcn = m->lcn + 1; + if (m->compressedblks) + goto out; + + err = z_erofs_load_lcluster_from_disk(m, lcn, false); + if (err) + return err; + + /* + * If the 1st NONHEAD lcluster has already been handled initially w/o + * valid compressedblks, which means at least it mustn't be CBLKCNT, or + * an internal implemenatation error is detected. + * + * The following code can also handle it properly anyway, but let's + * BUG_ON in the debugging mode only for developers to notice that. + */ + DBG_BUGON(lcn == initial_lcn && + m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); + + switch (m->type) { + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: + /* + * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type + * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. + */ + m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits); + break; + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: + if (m->delta[0] != 1) + goto err_bonus_cblkcnt; + if (m->compressedblks) + break; + fallthrough; + default: + erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } +out: + map->m_plen = erofs_pos(sb, m->compressedblks); + return 0; +err_bonus_cblkcnt: + erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; +} + +static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) +{ + struct inode *inode = m->inode; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_map_blocks *map = m->map; + unsigned int lclusterbits = vi->z_logical_clusterbits; + u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; + int err; + + do { + /* handle the last EOF pcluster (no next HEAD lcluster) */ + if ((lcn << lclusterbits) >= inode->i_size) { + map->m_llen = inode->i_size - map->m_la; + return 0; + } + + err = z_erofs_load_lcluster_from_disk(m, lcn, true); + if (err) + return err; + + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + DBG_BUGON(!m->delta[1] && + m->clusterofs != 1 << lclusterbits); + } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || + m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || + m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { + /* go on until the next HEAD lcluster */ + if (lcn != headlcn) + break; + m->delta[1] = 1; + } else { + erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + lcn += m->delta[1]; + } while (m->delta[1]); + + map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la; + return 0; +} + +static int z_erofs_do_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + struct erofs_inode *const vi = EROFS_I(inode); + bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + struct z_erofs_maprecorder m = { + .inode = inode, + .map = map, + }; + int err = 0; + unsigned int lclusterbits, endoff, afmt; + unsigned long initial_lcn; + unsigned long long ofs, end; + + lclusterbits = vi->z_logical_clusterbits; + ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; + initial_lcn = ofs >> lclusterbits; + endoff = ofs & ((1 << lclusterbits) - 1); + + err = z_erofs_load_lcluster_from_disk(&m, initial_lcn, false); + if (err) + goto unmap_out; + + if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL)) + vi->z_idataoff = m.nextpackoff; + + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; + end = (m.lcn + 1ULL) << lclusterbits; + + switch (m.type) { + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: + if (endoff >= m.clusterofs) { + m.headtype = m.type; + map->m_la = (m.lcn << lclusterbits) | m.clusterofs; + /* + * For ztailpacking files, in order to inline data more + * effectively, special EOF lclusters are now supported + * which can have three parts at most. + */ + if (ztailpacking && end > inode->i_size) + end = inode->i_size; + break; + } + /* m.lcn should be >= 1 if endoff < m.clusterofs */ + if (!m.lcn) { + erofs_err(inode->i_sb, + "invalid logical cluster 0 at nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } + end = (m.lcn << lclusterbits) | m.clusterofs; + map->m_flags |= EROFS_MAP_FULL_MAPPED; + m.delta[0] = 1; + fallthrough; + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: + /* get the corresponding first chunk */ + err = z_erofs_extent_lookback(&m, m.delta[0]); + if (err) + goto unmap_out; + break; + default: + erofs_err(inode->i_sb, + "unknown type %u @ offset %llu of nid %llu", + m.type, ofs, vi->nid); + err = -EOPNOTSUPP; + goto unmap_out; + } + if (m.partialref) + map->m_flags |= EROFS_MAP_PARTIAL_REF; + map->m_llen = end - map->m_la; + + if (flags & EROFS_GET_BLOCKS_FINDTAIL) { + vi->z_tailextent_headlcn = m.lcn; + /* for non-compact indexes, fragmentoff is 64 bits */ + if (fragment && vi->datalayout == EROFS_INODE_COMPRESSED_FULL) + vi->z_fragmentoff |= (u64)m.pblk << 32; + } + if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { + map->m_flags |= EROFS_MAP_META; + map->m_pa = vi->z_idataoff; + map->m_plen = vi->z_idata_size; + } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { + map->m_flags |= EROFS_MAP_FRAGMENT; + } else { + map->m_pa = erofs_pos(inode->i_sb, m.pblk); + err = z_erofs_get_extent_compressedlen(&m, initial_lcn); + if (err) + goto unmap_out; + } + + if (m.headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN) { + if (map->m_llen > map->m_plen) { + DBG_BUGON(1); + err = -EFSCORRUPTED; + goto unmap_out; + } + afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? + Z_EROFS_COMPRESSION_INTERLACED : + Z_EROFS_COMPRESSION_SHIFTED; + } else { + afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? + vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; + if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + afmt, vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } + } + map->m_algorithmformat = afmt; + + if ((flags & EROFS_GET_BLOCKS_FIEMAP) || + ((flags & EROFS_GET_BLOCKS_READMORE) && + (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA || + map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) && + map->m_llen >= i_blocksize(inode))) { + err = z_erofs_get_extent_decompressedlen(&m); + if (!err) + map->m_flags |= EROFS_MAP_FULL_MAPPED; + } + +unmap_out: + erofs_unmap_metabuf(&m.map->buf); + return err; +} + +static int z_erofs_fill_inode_lazy(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + int err, headnr; + erofs_off_t pos; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *kaddr; + struct z_erofs_map_header *h; + + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); + return 0; + } + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + err = 0; + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + goto out_unlock; + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP); + if (IS_ERR(kaddr)) { + err = PTR_ERR(kaddr); + goto out_unlock; + } + + h = kaddr + erofs_blkoff(sb, pos); + /* + * if the highest bit of the 8-byte map header is set, the whole file + * is stored in the packed inode. The rest bits keeps z_fragmentoff. + */ + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); + vi->z_tailextent_headlcn = 0; + goto done; + } + vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; + vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + + headnr = 0; + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || + vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", + headnr + 1, vi->z_algorithmtype[headnr], vi->nid); + err = -EOPNOTSUPP; + goto out_put_metabuf; + } + + vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7); + if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_put_metabuf; + } + if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_idata_size = le16_to_cpu(h->h_idata_size); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + + if (!map.m_plen || + erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map.m_plen); + err = -EFSCORRUPTED; + } + if (err < 0) + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && + !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + if (err < 0) + goto out_put_metabuf; + } +done: + /* paired with smp_mb() at the beginning of the function */ + smp_mb(); + set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +out_put_metabuf: + erofs_put_metabuf(&buf); +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); + return err; +} + +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, + int flags) +{ + struct erofs_inode *const vi = EROFS_I(inode); + int err = 0; + + trace_z_erofs_map_blocks_iter_enter(inode, map, flags); + + /* when trying to read beyond EOF, leave it unmapped */ + if (map->m_la >= inode->i_size) { + map->m_llen = map->m_la + 1 - inode->i_size; + map->m_la = inode->i_size; + map->m_flags = 0; + goto out; + } + + err = z_erofs_fill_inode_lazy(inode); + if (err) + goto out; + + if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | + EROFS_MAP_FRAGMENT; + goto out; + } + + err = z_erofs_do_map_blocks(inode, map, flags); +out: + trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); + return err; +} + +static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + struct erofs_map_blocks map = { .m_la = offset }; + + ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP); + erofs_put_metabuf(&map.buf); + if (ret < 0) + return ret; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = map.m_la; + iomap->length = map.m_llen; + if (map.m_flags & EROFS_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? + IOMAP_NULL_ADDR : map.m_pa; + } else { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + /* + * No strict rule on how to describe extents for post EOF, yet + * we need to do like below. Otherwise, iomap itself will get + * into an endless loop on post EOF. + * + * Calculate the effective offset by subtracting extent start + * (map.m_la) from the requested offset, and add it to length. + * (NB: offset >= map.m_la always) + */ + if (iomap->offset >= inode->i_size) + iomap->length = length + offset - map.m_la; + } + iomap->flags = 0; + return 0; +} + +const struct iomap_ops z_erofs_iomap_report_ops = { + .iomap_begin = z_erofs_iomap_begin_report, +}; |