diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/acl.h | 5 | ||||
-rw-r--r-- | fs/ext4/dir.c | 2 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 13 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.c | 2 | ||||
-rw-r--r-- | fs/ext4/extents.c | 9 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 2 | ||||
-rw-r--r-- | fs/ext4/fast_commit.c | 6 | ||||
-rw-r--r-- | fs/ext4/file.c | 9 | ||||
-rw-r--r-- | fs/ext4/fsmap.c | 8 | ||||
-rw-r--r-- | fs/ext4/inline.c | 6 | ||||
-rw-r--r-- | fs/ext4/inode.c | 99 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 3 | ||||
-rw-r--r-- | fs/ext4/mballoc-test.c | 677 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 361 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 14 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 4 | ||||
-rw-r--r-- | fs/ext4/namei.c | 74 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 3 | ||||
-rw-r--r-- | fs/ext4/readpage.c | 1 | ||||
-rw-r--r-- | fs/ext4/resize.c | 2 | ||||
-rw-r--r-- | fs/ext4/super.c | 154 | ||||
-rw-r--r-- | fs/ext4/sysfs.c | 174 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 182 |
23 files changed, 1273 insertions, 537 deletions
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index ef4c19e5f5..0c5a79c3b5 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -68,11 +68,6 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - /* usually, the umask is applied by posix_acl_create(), but if - ext4 ACL support is disabled at compile time, we need to do - it here, because posix_acl_create() will never be called */ - inode->i_mode &= ~current_umask(); - return 0; } #endif /* CONFIG_EXT4_FS_POSIX_ACL */ diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 3985f8c33f..ff4514e462 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -192,7 +192,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) (PAGE_SHIFT - inode->i_blkbits); if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( - sb->s_bdev->bd_inode->i_mapping, + sb->s_bdev->bd_mapping, &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4fb938ba3f..efed7f0987 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -213,11 +213,14 @@ enum criteria { #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 -/* Large fragment size list lookup succeeded at least once for cr = 0 */ +/* Large fragment size list lookup succeeded at least once for + * CR_POWER2_ALIGNED */ #define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 -/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ +/* Avg fragment size rb tree lookup succeeded at least once for + * CR_GOAL_LEN_FAST */ #define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 -/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ +/* Avg fragment size rb tree lookup succeeded at least once for + * CR_BEST_AVAIL_LEN */ #define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 struct ext4_allocation_request { @@ -1344,7 +1347,7 @@ struct ext4_super_block { /*60*/ __le32 s_feature_incompat; /* incompatible feature set */ __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */ +/*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */ /*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* @@ -1550,7 +1553,7 @@ struct ext4_sb_info { unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; - struct bdev_handle *s_journal_bdev_handle; + struct file *s_journal_bdev_file; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 5d8055161a..da4a824563 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -206,7 +206,7 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line, static void ext4_check_bdev_write_error(struct super_block *sb) { - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct address_space *mapping = sb->s_bdev->bd_mapping; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7669d154c0..e067f2dd03 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3402,9 +3402,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; - int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; + int allocated = 0; + unsigned int max_zeroout = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map_len); @@ -4111,10 +4112,10 @@ insert_hole: * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block - * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) * * return > 0, number of blocks already mapped/allocated - * if create == 0 and these are pre-allocated blocks + * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks * buffer head is unmapped * otherwise blocks are mapped * @@ -4218,7 +4219,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * requested block isn't allocated yet; - * we couldn't try to create block if create flag is zero + * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ext4_lblk_t len; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 4a00e2f019..3a53dbb85e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -310,6 +310,8 @@ void ext4_es_find_extent_range(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { + es->es_lblk = es->es_len = es->es_pblk = 0; + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 87c009e0c5..d3a67bc06d 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -649,6 +649,12 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; + if (ext4_has_inline_data(inode)) { + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, + handle); + return; + } + args.start = start; args.end = end; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 54d6ff2258..c89e434db6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -844,8 +844,7 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (err) goto out_journal; lock_buffer(sbi->s_sbh); - strncpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); + strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); @@ -885,8 +884,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | - FMODE_DIO_PARALLEL_WRITE; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } @@ -938,7 +936,6 @@ const struct file_operations ext4_file_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, - .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, @@ -946,6 +943,8 @@ const struct file_operations ext4_file_operations = { .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | + FOP_DIO_PARALLEL_WRITE, }; const struct inode_operations ext4_file_inode_operations = { diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 11e6f33677..df853c4d3a 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -576,9 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; - if (EXT4_SB(sb)->s_journal_bdev_handle && + if (EXT4_SB(sb)->s_journal_bdev_file && fm->fmr_device == - new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev)) + new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev)) return true; return false; } @@ -648,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; - if (EXT4_SB(sb)->s_journal_bdev_handle) { + if (EXT4_SB(sb)->s_journal_bdev_file) { handlers[1].gfd_dev = new_encode_dev( - EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev); + file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d5bd1e3a5d..e7a09a9983 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1410,7 +1410,11 @@ int ext4_inlinedir_to_tree(struct file *dir_file, hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + if (err) { + ret = err; + goto out; + } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 31604907af..238e196338 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -453,6 +453,35 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map) +{ + unsigned int status; + int retval; + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(handle, inode, map, 0); + else + retval = ext4_ind_map_blocks(handle, inode, map, 0); + + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + return retval; +} + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -465,9 +494,10 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocated. if - * create==0 and the blocks are pre-allocated and unwritten, the resulting @map - * is marked as unwritten. If the create == 1, it will mark @map as mapped. + * On success, it returns the number of blocks being mapped or allocated. + * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are + * pre-allocated and unwritten, the resulting @map is marked as unwritten. + * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to @@ -589,8 +619,7 @@ found: * Returns if the blocks have already allocated * * Note that if blocks have been preallocated - * ext4_ext_get_block() returns the create = 0 - * with buffer head unmapped. + * ext4_ext_map_blocks() returns with buffer head unmapped */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* @@ -1708,6 +1737,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, if (ext4_es_is_hole(&es)) goto add_delayed; +found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. @@ -1744,36 +1774,34 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; - else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); - if (retval < 0) { - up_read(&EXT4_I(inode)->i_data_sem); + retval = ext4_map_query_blocks(NULL, inode, map); + up_read(&EXT4_I(inode)->i_data_sem); + if (retval) return retval; - } - if (retval > 0) { - unsigned int status; - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); +add_delayed: + down_write(&EXT4_I(inode)->i_data_sem); + /* + * Page fault path (ext4_page_mkwrite does not take i_rwsem) + * and fallocate path (no folio lock) can race. Make sure we + * lookup the extent status tree here again while i_data_sem + * is held in write mode, before inserting a new da entry in + * the extent status tree. + */ + if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { + if (!ext4_es_is_hole(&es)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto found; + } + } else if (!ext4_has_inline_data(inode)) { + retval = ext4_map_query_blocks(NULL, inode, map); + if (retval) { + up_write(&EXT4_I(inode)->i_data_sem); + return retval; } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - up_read(&EXT4_I(inode)->i_data_sem); - return retval; } - up_read(&EXT4_I(inode)->i_data_sem); -add_delayed: - down_write(&EXT4_I(inode)->i_data_sem); retval = ext4_insert_delayed_block(inode, map->m_lblk); up_write(&EXT4_I(inode)->i_data_sem); if (retval) @@ -1865,7 +1893,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) - len = size & ~PAGE_MASK; + len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; @@ -2334,7 +2362,7 @@ static int mpage_journal_page_buffers(handle_t *handle, if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) - len = size - folio_pos(folio); + len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } @@ -2945,6 +2973,11 @@ static int ext4_da_do_write_end(struct address_space *mapping, bool disksize_changed = false; loff_t new_i_size; + if (unlikely(!folio_buffers(folio))) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. @@ -3527,7 +3560,6 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3544,7 +3576,6 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3561,7 +3592,6 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3570,7 +3600,6 @@ static const struct address_space_operations ext4_da_aops = { static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7160a71044..e8bf5972dd 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1150,9 +1150,8 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label */ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); - memset(label, 0, sizeof(label)); lock_buffer(sbi->s_sbh); - strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX); + memtostr_pad(label, sbi->s_es->s_volume_name); unlock_buffer(sbi->s_sbh); if (copy_to_user(user_label, label, sizeof(label))) diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index f94901fd38..bb2a223b20 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -5,6 +5,7 @@ #include <kunit/test.h> #include <kunit/static_stub.h> +#include <linux/random.h> #include "ext4.h" @@ -20,41 +21,159 @@ struct mbt_ctx { }; struct mbt_ext4_super_block { - struct super_block sb; + struct ext4_super_block es; + struct ext4_sb_info sbi; struct mbt_ctx mbt_ctx; }; -#define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx)) +#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi)) +#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx) #define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) +static struct inode *mbt_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); + if (!ei) + return NULL; + + INIT_LIST_HEAD(&ei->i_orphan); + init_rwsem(&ei->xattr_sem); + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); + ext4_fc_init_inode(&ei->vfs_inode); + + return &ei->vfs_inode; +} + +static void mbt_free_inode(struct inode *inode) +{ + kfree(EXT4_I(inode)); +} + +static const struct super_operations mbt_sops = { + .alloc_inode = mbt_alloc_inode, + .free_inode = mbt_free_inode, +}; + +static void mbt_kill_sb(struct super_block *sb) +{ + generic_shutdown_super(sb); +} + +static struct file_system_type mbt_fs_type = { + .name = "mballoc test", + .kill_sb = mbt_kill_sb, +}; + +static int mbt_mb_init(struct super_block *sb) +{ + ext4_fsblk_t block; + int ret; + + /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */ + sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL); + if (sb->s_bdev == NULL) + return -ENOMEM; + + sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL); + if (sb->s_bdev->bd_queue == NULL) { + kfree(sb->s_bdev); + return -ENOMEM; + } + + /* + * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache = + * new_inode(sb); + */ + INIT_LIST_HEAD(&sb->s_inodes); + sb->s_op = &mbt_sops; + + ret = ext4_mb_init(sb); + if (ret != 0) + goto err_out; + + block = ext4_count_free_clusters(sb); + ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block, + GFP_KERNEL); + if (ret != 0) + goto err_mb_release; + + ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (ret != 0) + goto err_freeclusters; + + return 0; + +err_freeclusters: + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); +err_mb_release: + ext4_mb_release(sb); +err_out: + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); + return ret; +} + +static void mbt_mb_release(struct super_block *sb) +{ + percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter); + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); + ext4_mb_release(sb); + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); +} + +static int mbt_set(struct super_block *sb, void *data) +{ + return 0; +} + static struct super_block *mbt_ext4_alloc_super_block(void) { - struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL); - struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); + struct mbt_ext4_super_block *fsb; + struct super_block *sb; + struct ext4_sb_info *sbi; - if (fsb == NULL || sbi == NULL || es == NULL) + fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); + if (fsb == NULL) + return NULL; + + sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL); + if (IS_ERR(sb)) goto out; - sbi->s_es = es; - fsb->sb.s_fs_info = sbi; - return &fsb->sb; + sbi = &fsb->sbi; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) + goto out_deactivate; + + bgl_lock_init(sbi->s_blockgroup_lock); + sbi->s_es = &fsb->es; + sb->s_fs_info = sbi; + + up_write(&sb->s_umount); + return sb; + +out_deactivate: + deactivate_locked_super(sb); out: kfree(fsb); - kfree(sbi); - kfree(es); return NULL; } static void mbt_ext4_free_super_block(struct super_block *sb) { - struct mbt_ext4_super_block *fsb = - container_of(sb, struct mbt_ext4_super_block, sb); + struct mbt_ext4_super_block *fsb = MBT_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); - kfree(sbi->s_es); - kfree(sbi); + kfree(sbi->s_blockgroup_lock); + deactivate_super(sb); kfree(fsb); } @@ -82,6 +201,9 @@ static void mbt_init_sb_layout(struct super_block *sb, sbi->s_clusters_per_group = layout->blocks_per_group >> layout->cluster_bits; sbi->s_desc_size = layout->desc_size; + sbi->s_desc_per_block_bits = + sb->s_blocksize_bits - (fls(layout->desc_size) - 1); + sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits; es->s_first_data_block = cpu_to_le32(0); es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group * @@ -91,9 +213,13 @@ static void mbt_init_sb_layout(struct super_block *sb, static int mbt_grp_ctx_init(struct super_block *sb, struct mbt_grp_ctx *grp_ctx) { + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL); if (grp_ctx->bitmap_bh.b_data == NULL) return -ENOMEM; + mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max); + ext4_free_group_clusters_set(sb, &grp_ctx->desc, max); return 0; } @@ -112,6 +238,13 @@ static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group, mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len); } +static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + return grp_ctx->bitmap_bh.b_data; +} + /* called after mbt_init_sb_layout */ static int mbt_ctx_init(struct super_block *sb) { @@ -133,6 +266,8 @@ static int mbt_ctx_init(struct super_block *sb) * block which will fail ext4_sb_block_valid check. */ mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1); + ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc, + EXT4_CLUSTERS_PER_GROUP(sb) - 1); return 0; out: @@ -167,6 +302,13 @@ static int ext4_wait_block_bitmap_stub(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { + /* + * real ext4_wait_block_bitmap will set these flags and + * functions like ext4_mb_init_cache will verify the flags. + */ + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + set_buffer_verified(bh); return 0; } @@ -232,6 +374,14 @@ static int mbt_kunit_init(struct kunit *test) kunit_activate_static_stub(test, ext4_mb_mark_context, ext4_mb_mark_context_stub); + + /* stub function will be called in mbt_mb_init->ext4_mb_init */ + if (mbt_mb_init(sb) != 0) { + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); + return -ENOMEM; + } + return 0; } @@ -239,6 +389,7 @@ static void mbt_kunit_exit(struct kunit *test) { struct super_block *sb = (struct super_block *)test->priv; + mbt_mb_release(sb); mbt_ctx_release(sb); mbt_ext4_free_super_block(sb); } @@ -246,14 +397,19 @@ static void mbt_kunit_exit(struct kunit *test) static void test_new_blocks_simple(struct kunit *test) { struct super_block *sb = (struct super_block *)test->priv; - struct inode inode = { .i_sb = sb, }; + struct inode *inode; struct ext4_allocation_request ar; ext4_group_t i, goal_group = TEST_GOAL_GROUP; int err = 0; ext4_fsblk_t found; struct ext4_sb_info *sbi = EXT4_SB(sb); - ar.inode = &inode; + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + + inode->i_sb = sb; + ar.inode = inode; /* get block at goal */ ar.goal = ext4_group_first_block_no(sb, goal_group); @@ -297,6 +453,486 @@ static void test_new_blocks_simple(struct kunit *test) "unexpectedly get block when no block is available"); } +#define TEST_RANGE_COUNT 8 + +struct test_range { + ext4_grpblk_t start; + ext4_grpblk_t len; +}; + +static void +mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges, + int count) +{ + ext4_grpblk_t start, len, max; + int i; + + max = EXT4_CLUSTERS_PER_GROUP(sb) / count; + for (i = 0; i < count; i++) { + start = get_random_u32() % max; + len = get_random_u32() % max; + len = min(len, max - start); + + ranges[i].start = start + i * max; + ranges[i].len = len; + } +} + +static void +validate_free_blocks_simple(struct kunit *test, struct super_block *sb, + ext4_group_t goal_group, ext4_grpblk_t start, + ext4_grpblk_t len) +{ + void *bitmap; + ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + + for (i = 0; i < ext4_get_groups_count(sb); i++) { + if (i == goal_group) + continue; + + bitmap = mbt_ctx_bitmap(sb, i); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ_MSG(test, bit, max, + "free block on unexpected group %d", i); + } + + bitmap = mbt_ctx_bitmap(sb, goal_group); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, bit, start); + + bit = mb_find_next_bit(bitmap, max, bit + 1); + KUNIT_ASSERT_EQ(test, bit, start + len); +} + +static void +test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group, + ext4_grpblk_t start, ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode; + ext4_fsblk_t block; + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + if (len == 0) + return; + + block = ext4_group_first_block_no(sb, goal_group) + + EXT4_C2B(sbi, start); + ext4_free_blocks_simple(inode, block, len); + validate_free_blocks_simple(test, sb, goal_group, start, len); + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); +} + +static void test_free_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + struct test_range ranges[TEST_RANGE_COUNT]; + + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, max); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_free_blocks_simple_range(test, TEST_GOAL_GROUP, + ranges[i].start, ranges[i].len); +} + +static void +test_mark_diskspace_used_range(struct kunit *test, + struct ext4_allocation_context *ac, + ext4_grpblk_t start, + ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + int ret; + void *bitmap; + ext4_grpblk_t i, max; + + /* ext4_mb_mark_diskspace_used will BUG if len is 0 */ + if (len == 0) + return; + + ac->ac_b_ex.fe_group = TEST_GOAL_GROUP; + ac->ac_b_ex.fe_start = start; + ac->ac_b_ex.fe_len = len; + + bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP); + memset(bitmap, 0, sb->s_blocksize); + ret = ext4_mb_mark_diskspace_used(ac, NULL, 0); + KUNIT_ASSERT_EQ(test, ret, 0); + + max = EXT4_CLUSTERS_PER_GROUP(sb); + i = mb_find_next_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, i, start); + i = mb_find_next_zero_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, i, start + len); + i = mb_find_next_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, max, i); +} + +static void test_mark_diskspace_used(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode *inode; + struct ext4_allocation_context ac; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + ac.ac_status = AC_STATUS_FOUND; + ac.ac_sb = sb; + ac.ac_inode = inode; + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mark_diskspace_used_range(test, &ac, ranges[i].start, + ranges[i].len); +} + +static void mbt_generate_buddy(struct super_block *sb, void *buddy, + void *bitmap, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + uint32_t order, off; + void *bb, *bb_h; + int max; + + memset(buddy, 0xff, sb->s_blocksize); + memset(grp, 0, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)])); + + bb = bitmap; + max = EXT4_CLUSTERS_PER_GROUP(sb); + bb_h = buddy + sbi->s_mb_offsets[1]; + + off = mb_find_next_zero_bit(bb, max, 0); + grp->bb_first_free = off; + while (off < max) { + grp->bb_counters[0]++; + grp->bb_free++; + + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + grp->bb_free++; + grp->bb_counters[0]--; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[1]++; + grp->bb_largest_free_order = 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + + for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) { + bb = buddy + sbi->s_mb_offsets[order]; + bb_h = buddy + sbi->s_mb_offsets[order + 1]; + max = max >> 1; + off = mb_find_next_zero_bit(bb, max, 0); + + while (off < max) { + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + mb_set_bits(bb, off, 2); + grp->bb_counters[order] -= 2; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[order + 1]++; + grp->bb_largest_free_order = order + 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + } + + max = EXT4_CLUSTERS_PER_GROUP(sb); + off = mb_find_next_zero_bit(bitmap, max, 0); + while (off < max) { + grp->bb_fragments++; + + off = mb_find_next_bit(bitmap, max, off + 1); + if (off + 1 >= max) + break; + + off = mb_find_next_zero_bit(bitmap, max, off + 1); + } +} + +static void +mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1, + struct ext4_group_info *grp2) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + KUNIT_ASSERT_EQ(test, grp1->bb_first_free, + grp2->bb_first_free); + KUNIT_ASSERT_EQ(test, grp1->bb_fragments, + grp2->bb_fragments); + KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free); + KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order, + grp2->bb_largest_free_order); + + for (i = 1; i < MB_NUM_ORDERS(sb); i++) { + KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i], + grp2->bb_counters[i], + "bb_counters[%d] diffs, expected %d, generated %d", + i, grp1->bb_counters[i], + grp2->bb_counters[i]); + } +} + +static void +do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap, + void *mbt_buddy, struct ext4_group_info *mbt_grp, + void *ext4_buddy, struct ext4_group_info *ext4_grp) +{ + int i; + + mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp); + + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + ext4_grp->bb_counters[i] = 0; + /* needed by validation in ext4_mb_generate_buddy */ + ext4_grp->bb_free = mbt_grp->bb_free; + memset(ext4_buddy, 0xff, sb->s_blocksize); + ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, + ext4_grp); + + KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, mbt_grp, ext4_grp); +} + +static void test_mb_generate_buddy(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *expected_bb, *generate_bb; + struct ext4_group_info *expected_grp, *generate_grp; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb); + generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb); + expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp); + generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP); + KUNIT_ASSERT_NOT_NULL(test, generate_grp); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) { + mb_set_bits(bitmap, ranges[i].start, ranges[i].len); + do_test_generate_buddy(test, sb, bitmap, expected_bb, + expected_grp, generate_bb, generate_grp); + } +} + +static void +test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int i; + + /* mb_mark_used only accepts non-zero len */ + if (len == 0) + return; + + ex.fe_start = start; + ex.fe_len = len; + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + mb_set_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free -= len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); +} + +static void test_mb_mark_used(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_mark_used_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +static void +test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + /* mb_free_blocks will WARN if len is 0 */ + if (len == 0) + return; + + ext4_lock_group(sb, e4b->bd_group); + mb_free_blocks(NULL, e4b, start, len); + ext4_unlock_group(sb, e4b->bd_group); + + mb_clear_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free += len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); + +} + +static void test_mb_free_blocks(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + struct ext4_free_extent ex; + int ret; + int i; + struct test_range ranges[TEST_RANGE_COUNT]; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_start = 0; + ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb); + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + grp->bb_free = 0; + memset(bitmap, 0xff, sb->s_blocksize); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_free_blocks_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +#define COUNT_FOR_ESTIMATE 100000 +static void test_mb_mark_used_cost(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i, j; + unsigned long start, end, all = 0; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_group = TEST_GOAL_GROUP; + for (j = 0; j < COUNT_FOR_ESTIMATE; j++) { + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + start = jiffies; + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ex.fe_start = ranges[i].start; + ex.fe_len = ranges[i].len; + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + end = jiffies; + all += (end - start); + + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_free_blocks(NULL, &e4b, ranges[i].start, + ranges[i].len); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + } + + kunit_info(test, "costed jiffies %lu\n", all); + ext4_mb_unload_buddy(&e4b); +} + static const struct mbt_ext4_block_layout mbt_test_layouts[] = { { .blocksize_bits = 10, @@ -334,6 +970,13 @@ KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout); static struct kunit_case mbt_test_cases[] = { KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params, + { .speed = KUNIT_SPEED_SLOW }), {} }; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 00b0839b53..9dda9cd68a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) return 0; if (order == MB_NUM_ORDERS(sb)) order--; + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) + order = MB_NUM_ORDERS(sb) - 1; return order; } @@ -1008,6 +1010,8 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context * goal length. */ order = fls(ac->ac_g_ex.fe_len) - 1; + if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) + order = MB_NUM_ORDERS(ac->ac_sb); min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; @@ -1076,23 +1080,11 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) } /* - * Return next linear group for allocation. If linear traversal should not be - * performed, this function just returns the same group + * Return next linear group for allocation. */ static ext4_group_t -next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, - ext4_group_t ngroups) +next_linear_group(ext4_group_t group, ext4_group_t ngroups) { - if (!should_optimize_scan(ac)) - goto inc_and_return; - - if (ac->ac_groups_linear_remaining) { - ac->ac_groups_linear_remaining--; - goto inc_and_return; - } - - return group; -inc_and_return: /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. @@ -1118,8 +1110,19 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, { *new_cr = ac->ac_criteria; - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { - *group = next_linear_group(ac, *group, ngroups); + if (!should_optimize_scan(ac)) { + *group = next_linear_group(*group, ngroups); + return; + } + + /* + * Optimized scanning can return non adjacent groups which can cause + * seek overhead for rotational disks. So try few linear groups before + * trying optimized scan. + */ + if (ac->ac_groups_linear_remaining) { + *group = next_linear_group(*group, ngroups); + ac->ac_groups_linear_remaining--; return; } @@ -1131,8 +1134,9 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, ext4_mb_choose_next_group_best_avail(ac, new_cr, group); } else { /* - * TODO: For CR=2, we can arrange groups in an rb tree sorted by - * bb_free. But until that happens, we should never come here. + * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an + * rb tree sorted by bb_free. But until that happens, we should + * never come here. */ WARN_ON(1); } @@ -1270,7 +1274,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * for this page; do not hold this lock when calling this routine! */ -static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) +static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; @@ -1288,13 +1292,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) char *bitmap; struct ext4_group_info *grinfo; - inode = page->mapping->host; + inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; - mb_debug(sb, "init page %lu\n", page->index); + mb_debug(sb, "init folio %lu\n", folio->index); groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) @@ -1309,9 +1313,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else bh = &bhs; - first_group = page->index * blocks_per_page / 2; + first_group = folio->index * blocks_per_page / 2; - /* read all groups the page covers into the cache */ + /* read all groups the folio covers into the cache */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { if (group >= ngroups) break; @@ -1322,10 +1326,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so - * we must skip all initialized uptodate buddies on the page, + * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ - if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + if (folio_test_uptodate(folio) && + !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } @@ -1349,7 +1354,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) err = err2; } - first_block = page->index * blocks_per_page; + first_block = folio->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { group = (first_block + i) >> 1; if (group >= ngroups) @@ -1370,7 +1375,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) * above * */ - data = page_address(page) + (i * blocksize); + data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* @@ -1385,8 +1390,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug(sb, "put buddy for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, @@ -1404,8 +1409,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ @@ -1423,7 +1428,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) incore = data; } } - SetPageUptodate(page); + folio_mark_uptodate(folio); out: if (bh) { @@ -1439,7 +1444,7 @@ out: * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) @@ -1447,10 +1452,10 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; - struct page *page; + struct folio *folio; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* @@ -1461,12 +1466,13 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ @@ -1474,23 +1480,24 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, } /* blocks_per_page == 1, hence we need another page for the buddy */ - page = find_or_create_page(inode->i_mapping, block + 1, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_buddy_page = page; + folio = __filemap_get_folio(inode->i_mapping, block + 1, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_buddy_folio = folio; return 0; } static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) { - unlock_page(e4b->bd_bitmap_page); - put_page(e4b->bd_bitmap_page); + if (e4b->bd_bitmap_folio) { + folio_unlock(e4b->bd_bitmap_folio); + folio_put(e4b->bd_bitmap_folio); } - if (e4b->bd_buddy_page) { - unlock_page(e4b->bd_buddy_page); - put_page(e4b->bd_buddy_page); + if (e4b->bd_buddy_folio) { + folio_unlock(e4b->bd_buddy_folio); + folio_put(e4b->bd_buddy_folio); } } @@ -1505,7 +1512,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) struct ext4_group_info *this_grp; struct ext4_buddy e4b; - struct page *page; + struct folio *folio; int ret = 0; might_sleep(); @@ -1532,16 +1539,16 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } - page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL, gfp); + folio = e4b.bd_bitmap_folio; + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - if (e4b.bd_buddy_page == NULL) { + if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force @@ -1551,11 +1558,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } /* init buddy cache */ - page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); + folio = e4b.bd_buddy_folio; + ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } @@ -1577,7 +1584,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, int block; int pnum; int poff; - struct page *page; + struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1595,8 +1602,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* @@ -1617,102 +1624,103 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, pnum = block / blocks_per_page; poff = block % blocks_per_page; - /* we could use find_or_create_page(), but it locks page - * what we'd like to avoid in fast path ... */ - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) + /* Avoid locking the folio in the fast path ... */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) /* - * drop the page reference and try - * to get the page with lock. If we + * drop the folio reference and try + * to get the folio with lock. If we * are not uptodate that implies - * somebody just created the page but - * is yet to initialize the same. So + * somebody just created the folio but + * is yet to initialize it. So * wait for it to initialize. */ - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL, gfp); + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } - mb_cmp_bitmaps(e4b, page_address(page) + + mb_cmp_bitmaps(e4b, folio_address(folio) + (poff * sb->s_blocksize)); } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + /* Folios marked accessed already */ + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap, + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_buddy_page = page; - e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + /* Folios marked accessed already */ + e4b->bd_buddy_folio = folio; + e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); return 0; err: - if (page) - put_page(page); - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); + if (!IS_ERR_OR_NULL(folio)) + folio_put(folio); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; @@ -1727,10 +1735,10 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); - if (e4b->bd_buddy_page) - put_page(e4b->bd_buddy_page); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); + if (e4b->bd_buddy_folio) + folio_put(e4b->bd_buddy_folio); } @@ -2040,13 +2048,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) int ord; int mlen = 0; int max = 0; - int cur; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; - bool split = false; + int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); @@ -2071,16 +2078,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain buddy itself */ while (len) { - if (!split) - ord = mb_find_order_for_block(e4b, start); + ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; - if (!split) - buddy = mb_find_buddy(e4b, ord, &max); - else - split = false; + buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; @@ -2094,20 +2097,29 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) if (ret == 0) ret = len | (ord << 16); - /* we have to split large buddy */ BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; - ord--; - cur = (start >> ord) & ~1U; - buddy = mb_find_buddy(e4b, ord, &max); - mb_clear_bit(cur, buddy); - mb_clear_bit(cur + 1, buddy); - e4b->bd_info->bb_counters[ord]++; - e4b->bd_info->bb_counters[ord]++; - split = true; + ord_start = (start >> ord) << ord; + ord_end = ord_start + (1 << ord); + /* first chunk */ + if (start > ord_start) + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + ord_start, start - ord_start, + e4b->bd_info); + + /* last chunk */ + if (start + len < ord_end) { + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + start + len, + ord_end - (start + len), + e4b->bd_info); + break; + } + len = start + len - ord_end; + start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); @@ -2149,10 +2161,10 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ - ac->ac_bitmap_page = e4b->bd_bitmap_page; - get_page(ac->ac_bitmap_page); - ac->ac_buddy_page = e4b->bd_buddy_page; - get_page(ac->ac_buddy_page); + ac->ac_bitmap_folio = e4b->bd_bitmap_folio; + folio_get(ac->ac_bitmap_folio); + ac->ac_buddy_folio = e4b->bd_buddy_folio; + folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); @@ -2675,7 +2687,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, int ret; /* - * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic + * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this @@ -2856,6 +2868,7 @@ repeat: group = ac->ac_g_ex.fe_group; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; + nr = 0; for (i = 0, new_cr = cr; i < ngroups; i++, ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { @@ -3015,8 +3028,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); - int i; - int err, buddy_loaded = 0; + int i, err; + char nbuf[16]; struct ext4_buddy e4b; struct ext4_group_info *grinfo; unsigned char blocksize_bits = min_t(unsigned char, @@ -3043,17 +3056,17 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - seq_printf(seq, "#%-5u: I/O error\n", group); + seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf)); return 0; } - buddy_loaded = 1; + ext4_mb_unload_buddy(&e4b); } + /* + * We care only about free space counters in the group info and + * these are safe to access even after the buddy has been unloaded + */ memcpy(&sg, grinfo, i); - - if (buddy_loaded) - ext4_mb_unload_buddy(&e4b); - seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) @@ -3186,7 +3199,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) -__acquires(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; @@ -3440,10 +3452,11 @@ static int ext4_mb_init_backend(struct super_block *sb) } if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) sbi->s_mb_prefetch = ext4_get_groups_count(sb); - /* now many real IOs to prefetch within a single allocation at cr=0 - * given cr=0 is an CPU-related optimization we shouldn't try to - * load too many groups, at some point we should start to use what - * we've got in memory. + /* + * now many real IOs to prefetch within a single allocation at + * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related + * optimization we shouldn't try to load too many groups, at some point + * we should start to use what we've got in memory. * with an average random access time 5ms, it'd take a second to get * 200 groups (* N with flex_bg), so let's make this limit 4 */ @@ -3832,8 +3845,7 @@ void ext4_mb_release(struct super_block *sb) } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t cluster, int count, - struct bio **biop) + ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; @@ -3842,13 +3854,8 @@ static inline int ext4_issue_discard(struct super_block *sb, count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - if (biop) { - return __blkdev_issue_discard(sb->s_bdev, - (sector_t)discard_block << (sb->s_blocksize_bits - 9), - (sector_t)count << (sb->s_blocksize_bits - 9), - GFP_NOFS, biop); - } else - return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); + + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } static void ext4_free_data_in_buddy(struct super_block *sb, @@ -3890,8 +3897,8 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ - put_page(e4b.bd_buddy_page); - put_page(e4b.bd_bitmap_page); + folio_put(e4b.bd_buddy_folio); + folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); @@ -5995,10 +6002,10 @@ static void ext4_mb_release_context(struct ext4_allocation_context *ac) ext4_mb_put_pa(ac, ac->ac_sb, pa); } - if (ac->ac_bitmap_page) - put_page(ac->ac_bitmap_page); - if (ac->ac_buddy_page) - put_page(ac->ac_buddy_page); + if (ac->ac_bitmap_folio) + folio_put(ac->ac_bitmap_folio); + if (ac->ac_buddy_folio) + folio_put(ac->ac_buddy_folio); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); @@ -6314,8 +6321,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); + BUG_ON(e4b->bd_bitmap_folio == NULL); + BUG_ON(e4b->bd_buddy_folio == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; @@ -6326,8 +6333,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ - get_page(e4b->bd_buddy_page); - get_page(e4b->bd_bitmap_page); + folio_get(e4b->bd_buddy_folio); + folio_get(e4b->bd_bitmap_folio); } while (*n) { parent = *n; @@ -6496,8 +6503,14 @@ do_more: } else { if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, - count_clusters, NULL); - if (err && err != -EOPNOTSUPP) + count_clusters); + /* + * Ignore EOPNOTSUPP error. This is consistent with + * what happens when using journal. + */ + if (err == -EOPNOTSUPP) + err = 0; + if (err) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" " with %d", block_group, bit, count, @@ -6747,7 +6760,7 @@ __acquires(bitlock) */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ret = ext4_issue_discard(sb, group, start, count, NULL); + ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 56938532b4..d8553f1498 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -187,14 +187,14 @@ struct ext4_allocation_context { struct ext4_free_extent ac_f_ex; /* - * goal len can change in CR1.5, so save the original len. This is - * used while adjusting the PA window and for accounting. + * goal len can change in CR_BEST_AVAIL_LEN, so save the original len. + * This is used while adjusting the PA window and for accounting. */ ext4_grpblk_t ac_orig_goal_len; __u32 ac_flags; /* allocation hints */ + __u32 ac_groups_linear_remaining; __u16 ac_groups_scanned; - __u16 ac_groups_linear_remaining; __u16 ac_found; __u16 ac_cX_found[EXT4_MB_NUM_CRS]; __u16 ac_tail; @@ -204,8 +204,8 @@ struct ext4_allocation_context { __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ - struct page *ac_bitmap_page; - struct page *ac_buddy_page; + struct folio *ac_bitmap_folio; + struct folio *ac_buddy_folio; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; @@ -215,9 +215,9 @@ struct ext4_allocation_context { #define AC_STATUS_BREAK 3 struct ext4_buddy { - struct page *bd_buddy_page; + struct folio *bd_buddy_folio; void *bd_buddy; - struct page *bd_bitmap_page; + struct folio *bd_bitmap_folio; void *bd_bitmap; struct ext4_group_info *bd_info; struct super_block *bd_sb; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 7cd4afa4de..204f53b236 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -199,10 +199,8 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) continue; if (!buffer_mapped(bh)) { err = ext4_get_block(inode, block, bh, 0); - if (err) { - folio_set_error(folio); + if (err) return err; - } if (!buffer_mapped(bh)) { folio_zero_range(folio, block_start, blocksize); set_buffer_uptodate(bh); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 58fee3c6fe..1311ad0464 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -151,10 +151,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, return bh; } - if (!bh && (type == INDEX || type == DIRENT_HTREE)) { + /* The first directory block must not be a hole. */ + if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) { ext4_error_inode(inode, func, line, block, - "Directory hole found for htree %s block", - (type == INDEX) ? "index" : "leaf"); + "Directory hole found for htree %s block %u", + (type == INDEX) ? "index" : "leaf", block); return ERR_PTR(-EFSCORRUPTED); } if (!bh) @@ -1762,7 +1763,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return NULL; if (err) @@ -2218,6 +2218,52 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, return err ? err : err2; } +static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root) +{ + struct fake_dirent *fde; + const char *error_msg; + unsigned int rlen; + unsigned int blocksize = dir->i_sb->s_blocksize; + char *blockend = (char *)root + dir->i_sb->s_blocksize; + + fde = &root->dot; + if (unlikely(fde->name_len != 1)) { + error_msg = "invalid name_len for '.'"; + goto corrupted; + } + if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) { + error_msg = "invalid name for '.'"; + goto corrupted; + } + rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); + if (unlikely((char *)fde + rlen >= blockend)) { + error_msg = "invalid rec_len for '.'"; + goto corrupted; + } + + fde = &root->dotdot; + if (unlikely(fde->name_len != 2)) { + error_msg = "invalid name_len for '..'"; + goto corrupted; + } + if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) { + error_msg = "invalid name for '..'"; + goto corrupted; + } + rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); + if (unlikely((char *)fde + rlen >= blockend)) { + error_msg = "invalid rec_len for '..'"; + goto corrupted; + } + + return true; + +corrupted: + EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended", + error_msg); + return false; +} + /* * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. @@ -2252,17 +2298,17 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, brelse(bh); return retval; } + root = (struct dx_root *) bh->b_data; + if (!ext4_check_dx_root(dir, root)) { + brelse(bh); + return -EFSCORRUPTED; + } /* The 0th block becomes the root, move the dirents out */ fde = &root->dotdot; de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); - if ((char *) de >= (((char *) root) + blocksize)) { - EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); - brelse(bh); - return -EFSCORRUPTED; - } len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ @@ -3084,10 +3130,7 @@ bool ext4_empty_dir(struct inode *inode) EXT4_ERROR_INODE(inode, "invalid size"); return false; } - /* The first directory block must not be a hole, - * so treat it as DIRENT_HTREE - */ - bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); + bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) return false; @@ -3532,10 +3575,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct ext4_dir_entry_2 *de; unsigned int offset; - /* The first directory block must not be a hole, so - * treat it as DIRENT_HTREE - */ - bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); + bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) { *retval = PTR_ERR(bh); return NULL; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 312bc68133..ad5543866d 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -117,7 +117,6 @@ static void ext4_finish_bio(struct bio *bio) if (bio->bi_status) { int err = blk_status_to_errno(bio->bi_status); - folio_set_error(folio); mapping_set_error(folio->mapping, err); } bh = head = folio_buffers(folio); @@ -441,8 +440,6 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); - folio_clear_error(folio); - /* * Comments copied from block_write_full_folio: * diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 21e8f0aebb..8494492582 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -289,7 +289,6 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: - folio_set_error(folio); folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3c0d12382e..0ba9837d65 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -2085,7 +2085,7 @@ retry: } } - if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) { + if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) { err = ext4_convert_meta_bg(sb, resize_inode); if (err) goto out; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4866657c6d..c682fb927b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -244,7 +244,7 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { - gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS) | __GFP_MOVABLE; return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); @@ -253,7 +253,7 @@ struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { - gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS); return __ext4_sb_bread_gfp(sb, block, 0, gfp); @@ -492,22 +492,6 @@ static void ext4_maybe_update_superblock(struct super_block *sb) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } -/* - * The del_gendisk() function uninitializes the disk-specific data - * structures, including the bdi structure, without telling anyone - * else. Once this happens, any attempt to call mark_buffer_dirty() - * (for example, by ext4_commit_super), will cause a kernel OOPS. - * This is a kludge to prevent these oops until we can put in a proper - * hook in del_gendisk() to inform the VFS and file system layers. - */ -static int block_device_ejected(struct super_block *sb) -{ - struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = inode_to_bdi(bd_inode); - - return bdi->dev == NULL; -} - static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; @@ -1359,14 +1343,14 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->s_journal_bdev_handle) { + if (sbi->s_journal_bdev_file) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ - sync_blockdev(sbi->s_journal_bdev_handle->bdev); - invalidate_bdev(sbi->s_journal_bdev_handle->bdev); + sync_blockdev(file_bdev(sbi->s_journal_bdev_file)); + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); @@ -1500,8 +1484,7 @@ static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT), + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); @@ -1724,10 +1707,6 @@ static const struct constant_table ext4_param_dax[] = { {} }; -/* String parameter that allows empty argument */ -#define fsparam_string_empty(NAME, OPT) \ - __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) - /* * Mount option specification * We don't use fsparam_flag_no because of the way we set the @@ -2079,8 +2058,7 @@ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; - if (ctx->s_qf_names[qtype]) - kfree(ctx->s_qf_names[qtype]); + kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; @@ -2485,8 +2463,7 @@ static int parse_options(struct fs_context *fc, char *options) param.size = v_len; ret = ext4_parse_param(fc, ¶m); - if (param.string) - kfree(param.string); + kfree(param.string); if (ret < 0) return ret; } @@ -4233,7 +4210,7 @@ int ext4_calculate_overhead(struct super_block *sb) * Add the internal journal blocks whether the journal has been * loaded or not */ - if (sbi->s_journal && !sbi->s_journal_bdev_handle) + if (sbi->s_journal && !sbi->s_journal_bdev_file) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ @@ -4422,22 +4399,6 @@ static int ext4_handle_clustersize(struct super_block *sb) } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); - sbi->s_clusters_per_group = - le32_to_cpu(es->s_clusters_per_group); - if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#clusters per group too big: %lu", - sbi->s_clusters_per_group); - return -EINVAL; - } - if (sbi->s_blocks_per_group != - (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { - ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " - "clusters per group (%lu) inconsistent", - sbi->s_blocks_per_group, - sbi->s_clusters_per_group); - return -EINVAL; - } } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, @@ -4451,9 +4412,21 @@ static int ext4_handle_clustersize(struct super_block *sb) sbi->s_blocks_per_group); return -EINVAL; } - sbi->s_clusters_per_group = sbi->s_blocks_per_group; sbi->s_cluster_bits = 0; } + sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + return -EINVAL; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { + ext4_msg(sb, KERN_ERR, + "blocks per group (%lu) and clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, sbi->s_clusters_per_group); + return -EINVAL; + } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ @@ -5346,7 +5319,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif - memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); + super_set_sysfs_name_bdev(sb); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); @@ -5484,6 +5458,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount4; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); @@ -5555,19 +5530,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err) goto failed_mount6; - err = ext4_register_sysfs(sb); - if (err) - goto failed_mount7; - err = ext4_init_orphan_info(sb); if (err) - goto failed_mount8; + goto failed_mount7; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) - goto failed_mount9; + goto failed_mount8; } #endif /* CONFIG_QUOTA */ @@ -5576,7 +5547,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * used to detect the metadata async write error. */ spin_lock_init(&sbi->s_bdev_wb_lock); - errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); @@ -5593,7 +5564,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount10; + goto failed_mount9; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) @@ -5610,15 +5581,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); + /* Register sysfs after all initializations are complete. */ + err = ext4_register_sysfs(sb); + if (err) + goto failed_mount9; + return 0; -failed_mount10: +failed_mount9: ext4_quotas_off(sb, EXT4_MAXQUOTAS); -failed_mount9: __maybe_unused +failed_mount8: __maybe_unused ext4_release_orphan_info(sb); -failed_mount8: - ext4_unregister_sysfs(sb); - kobject_put(&sbi->s_kobj); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: @@ -5670,9 +5643,9 @@ failed_mount: #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); - if (sbi->s_journal_bdev_handle) { - invalidate_bdev(sbi->s_journal_bdev_handle->bdev); - bdev_release(sbi->s_journal_bdev_handle); + if (sbi->s_journal_bdev_file) { + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); + bdev_fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); @@ -5842,30 +5815,30 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb, return journal; } -static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, +static struct file *ext4_get_journal_blkdev(struct super_block *sb, dev_t j_dev, ext4_fsblk_t *j_start, ext4_fsblk_t *j_len) { struct buffer_head *bh; struct block_device *bdev; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; int errno; - bdev_handle = bdev_open_by_dev(j_dev, + bdev_file = bdev_file_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, sb, &fs_holder_ops); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle)); - return bdev_handle; + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file)); + return bdev_file; } - bdev = bdev_handle->bdev; + bdev = file_bdev(bdev_file); blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { @@ -5877,7 +5850,7 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; - set_blocksize(bdev, blocksize); + set_blocksize(bdev_file, blocksize); bh = __bread(bdev, sb_block, blocksize); if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " @@ -5912,12 +5885,12 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, *j_start = sb_block + 1; *j_len = ext4_blocks_count(es); brelse(bh); - return bdev_handle; + return bdev_file; out_bh: brelse(bh); out_bdev: - bdev_release(bdev_handle); + bdev_fput(bdev_file); return ERR_PTR(errno); } @@ -5927,14 +5900,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int errno = 0; - bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); - if (IS_ERR(bdev_handle)) - return ERR_CAST(bdev_handle); + bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); - journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start, + journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); @@ -5949,14 +5922,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, goto out_journal; } journal->j_private = sb; - EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle; + EXT4_SB(sb)->s_journal_bdev_file = bdev_file; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: - bdev_release(bdev_handle); + bdev_fput(bdev_file); return ERR_PTR(errno); } @@ -6134,8 +6107,8 @@ static void ext4_update_super(struct super_block *sb) __ext4_update_tstamp(&es->s_first_error_time, &es->s_first_error_time_hi, sbi->s_first_error_time); - strncpy(es->s_first_error_func, sbi->s_first_error_func, - sizeof(es->s_first_error_func)); + strtomem_pad(es->s_first_error_func, + sbi->s_first_error_func, 0); es->s_first_error_line = cpu_to_le32(sbi->s_first_error_line); es->s_first_error_ino = @@ -6148,8 +6121,7 @@ static void ext4_update_super(struct super_block *sb) __ext4_update_tstamp(&es->s_last_error_time, &es->s_last_error_time_hi, sbi->s_last_error_time); - strncpy(es->s_last_error_func, sbi->s_last_error_func, - sizeof(es->s_last_error_func)); + strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0); es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); @@ -6176,8 +6148,6 @@ static int ext4_commit_super(struct super_block *sb) if (!sbh) return -EINVAL; - if (block_device_ejected(sb)) - return -ENODEV; ext4_update_super(sb); @@ -7326,12 +7296,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb) static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL; + struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL; kill_block_super(sb); - if (handle) - bdev_release(handle); + if (bdev_file) + bdev_fput(bdev_file); } static struct file_system_type ext4_fs_type = { diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 6d332dff79..ddb54608ca 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -29,7 +29,10 @@ typedef enum { attr_trigger_test_error, attr_first_error_time, attr_last_error_time, + attr_clusters_in_group, + attr_mb_order, attr_feature, + attr_pointer_pi, attr_pointer_ui, attr_pointer_ul, attr_pointer_u64, @@ -104,7 +107,7 @@ static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi, int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); - if (ret || val >= clusters) + if (ret || val >= clusters || (s64)val < 0) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); @@ -178,6 +181,9 @@ static struct ext4_attr ext4_attr_##_name = { \ #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) +#define EXT4_RW_ATTR_SBI_PI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname) + #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) @@ -207,23 +213,25 @@ EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ext4_sb_info, s_inode_readahead_blks); +EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, + ext4_sb_info, s_mb_group_prealloc); +EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, + ext4_sb_info, s_mb_best_avail_max_trim_order); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -366,13 +374,45 @@ static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) #define print_tstamp(buf, es, tstamp) \ __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) +static ssize_t ext4_generic_attr_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; + + switch (a->attr_id) { + case attr_inode_readahead: + case attr_clusters_in_group: + case attr_mb_order: + case attr_pointer_pi: + case attr_pointer_ui: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); + return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); + case attr_pointer_u8: + return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); + case attr_pointer_u64: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr)); + return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr)); + case attr_pointer_string: + return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr); + case attr_pointer_atomic: + return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr)); + } + return 0; +} + static ssize_t ext4_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, s_kobj); struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - void *ptr = calc_ptr(a, sbi); switch (a->attr_id) { case attr_delayed_allocation_blocks: @@ -391,45 +431,6 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return sysfs_emit(buf, "%llu\n", (unsigned long long) percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); - case attr_inode_readahead: - case attr_pointer_ui: - if (!ptr) - return 0; - if (a->attr_ptr == ptr_ext4_super_block_offset) - return sysfs_emit(buf, "%u\n", - le32_to_cpup(ptr)); - else - return sysfs_emit(buf, "%u\n", - *((unsigned int *) ptr)); - case attr_pointer_ul: - if (!ptr) - return 0; - return sysfs_emit(buf, "%lu\n", - *((unsigned long *) ptr)); - case attr_pointer_u8: - if (!ptr) - return 0; - return sysfs_emit(buf, "%u\n", - *((unsigned char *) ptr)); - case attr_pointer_u64: - if (!ptr) - return 0; - if (a->attr_ptr == ptr_ext4_super_block_offset) - return sysfs_emit(buf, "%llu\n", - le64_to_cpup(ptr)); - else - return sysfs_emit(buf, "%llu\n", - *((unsigned long long *) ptr)); - case attr_pointer_string: - if (!ptr) - return 0; - return sysfs_emit(buf, "%.*s\n", a->attr_size, - (char *) ptr); - case attr_pointer_atomic: - if (!ptr) - return 0; - return sysfs_emit(buf, "%d\n", - atomic_read((atomic_t *) ptr)); case attr_feature: return sysfs_emit(buf, "supported\n"); case attr_first_error_time: @@ -438,29 +439,34 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return print_tstamp(buf, sbi->s_es, s_last_error_time); case attr_journal_task: return journal_task_show(sbi, buf); + default: + return ext4_generic_attr_show(a, sbi, buf); } - - return 0; } -static ssize_t ext4_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) +static ssize_t ext4_generic_attr_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t len) { - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - void *ptr = calc_ptr(a, sbi); - unsigned long t; int ret; + unsigned int t; + unsigned long lt; + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; switch (a->attr_id) { - case attr_reserved_clusters: - return reserved_clusters_store(sbi, buf, len); + case attr_pointer_pi: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if ((int)t < 0) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_pointer_ui: - if (!ptr) - return 0; - ret = kstrtoul(skip_spaces(buf), 0, &t); + ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; if (a->attr_ptr == ptr_ext4_super_block_offset) @@ -468,20 +474,50 @@ static ssize_t ext4_attr_store(struct kobject *kobj, else *((unsigned int *) ptr) = t; return len; + case attr_mb_order: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > 64) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; + case attr_clusters_in_group: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > sbi->s_clusters_per_group) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_pointer_ul: - if (!ptr) - return 0; - ret = kstrtoul(skip_spaces(buf), 0, &t); + ret = kstrtoul(skip_spaces(buf), 0, <); if (ret) return ret; - *((unsigned long *) ptr) = t; + *((unsigned long *) ptr) = lt; return len; + } + return 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + switch (a->attr_id) { + case attr_reserved_clusters: + return reserved_clusters_store(sbi, buf, len); case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: return trigger_test_error(sbi, buf, len); + default: + return ext4_generic_attr_store(a, sbi, buf, len); } - return 0; } static void ext4_sb_release(struct kobject *kobj) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 82dc5e673d..46ce2f21fe 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1433,6 +1433,12 @@ retry: goto out; memcpy(bh->b_data, buf, csize); + /* + * Zero out block tail to avoid writing uninitialized memory + * to disk. + */ + if (csize < blocksize) + memset(bh->b_data + csize, 0, blocksize - csize); set_buffer_uptodate(bh); ext4_handle_dirty_metadata(handle, ea_inode, bh); @@ -1565,46 +1571,49 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, /* * Add value of the EA in an inode. */ -static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, - const void *value, size_t value_len, - struct inode **ret_inode) +static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle, + struct inode *inode, const void *value, size_t value_len) { struct inode *ea_inode; u32 hash; int err; + /* Account inode & space to quota even if sharing... */ + err = ext4_xattr_inode_alloc_quota(inode, value_len); + if (err) + return ERR_PTR(err); + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); if (ea_inode) { err = ext4_xattr_inode_inc_ref(handle, ea_inode); - if (err) { - iput(ea_inode); - return err; - } - - *ret_inode = ea_inode; - return 0; + if (err) + goto out_err; + return ea_inode; } /* Create an inode for the EA value */ ea_inode = ext4_xattr_inode_create(handle, inode, hash); - if (IS_ERR(ea_inode)) - return PTR_ERR(ea_inode); + if (IS_ERR(ea_inode)) { + ext4_xattr_inode_free_quota(inode, NULL, value_len); + return ea_inode; + } err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); - iput(ea_inode); - return err; + goto out_err; } if (EA_INODE_CACHE(inode)) mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, ea_inode->i_ino, true /* reusable */); - - *ret_inode = ea_inode; - return 0; + return ea_inode; +out_err: + iput(ea_inode); + ext4_xattr_inode_free_quota(inode, NULL, value_len); + return ERR_PTR(err); } /* @@ -1616,6 +1625,7 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode, + struct inode *new_ea_inode, bool is_block) { struct ext4_xattr_entry *last, *next; @@ -1623,7 +1633,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; - struct inode *new_ea_inode = NULL; size_t old_size, new_size; int ret; @@ -1708,43 +1717,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, old_ea_inode = NULL; goto out; } - } - if (i->value && in_inode) { - WARN_ON_ONCE(!i->value_len); - ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); - if (ret) - goto out; - - ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, - i->value_len, - &new_ea_inode); - if (ret) { - new_ea_inode = NULL; - ext4_xattr_inode_free_quota(inode, NULL, i->value_len); - goto out; - } - } - - if (old_ea_inode) { /* We are ready to release ref count on the old_ea_inode. */ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); - if (ret) { - /* Release newly required ref count on new_ea_inode. */ - if (new_ea_inode) { - int err; - - err = ext4_xattr_inode_dec_ref(handle, - new_ea_inode); - if (err) - ext4_warning_inode(new_ea_inode, - "dec ref new_ea_inode err=%d", - err); - ext4_xattr_inode_free_quota(inode, new_ea_inode, - i->value_len); - } + if (ret) goto out; - } ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); @@ -1868,7 +1845,6 @@ update_hash: ret = 0; out: iput(old_ea_inode); - iput(new_ea_inode); return ret; } @@ -1931,9 +1907,21 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, size_t old_ea_inode_quota = 0; unsigned int ea_ino; - #define header(x) ((struct ext4_xattr_header *)(x)) + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) { + error = PTR_ERR(ea_inode); + ea_inode = NULL; + goto cleanup; + } + } + if (s->base) { int offset = (char *)s->here - bs->bh->b_data; @@ -1942,6 +1930,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, EXT4_JTR_NONE); if (error) goto cleanup; + lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { @@ -1968,7 +1957,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, - true /* is_block */); + ea_inode, true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -2036,33 +2025,22 @@ clone_block: s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; - if (i->value && s->here->e_value_inum) { - /* - * A ref count on ea_inode has been taken as part of the call to - * ext4_xattr_set_entry() above. We would like to drop this - * extra ref but we have to wait until the xattr block is - * initialized and has its own ref count on the ea_inode. - */ - ea_ino = le32_to_cpu(s->here->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, - le32_to_cpu(s->here->e_hash), - &ea_inode); - if (error) { - ea_inode = NULL; +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); + if (IS_ERR(new_bh)) { + error = PTR_ERR(new_bh); + new_bh = NULL; goto cleanup; } - } -inserted: - if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext4_xattr_block_cache_find(inode, header(s->base), - &ce); if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) @@ -2211,17 +2189,16 @@ getblk_failed: cleanup: if (ea_inode) { - int error2; - - error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); - if (error2) - ext4_warning_inode(ea_inode, "dec ref error=%d", - error2); + if (error) { + int error2; - /* If there was an error, revert the quota charge. */ - if (error) + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); + } iput(ea_inode); } if (ce) @@ -2279,14 +2256,38 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; + struct inode *ea_inode = NULL; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; - error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); - if (error) + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) + return PTR_ERR(ea_inode); + } + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + false /* is_block */); + if (error) { + if (ea_inode) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + + ext4_xattr_inode_free_quota(inode, ea_inode, + i_size_read(ea_inode)); + iput(ea_inode); + } return error; + } header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); @@ -2295,6 +2296,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } + iput(ea_inode); return 0; } @@ -3103,8 +3105,8 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, * * Find an identical extended attribute block. * - * Returns a pointer to the block found, or NULL if such a block was - * not found or an error occurred. + * Returns a pointer to the block found, or NULL if such a block was not + * found, or an error pointer if an error occurred while reading ea block. */ static struct buffer_head * ext4_xattr_block_cache_find(struct inode *inode, @@ -3126,11 +3128,11 @@ ext4_xattr_block_cache_find(struct inode *inode, bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { - if (PTR_ERR(bh) == -ENOMEM) - return NULL; - bh = NULL; - EXT4_ERROR_INODE(inode, "block %lu read error", - (unsigned long)ce->e_value); + if (PTR_ERR(bh) != -ENOMEM) + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long)ce->e_value); + mb_cache_entry_put(ea_block_cache, ce); + return bh; } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; |